Replace plain decoder with utf8 (#922)

2024-11-10 15:14:38 +00:00 · 2022-11-15 09:36:01 -08:00 · 2022-11-15 09:36:01 -08:00 · d3b24fa592
commit d3b24fa592
parent 42a82fc7e1
5 changed files with 133 additions and 16 deletions
--- a/pkg/decoders/decoders.go
+++ b/pkg/decoders/decoders.go
@ -6,7 +6,7 @@ import (

 func DefaultDecoders() []Decoder {
 	return []Decoder{
-		&Plain{},
+		&UTF8{},
 		&Base64{},
 	}
 }
--- a/pkg/decoders/plain.go
+++ b/pkg/decoders/plain.go
@ -1,14 +0,0 @@
-package decoders
-
-import (
-	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
-)
-
-// Ensure the Decoder satisfies the interface at compile time
-var _ Decoder = (*Plain)(nil)
-
-type Plain struct{}
-
-func (d *Plain) FromChunk(chunk *sources.Chunk) *sources.Chunk {
-	return chunk
-}
--- a/pkg/decoders/utf8.go
+++ b/pkg/decoders/utf8.go
@ -0,0 +1,44 @@
+package decoders
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+type UTF8 struct{}
+
+func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
+	if chunk == nil || len(chunk.Data) == 0 {
+		return nil
+	}
+
+	if !utf8.Valid(chunk.Data) {
+		chunk.Data = extractSubstrings(chunk.Data)
+		return chunk
+	}
+
+	return chunk
+}
+
+// extractSubstrings performs similarly to the strings binutil,
+// extacting contigous portions of printable characters that we care
+// about from some bytes
+func extractSubstrings(b []byte) []byte {
+	fields := bytes.FieldsFunc(b, func(r rune) bool {
+		// https://www.rapidtables.com/code/text/ascii-table.html
+		// split on anything that is not ascii space through tilde
+		return !(r > 31 && r < 127)
+	})
+
+	keep := [][]byte{}
+	for _, field := range fields {
+		// Remove fields shorter than 6 characters.
+		if bts := bytes.TrimSpace(field); len(bts) > 5 {
+			keep = append(keep, bts)
+		}
+	}
+
+	return bytes.Join(keep, []byte("\n"))
+}
--- a/pkg/decoders/utf8_test.go
+++ b/pkg/decoders/utf8_test.go
@ -0,0 +1,87 @@
+package decoders
+
+import (
+	"testing"
+
+	"github.com/kylelemons/godebug/pretty"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+func TestUTF8_FromChunk(t *testing.T) {
+	type args struct {
+		chunk *sources.Chunk
+	}
+	tests := []struct {
+		name    string
+		d       *UTF8
+		args    args
+		want    *sources.Chunk
+		wantErr bool
+	}{
+		{
+			name: "successful UTF8 decode",
+			d:    &UTF8{},
+			args: args{
+				chunk: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
+			},
+			want:    &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
+			wantErr: false,
+		},
+		{
+			name: "successful binary decode",
+			d:    &UTF8{},
+			args: args{
+				chunk: &sources.Chunk{Data: []byte("\xf0\x28\x8c\x28 not-entirely utf8 chunk that should decode successfully")},
+			},
+			want:    &sources.Chunk{Data: []byte("( not-entirely utf8 chunk that should decode successfully")},
+			wantErr: false,
+		},
+		{
+			name: "unsuccessful decode",
+			d:    &UTF8{},
+			args: args{
+				chunk: nil,
+			},
+			want:    nil,
+			wantErr: false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			d := &UTF8{}
+			got := d.FromChunk(tt.args.chunk)
+			if got != nil && tt.want != nil {
+				if diff := pretty.Compare(string(got.Data), string(tt.want.Data)); diff != "" {
+					t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
+				}
+			} else {
+				if diff := pretty.Compare(got, tt.want); diff != "" {
+					t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
+				}
+			}
+		})
+	}
+}
+
+var testBytes = []byte(`some words   with random spaces and
+	
+newlines with           
+arbitrary length           
+of
+
+	hey
+
+the lines themselves.
+
+and
+short
+words
+that
+go
+away.`)
+
+func Benchmark_extractSubstrings(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		extractSubstrings(testBytes)
+	}
+}
--- a/pkg/engine/engine.go
+++ b/pkg/engine/engine.go
@ -185,7 +185,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
 			for _, decoder := range e.decoders {
 				var decoderType detectorspb.DecoderType
 				switch decoder.(type) {
-				case *decoders.Plain:
+				case *decoders.UTF8:
 					decoderType = detectorspb.DecoderType_PLAIN
 				case *decoders.Base64:
 					decoderType = detectorspb.DecoderType_BASE64