Replace plain decoder with utf8 (#922)

This commit is contained in:
Bill Rich 2022-11-15 09:36:01 -08:00 committed by GitHub
parent 42a82fc7e1
commit d3b24fa592
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 133 additions and 16 deletions

View file

@ -6,7 +6,7 @@ import (
func DefaultDecoders() []Decoder {
return []Decoder{
&Plain{},
&UTF8{},
&Base64{},
}
}

View file

@ -1,14 +0,0 @@
package decoders
import (
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
// Ensure the Decoder satisfies the interface at compile time
var _ Decoder = (*Plain)(nil)
type Plain struct{}
func (d *Plain) FromChunk(chunk *sources.Chunk) *sources.Chunk {
return chunk
}

44
pkg/decoders/utf8.go Normal file
View file

@ -0,0 +1,44 @@
package decoders
import (
"bytes"
"unicode/utf8"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
type UTF8 struct{}
func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}
if !utf8.Valid(chunk.Data) {
chunk.Data = extractSubstrings(chunk.Data)
return chunk
}
return chunk
}
// extractSubstrings performs similarly to the strings binutil,
// extacting contigous portions of printable characters that we care
// about from some bytes
func extractSubstrings(b []byte) []byte {
fields := bytes.FieldsFunc(b, func(r rune) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return !(r > 31 && r < 127)
})
keep := [][]byte{}
for _, field := range fields {
// Remove fields shorter than 6 characters.
if bts := bytes.TrimSpace(field); len(bts) > 5 {
keep = append(keep, bts)
}
}
return bytes.Join(keep, []byte("\n"))
}

87
pkg/decoders/utf8_test.go Normal file
View file

@ -0,0 +1,87 @@
package decoders
import (
"testing"
"github.com/kylelemons/godebug/pretty"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
func TestUTF8_FromChunk(t *testing.T) {
type args struct {
chunk *sources.Chunk
}
tests := []struct {
name string
d *UTF8
args args
want *sources.Chunk
wantErr bool
}{
{
name: "successful UTF8 decode",
d: &UTF8{},
args: args{
chunk: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
},
want: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
wantErr: false,
},
{
name: "successful binary decode",
d: &UTF8{},
args: args{
chunk: &sources.Chunk{Data: []byte("\xf0\x28\x8c\x28 not-entirely utf8 chunk that should decode successfully")},
},
want: &sources.Chunk{Data: []byte("( not-entirely utf8 chunk that should decode successfully")},
wantErr: false,
},
{
name: "unsuccessful decode",
d: &UTF8{},
args: args{
chunk: nil,
},
want: nil,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &UTF8{}
got := d.FromChunk(tt.args.chunk)
if got != nil && tt.want != nil {
if diff := pretty.Compare(string(got.Data), string(tt.want.Data)); diff != "" {
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
}
} else {
if diff := pretty.Compare(got, tt.want); diff != "" {
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
}
}
})
}
}
var testBytes = []byte(`some words with random spaces and
newlines with
arbitrary length
of
hey
the lines themselves.
and
short
words
that
go
away.`)
func Benchmark_extractSubstrings(b *testing.B) {
for i := 0; i < b.N; i++ {
extractSubstrings(testBytes)
}
}

View file

@ -185,7 +185,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
for _, decoder := range e.decoders {
var decoderType detectorspb.DecoderType
switch decoder.(type) {
case *decoders.Plain:
case *decoders.UTF8:
decoderType = detectorspb.DecoderType_PLAIN
case *decoders.Base64:
decoderType = detectorspb.DecoderType_BASE64