mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 15:14:38 +00:00
Replace plain decoder with utf8 (#922)
This commit is contained in:
parent
42a82fc7e1
commit
d3b24fa592
5 changed files with 133 additions and 16 deletions
|
@ -6,7 +6,7 @@ import (
|
|||
|
||||
func DefaultDecoders() []Decoder {
|
||||
return []Decoder{
|
||||
&Plain{},
|
||||
&UTF8{},
|
||||
&Base64{},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
package decoders
|
||||
|
||||
import (
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
)
|
||||
|
||||
// Ensure the Decoder satisfies the interface at compile time
|
||||
var _ Decoder = (*Plain)(nil)
|
||||
|
||||
type Plain struct{}
|
||||
|
||||
func (d *Plain) FromChunk(chunk *sources.Chunk) *sources.Chunk {
|
||||
return chunk
|
||||
}
|
44
pkg/decoders/utf8.go
Normal file
44
pkg/decoders/utf8.go
Normal file
|
@ -0,0 +1,44 @@
|
|||
package decoders
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
)
|
||||
|
||||
type UTF8 struct{}
|
||||
|
||||
func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
|
||||
if chunk == nil || len(chunk.Data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if !utf8.Valid(chunk.Data) {
|
||||
chunk.Data = extractSubstrings(chunk.Data)
|
||||
return chunk
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractSubstrings performs similarly to the strings binutil,
|
||||
// extacting contigous portions of printable characters that we care
|
||||
// about from some bytes
|
||||
func extractSubstrings(b []byte) []byte {
|
||||
fields := bytes.FieldsFunc(b, func(r rune) bool {
|
||||
// https://www.rapidtables.com/code/text/ascii-table.html
|
||||
// split on anything that is not ascii space through tilde
|
||||
return !(r > 31 && r < 127)
|
||||
})
|
||||
|
||||
keep := [][]byte{}
|
||||
for _, field := range fields {
|
||||
// Remove fields shorter than 6 characters.
|
||||
if bts := bytes.TrimSpace(field); len(bts) > 5 {
|
||||
keep = append(keep, bts)
|
||||
}
|
||||
}
|
||||
|
||||
return bytes.Join(keep, []byte("\n"))
|
||||
}
|
87
pkg/decoders/utf8_test.go
Normal file
87
pkg/decoders/utf8_test.go
Normal file
|
@ -0,0 +1,87 @@
|
|||
package decoders
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/kylelemons/godebug/pretty"
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
)
|
||||
|
||||
func TestUTF8_FromChunk(t *testing.T) {
|
||||
type args struct {
|
||||
chunk *sources.Chunk
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
d *UTF8
|
||||
args args
|
||||
want *sources.Chunk
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "successful UTF8 decode",
|
||||
d: &UTF8{},
|
||||
args: args{
|
||||
chunk: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
|
||||
},
|
||||
want: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "successful binary decode",
|
||||
d: &UTF8{},
|
||||
args: args{
|
||||
chunk: &sources.Chunk{Data: []byte("\xf0\x28\x8c\x28 not-entirely utf8 chunk that should decode successfully")},
|
||||
},
|
||||
want: &sources.Chunk{Data: []byte("( not-entirely utf8 chunk that should decode successfully")},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "unsuccessful decode",
|
||||
d: &UTF8{},
|
||||
args: args{
|
||||
chunk: nil,
|
||||
},
|
||||
want: nil,
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
d := &UTF8{}
|
||||
got := d.FromChunk(tt.args.chunk)
|
||||
if got != nil && tt.want != nil {
|
||||
if diff := pretty.Compare(string(got.Data), string(tt.want.Data)); diff != "" {
|
||||
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
|
||||
}
|
||||
} else {
|
||||
if diff := pretty.Compare(got, tt.want); diff != "" {
|
||||
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var testBytes = []byte(`some words with random spaces and
|
||||
|
||||
newlines with
|
||||
arbitrary length
|
||||
of
|
||||
|
||||
hey
|
||||
|
||||
the lines themselves.
|
||||
|
||||
and
|
||||
short
|
||||
words
|
||||
that
|
||||
go
|
||||
away.`)
|
||||
|
||||
func Benchmark_extractSubstrings(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
extractSubstrings(testBytes)
|
||||
}
|
||||
}
|
|
@ -185,7 +185,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
|
|||
for _, decoder := range e.decoders {
|
||||
var decoderType detectorspb.DecoderType
|
||||
switch decoder.(type) {
|
||||
case *decoders.Plain:
|
||||
case *decoders.UTF8:
|
||||
decoderType = detectorspb.DecoderType_PLAIN
|
||||
case *decoders.Base64:
|
||||
decoderType = detectorspb.DecoderType_BASE64
|
||||
|
|
Loading…
Reference in a new issue