Keyword optimization (#1144)

* init

* ignore trufflehog binary and added comment

* remove unused keywords in chunk, better comment

* remove keywords from engine struct
This commit is contained in:
Zachary Rice 2023-03-02 11:32:37 -06:00 committed by GitHub
parent c72840de67
commit 4777b77ec6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 8 deletions

5
.gitignore vendored
View file

@ -1,4 +1,7 @@
.idea
dist
.env
*.test
*.test
# binary
trufflehog

1
go.mod
View file

@ -120,6 +120,7 @@ require (
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
github.com/onsi/ginkgo v1.16.5 // indirect
github.com/onsi/gomega v1.23.0 // indirect
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 // indirect
github.com/pierrec/lz4/v4 v4.1.14 // indirect
github.com/pjbgf/sha1cd v0.2.3 // indirect
github.com/pkg/diff v0.0.0-20200914180035-5b29258ca4f7 // indirect

2
go.sum
View file

@ -281,6 +281,8 @@ github.com/onsi/gomega v1.23.0/go.mod h1:Z/NWtiqwBrwUt4/2loMmHL63EDLnYHmVbuBpDr2
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs=
github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE=
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 h1:lL+y4Xv20pVlCGyLzNHRC0I0rIHhIL1lTvHizoS/dU8=
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw=
github.com/pierrec/lz4/v4 v4.1.14 h1:+fL8AQEZtz/ijeNnpduH0bROTu0O3NZAlPjQxGn8LwE=
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=

View file

@ -9,6 +9,7 @@ import (
"sync/atomic"
"time"
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
"google.golang.org/protobuf/proto"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
@ -36,6 +37,10 @@ type Engine struct {
// If there are multiple unverified results for the same chunk for the same detector,
// only the first one will be kept.
filterUnverified bool
// prefilter is a ahocorasick struct used for doing efficient string
// matching given a set of words (keywords from the rules in the config)
prefilter ahocorasick.AhoCorasick
}
type EngineOption func(*Engine)
@ -110,7 +115,6 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
}
// Set defaults.
if e.concurrency == 0 {
numCPU := runtime.NumCPU()
ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU)
@ -128,6 +132,23 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
e.detectors[false] = []detectors.Detector{}
}
// build ahocorasick prefilter for efficient string matching
// on keywords
keywords := []string{}
for _, d := range e.detectors[false] {
keywords = append(keywords, d.Keywords()...)
}
for _, d := range e.detectors[true] {
keywords = append(keywords, d.Keywords()...)
}
builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
AsciiCaseInsensitive: true,
MatchOnlyWholeWords: false,
MatchKind: ahocorasick.LeftMostLongestMatch,
DFA: true,
})
e.prefilter = builder.Build(keywords)
ctx.Logger().V(2).Info("loaded decoders", "count", len(e.decoders))
ctx.Logger().V(2).Info("loaded detectors",
"total", len(e.detectors[true])+len(e.detectors[false]),
@ -208,6 +229,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
func (e *Engine) detectorWorker(ctx context.Context) {
for originalChunk := range e.chunks {
for chunk := range sources.Chunker(originalChunk) {
matchedKeywords := make(map[string]struct{})
atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data)))
for _, decoder := range e.decoders {
var decoderType detectorspb.DecoderType
@ -224,21 +246,29 @@ func (e *Engine) detectorWorker(ctx context.Context) {
if decoded == nil {
continue
}
dataLower := strings.ToLower(string(decoded.Data))
matches := e.prefilter.FindAll(dataLower)
for _, m := range matches {
matchedKeywords[dataLower[m.Start():m.End()]] = struct{}{}
}
for verify, detectorsSet := range e.detectors {
for _, detector := range detectorsSet {
start := time.Now()
foundKeyword := false
chunkContainsKeyword := false
for _, kw := range detector.Keywords() {
if strings.Contains(dataLower, strings.ToLower(kw)) {
foundKeyword = true
break
if _, ok := matchedKeywords[strings.ToLower(kw)]; ok {
chunkContainsKeyword = true
}
}
if !foundKeyword {
if !chunkContainsKeyword {
continue
}
start := time.Now()
results, err := func() ([]detectors.Result, error) {
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
defer cancel()