From 4777b77ec6f92bd4f19eee819b73a339343644f3 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 2 Mar 2023 11:32:37 -0600 Subject: [PATCH] Keyword optimization (#1144) * init * ignore trufflehog binary and added comment * remove unused keywords in chunk, better comment * remove keywords from engine struct --- .gitignore | 5 ++++- go.mod | 1 + go.sum | 2 ++ pkg/engine/engine.go | 44 +++++++++++++++++++++++++++++++++++++------- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 416391864..e9a3dd8a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ .idea dist .env -*.test \ No newline at end of file +*.test + +# binary +trufflehog diff --git a/go.mod b/go.mod index 4a810d439..03a726868 100644 --- a/go.mod +++ b/go.mod @@ -120,6 +120,7 @@ require ( github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect github.com/onsi/ginkgo v1.16.5 // indirect github.com/onsi/gomega v1.23.0 // indirect + github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect github.com/pjbgf/sha1cd v0.2.3 // indirect github.com/pkg/diff v0.0.0-20200914180035-5b29258ca4f7 // indirect diff --git a/go.sum b/go.sum index 7b61144b1..2304f479f 100644 --- a/go.sum +++ b/go.sum @@ -281,6 +281,8 @@ github.com/onsi/gomega v1.23.0/go.mod h1:Z/NWtiqwBrwUt4/2loMmHL63EDLnYHmVbuBpDr2 github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= +github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 h1:lL+y4Xv20pVlCGyLzNHRC0I0rIHhIL1lTvHizoS/dU8= +github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw= github.com/pierrec/lz4/v4 v4.1.14 h1:+fL8AQEZtz/ijeNnpduH0bROTu0O3NZAlPjQxGn8LwE= github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 6b179adec..211e79c1e 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -9,6 +9,7 @@ import ( "sync/atomic" "time" + ahocorasick "github.com/petar-dambovaliev/aho-corasick" "google.golang.org/protobuf/proto" "github.com/trufflesecurity/trufflehog/v3/pkg/common" @@ -36,6 +37,10 @@ type Engine struct { // If there are multiple unverified results for the same chunk for the same detector, // only the first one will be kept. filterUnverified bool + + // prefilter is a ahocorasick struct used for doing efficient string + // matching given a set of words (keywords from the rules in the config) + prefilter ahocorasick.AhoCorasick } type EngineOption func(*Engine) @@ -110,7 +115,6 @@ func Start(ctx context.Context, options ...EngineOption) *Engine { } // Set defaults. - if e.concurrency == 0 { numCPU := runtime.NumCPU() ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU) @@ -128,6 +132,23 @@ func Start(ctx context.Context, options ...EngineOption) *Engine { e.detectors[false] = []detectors.Detector{} } + // build ahocorasick prefilter for efficient string matching + // on keywords + keywords := []string{} + for _, d := range e.detectors[false] { + keywords = append(keywords, d.Keywords()...) + } + for _, d := range e.detectors[true] { + keywords = append(keywords, d.Keywords()...) + } + builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{ + AsciiCaseInsensitive: true, + MatchOnlyWholeWords: false, + MatchKind: ahocorasick.LeftMostLongestMatch, + DFA: true, + }) + e.prefilter = builder.Build(keywords) + ctx.Logger().V(2).Info("loaded decoders", "count", len(e.decoders)) ctx.Logger().V(2).Info("loaded detectors", "total", len(e.detectors[true])+len(e.detectors[false]), @@ -208,6 +229,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration { func (e *Engine) detectorWorker(ctx context.Context) { for originalChunk := range e.chunks { for chunk := range sources.Chunker(originalChunk) { + matchedKeywords := make(map[string]struct{}) atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data))) for _, decoder := range e.decoders { var decoderType detectorspb.DecoderType @@ -224,21 +246,29 @@ func (e *Engine) detectorWorker(ctx context.Context) { if decoded == nil { continue } + dataLower := strings.ToLower(string(decoded.Data)) + matches := e.prefilter.FindAll(dataLower) + + for _, m := range matches { + matchedKeywords[dataLower[m.Start():m.End()]] = struct{}{} + } + for verify, detectorsSet := range e.detectors { for _, detector := range detectorsSet { - start := time.Now() - foundKeyword := false + chunkContainsKeyword := false for _, kw := range detector.Keywords() { - if strings.Contains(dataLower, strings.ToLower(kw)) { - foundKeyword = true - break + if _, ok := matchedKeywords[strings.ToLower(kw)]; ok { + chunkContainsKeyword = true } } - if !foundKeyword { + + if !chunkContainsKeyword { continue } + start := time.Now() + results, err := func() ([]detectors.Result, error) { ctx, cancel := context.WithTimeout(ctx, time.Second*10) defer cancel()