mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
Keyword optimization (#1144)
* init * ignore trufflehog binary and added comment * remove unused keywords in chunk, better comment * remove keywords from engine struct
This commit is contained in:
parent
c72840de67
commit
4777b77ec6
4 changed files with 44 additions and 8 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,3 +2,6 @@
|
|||
dist
|
||||
.env
|
||||
*.test
|
||||
|
||||
# binary
|
||||
trufflehog
|
||||
|
|
1
go.mod
1
go.mod
|
@ -120,6 +120,7 @@ require (
|
|||
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
|
||||
github.com/onsi/ginkgo v1.16.5 // indirect
|
||||
github.com/onsi/gomega v1.23.0 // indirect
|
||||
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 // indirect
|
||||
github.com/pierrec/lz4/v4 v4.1.14 // indirect
|
||||
github.com/pjbgf/sha1cd v0.2.3 // indirect
|
||||
github.com/pkg/diff v0.0.0-20200914180035-5b29258ca4f7 // indirect
|
||||
|
|
2
go.sum
2
go.sum
|
@ -281,6 +281,8 @@ github.com/onsi/gomega v1.23.0/go.mod h1:Z/NWtiqwBrwUt4/2loMmHL63EDLnYHmVbuBpDr2
|
|||
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
|
||||
github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs=
|
||||
github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE=
|
||||
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 h1:lL+y4Xv20pVlCGyLzNHRC0I0rIHhIL1lTvHizoS/dU8=
|
||||
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw=
|
||||
github.com/pierrec/lz4/v4 v4.1.14 h1:+fL8AQEZtz/ijeNnpduH0bROTu0O3NZAlPjQxGn8LwE=
|
||||
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
||||
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
|
||||
"google.golang.org/protobuf/proto"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
||||
|
@ -36,6 +37,10 @@ type Engine struct {
|
|||
// If there are multiple unverified results for the same chunk for the same detector,
|
||||
// only the first one will be kept.
|
||||
filterUnverified bool
|
||||
|
||||
// prefilter is a ahocorasick struct used for doing efficient string
|
||||
// matching given a set of words (keywords from the rules in the config)
|
||||
prefilter ahocorasick.AhoCorasick
|
||||
}
|
||||
|
||||
type EngineOption func(*Engine)
|
||||
|
@ -110,7 +115,6 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
|
|||
}
|
||||
|
||||
// Set defaults.
|
||||
|
||||
if e.concurrency == 0 {
|
||||
numCPU := runtime.NumCPU()
|
||||
ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU)
|
||||
|
@ -128,6 +132,23 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
|
|||
e.detectors[false] = []detectors.Detector{}
|
||||
}
|
||||
|
||||
// build ahocorasick prefilter for efficient string matching
|
||||
// on keywords
|
||||
keywords := []string{}
|
||||
for _, d := range e.detectors[false] {
|
||||
keywords = append(keywords, d.Keywords()...)
|
||||
}
|
||||
for _, d := range e.detectors[true] {
|
||||
keywords = append(keywords, d.Keywords()...)
|
||||
}
|
||||
builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
|
||||
AsciiCaseInsensitive: true,
|
||||
MatchOnlyWholeWords: false,
|
||||
MatchKind: ahocorasick.LeftMostLongestMatch,
|
||||
DFA: true,
|
||||
})
|
||||
e.prefilter = builder.Build(keywords)
|
||||
|
||||
ctx.Logger().V(2).Info("loaded decoders", "count", len(e.decoders))
|
||||
ctx.Logger().V(2).Info("loaded detectors",
|
||||
"total", len(e.detectors[true])+len(e.detectors[false]),
|
||||
|
@ -208,6 +229,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
|
|||
func (e *Engine) detectorWorker(ctx context.Context) {
|
||||
for originalChunk := range e.chunks {
|
||||
for chunk := range sources.Chunker(originalChunk) {
|
||||
matchedKeywords := make(map[string]struct{})
|
||||
atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data)))
|
||||
for _, decoder := range e.decoders {
|
||||
var decoderType detectorspb.DecoderType
|
||||
|
@ -224,21 +246,29 @@ func (e *Engine) detectorWorker(ctx context.Context) {
|
|||
if decoded == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
dataLower := strings.ToLower(string(decoded.Data))
|
||||
matches := e.prefilter.FindAll(dataLower)
|
||||
|
||||
for _, m := range matches {
|
||||
matchedKeywords[dataLower[m.Start():m.End()]] = struct{}{}
|
||||
}
|
||||
|
||||
for verify, detectorsSet := range e.detectors {
|
||||
for _, detector := range detectorsSet {
|
||||
start := time.Now()
|
||||
foundKeyword := false
|
||||
chunkContainsKeyword := false
|
||||
for _, kw := range detector.Keywords() {
|
||||
if strings.Contains(dataLower, strings.ToLower(kw)) {
|
||||
foundKeyword = true
|
||||
break
|
||||
if _, ok := matchedKeywords[strings.ToLower(kw)]; ok {
|
||||
chunkContainsKeyword = true
|
||||
}
|
||||
}
|
||||
if !foundKeyword {
|
||||
|
||||
if !chunkContainsKeyword {
|
||||
continue
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
|
||||
results, err := func() ([]detectors.Result, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
|
||||
defer cancel()
|
||||
|
|
Loading…
Reference in a new issue