mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
Keyword optimization (#1144)
* init * ignore trufflehog binary and added comment * remove unused keywords in chunk, better comment * remove keywords from engine struct
This commit is contained in:
parent
c72840de67
commit
4777b77ec6
4 changed files with 44 additions and 8 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,4 +1,7 @@
|
||||||
.idea
|
.idea
|
||||||
dist
|
dist
|
||||||
.env
|
.env
|
||||||
*.test
|
*.test
|
||||||
|
|
||||||
|
# binary
|
||||||
|
trufflehog
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -120,6 +120,7 @@ require (
|
||||||
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
|
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
|
||||||
github.com/onsi/ginkgo v1.16.5 // indirect
|
github.com/onsi/ginkgo v1.16.5 // indirect
|
||||||
github.com/onsi/gomega v1.23.0 // indirect
|
github.com/onsi/gomega v1.23.0 // indirect
|
||||||
|
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 // indirect
|
||||||
github.com/pierrec/lz4/v4 v4.1.14 // indirect
|
github.com/pierrec/lz4/v4 v4.1.14 // indirect
|
||||||
github.com/pjbgf/sha1cd v0.2.3 // indirect
|
github.com/pjbgf/sha1cd v0.2.3 // indirect
|
||||||
github.com/pkg/diff v0.0.0-20200914180035-5b29258ca4f7 // indirect
|
github.com/pkg/diff v0.0.0-20200914180035-5b29258ca4f7 // indirect
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -281,6 +281,8 @@ github.com/onsi/gomega v1.23.0/go.mod h1:Z/NWtiqwBrwUt4/2loMmHL63EDLnYHmVbuBpDr2
|
||||||
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
|
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
|
||||||
github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs=
|
github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs=
|
||||||
github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE=
|
github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE=
|
||||||
|
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 h1:lL+y4Xv20pVlCGyLzNHRC0I0rIHhIL1lTvHizoS/dU8=
|
||||||
|
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw=
|
||||||
github.com/pierrec/lz4/v4 v4.1.14 h1:+fL8AQEZtz/ijeNnpduH0bROTu0O3NZAlPjQxGn8LwE=
|
github.com/pierrec/lz4/v4 v4.1.14 h1:+fL8AQEZtz/ijeNnpduH0bROTu0O3NZAlPjQxGn8LwE=
|
||||||
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
||||||
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
|
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
|
||||||
|
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
|
||||||
"google.golang.org/protobuf/proto"
|
"google.golang.org/protobuf/proto"
|
||||||
|
|
||||||
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
||||||
|
@ -36,6 +37,10 @@ type Engine struct {
|
||||||
// If there are multiple unverified results for the same chunk for the same detector,
|
// If there are multiple unverified results for the same chunk for the same detector,
|
||||||
// only the first one will be kept.
|
// only the first one will be kept.
|
||||||
filterUnverified bool
|
filterUnverified bool
|
||||||
|
|
||||||
|
// prefilter is a ahocorasick struct used for doing efficient string
|
||||||
|
// matching given a set of words (keywords from the rules in the config)
|
||||||
|
prefilter ahocorasick.AhoCorasick
|
||||||
}
|
}
|
||||||
|
|
||||||
type EngineOption func(*Engine)
|
type EngineOption func(*Engine)
|
||||||
|
@ -110,7 +115,6 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set defaults.
|
// Set defaults.
|
||||||
|
|
||||||
if e.concurrency == 0 {
|
if e.concurrency == 0 {
|
||||||
numCPU := runtime.NumCPU()
|
numCPU := runtime.NumCPU()
|
||||||
ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU)
|
ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU)
|
||||||
|
@ -128,6 +132,23 @@ func Start(ctx context.Context, options ...EngineOption) *Engine {
|
||||||
e.detectors[false] = []detectors.Detector{}
|
e.detectors[false] = []detectors.Detector{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// build ahocorasick prefilter for efficient string matching
|
||||||
|
// on keywords
|
||||||
|
keywords := []string{}
|
||||||
|
for _, d := range e.detectors[false] {
|
||||||
|
keywords = append(keywords, d.Keywords()...)
|
||||||
|
}
|
||||||
|
for _, d := range e.detectors[true] {
|
||||||
|
keywords = append(keywords, d.Keywords()...)
|
||||||
|
}
|
||||||
|
builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
|
||||||
|
AsciiCaseInsensitive: true,
|
||||||
|
MatchOnlyWholeWords: false,
|
||||||
|
MatchKind: ahocorasick.LeftMostLongestMatch,
|
||||||
|
DFA: true,
|
||||||
|
})
|
||||||
|
e.prefilter = builder.Build(keywords)
|
||||||
|
|
||||||
ctx.Logger().V(2).Info("loaded decoders", "count", len(e.decoders))
|
ctx.Logger().V(2).Info("loaded decoders", "count", len(e.decoders))
|
||||||
ctx.Logger().V(2).Info("loaded detectors",
|
ctx.Logger().V(2).Info("loaded detectors",
|
||||||
"total", len(e.detectors[true])+len(e.detectors[false]),
|
"total", len(e.detectors[true])+len(e.detectors[false]),
|
||||||
|
@ -208,6 +229,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
|
||||||
func (e *Engine) detectorWorker(ctx context.Context) {
|
func (e *Engine) detectorWorker(ctx context.Context) {
|
||||||
for originalChunk := range e.chunks {
|
for originalChunk := range e.chunks {
|
||||||
for chunk := range sources.Chunker(originalChunk) {
|
for chunk := range sources.Chunker(originalChunk) {
|
||||||
|
matchedKeywords := make(map[string]struct{})
|
||||||
atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data)))
|
atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data)))
|
||||||
for _, decoder := range e.decoders {
|
for _, decoder := range e.decoders {
|
||||||
var decoderType detectorspb.DecoderType
|
var decoderType detectorspb.DecoderType
|
||||||
|
@ -224,21 +246,29 @@ func (e *Engine) detectorWorker(ctx context.Context) {
|
||||||
if decoded == nil {
|
if decoded == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
dataLower := strings.ToLower(string(decoded.Data))
|
dataLower := strings.ToLower(string(decoded.Data))
|
||||||
|
matches := e.prefilter.FindAll(dataLower)
|
||||||
|
|
||||||
|
for _, m := range matches {
|
||||||
|
matchedKeywords[dataLower[m.Start():m.End()]] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
for verify, detectorsSet := range e.detectors {
|
for verify, detectorsSet := range e.detectors {
|
||||||
for _, detector := range detectorsSet {
|
for _, detector := range detectorsSet {
|
||||||
start := time.Now()
|
chunkContainsKeyword := false
|
||||||
foundKeyword := false
|
|
||||||
for _, kw := range detector.Keywords() {
|
for _, kw := range detector.Keywords() {
|
||||||
if strings.Contains(dataLower, strings.ToLower(kw)) {
|
if _, ok := matchedKeywords[strings.ToLower(kw)]; ok {
|
||||||
foundKeyword = true
|
chunkContainsKeyword = true
|
||||||
break
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !foundKeyword {
|
|
||||||
|
if !chunkContainsKeyword {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
results, err := func() ([]detectors.Result, error) {
|
results, err := func() ([]detectors.Result, error) {
|
||||||
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
|
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
Loading…
Reference in a new issue