Re-add detector version (#2060)

#2010 mistakenly removed detector version tracking from the Aho Corasick wrapper. This PR re-adds it.
This commit is contained in:
Cody Rose 2023-10-30 15:34:33 -04:00 committed by GitHub
parent 3c2270ae65
commit 45059864f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 120 additions and 12 deletions

View file

@ -9,6 +9,14 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
// detectorKey is used to identify a detector in the keywordsToDetectors map.
// Multiple detectors can have the same detector type but different versions.
// This allows us to identify a detector by its type and version.
type detectorKey struct {
detectorType detectorspb.DetectorType
version int
}
// AhoCorasickCore encapsulates the operations and data structures used for keyword matching via the
// Aho-Corasick algorithm. It is responsible for constructing and managing the trie for efficient
// substring searches, as well as mapping keywords to their associated detectors for rapid lookups.
@ -21,30 +29,31 @@ type AhoCorasickCore struct {
// type and then again from detector type to detector. We could
// go straight from keywords to detectors but doing it this way makes
// some consuming code a little cleaner.)
keywordsToDetectorTypes map[string][]detectorspb.DetectorType
detectorsByType map[detectorspb.DetectorType]detectors.Detector
keywordsToDetectors map[string][]detectorKey
detectorsByKey map[detectorKey]detectors.Detector
}
// NewAhoCorasickCore allocates and initializes a new instance of AhoCorasickCore. It uses the
// provided detector slice to create a map from keywords to detectors and build the Aho-Corasick
// prefilter trie.
func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
keywordsToDetectorTypes := make(map[string][]detectorspb.DetectorType)
detectorsByType := make(map[detectorspb.DetectorType]detectors.Detector, len(allDetectors))
keywordsToDetectors := make(map[string][]detectorKey)
detectorsByKey := make(map[detectorKey]detectors.Detector, len(allDetectors))
var keywords []string
for _, d := range allDetectors {
detectorsByType[d.Type()] = d
key := createDetectorKey(d)
detectorsByKey[key] = d
for _, kw := range d.Keywords() {
kwLower := strings.ToLower(kw)
keywords = append(keywords, kwLower)
keywordsToDetectorTypes[kwLower] = append(keywordsToDetectorTypes[kwLower], d.Type())
keywordsToDetectors[kwLower] = append(keywordsToDetectors[kwLower], key)
}
}
return &AhoCorasickCore{
keywordsToDetectorTypes: keywordsToDetectorTypes,
detectorsByType: detectorsByType,
prefilter: *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(),
keywordsToDetectors: keywordsToDetectors,
detectorsByKey: detectorsByKey,
prefilter: *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(),
}
}
@ -58,12 +67,23 @@ func (ac *AhoCorasickCore) MatchString(input string) []*ahocorasick.Match {
// This method is designed to reuse the same map for performance optimization,
// reducing the need for repeated allocations within each detector worker in the engine.
func (ac *AhoCorasickCore) PopulateDetectorsByMatch(match *ahocorasick.Match, detectors map[detectorspb.DetectorType]detectors.Detector) bool {
matchedDetectorTypes, ok := ac.keywordsToDetectorTypes[match.MatchString()]
matchedDetectorKeys, ok := ac.keywordsToDetectors[match.MatchString()]
if !ok {
return false
}
for _, t := range matchedDetectorTypes {
detectors[t] = ac.detectorsByType[t]
for _, key := range matchedDetectorKeys {
detectors[key.detectorType] = ac.detectorsByKey[key]
}
return true
}
// createDetectorKey creates a unique key for each detector. This key based on type and version,
// it ensures faster lookups and reduces redundancy in our main detector store.
func createDetectorKey(d detectors.Detector) detectorKey {
detectorType := d.Type()
var version int
if v, ok := d.(detectors.Versioner); ok {
version = v.Version()
}
return detectorKey{detectorType: detectorType, version: version}
}

View file

@ -0,0 +1,88 @@
package engine
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
const TestDetectorType = -1
type testDetectorV1 struct {
}
func (d testDetectorV1) FromData(ctx context.Context, verify bool, data []byte) ([]detectors.Result, error) {
return make([]detectors.Result, 0), nil
}
func (d testDetectorV1) Keywords() []string {
return []string{"a"}
}
func (d testDetectorV1) Type() detectorspb.DetectorType {
return TestDetectorType
}
func (d testDetectorV1) Version() int {
return 1
}
type testDetectorV2 struct {
}
func (d testDetectorV2) FromData(ctx context.Context, verify bool, data []byte) ([]detectors.Result, error) {
return make([]detectors.Result, 0), nil
}
func (d testDetectorV2) Keywords() []string {
return []string{"b"}
}
func (d testDetectorV2) Type() detectorspb.DetectorType {
return TestDetectorType
}
func (d testDetectorV2) Version() int {
return 2
}
var _ detectors.Detector = (*testDetectorV1)(nil)
var _ detectors.Detector = (*testDetectorV2)(nil)
var _ detectors.Versioner = (*testDetectorV1)(nil)
var _ detectors.Versioner = (*testDetectorV2)(nil)
func TestAhoCorasickCore_MultipleDetectorVersionsMatchable(t *testing.T) {
testCases := []struct {
matchString string
detector detectors.Detector
}{
{
matchString: "a",
detector: testDetectorV1{},
},
{
matchString: "b",
detector: testDetectorV2{},
},
}
var allDetectors []detectors.Detector
for _, tt := range testCases {
allDetectors = append(allDetectors, tt.detector)
}
ac := NewAhoCorasickCore(allDetectors)
for _, tt := range testCases {
matches := ac.MatchString(tt.matchString)
assert.Equal(t, 1, len(matches))
matchingDetectors := make(map[detectorspb.DetectorType]detectors.Detector)
ac.PopulateDetectorsByMatch(matches[0], matchingDetectors)
assert.Equal(t, 1, len(matchingDetectors))
assert.Equal(t, tt.detector, matchingDetectors[TestDetectorType])
}
}