Support multiple detectors per match (#2065)

#1711 inadvertently removed the ability to match multiple custom detectors, or multiple detectors of the same type but different version, to a given keyword. (#2060 re-added support for multiple versions of detectors globally, and #2064 re-added support for multiple custom detectors globally, but neither fixed trufflehog's inability to support multiple such detectors for a given keyword match.) This PR re-adds the removed functionality (and narrows the AhoCorasickCore interface in the process.)
This commit is contained in:
Cody Rose 2023-11-03 12:26:18 -04:00 committed by GitHub
parent 600903f391
commit 7a156330b5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 58 additions and 90 deletions

View file

@ -9,12 +9,14 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
// detectorKey is used to identify a detector in the keywordsToDetectors map.
// DetectorKey is used to identify a detector in the keywordsToDetectors map.
// Multiple detectors can have the same detector type but different versions.
// This allows us to identify a detector by its type and version. An
// additional (optional) field is provided to disambiguate multiple custom
// detectors.
type detectorKey struct {
// detectors. This type is exported even though none of its fields are so
// that the AhoCorasickCore can populate passed-in maps keyed on this type
// without exposing any of its internals to consumers.
type DetectorKey struct {
detectorType detectorspb.DetectorType
version int
customDetectorName string
@ -32,16 +34,16 @@ type AhoCorasickCore struct {
// type and then again from detector type to detector. We could
// go straight from keywords to detectors but doing it this way makes
// some consuming code a little cleaner.)
keywordsToDetectors map[string][]detectorKey
detectorsByKey map[detectorKey]detectors.Detector
keywordsToDetectors map[string][]DetectorKey
detectorsByKey map[DetectorKey]detectors.Detector
}
// NewAhoCorasickCore allocates and initializes a new instance of AhoCorasickCore. It uses the
// provided detector slice to create a map from keywords to detectors and build the Aho-Corasick
// prefilter trie.
func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
keywordsToDetectors := make(map[string][]detectorKey)
detectorsByKey := make(map[detectorKey]detectors.Detector, len(allDetectors))
keywordsToDetectors := make(map[string][]DetectorKey)
detectorsByKey := make(map[DetectorKey]detectors.Detector, len(allDetectors))
var keywords []string
for _, d := range allDetectors {
key := createDetectorKey(d)
@ -60,29 +62,20 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
}
}
// MatchString performs a string match using the Aho-Corasick algorithm, returning an array of matches.
// Designed for internal use within the AhoCorasickCore component.
func (ac *AhoCorasickCore) MatchString(input string) []*ahocorasick.Match {
return ac.prefilter.MatchString(strings.ToLower(input))
// PopulateMatchingDetectors populates the given detector slice with all the detectors matching the
// provided input. This method populates an existing map rather than allocating a new one because
// it will be called once per chunk and that many allocations has a noticeable performance cost.
func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors map[DetectorKey]detectors.Detector) {
for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) {
for _, k := range ac.keywordsToDetectors[m.MatchString()] {
detectors[k] = ac.detectorsByKey[k]
}
}
}
// PopulateDetectorsByMatch populates the given detectorMap based on the Aho-Corasick match results.
// This method is designed to reuse the same map for performance optimization,
// reducing the need for repeated allocations within each detector worker in the engine.
func (ac *AhoCorasickCore) PopulateDetectorsByMatch(match *ahocorasick.Match, detectors map[detectorspb.DetectorType]detectors.Detector) bool {
matchedDetectorKeys, ok := ac.keywordsToDetectors[match.MatchString()]
if !ok {
return false
}
for _, key := range matchedDetectorKeys {
detectors[key.detectorType] = ac.detectorsByKey[key]
}
return true
}
// createDetectorKey creates a unique key for each detector. This key based on type and version,
// it ensures faster lookups and reduces redundancy in our main detector store.
func createDetectorKey(d detectors.Detector) detectorKey {
// createDetectorKey creates a unique key for each detector from its type, version, and, for
// custom regex detectors, its name.
func createDetectorKey(d detectors.Detector) DetectorKey {
detectorType := d.Type()
var version int
if v, ok := d.(detectors.Versioner); ok {
@ -92,5 +85,5 @@ func createDetectorKey(d detectors.Detector) detectorKey {
if r, ok := d.(*custom_detectors.CustomRegexWebhook); ok {
customDetectorName = r.GetName()
}
return detectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
return DetectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
}

View file

@ -21,7 +21,7 @@ func (d testDetectorV1) FromData(ctx context.Context, verify bool, data []byte)
}
func (d testDetectorV1) Keywords() []string {
return []string{"a"}
return []string{"a", "b"}
}
func (d testDetectorV1) Type() detectorspb.DetectorType {
@ -40,7 +40,7 @@ func (d testDetectorV2) FromData(ctx context.Context, verify bool, data []byte)
}
func (d testDetectorV2) Keywords() []string {
return []string{"b"}
return []string{"a"}
}
func (d testDetectorV2) Type() detectorspb.DetectorType {
@ -66,72 +66,51 @@ func TestAhoCorasickCore_MultipleCustomDetectorsMatchable(t *testing.T) {
customDetector2, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
Name: "custom detector 2",
Keywords: []string{"b"},
Keywords: []string{"a"},
Regex: map[string]string{"": ""},
})
assert.Nil(t, err)
testCases := []struct {
matchString string
detector detectors.Detector
}{
{
matchString: "a",
detector: customDetector1,
},
{
matchString: "b",
detector: customDetector2,
},
}
var allDetectors []detectors.Detector
for _, tt := range testCases {
allDetectors = append(allDetectors, tt.detector)
}
allDetectors := []detectors.Detector{customDetector1, customDetector2}
ac := NewAhoCorasickCore(allDetectors)
for _, tt := range testCases {
matches := ac.MatchString(tt.matchString)
assert.Equal(t, 1, len(matches))
matchingDetectors := make(map[detectorspb.DetectorType]detectors.Detector)
ac.PopulateDetectorsByMatch(matches[0], matchingDetectors)
assert.Equal(t, 1, len(matchingDetectors))
assert.Equal(t, tt.detector, matchingDetectors[detectorspb.DetectorType_CustomRegex])
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}
func TestAhoCorasickCore_MultipleDetectorVersionsMatchable(t *testing.T) {
testCases := []struct {
matchString string
detector detectors.Detector
}{
{
matchString: "a",
detector: testDetectorV1{},
},
{
matchString: "b",
detector: testDetectorV2{},
},
}
var allDetectors []detectors.Detector
for _, tt := range testCases {
allDetectors = append(allDetectors, tt.detector)
}
v1 := testDetectorV1{}
v2 := testDetectorV2{}
allDetectors := []detectors.Detector{v1, v2}
ac := NewAhoCorasickCore(allDetectors)
for _, tt := range testCases {
matches := ac.MatchString(tt.matchString)
assert.Equal(t, 1, len(matches))
matchingDetectors := make(map[detectorspb.DetectorType]detectors.Detector)
ac.PopulateDetectorsByMatch(matches[0], matchingDetectors)
assert.Equal(t, 1, len(matchingDetectors))
assert.Equal(t, tt.detector, matchingDetectors[TestDetectorType])
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}
func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) {
d := testDetectorV1{}
allDetectors := []detectors.Detector{d}
ac := NewAhoCorasickCore(allDetectors)
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a a b b", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}

View file

@ -458,7 +458,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
// Reuse the same map to avoid allocations.
const avgDetectorsPerChunk = 2
chunkSpecificDetectors := make(map[detectorspb.DetectorType]detectors.Detector, avgDetectorsPerChunk)
chunkSpecificDetectors := make(map[DetectorKey]detectors.Detector, avgDetectorsPerChunk)
for originalChunk := range e.ChunksChan() {
for chunk := range sources.Chunker(originalChunk) {
atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data)))
@ -469,11 +469,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
continue
}
for _, match := range e.ahoCorasickCore.MatchString(string(decoded.Chunk.Data)) {
if !e.ahoCorasickCore.PopulateDetectorsByMatch(match, chunkSpecificDetectors) {
continue
}
}
e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors)
for k, detector := range chunkSpecificDetectors {
decoded.Chunk.Verify = e.verify