mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 15:14:38 +00:00
Support multiple detectors per match (#2065)
#1711 inadvertently removed the ability to match multiple custom detectors, or multiple detectors of the same type but different version, to a given keyword. (#2060 re-added support for multiple versions of detectors globally, and #2064 re-added support for multiple custom detectors globally, but neither fixed trufflehog's inability to support multiple such detectors for a given keyword match.) This PR re-adds the removed functionality (and narrows the AhoCorasickCore interface in the process.)
This commit is contained in:
parent
600903f391
commit
7a156330b5
3 changed files with 58 additions and 90 deletions
|
@ -9,12 +9,14 @@ import (
|
|||
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
|
||||
)
|
||||
|
||||
// detectorKey is used to identify a detector in the keywordsToDetectors map.
|
||||
// DetectorKey is used to identify a detector in the keywordsToDetectors map.
|
||||
// Multiple detectors can have the same detector type but different versions.
|
||||
// This allows us to identify a detector by its type and version. An
|
||||
// additional (optional) field is provided to disambiguate multiple custom
|
||||
// detectors.
|
||||
type detectorKey struct {
|
||||
// detectors. This type is exported even though none of its fields are so
|
||||
// that the AhoCorasickCore can populate passed-in maps keyed on this type
|
||||
// without exposing any of its internals to consumers.
|
||||
type DetectorKey struct {
|
||||
detectorType detectorspb.DetectorType
|
||||
version int
|
||||
customDetectorName string
|
||||
|
@ -32,16 +34,16 @@ type AhoCorasickCore struct {
|
|||
// type and then again from detector type to detector. We could
|
||||
// go straight from keywords to detectors but doing it this way makes
|
||||
// some consuming code a little cleaner.)
|
||||
keywordsToDetectors map[string][]detectorKey
|
||||
detectorsByKey map[detectorKey]detectors.Detector
|
||||
keywordsToDetectors map[string][]DetectorKey
|
||||
detectorsByKey map[DetectorKey]detectors.Detector
|
||||
}
|
||||
|
||||
// NewAhoCorasickCore allocates and initializes a new instance of AhoCorasickCore. It uses the
|
||||
// provided detector slice to create a map from keywords to detectors and build the Aho-Corasick
|
||||
// prefilter trie.
|
||||
func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
|
||||
keywordsToDetectors := make(map[string][]detectorKey)
|
||||
detectorsByKey := make(map[detectorKey]detectors.Detector, len(allDetectors))
|
||||
keywordsToDetectors := make(map[string][]DetectorKey)
|
||||
detectorsByKey := make(map[DetectorKey]detectors.Detector, len(allDetectors))
|
||||
var keywords []string
|
||||
for _, d := range allDetectors {
|
||||
key := createDetectorKey(d)
|
||||
|
@ -60,29 +62,20 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
|
|||
}
|
||||
}
|
||||
|
||||
// MatchString performs a string match using the Aho-Corasick algorithm, returning an array of matches.
|
||||
// Designed for internal use within the AhoCorasickCore component.
|
||||
func (ac *AhoCorasickCore) MatchString(input string) []*ahocorasick.Match {
|
||||
return ac.prefilter.MatchString(strings.ToLower(input))
|
||||
// PopulateMatchingDetectors populates the given detector slice with all the detectors matching the
|
||||
// provided input. This method populates an existing map rather than allocating a new one because
|
||||
// it will be called once per chunk and that many allocations has a noticeable performance cost.
|
||||
func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors map[DetectorKey]detectors.Detector) {
|
||||
for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) {
|
||||
for _, k := range ac.keywordsToDetectors[m.MatchString()] {
|
||||
detectors[k] = ac.detectorsByKey[k]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PopulateDetectorsByMatch populates the given detectorMap based on the Aho-Corasick match results.
|
||||
// This method is designed to reuse the same map for performance optimization,
|
||||
// reducing the need for repeated allocations within each detector worker in the engine.
|
||||
func (ac *AhoCorasickCore) PopulateDetectorsByMatch(match *ahocorasick.Match, detectors map[detectorspb.DetectorType]detectors.Detector) bool {
|
||||
matchedDetectorKeys, ok := ac.keywordsToDetectors[match.MatchString()]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
for _, key := range matchedDetectorKeys {
|
||||
detectors[key.detectorType] = ac.detectorsByKey[key]
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// createDetectorKey creates a unique key for each detector. This key based on type and version,
|
||||
// it ensures faster lookups and reduces redundancy in our main detector store.
|
||||
func createDetectorKey(d detectors.Detector) detectorKey {
|
||||
// createDetectorKey creates a unique key for each detector from its type, version, and, for
|
||||
// custom regex detectors, its name.
|
||||
func createDetectorKey(d detectors.Detector) DetectorKey {
|
||||
detectorType := d.Type()
|
||||
var version int
|
||||
if v, ok := d.(detectors.Versioner); ok {
|
||||
|
@ -92,5 +85,5 @@ func createDetectorKey(d detectors.Detector) detectorKey {
|
|||
if r, ok := d.(*custom_detectors.CustomRegexWebhook); ok {
|
||||
customDetectorName = r.GetName()
|
||||
}
|
||||
return detectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
|
||||
return DetectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ func (d testDetectorV1) FromData(ctx context.Context, verify bool, data []byte)
|
|||
}
|
||||
|
||||
func (d testDetectorV1) Keywords() []string {
|
||||
return []string{"a"}
|
||||
return []string{"a", "b"}
|
||||
}
|
||||
|
||||
func (d testDetectorV1) Type() detectorspb.DetectorType {
|
||||
|
@ -40,7 +40,7 @@ func (d testDetectorV2) FromData(ctx context.Context, verify bool, data []byte)
|
|||
}
|
||||
|
||||
func (d testDetectorV2) Keywords() []string {
|
||||
return []string{"b"}
|
||||
return []string{"a"}
|
||||
}
|
||||
|
||||
func (d testDetectorV2) Type() detectorspb.DetectorType {
|
||||
|
@ -66,72 +66,51 @@ func TestAhoCorasickCore_MultipleCustomDetectorsMatchable(t *testing.T) {
|
|||
|
||||
customDetector2, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
|
||||
Name: "custom detector 2",
|
||||
Keywords: []string{"b"},
|
||||
Keywords: []string{"a"},
|
||||
Regex: map[string]string{"": ""},
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
|
||||
testCases := []struct {
|
||||
matchString string
|
||||
detector detectors.Detector
|
||||
}{
|
||||
{
|
||||
matchString: "a",
|
||||
detector: customDetector1,
|
||||
},
|
||||
{
|
||||
matchString: "b",
|
||||
detector: customDetector2,
|
||||
},
|
||||
}
|
||||
|
||||
var allDetectors []detectors.Detector
|
||||
for _, tt := range testCases {
|
||||
allDetectors = append(allDetectors, tt.detector)
|
||||
}
|
||||
allDetectors := []detectors.Detector{customDetector1, customDetector2}
|
||||
|
||||
ac := NewAhoCorasickCore(allDetectors)
|
||||
|
||||
for _, tt := range testCases {
|
||||
matches := ac.MatchString(tt.matchString)
|
||||
assert.Equal(t, 1, len(matches))
|
||||
|
||||
matchingDetectors := make(map[detectorspb.DetectorType]detectors.Detector)
|
||||
ac.PopulateDetectorsByMatch(matches[0], matchingDetectors)
|
||||
assert.Equal(t, 1, len(matchingDetectors))
|
||||
assert.Equal(t, tt.detector, matchingDetectors[detectorspb.DetectorType_CustomRegex])
|
||||
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
|
||||
ac.PopulateMatchingDetectors("a", detectorsMap)
|
||||
matchingDetectors := make([]detectors.Detector, 0, 2)
|
||||
for _, d := range detectorsMap {
|
||||
matchingDetectors = append(matchingDetectors, d)
|
||||
}
|
||||
assert.ElementsMatch(t, allDetectors, matchingDetectors)
|
||||
}
|
||||
|
||||
func TestAhoCorasickCore_MultipleDetectorVersionsMatchable(t *testing.T) {
|
||||
testCases := []struct {
|
||||
matchString string
|
||||
detector detectors.Detector
|
||||
}{
|
||||
{
|
||||
matchString: "a",
|
||||
detector: testDetectorV1{},
|
||||
},
|
||||
{
|
||||
matchString: "b",
|
||||
detector: testDetectorV2{},
|
||||
},
|
||||
}
|
||||
|
||||
var allDetectors []detectors.Detector
|
||||
for _, tt := range testCases {
|
||||
allDetectors = append(allDetectors, tt.detector)
|
||||
}
|
||||
v1 := testDetectorV1{}
|
||||
v2 := testDetectorV2{}
|
||||
allDetectors := []detectors.Detector{v1, v2}
|
||||
|
||||
ac := NewAhoCorasickCore(allDetectors)
|
||||
|
||||
for _, tt := range testCases {
|
||||
matches := ac.MatchString(tt.matchString)
|
||||
assert.Equal(t, 1, len(matches))
|
||||
|
||||
matchingDetectors := make(map[detectorspb.DetectorType]detectors.Detector)
|
||||
ac.PopulateDetectorsByMatch(matches[0], matchingDetectors)
|
||||
assert.Equal(t, 1, len(matchingDetectors))
|
||||
assert.Equal(t, tt.detector, matchingDetectors[TestDetectorType])
|
||||
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
|
||||
ac.PopulateMatchingDetectors("a", detectorsMap)
|
||||
matchingDetectors := make([]detectors.Detector, 0, 2)
|
||||
for _, d := range detectorsMap {
|
||||
matchingDetectors = append(matchingDetectors, d)
|
||||
}
|
||||
assert.ElementsMatch(t, allDetectors, matchingDetectors)
|
||||
}
|
||||
|
||||
func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) {
|
||||
d := testDetectorV1{}
|
||||
allDetectors := []detectors.Detector{d}
|
||||
|
||||
ac := NewAhoCorasickCore(allDetectors)
|
||||
|
||||
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
|
||||
ac.PopulateMatchingDetectors("a a b b", detectorsMap)
|
||||
matchingDetectors := make([]detectors.Detector, 0, 2)
|
||||
for _, d := range detectorsMap {
|
||||
matchingDetectors = append(matchingDetectors, d)
|
||||
}
|
||||
assert.ElementsMatch(t, allDetectors, matchingDetectors)
|
||||
}
|
||||
|
|
|
@ -458,7 +458,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
|
|||
|
||||
// Reuse the same map to avoid allocations.
|
||||
const avgDetectorsPerChunk = 2
|
||||
chunkSpecificDetectors := make(map[detectorspb.DetectorType]detectors.Detector, avgDetectorsPerChunk)
|
||||
chunkSpecificDetectors := make(map[DetectorKey]detectors.Detector, avgDetectorsPerChunk)
|
||||
for originalChunk := range e.ChunksChan() {
|
||||
for chunk := range sources.Chunker(originalChunk) {
|
||||
atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data)))
|
||||
|
@ -469,11 +469,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
|
|||
continue
|
||||
}
|
||||
|
||||
for _, match := range e.ahoCorasickCore.MatchString(string(decoded.Chunk.Data)) {
|
||||
if !e.ahoCorasickCore.PopulateDetectorsByMatch(match, chunkSpecificDetectors) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors)
|
||||
|
||||
for k, detector := range chunkSpecificDetectors {
|
||||
decoded.Chunk.Verify = e.verify
|
||||
|
|
Loading…
Reference in a new issue