trufflehog/pkg/engine/ahocorasickcore.go
Cody Rose 7a156330b5
Support multiple detectors per match (#2065)
#1711 inadvertently removed the ability to match multiple custom detectors, or multiple detectors of the same type but different version, to a given keyword. (#2060 re-added support for multiple versions of detectors globally, and #2064 re-added support for multiple custom detectors globally, but neither fixed trufflehog's inability to support multiple such detectors for a given keyword match.) This PR re-adds the removed functionality (and narrows the AhoCorasickCore interface in the process.)
2023-11-03 12:26:18 -04:00

89 lines
3.8 KiB
Go

package engine
import (
"strings"
ahocorasick "github.com/BobuSumisu/aho-corasick"
"github.com/trufflesecurity/trufflehog/v3/pkg/custom_detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
// DetectorKey is used to identify a detector in the keywordsToDetectors map.
// Multiple detectors can have the same detector type but different versions.
// This allows us to identify a detector by its type and version. An
// additional (optional) field is provided to disambiguate multiple custom
// detectors. This type is exported even though none of its fields are so
// that the AhoCorasickCore can populate passed-in maps keyed on this type
// without exposing any of its internals to consumers.
type DetectorKey struct {
detectorType detectorspb.DetectorType
version int
customDetectorName string
}
// AhoCorasickCore encapsulates the operations and data structures used for keyword matching via the
// Aho-Corasick algorithm. It is responsible for constructing and managing the trie for efficient
// substring searches, as well as mapping keywords to their associated detectors for rapid lookups.
type AhoCorasickCore struct {
// prefilter is a ahocorasick struct used for doing efficient string
// matching given a set of words. (keywords from the rules in the config)
prefilter ahocorasick.Trie
// Maps for efficient lookups during detection.
// (This implementation maps in two layers: from keywords to detector
// type and then again from detector type to detector. We could
// go straight from keywords to detectors but doing it this way makes
// some consuming code a little cleaner.)
keywordsToDetectors map[string][]DetectorKey
detectorsByKey map[DetectorKey]detectors.Detector
}
// NewAhoCorasickCore allocates and initializes a new instance of AhoCorasickCore. It uses the
// provided detector slice to create a map from keywords to detectors and build the Aho-Corasick
// prefilter trie.
func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
keywordsToDetectors := make(map[string][]DetectorKey)
detectorsByKey := make(map[DetectorKey]detectors.Detector, len(allDetectors))
var keywords []string
for _, d := range allDetectors {
key := createDetectorKey(d)
detectorsByKey[key] = d
for _, kw := range d.Keywords() {
kwLower := strings.ToLower(kw)
keywords = append(keywords, kwLower)
keywordsToDetectors[kwLower] = append(keywordsToDetectors[kwLower], key)
}
}
return &AhoCorasickCore{
keywordsToDetectors: keywordsToDetectors,
detectorsByKey: detectorsByKey,
prefilter: *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(),
}
}
// PopulateMatchingDetectors populates the given detector slice with all the detectors matching the
// provided input. This method populates an existing map rather than allocating a new one because
// it will be called once per chunk and that many allocations has a noticeable performance cost.
func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors map[DetectorKey]detectors.Detector) {
for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) {
for _, k := range ac.keywordsToDetectors[m.MatchString()] {
detectors[k] = ac.detectorsByKey[k]
}
}
}
// createDetectorKey creates a unique key for each detector from its type, version, and, for
// custom regex detectors, its name.
func createDetectorKey(d detectors.Detector) DetectorKey {
detectorType := d.Type()
var version int
if v, ok := d.(detectors.Versioner); ok {
version = v.Version()
}
var customDetectorName string
if r, ok := d.(*custom_detectors.CustomRegexWebhook); ok {
customDetectorName = r.GetName()
}
return DetectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
}