diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index 32a5d5940..33dc20348 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -30,21 +30,25 @@ func (k DetectorKey) Type() detectorspb.DetectorType { return k.detectorType } // spanCalculator is an interface that defines a method for calculating a match span // in the chunk data. This allows for different strategies to be used without changing the core logic. type spanCalculator interface { - calculateSpan(startIdx int64, chunkData []byte, detector detectors.Detector) matchSpan + calculateSpan(params spanCalculationParams) matchSpan +} + +// spanCalculationParams provides the necessary context for calculating match spans, +// including the starting index in the chunk, the chunk data itself, and the detector being used. +type spanCalculationParams struct { + startIdx int64 + chunkData []byte + detector detectors.Detector } // EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data. // This is used when we want to match against the full length of the provided chunk. type EntireChunkSpanCalculator struct{} -// calculateSpans returns the match span as the length of the chunk data, +// calculateSpan returns the match span as the length of the chunk data, // effectively using the entire chunk for matching. -func (e *EntireChunkSpanCalculator) calculateSpan( - startIdx int64, - chunkData []byte, - _ detectors.Detector, -) matchSpan { - return matchSpan{startOffset: startIdx, endOffset: int64(len(chunkData))} +func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { + return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))} } // maxMatchLengthSpanCalculator is a strategy that calculates match spans based on a default max @@ -59,26 +63,22 @@ func newMaxMatchLengthSpanCalculator(maxMatchLength int64) *maxMatchLengthSpanCa // calculateSpans computes the match spans based on the start index and the max match length. // If the detector provides an override value, it uses that instead of the default max match length. -func (m *maxMatchLengthSpanCalculator) calculateSpan( - startIdx int64, - chunkData []byte, - detector detectors.Detector, -) matchSpan { +func (m *maxMatchLengthSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { maxSize := m.maxMatchLength - switch d := detector.(type) { + switch d := params.detector.(type) { case detectors.MultiPartCredentialProvider: maxSize = d.MaxCredentialSpan() case detectors.MaxSecretSizeProvider: maxSize = d.MaxSecretSize() default: // Use the default max match length } - endIdx := startIdx + maxSize - if endIdx > int64(len(chunkData)) { - endIdx = int64(len(chunkData)) + endIdx := params.startIdx + maxSize + if endIdx > int64(len(params.chunkData)) { + endIdx = int64(len(params.chunkData)) } - return matchSpan{startOffset: startIdx, endOffset: endIdx} + return matchSpan{startOffset: params.startIdx, endOffset: endIdx} } // CoreOption is a functional option type for configuring an AhoCorasickCore instance. @@ -232,7 +232,13 @@ func (ac *Core) FindDetectorMatches(chunkData []byte) []*DetectorMatch { detectorMatch := detectorMatches[k] startIdx := m.Pos() - span := ac.spanCalculator.calculateSpan(startIdx, chunkData, detectorMatch.Detector) + span := ac.spanCalculator.calculateSpan( + spanCalculationParams{ + startIdx: startIdx, + chunkData: chunkData, + detector: detectorMatch.Detector, + }, + ) detectorMatch.addMatchSpan(span) } } diff --git a/pkg/engine/ahocorasick/ahocorasickcore_test.go b/pkg/engine/ahocorasick/ahocorasickcore_test.go index fb8d4be52..209ed326e 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore_test.go +++ b/pkg/engine/ahocorasick/ahocorasickcore_test.go @@ -128,6 +128,7 @@ func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) { func TestFindDetectorMatches(t *testing.T) { testCases := []struct { name string + opts []CoreOption detectors []detectors.Detector sampleData string expectedResult map[DetectorKey][][]int64 @@ -174,6 +175,50 @@ func TestFindDetectorMatches(t *testing.T) { CreateDetectorKey(testDetectorV2{}): {{43, 555}, {854, 856}}, }, }, + { + name: "single matchSpan; entireSpanChunkCalculator", + opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})}, + detectors: []detectors.Detector{ + testDetectorV3{}, + }, + sampleData: "This is a sample data containing keyword truffle", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV3{}): {{0, 48}}, + }, + }, + { + name: "Multiple matches overlapping; entireSpanChunkCalculator", + opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})}, + detectors: []detectors.Detector{ + testDetectorV1{}, + }, + sampleData: "This is a sample data containing keyword a", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV1{}): {{0, 42}}, + }, + }, + { + name: "Multiple matches; entireSpanChunkCalculator", + opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})}, + detectors: []detectors.Detector{ + testDetectorV2{}, + }, + sampleData: `This is the first occurrence of the letter a. + Lorem ipsum dolor sit met, consectetur dipiscing elit. Sed uctor, + mgn bibendum bibendum, ugue ugue tincidunt ugue, + eget ultricies ugue ugue id ugue. Meens liquet libero + c libero molestie, nec mlesud ugue ugue eget. Donec + sed ugue. Sed euismod, ugue sit met liqum lcini, + ugue ugue tincidunt ugue, eget ultricies ugue ugue id + ugue. Meens liquet libero c libero molestie, nec + mlesud ugue ugue eget. Donec sed ugue. Sed euismod, + ugue sit met liqum lcini, ugue ugue tincidunt ugue, + eget ultricies ugue ugue id ugue. Meens liquet libero + c libero molestie, nec mlesud ugue ugue eget. This is the second occurrence of the letter a.`, + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV2{}): {{0, 856}}, + }, + }, { name: "No matches", detectors: []detectors.Detector{ @@ -190,7 +235,7 @@ func TestFindDetectorMatches(t *testing.T) { t.Run(tc.name, func(t *testing.T) { t.Parallel() - ac := NewAhoCorasickCore(tc.detectors) + ac := NewAhoCorasickCore(tc.detectors, tc.opts...) detectorMatches := ac.FindDetectorMatches([]byte(tc.sampleData)) // Verify that all matching detectors and their matches are returned.