mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
[fix] - Correctly calculate EntireSpanChunkCalculator span (#2924)
* fix bug when calculating the span for the entire span chunk calculator * fix rename
This commit is contained in:
parent
babe48fdd1
commit
bef4a46b65
2 changed files with 71 additions and 20 deletions
|
@ -30,21 +30,25 @@ func (k DetectorKey) Type() detectorspb.DetectorType { return k.detectorType }
|
|||
// spanCalculator is an interface that defines a method for calculating a match span
|
||||
// in the chunk data. This allows for different strategies to be used without changing the core logic.
|
||||
type spanCalculator interface {
|
||||
calculateSpan(startIdx int64, chunkData []byte, detector detectors.Detector) matchSpan
|
||||
calculateSpan(params spanCalculationParams) matchSpan
|
||||
}
|
||||
|
||||
// spanCalculationParams provides the necessary context for calculating match spans,
|
||||
// including the starting index in the chunk, the chunk data itself, and the detector being used.
|
||||
type spanCalculationParams struct {
|
||||
startIdx int64
|
||||
chunkData []byte
|
||||
detector detectors.Detector
|
||||
}
|
||||
|
||||
// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data.
|
||||
// This is used when we want to match against the full length of the provided chunk.
|
||||
type EntireChunkSpanCalculator struct{}
|
||||
|
||||
// calculateSpans returns the match span as the length of the chunk data,
|
||||
// calculateSpan returns the match span as the length of the chunk data,
|
||||
// effectively using the entire chunk for matching.
|
||||
func (e *EntireChunkSpanCalculator) calculateSpan(
|
||||
startIdx int64,
|
||||
chunkData []byte,
|
||||
_ detectors.Detector,
|
||||
) matchSpan {
|
||||
return matchSpan{startOffset: startIdx, endOffset: int64(len(chunkData))}
|
||||
func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
|
||||
return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))}
|
||||
}
|
||||
|
||||
// maxMatchLengthSpanCalculator is a strategy that calculates match spans based on a default max
|
||||
|
@ -59,26 +63,22 @@ func newMaxMatchLengthSpanCalculator(maxMatchLength int64) *maxMatchLengthSpanCa
|
|||
|
||||
// calculateSpans computes the match spans based on the start index and the max match length.
|
||||
// If the detector provides an override value, it uses that instead of the default max match length.
|
||||
func (m *maxMatchLengthSpanCalculator) calculateSpan(
|
||||
startIdx int64,
|
||||
chunkData []byte,
|
||||
detector detectors.Detector,
|
||||
) matchSpan {
|
||||
func (m *maxMatchLengthSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
|
||||
maxSize := m.maxMatchLength
|
||||
|
||||
switch d := detector.(type) {
|
||||
switch d := params.detector.(type) {
|
||||
case detectors.MultiPartCredentialProvider:
|
||||
maxSize = d.MaxCredentialSpan()
|
||||
case detectors.MaxSecretSizeProvider:
|
||||
maxSize = d.MaxSecretSize()
|
||||
default: // Use the default max match length
|
||||
}
|
||||
endIdx := startIdx + maxSize
|
||||
if endIdx > int64(len(chunkData)) {
|
||||
endIdx = int64(len(chunkData))
|
||||
endIdx := params.startIdx + maxSize
|
||||
if endIdx > int64(len(params.chunkData)) {
|
||||
endIdx = int64(len(params.chunkData))
|
||||
}
|
||||
|
||||
return matchSpan{startOffset: startIdx, endOffset: endIdx}
|
||||
return matchSpan{startOffset: params.startIdx, endOffset: endIdx}
|
||||
}
|
||||
|
||||
// CoreOption is a functional option type for configuring an AhoCorasickCore instance.
|
||||
|
@ -232,7 +232,13 @@ func (ac *Core) FindDetectorMatches(chunkData []byte) []*DetectorMatch {
|
|||
|
||||
detectorMatch := detectorMatches[k]
|
||||
startIdx := m.Pos()
|
||||
span := ac.spanCalculator.calculateSpan(startIdx, chunkData, detectorMatch.Detector)
|
||||
span := ac.spanCalculator.calculateSpan(
|
||||
spanCalculationParams{
|
||||
startIdx: startIdx,
|
||||
chunkData: chunkData,
|
||||
detector: detectorMatch.Detector,
|
||||
},
|
||||
)
|
||||
detectorMatch.addMatchSpan(span)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -128,6 +128,7 @@ func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) {
|
|||
func TestFindDetectorMatches(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
opts []CoreOption
|
||||
detectors []detectors.Detector
|
||||
sampleData string
|
||||
expectedResult map[DetectorKey][][]int64
|
||||
|
@ -174,6 +175,50 @@ func TestFindDetectorMatches(t *testing.T) {
|
|||
CreateDetectorKey(testDetectorV2{}): {{43, 555}, {854, 856}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "single matchSpan; entireSpanChunkCalculator",
|
||||
opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})},
|
||||
detectors: []detectors.Detector{
|
||||
testDetectorV3{},
|
||||
},
|
||||
sampleData: "This is a sample data containing keyword truffle",
|
||||
expectedResult: map[DetectorKey][][]int64{
|
||||
CreateDetectorKey(testDetectorV3{}): {{0, 48}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Multiple matches overlapping; entireSpanChunkCalculator",
|
||||
opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})},
|
||||
detectors: []detectors.Detector{
|
||||
testDetectorV1{},
|
||||
},
|
||||
sampleData: "This is a sample data containing keyword a",
|
||||
expectedResult: map[DetectorKey][][]int64{
|
||||
CreateDetectorKey(testDetectorV1{}): {{0, 42}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Multiple matches; entireSpanChunkCalculator",
|
||||
opts: []CoreOption{WithSpanCalculator(&EntireChunkSpanCalculator{})},
|
||||
detectors: []detectors.Detector{
|
||||
testDetectorV2{},
|
||||
},
|
||||
sampleData: `This is the first occurrence of the letter a.
|
||||
Lorem ipsum dolor sit met, consectetur dipiscing elit. Sed uctor,
|
||||
mgn bibendum bibendum, ugue ugue tincidunt ugue,
|
||||
eget ultricies ugue ugue id ugue. Meens liquet libero
|
||||
c libero molestie, nec mlesud ugue ugue eget. Donec
|
||||
sed ugue. Sed euismod, ugue sit met liqum lcini,
|
||||
ugue ugue tincidunt ugue, eget ultricies ugue ugue id
|
||||
ugue. Meens liquet libero c libero molestie, nec
|
||||
mlesud ugue ugue eget. Donec sed ugue. Sed euismod,
|
||||
ugue sit met liqum lcini, ugue ugue tincidunt ugue,
|
||||
eget ultricies ugue ugue id ugue. Meens liquet libero
|
||||
c libero molestie, nec mlesud ugue ugue eget. This is the second occurrence of the letter a.`,
|
||||
expectedResult: map[DetectorKey][][]int64{
|
||||
CreateDetectorKey(testDetectorV2{}): {{0, 856}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "No matches",
|
||||
detectors: []detectors.Detector{
|
||||
|
@ -190,7 +235,7 @@ func TestFindDetectorMatches(t *testing.T) {
|
|||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ac := NewAhoCorasickCore(tc.detectors)
|
||||
ac := NewAhoCorasickCore(tc.detectors, tc.opts...)
|
||||
detectorMatches := ac.FindDetectorMatches([]byte(tc.sampleData))
|
||||
|
||||
// Verify that all matching detectors and their matches are returned.
|
||||
|
|
Loading…
Reference in a new issue