[feat] - Update span calculation logic to use offset magnitude (#2957)

* Add a default start offset

* update

* use keywordIdx
This commit is contained in:
ahrav 2024-06-11 09:12:31 -07:00 committed by GitHub
parent 68bea576db
commit bf77251543
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 227 additions and 41 deletions

View file

@ -38,6 +38,12 @@ type MaxSecretSizeProvider interface {
MaxSecretSize() int64
}
// StartOffsetProvider is an optional interface that a detector can implement to
// provide a custom start offset for the secret it finds.
type StartOffsetProvider interface {
StartOffset() int64
}
// MultiPartCredentialProvider is an optional interface that a detector can implement
// to indicate its compatibility with multi-part credentials and provide the maximum
// secret size for the credential it finds.

View file

@ -20,6 +20,7 @@ type Scanner struct{}
var _ detectors.Detector = (*Scanner)(nil)
var _ detectors.CustomFalsePositiveChecker = (*Scanner)(nil)
var _ detectors.MaxSecretSizeProvider = (*Scanner)(nil)
var _ detectors.StartOffsetProvider = (*Scanner)(nil)
var (
keyPat = regexp.MustCompile(`\{[^{]+auth_provider_x509_cert_url[^}]+\}`)
@ -50,10 +51,15 @@ func (s Scanner) Keywords() []string {
return []string{"provider_x509"}
}
const maxGCPKeySize = 4096
const maxGCPKeySize = 2048
// ProvideMaxSecretSize returns the maximum size of a secret that this detector can find.
func (s Scanner) MaxSecretSize() int64 { return maxGCPKeySize }
// MaxSecretSize returns the maximum size of a secret that this detector can find.
func (Scanner) MaxSecretSize() int64 { return maxGCPKeySize }
const startOffset = 4096
// StartOffset returns the start offset for the secret this detector finds.
func (Scanner) StartOffset() int64 { return startOffset }
// FromData will find and optionally verify GCP secrets in a given set of bytes.
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {

View file

@ -26,6 +26,7 @@ type Scanner struct {
// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*Scanner)(nil)
var _ detectors.MaxSecretSizeProvider = (*Scanner)(nil)
var _ detectors.StartOffsetProvider = (*Scanner)(nil)
var (
defaultClient = common.SaneHttpClient()
@ -48,8 +49,13 @@ func (s Scanner) Keywords() []string {
const maxGCPADCKeySize = 1024
// ProvideMaxSecretSize returns the maximum size of a secret that this detector can find.
func (s Scanner) MaxSecretSize() int64 { return maxGCPADCKeySize }
// MaxSecretSize returns the maximum size of a secret that this detector can find.
func (Scanner) MaxSecretSize() int64 { return maxGCPADCKeySize }
const startOffset = maxGCPADCKeySize
// StartOffset returns the start offset for the secret this detector finds.
func (Scanner) StartOffset() int64 { return startOffset }
// FromData will find and optionally verify Gcpapplicationdefaultcredentials secrets in a given set of bytes.
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {

View file

@ -34,11 +34,11 @@ type spanCalculator interface {
}
// spanCalculationParams provides the necessary context for calculating match spans,
// including the starting index in the chunk, the chunk data itself, and the detector being used.
// including the keyword index in the chunk, the chunk data itself, and the detector being used.
type spanCalculationParams struct {
startIdx int64
chunkData []byte
detector detectors.Detector
keywordIdx int64 // Index of the keyword in the chunk data
chunkData []byte
detector detectors.Detector
}
// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data.
@ -51,34 +51,44 @@ func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams)
return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))}
}
// maxMatchLengthSpanCalculator is a strategy that calculates match spans based on a default max
// match length or values provided by detectors. This allows for more granular control over the match span.
type maxMatchLengthSpanCalculator struct{ maxMatchLength int64 }
// adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude
// or values provided by specific detectors to adjust the start and end indices of the span, allowing
// for more granular control over the match.
type adjustableSpanCalculator struct{ offsetMagnitude int64 }
// newMaxMatchLengthSpanCalculator creates a new instance of maxMatchLengthSpanCalculator with the
// specified max match length.
func newMaxMatchLengthSpanCalculator(maxMatchLength int64) *maxMatchLengthSpanCalculator {
return &maxMatchLengthSpanCalculator{maxMatchLength: maxMatchLength}
// newAdjustableSpanCalculator creates a new instance of adjustableSpanCalculator with the
// specified offset magnitude.
func newAdjustableSpanCalculator(offsetRadius int64) *adjustableSpanCalculator {
return &adjustableSpanCalculator{offsetMagnitude: offsetRadius}
}
// calculateSpans computes the match spans based on the start index and the max match length.
// If the detector provides an override value, it uses that instead of the default max match length.
func (m *maxMatchLengthSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
maxSize := m.maxMatchLength
// calculateSpan computes the match span based on the keyword index and the offset magnitude.
// If the detector provides an override value, it uses that instead of the default offset magnitude to
// calculate the maximum size of the span.
// The start index of the span is also adjusted if the detector provides a start offset.
func (m *adjustableSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
keywordIdx := params.keywordIdx
switch d := params.detector.(type) {
case detectors.MultiPartCredentialProvider:
maxSize = d.MaxCredentialSpan()
case detectors.MaxSecretSizeProvider:
maxSize = d.MaxSecretSize()
default: // Use the default max match length
maxSize := keywordIdx + m.offsetMagnitude
startOffset := keywordIdx - m.offsetMagnitude
// Check if the detector implements each interface and update values accordingly.
// This CAN'T be done in a switch statement because a detector can implement multiple interfaces.
if provider, ok := params.detector.(detectors.MultiPartCredentialProvider); ok {
maxSize = provider.MaxCredentialSpan() + keywordIdx
startOffset = keywordIdx - provider.MaxCredentialSpan()
}
endIdx := params.startIdx + maxSize
if endIdx > int64(len(params.chunkData)) {
endIdx = int64(len(params.chunkData))
if provider, ok := params.detector.(detectors.MaxSecretSizeProvider); ok {
maxSize = provider.MaxSecretSize() + keywordIdx
}
if provider, ok := params.detector.(detectors.StartOffsetProvider); ok {
startOffset = keywordIdx - provider.StartOffset()
}
return matchSpan{startOffset: params.startIdx, endOffset: endIdx}
startIdx := max(startOffset, 0)
endIdx := min(maxSize, int64(len(params.chunkData)))
return matchSpan{startOffset: startIdx, endOffset: endIdx}
}
// CoreOption is a functional option type for configuring an AhoCorasickCore instance.
@ -123,19 +133,19 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector, opts ...CoreOption) *
}
}
const maxMatchLength int64 = 512
ac := &Core{
const defaultOffsetRadius int64 = 512
core := &Core{
keywordsToDetectors: keywordsToDetectors,
detectorsByKey: detectorsByKey,
prefilter: *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(),
spanCalculator: newMaxMatchLengthSpanCalculator(maxMatchLength), // Default span calculator
spanCalculator: newAdjustableSpanCalculator(defaultOffsetRadius), // Default span calculator
}
for _, opt := range opts {
opt(ac)
opt(core)
}
return ac
return core
}
// DetectorMatch represents a detected pattern's metadata in a data chunk.
@ -234,9 +244,9 @@ func (ac *Core) FindDetectorMatches(chunkData []byte) []*DetectorMatch {
startIdx := m.Pos()
span := ac.spanCalculator.calculateSpan(
spanCalculationParams{
startIdx: startIdx,
chunkData: chunkData,
detector: detectorMatch.Detector,
keywordIdx: startIdx,
chunkData: chunkData,
detector: detectorMatch.Detector,
},
)
detectorMatch.addMatchSpan(span)

View file

@ -63,6 +63,64 @@ func (testDetectorV3) Type() detectorspb.DetectorType {
func (testDetectorV3) Version() int { return 1 }
var _ detectors.Detector = (*testDetectorV4)(nil)
var _ detectors.MultiPartCredentialProvider = (*testDetectorV4)(nil)
var _ detectors.StartOffsetProvider = (*testDetectorV4)(nil)
type testDetectorV4 struct{}
func (testDetectorV4) FromData(context.Context, bool, []byte) ([]detectors.Result, error) {
return make([]detectors.Result, 0), nil
}
func (testDetectorV4) Keywords() []string { return []string{"password"} }
func (testDetectorV4) Type() detectorspb.DetectorType { return TestDetectorType }
func (testDetectorV4) Version() int { return 1 }
func (testDetectorV4) MaxCredentialSpan() int64 { return 15 }
func (testDetectorV4) StartOffset() int64 { return 5 }
var _ detectors.Detector = (*testDetectorV5)(nil)
var _ detectors.MaxSecretSizeProvider = (*testDetectorV5)(nil)
var _ detectors.StartOffsetProvider = (*testDetectorV5)(nil)
type testDetectorV5 struct{}
func (testDetectorV5) FromData(context.Context, bool, []byte) ([]detectors.Result, error) {
return make([]detectors.Result, 0), nil
}
func (testDetectorV5) Keywords() []string { return []string{"password"} }
func (testDetectorV5) Type() detectorspb.DetectorType { return TestDetectorType }
func (testDetectorV5) Version() int { return 1 }
func (testDetectorV5) MaxSecretSize() int64 { return 10 }
func (testDetectorV5) StartOffset() int64 { return 3 }
var _ detectors.Detector = (*testDetectorV6)(nil)
var _ detectors.Detector = (*testDetectorV6)(nil)
var _ detectors.StartOffsetProvider = (*testDetectorV6)(nil)
type testDetectorV6 struct{}
func (testDetectorV6) FromData(context.Context, bool, []byte) ([]detectors.Result, error) {
return make([]detectors.Result, 0), nil
}
func (testDetectorV6) Keywords() []string { return []string{"password"} }
func (testDetectorV6) Type() detectorspb.DetectorType { return TestDetectorType }
func (testDetectorV6) Version() int { return 1 }
func (testDetectorV6) StartOffset() int64 { return 1 }
var _ detectors.Detector = (*testDetectorV1)(nil)
var _ detectors.Detector = (*testDetectorV2)(nil)
var _ detectors.Versioner = (*testDetectorV1)(nil)
@ -141,7 +199,7 @@ func TestFindDetectorMatches(t *testing.T) {
},
sampleData: "This is a sample data containing keyword truffle",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV3{}): {{41, 48}},
CreateDetectorKey(testDetectorV3{}): {{0, 48}},
},
},
{
@ -151,7 +209,7 @@ func TestFindDetectorMatches(t *testing.T) {
},
sampleData: "This is a sample data containing keyword a",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV1{}): {{8, 42}},
CreateDetectorKey(testDetectorV1{}): {{0, 42}},
},
},
{
@ -172,7 +230,7 @@ func TestFindDetectorMatches(t *testing.T) {
eget ultricies ugue ugue id ugue. Meens liquet libero
c libero molestie, nec mlesud ugue ugue eget. This is the second occurrence of the letter a.`,
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV2{}): {{43, 555}, {854, 856}},
CreateDetectorKey(testDetectorV2{}): {{0, 856}},
},
},
{
@ -219,6 +277,106 @@ func TestFindDetectorMatches(t *testing.T) {
CreateDetectorKey(testDetectorV2{}): {{0, 856}},
},
},
{
name: "keyword in the middle of the credential; MultiPartCredentialProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV4{},
},
sampleData: "This is a password in the middle of some data",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV4{}): {{5, 25}},
},
},
{
name: "keyword at the end of the credential; MultiPartCredentialProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV4{},
},
sampleData: "This data ends with a password",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV4{}): {{17, 30}},
},
},
{
name: "keyword near the start of the data; MultiPartCredentialProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV4{},
},
sampleData: "a password at the start",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV4{}): {{0, 17}},
},
},
{
name: "keyword in the middle of the credential; MaxSecretSizeProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV5{},
},
sampleData: "This is a password in the middle of some data",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV5{}): {{7, 20}},
},
},
{
name: "keyword at the end of the credential; MaxSecretSizeProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV5{},
},
sampleData: "This data ends with a password",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV5{}): {{19, 30}},
},
},
{
name: "keyword near the start of the data; MaxSecretSizeProvider, StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV5{},
},
sampleData: "a password at the start",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV5{}): {{0, 12}},
},
},
{
name: "keyword in the middle of the credential; StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV6{},
},
sampleData: "This is a password in the middle of some data",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV6{}): {{9, 45}},
},
},
{
name: "keyword at the end of the credential; StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV6{},
},
sampleData: "This data ends with a password",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV6{}): {{21, 30}},
},
},
{
name: "keyword near the start of the data; StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV6{},
},
sampleData: "a password at the start",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV6{}): {{1, 23}},
},
},
{
name: "multiple keyword in the middle of the credential; StartOffsetProvider",
detectors: []detectors.Detector{
testDetectorV6{},
},
sampleData: "This is a password in the middle of some data, and another password at the end!",
expectedResult: map[DetectorKey][][]int64{
CreateDetectorKey(testDetectorV6{}): {{9, 79}},
},
},
{
name: "No matches",
detectors: []detectors.Detector{