[fix] - Refactor Filtering Logic to Fix Known False Positive Handling in Overlapping Cases (#2946)

* Filter results for verification overlap results

* add test
This commit is contained in:
ahrav 2024-06-11 07:25:12 -07:00 committed by GitHub
parent 11b80dbdf9
commit 68bea576db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 72 additions and 9 deletions

View file

@ -735,6 +735,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
detectorKeysWithResults[detector.Key] = detector
}
results = e.filterResults(ctx, detector, results, e.logFilteredUnverified)
for _, res := range results {
var val []byte
if res.RawV2 != nil {
@ -844,15 +845,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
}
if e.filterUnverified {
results = detectors.CleanResults(results)
}
results = detectors.FilterKnownFalsePositives(ctx, data.detector, results, e.logFilteredUnverified)
if e.filterEntropy != nil {
results = detectors.FilterResultsWithEntropy(ctx, results, *e.filterEntropy, e.logFilteredUnverified)
}
results = e.filterResults(ctx, data.detector, results, e.logFilteredUnverified)
for _, res := range results {
e.processResult(ctx, data, res)
@ -861,6 +854,26 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
data.wgDoneFn()
}
// filterResults applies multiple filters to the detection results to reduce false positives
// and ensure the results meet specific criteria such as verification status and entropy level.
// This function centralizes the filtering logic, making it reusable across different stages
// of the detection pipeline.
func (e *Engine) filterResults(
ctx context.Context,
detector detectors.Detector,
results []detectors.Result,
logFilteredUnverified bool,
) []detectors.Result {
if e.filterUnverified {
results = detectors.CleanResults(results)
}
results = detectors.FilterKnownFalsePositives(ctx, detector, results, logFilteredUnverified)
if e.filterEntropy != nil {
results = detectors.FilterResultsWithEntropy(ctx, results, *e.filterEntropy, logFilteredUnverified)
}
return results
}
func (e *Engine) processResult(ctx context.Context, data detectableChunk, res detectors.Result) {
ignoreLinePresent := false
if SupportsLineNumbers(data.chunk.SourceType) {

View file

@ -393,6 +393,41 @@ func TestVerificationOverlapChunk(t *testing.T) {
assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount)
}
func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
assert.NoError(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors_fp.yaml")
assert.NoError(t, err)
conf, err := config.Read(confPath)
assert.NoError(t, err)
e, err := Start(ctx,
WithConcurrency(1),
WithDecoders(decoders.DefaultDecoders()...),
WithDetectors(conf.Detectors...),
WithVerify(false),
WithPrinter(new(discardPrinter)),
withVerificationOverlapTracking(),
)
assert.NoError(t, err)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
err = e.ScanFileSystem(ctx, cfg)
assert.NoError(t, err)
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 0 because the secret is a false positive.
want := uint64(0)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
func TestFragmentFirstLineAndLink(t *testing.T) {
tests := []struct {
name string

View file

@ -0,0 +1,13 @@
# config.yaml
detectors:
- name: detector1
keywords:
- sample
regex:
api_key: \b(sample-[a-zA-Z-0-9]{59})\b
- name: detector2
keywords:
- ample
regex:
api_key: \b(ssample-[a-zA-Z-0-9]{59})\b

View file

@ -0,0 +1,2 @@
POSTMAN_API_KEY="ssample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"