[feat] - Add Option to Retain False Positives During Detection (#2967)

* provide a mechanism to retain false positive findings * update * reorganzie * revert comment * update test * typo * fix test * fix test * update * update
2024-11-10 07:04:24 +00:00 · 2024-06-18 09:40:21 -07:00 · 2024-06-18 09:40:21 -07:00 · 347e8a6683
commit 347e8a6683
parent a0108df67a
4 changed files with 113 additions and 25 deletions
--- a/pkg/detectors/falsepositives.go
+++ b/pkg/detectors/falsepositives.go
@ -2,6 +2,7 @@ package detectors

 import (
 	_ "embed"
+	"fmt"
 	"math"
 	"strings"
 	"unicode"
@ -151,22 +152,26 @@ func FilterResultsWithEntropy(ctx context.Context, results []Result, entropy flo
 }

 // FilterKnownFalsePositives filters out known false positives from the results.
-func FilterKnownFalsePositives(ctx context.Context, detector Detector, results []Result, shouldLog bool) []Result {
+func FilterKnownFalsePositives(ctx context.Context, detector Detector, results []Result) []Result {
 	var filteredResults []Result

 	isFalsePositive := GetFalsePositiveCheck(detector)

 	for _, result := range results {
-		if !result.Verified && result.Raw != nil {
-			isFp, reason := isFalsePositive(result)
-			if !isFp {
-				filteredResults = append(filteredResults, result)
-			} else if shouldLog {
-				ctx.Logger().Info("Filtered out known false positive", "result", result, "reason", reason)
-			}
-		} else {
+		if len(result.Raw) == 0 {
+			ctx.Logger().Error(fmt.Errorf("empty raw"), "invalid result; skipping")
+			continue
+		}
+
+		if result.Verified {
+			filteredResults = append(filteredResults, result)
+			continue
+		}
+
+		if isFp, _ := isFalsePositive(result); !isFp {
 			filteredResults = append(filteredResults, result)
 		}
 	}
+
 	return filteredResults
 }
--- a/pkg/detectors/falsepositives_test.go
+++ b/pkg/detectors/falsepositives_test.go
@ -42,7 +42,7 @@ func TestFilterKnownFalsePositives_DefaultLogic(t *testing.T) {
 	expected := []Result{
 		{Raw: []byte("hga8adshla3434g")},
 	}
-	filtered := FilterKnownFalsePositives(logContext.Background(), fakeDetector{}, results, false)
+	filtered := FilterKnownFalsePositives(logContext.Background(), fakeDetector{}, results)
 	assert.ElementsMatch(t, expected, filtered)
 }

@ -58,7 +58,7 @@ func TestFilterKnownFalsePositives_CustomLogic(t *testing.T) {
 		{Raw: []byte("number")},
 		{Raw: []byte("hga8adshla3434g")},
 	}
-	filtered := FilterKnownFalsePositives(logContext.Background(), customFalsePositiveChecker{}, results, false)
+	filtered := FilterKnownFalsePositives(logContext.Background(), customFalsePositiveChecker{}, results)
 	assert.ElementsMatch(t, expected, filtered)
 }

--- a/pkg/engine/engine.go
+++ b/pkg/engine/engine.go
@ -156,7 +156,7 @@ type Engine struct {
 	notifyVerifiedResults   bool
 	notifyUnverifiedResults bool
 	notifyUnknownResults    bool
-	logFilteredUnverified   bool
+	retainFalsePositives    bool
 	verificationOverlap     bool
 	printAvgDetectorTime    bool
 	// By default, the engine will only scan a subset of the chunk if a detector matches the chunk.
@ -206,7 +206,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
 		filterUnverified:              cfg.FilterUnverified,
 		filterEntropy:                 cfg.FilterEntropy,
 		printAvgDetectorTime:          cfg.PrintAvgDetectorTime,
-		logFilteredUnverified:         cfg.LogFilteredUnverified,
+		retainFalsePositives:          cfg.LogFilteredUnverified,
 		verificationOverlap:           cfg.VerificationOverlap,
 		sourceManager:                 cfg.SourceManager,
 		scanEntireChunk:               cfg.ShouldScanEntireChunk,
@ -279,8 +279,10 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
 		_, ok = results["unverified"]
 		engine.notifyUnverifiedResults = ok

-		_, ok = results["filtered_unverified"]
-		engine.logFilteredUnverified = ok
+		if _, ok = results["filtered_unverified"]; ok {
+			engine.retainFalsePositives = ok
+			engine.notifyUnverifiedResults = ok
+		}
 	}

 	if err := engine.initialize(ctx); err != nil {
@ -893,7 +895,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
 					detectorKeysWithResults[detector.Key] = detector
 				}

-				results = e.filterResults(ctx, detector, results, e.logFilteredUnverified)
+				results = e.filterResults(ctx, detector, results)
 				for _, res := range results {
 					var val []byte
 					if res.RawV2 != nil {
@ -1024,7 +1026,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
 			e.metrics.detectorAvgTime.Store(detectorName, avgTime)
 		}

-		results = e.filterResults(ctx, data.detector, results, e.logFilteredUnverified)
+		results = e.filterResults(ctx, data.detector, results)

 		for _, res := range results {
 			e.processResult(ctx, data, res, isFalsePositive)
@ -1038,16 +1040,17 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {

 func (e *Engine) filterResults(
 	ctx context.Context,
-	detector detectors.Detector,
+	detector *ahocorasick.DetectorMatch,
 	results []detectors.Result,
-	logFilteredUnverified bool,
 ) []detectors.Result {
 	if e.filterUnverified {
 		results = detectors.CleanResults(results)
 	}
-	results = detectors.FilterKnownFalsePositives(ctx, detector, results, logFilteredUnverified)
+	if !e.retainFalsePositives {
+		results = detectors.FilterKnownFalsePositives(ctx, detector.Detector, results)
+	}
 	if e.filterEntropy != 0 {
-		results = detectors.FilterResultsWithEntropy(ctx, results, e.filterEntropy, logFilteredUnverified)
+		results = detectors.FilterResultsWithEntropy(ctx, results, e.filterEntropy, e.retainFalsePositives)
 	}
 	return results
 }
--- a/pkg/engine/engine_test.go
+++ b/pkg/engine/engine_test.go
@ -448,6 +448,43 @@ func TestVerificationOverlapChunk(t *testing.T) {
 	assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount)
 }

+const (
+	TestDetectorType  = -1
+	TestDetectorType2 = -2
+)
+
+var _ detectors.Detector = (*testDetectorV1)(nil)
+
+type testDetectorV1 struct{}
+
+func (testDetectorV1) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
+	result := detectors.Result{
+		DetectorType: TestDetectorType,
+		Raw:          []byte("ssample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
+	}
+	return []detectors.Result{result}, nil
+}
+
+func (testDetectorV1) Keywords() []string { return []string{"sample"} }
+
+func (testDetectorV1) Type() detectorspb.DetectorType { return TestDetectorType }
+
+var _ detectors.Detector = (*testDetectorV2)(nil)
+
+type testDetectorV2 struct{}
+
+func (testDetectorV2) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
+	result := detectors.Result{
+		DetectorType: TestDetectorType,
+		Raw:          []byte("sample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
+	}
+	return []detectors.Result{result}, nil
+}
+
+func (testDetectorV2) Keywords() []string { return []string{"ample"} }
+
+func (testDetectorV2) Type() detectorspb.DetectorType { return TestDetectorType2 }
+
 func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
 	ctx := context.Background()

@ -457,6 +494,50 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
 	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
 	defer cancel()

+	const defaultOutputBufferSize = 64
+	opts := []func(*sources.SourceManager){
+		sources.WithSourceUnits(),
+		sources.WithBufferedOutput(defaultOutputBufferSize),
+	}
+
+	sourceManager := sources.NewManager(opts...)
+
+	c := Config{
+		Concurrency:   1,
+		Decoders:      decoders.DefaultDecoders(),
+		Detectors:     []detectors.Detector{testDetectorV1{}, testDetectorV2{}},
+		Verify:        false,
+		SourceManager: sourceManager,
+		Dispatcher:    NewPrinterDispatcher(new(discardPrinter)),
+	}
+
+	e, err := NewEngine(ctx, &c)
+	assert.NoError(t, err)
+
+	e.verificationOverlapTracker = new(verificationOverlapTracker)
+
+	e.Start(ctx)
+
+	cfg := sources.FilesystemConfig{Paths: []string{absPath}}
+	err = e.ScanFileSystem(ctx, cfg)
+	assert.NoError(t, err)
+
+	// Wait for all the chunks to be processed.
+	assert.NoError(t, e.Finish(ctx))
+	// We want 0 because the secret is a false positive.
+	want := uint64(0)
+	assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
+}
+
+func TestRetainFalsePositives(t *testing.T) {
+	ctx := context.Background()
+
+	absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
+	assert.NoError(t, err)
+
+	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+	defer cancel()
+
 	confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors_fp.yaml")
 	assert.NoError(t, err)
 	conf, err := config.Read(confPath)
@ -477,13 +558,12 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
 		Verify:        false,
 		SourceManager: sourceManager,
 		Dispatcher:    NewPrinterDispatcher(new(discardPrinter)),
+		Results:       map[string]struct{}{"filtered_unverified": {}},
 	}

 	e, err := NewEngine(ctx, &c)
 	assert.NoError(t, err)

-	e.verificationOverlapTracker = new(verificationOverlapTracker)
-
 	e.Start(ctx)

 	cfg := sources.FilesystemConfig{Paths: []string{absPath}}
@ -492,8 +572,8 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {

 	// Wait for all the chunks to be processed.
 	assert.NoError(t, e.Finish(ctx))
-	// We want 0 because the secret is a false positive.
-	want := uint64(0)
+	// We want 1 because the secret is a false positive and we are retaining it.
+	want := uint64(1)
 	assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
 }