[feat] - Add Option to Retain False Positives During Detection (#2967)

* provide a mechanism to retain false positive findings

* update

* reorganzie

* revert comment

* update test

* typo

* fix test

* fix test

* update

* update
This commit is contained in:
ahrav 2024-06-18 09:40:21 -07:00 committed by GitHub
parent a0108df67a
commit 347e8a6683
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 113 additions and 25 deletions

View file

@ -2,6 +2,7 @@ package detectors
import (
_ "embed"
"fmt"
"math"
"strings"
"unicode"
@ -151,22 +152,26 @@ func FilterResultsWithEntropy(ctx context.Context, results []Result, entropy flo
}
// FilterKnownFalsePositives filters out known false positives from the results.
func FilterKnownFalsePositives(ctx context.Context, detector Detector, results []Result, shouldLog bool) []Result {
func FilterKnownFalsePositives(ctx context.Context, detector Detector, results []Result) []Result {
var filteredResults []Result
isFalsePositive := GetFalsePositiveCheck(detector)
for _, result := range results {
if !result.Verified && result.Raw != nil {
isFp, reason := isFalsePositive(result)
if !isFp {
filteredResults = append(filteredResults, result)
} else if shouldLog {
ctx.Logger().Info("Filtered out known false positive", "result", result, "reason", reason)
}
} else {
if len(result.Raw) == 0 {
ctx.Logger().Error(fmt.Errorf("empty raw"), "invalid result; skipping")
continue
}
if result.Verified {
filteredResults = append(filteredResults, result)
continue
}
if isFp, _ := isFalsePositive(result); !isFp {
filteredResults = append(filteredResults, result)
}
}
return filteredResults
}

View file

@ -42,7 +42,7 @@ func TestFilterKnownFalsePositives_DefaultLogic(t *testing.T) {
expected := []Result{
{Raw: []byte("hga8adshla3434g")},
}
filtered := FilterKnownFalsePositives(logContext.Background(), fakeDetector{}, results, false)
filtered := FilterKnownFalsePositives(logContext.Background(), fakeDetector{}, results)
assert.ElementsMatch(t, expected, filtered)
}
@ -58,7 +58,7 @@ func TestFilterKnownFalsePositives_CustomLogic(t *testing.T) {
{Raw: []byte("number")},
{Raw: []byte("hga8adshla3434g")},
}
filtered := FilterKnownFalsePositives(logContext.Background(), customFalsePositiveChecker{}, results, false)
filtered := FilterKnownFalsePositives(logContext.Background(), customFalsePositiveChecker{}, results)
assert.ElementsMatch(t, expected, filtered)
}

View file

@ -156,7 +156,7 @@ type Engine struct {
notifyVerifiedResults bool
notifyUnverifiedResults bool
notifyUnknownResults bool
logFilteredUnverified bool
retainFalsePositives bool
verificationOverlap bool
printAvgDetectorTime bool
// By default, the engine will only scan a subset of the chunk if a detector matches the chunk.
@ -206,7 +206,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
filterUnverified: cfg.FilterUnverified,
filterEntropy: cfg.FilterEntropy,
printAvgDetectorTime: cfg.PrintAvgDetectorTime,
logFilteredUnverified: cfg.LogFilteredUnverified,
retainFalsePositives: cfg.LogFilteredUnverified,
verificationOverlap: cfg.VerificationOverlap,
sourceManager: cfg.SourceManager,
scanEntireChunk: cfg.ShouldScanEntireChunk,
@ -279,8 +279,10 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
_, ok = results["unverified"]
engine.notifyUnverifiedResults = ok
_, ok = results["filtered_unverified"]
engine.logFilteredUnverified = ok
if _, ok = results["filtered_unverified"]; ok {
engine.retainFalsePositives = ok
engine.notifyUnverifiedResults = ok
}
}
if err := engine.initialize(ctx); err != nil {
@ -893,7 +895,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
detectorKeysWithResults[detector.Key] = detector
}
results = e.filterResults(ctx, detector, results, e.logFilteredUnverified)
results = e.filterResults(ctx, detector, results)
for _, res := range results {
var val []byte
if res.RawV2 != nil {
@ -1024,7 +1026,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
}
results = e.filterResults(ctx, data.detector, results, e.logFilteredUnverified)
results = e.filterResults(ctx, data.detector, results)
for _, res := range results {
e.processResult(ctx, data, res, isFalsePositive)
@ -1038,16 +1040,17 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
func (e *Engine) filterResults(
ctx context.Context,
detector detectors.Detector,
detector *ahocorasick.DetectorMatch,
results []detectors.Result,
logFilteredUnverified bool,
) []detectors.Result {
if e.filterUnverified {
results = detectors.CleanResults(results)
}
results = detectors.FilterKnownFalsePositives(ctx, detector, results, logFilteredUnverified)
if !e.retainFalsePositives {
results = detectors.FilterKnownFalsePositives(ctx, detector.Detector, results)
}
if e.filterEntropy != 0 {
results = detectors.FilterResultsWithEntropy(ctx, results, e.filterEntropy, logFilteredUnverified)
results = detectors.FilterResultsWithEntropy(ctx, results, e.filterEntropy, e.retainFalsePositives)
}
return results
}

View file

@ -448,6 +448,43 @@ func TestVerificationOverlapChunk(t *testing.T) {
assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount)
}
const (
TestDetectorType = -1
TestDetectorType2 = -2
)
var _ detectors.Detector = (*testDetectorV1)(nil)
type testDetectorV1 struct{}
func (testDetectorV1) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
result := detectors.Result{
DetectorType: TestDetectorType,
Raw: []byte("ssample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
}
return []detectors.Result{result}, nil
}
func (testDetectorV1) Keywords() []string { return []string{"sample"} }
func (testDetectorV1) Type() detectorspb.DetectorType { return TestDetectorType }
var _ detectors.Detector = (*testDetectorV2)(nil)
type testDetectorV2 struct{}
func (testDetectorV2) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
result := detectors.Result{
DetectorType: TestDetectorType,
Raw: []byte("sample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
}
return []detectors.Result{result}, nil
}
func (testDetectorV2) Keywords() []string { return []string{"ample"} }
func (testDetectorV2) Type() detectorspb.DetectorType { return TestDetectorType2 }
func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
ctx := context.Background()
@ -457,6 +494,50 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: []detectors.Detector{testDetectorV1{}, testDetectorV2{}},
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
err = e.ScanFileSystem(ctx, cfg)
assert.NoError(t, err)
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 0 because the secret is a false positive.
want := uint64(0)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
func TestRetainFalsePositives(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
assert.NoError(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors_fp.yaml")
assert.NoError(t, err)
conf, err := config.Read(confPath)
@ -477,13 +558,12 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
Results: map[string]struct{}{"filtered_unverified": {}},
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
@ -492,8 +572,8 @@ func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 0 because the secret is a false positive.
want := uint64(0)
// We want 1 because the secret is a false positive and we are retaining it.
want := uint64(1)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}