trufflehog/pkg/engine/engine_test.go
Cody Rose f39a5254ff
Customize results cleaning (using smuggled interface) (#3235)
We have identified some cases in which the results "cleaning" logic (the logic that eliminates superfluous results) should not run. In order to allow this, we need to expose the cleaning logic to the engine. This PR does so by doing these things:

- Create a CustomResultsCleaner interface that can be implemented by detectors that want to use custom cleaning logic
- Implement this interface for the aws and awssessionkey detectors (and remove their previous invocation of their custom cleaning logic)
- Modify the engine to invoke this logic (conditionally)

This PR also removes the "custom" cleaning logic for the opsgenie, razorpay, and twilio detectors, because it was added erroneously.

This is an alternative implementation of #3233.
2024-08-21 09:42:20 -04:00

1035 lines
30 KiB
Go

package engine
import (
aCtx "context"
"fmt"
"math/rand"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/gitlab/v2"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/config"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/custom_detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/decoders"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
const fakeDetectorKeyword = "fakedetector"
type fakeDetectorV1 struct{}
type fakeDetectorV2 struct{}
var _ detectors.Detector = (*fakeDetectorV1)(nil)
var _ detectors.Versioner = (*fakeDetectorV1)(nil)
var _ detectors.Detector = (*fakeDetectorV2)(nil)
var _ detectors.Versioner = (*fakeDetectorV2)(nil)
func (f fakeDetectorV1) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
return []detectors.Result{
{
DetectorType: detectorspb.DetectorType(-1),
Verified: true,
Raw: []byte("fake secret v1"),
},
}, nil
}
func (f fakeDetectorV1) Keywords() []string { return []string{fakeDetectorKeyword} }
func (f fakeDetectorV1) Type() detectorspb.DetectorType { return detectorspb.DetectorType(-1) }
func (f fakeDetectorV1) Version() int { return 1 }
func (f fakeDetectorV2) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
return []detectors.Result{
{
DetectorType: detectorspb.DetectorType(-1),
Verified: true,
Raw: []byte("fake secret v2"),
},
}, nil
}
func (f fakeDetectorV2) Keywords() []string { return []string{fakeDetectorKeyword} }
func (f fakeDetectorV2) Type() detectorspb.DetectorType { return detectorspb.DetectorType(-1) }
func (f fakeDetectorV2) Version() int { return 2 }
func TestFragmentLineOffset(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
result *detectors.Result
expectedLine int64
ignore bool
}{
{
name: "ignore found on same line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nsecret here trufflehog:ignore\nline4"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 2,
ignore: true,
},
{
name: "no ignore",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nsecret here\nline4"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 2,
ignore: false,
},
{
name: "ignore on different line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\ntrufflehog:ignore\nline4\nsecret here\nline6"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 4,
ignore: false,
},
{
name: "match on consecutive lines",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\ntrufflehog:ignore\nline4\nsecret\nhere\nline6"),
},
result: &detectors.Result{
Raw: []byte("secret\nhere"),
},
expectedLine: 4,
ignore: false,
},
{
name: "ignore on last consecutive lines",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nline3\nsecret\nhere // trufflehog:ignore\nline5"),
},
result: &detectors.Result{
Raw: []byte("secret\nhere"),
},
expectedLine: 3,
ignore: true,
},
{
name: "ignore on last line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nline3\nsecret here // trufflehog:ignore"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 3,
ignore: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
lineOffset, isIgnored := FragmentLineOffset(tt.chunk, tt.result)
if lineOffset != tt.expectedLine {
t.Errorf("Expected line offset to be %d, got %d", tt.expectedLine, lineOffset)
}
if isIgnored != tt.ignore {
t.Errorf("Expected isIgnored to be %v, got %v", tt.ignore, isIgnored)
}
})
}
}
func setupFragmentLineOffsetBench(totalLines, needleLine int) (*sources.Chunk, *detectors.Result) {
data := make([]byte, 0, 4096)
needle := []byte("needle")
for i := 0; i < totalLines; i++ {
if i != needleLine {
data = append(data, []byte(fmt.Sprintf("line%d\n", i))...)
continue
}
data = append(data, needle...)
data = append(data, '\n')
}
chunk := &sources.Chunk{Data: data}
result := &detectors.Result{Raw: needle}
return chunk, result
}
func BenchmarkFragmentLineOffsetStart(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 2)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
func BenchmarkFragmentLineOffsetMiddle(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 256)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
func BenchmarkFragmentLineOffsetEnd(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 510)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
// Test to make sure that DefaultDecoders always returns the UTF8 decoder first.
// Technically a decoder test but we want this to run and fail in CI
func TestDefaultDecoders(t *testing.T) {
ds := decoders.DefaultDecoders()
if _, ok := ds[0].(*decoders.UTF8); !ok {
t.Errorf("DefaultDecoders() = %v, expected UTF8 decoder to be first", ds)
}
}
func TestSupportsLineNumbers(t *testing.T) {
tests := []struct {
name string
sourceType sourcespb.SourceType
expectedValue bool
}{
{"Git source", sourcespb.SourceType_SOURCE_TYPE_GIT, true},
{"Github source", sourcespb.SourceType_SOURCE_TYPE_GITHUB, true},
{"Gitlab source", sourcespb.SourceType_SOURCE_TYPE_GITLAB, true},
{"Bitbucket source", sourcespb.SourceType_SOURCE_TYPE_BITBUCKET, true},
{"Gerrit source", sourcespb.SourceType_SOURCE_TYPE_GERRIT, true},
{"Github unauthenticated org source", sourcespb.SourceType_SOURCE_TYPE_GITHUB_UNAUTHENTICATED_ORG, true},
{"Public Git source", sourcespb.SourceType_SOURCE_TYPE_PUBLIC_GIT, true},
{"Filesystem source", sourcespb.SourceType_SOURCE_TYPE_FILESYSTEM, true},
{"Azure Repos source", sourcespb.SourceType_SOURCE_TYPE_AZURE_REPOS, true},
{"Unsupported type", sourcespb.SourceType_SOURCE_TYPE_BUILDKITE, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := SupportsLineNumbers(tt.sourceType)
assert.Equal(t, tt.expectedValue, result)
})
}
}
func BenchmarkSupportsLineNumbersLoop(b *testing.B) {
sourceType := sourcespb.SourceType_SOURCE_TYPE_GITHUB
for i := 0; i < b.N; i++ {
_ = SupportsLineNumbers(sourceType)
}
}
// TestEngine_DuplicateSecrets is a test that detects ALL duplicate secrets with the same decoder.
func TestEngine_DuplicateSecrets(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/secrets.txt")
assert.Nil(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: DefaultDetectors(),
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
// Wait for all the chunks to be processed.
assert.Nil(t, e.Finish(ctx))
want := uint64(5)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
// TestEngine_VersionedDetectorsVerifiedSecrets is a test that detects ALL verified secrets across
// versioned detectors.
func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
tmpFile, err := os.CreateTemp("", "testfile")
assert.Nil(t, err)
defer tmpFile.Close()
defer os.Remove(tmpFile.Name())
_, err = tmpFile.WriteString(fmt.Sprintf("test data using keyword %s", fakeDetectorKeyword))
assert.NoError(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: []detectors.Detector{new(fakeDetectorV1), new(fakeDetectorV2)},
Verify: true,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
assert.NoError(t, e.Finish(ctx))
want := uint64(2)
assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound)
}
// TestEngine_CustomDetectorsDetectorsVerifiedSecrets is a test that covers an edge case where there are
// multiple detectors with the same type, keywords and regex that match the same secret.
// This ensures that those secrets get verified.
func TestEngine_CustomDetectorsDetectorsVerifiedSecrets(t *testing.T) {
tmpFile, err := os.CreateTemp("", "testfile")
assert.Nil(t, err)
defer tmpFile.Close()
defer os.Remove(tmpFile.Name())
_, err = tmpFile.WriteString("test stuff")
assert.Nil(t, err)
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer ts.Close()
customDetector1, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
Name: "custom detector 1",
Keywords: []string{"test"},
Regex: map[string]string{"test": "\\w+"},
Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}},
})
assert.Nil(t, err)
customDetector2, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
Name: "custom detector 2",
Keywords: []string{"test"},
Regex: map[string]string{"test": "\\w+"},
Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}},
})
assert.Nil(t, err)
allDetectors := []detectors.Detector{customDetector1, customDetector2}
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: allDetectors,
Verify: true,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
assert.Nil(t, e.Finish(ctx))
// We should have 4 verified secrets, 2 for each custom detector.
want := uint64(4)
assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound)
}
func TestVerificationOverlapChunk(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets.txt")
assert.Nil(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors.yaml")
assert.Nil(t, err)
conf, err := config.Read(confPath)
assert.Nil(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: conf.Detectors,
IncludeDetectors: "904", // isolate this test to only the custom detectors provided
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
// Wait for all the chunks to be processed.
assert.Nil(t, e.Finish(ctx))
// We want TWO secrets that match both the custom regexes.
want := uint64(2)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
// We want 0 because these are custom detectors and verification should still occur.
wantDupe := 0
assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount)
}
const (
TestDetectorType = -1
TestDetectorType2 = -2
)
var _ detectors.Detector = (*testDetectorV1)(nil)
type testDetectorV1 struct{}
func (testDetectorV1) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
result := detectors.Result{
DetectorType: TestDetectorType,
Raw: []byte("ssample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
}
return []detectors.Result{result}, nil
}
func (testDetectorV1) Keywords() []string { return []string{"sample"} }
func (testDetectorV1) Type() detectorspb.DetectorType { return TestDetectorType }
var _ detectors.Detector = (*testDetectorV2)(nil)
type testDetectorV2 struct{}
func (testDetectorV2) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
result := detectors.Result{
DetectorType: TestDetectorType,
Raw: []byte("sample-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r"),
}
return []detectors.Result{result}, nil
}
func (testDetectorV2) Keywords() []string { return []string{"ample"} }
func (testDetectorV2) Type() detectorspb.DetectorType { return TestDetectorType2 }
func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
assert.NoError(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: []detectors.Detector{testDetectorV1{}, testDetectorV2{}},
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
err = e.ScanFileSystem(ctx, cfg)
assert.NoError(t, err)
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 0 because the secret is a false positive.
want := uint64(0)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
func TestRetainFalsePositives(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
assert.NoError(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors_fp.yaml")
assert.NoError(t, err)
conf, err := config.Read(confPath)
assert.NoError(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: conf.Detectors,
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
Results: map[string]struct{}{"filtered_unverified": {}},
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
err = e.ScanFileSystem(ctx, cfg)
assert.NoError(t, err)
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 1 because the secret is a false positive and we are retaining it.
want := uint64(1)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
func TestFragmentFirstLineAndLink(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
expectedLine int64
expectedLink string
}{
{
name: "Test Git Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Git{
Git: &source_metadatapb.Git{
Line: 10,
},
},
},
},
expectedLine: 10,
expectedLink: "", // Git doesn't support links
},
{
name: "Test Github Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{
Line: 5,
Link: "https://example.github.com",
},
},
},
},
expectedLine: 5,
expectedLink: "https://example.github.com",
},
{
name: "Test Azure Repos Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_AzureRepos{
AzureRepos: &source_metadatapb.AzureRepos{
Line: 5,
Link: "https://example.azure.com",
},
},
},
},
expectedLine: 5,
expectedLink: "https://example.azure.com",
},
{
name: "Unsupported Type",
chunk: &sources.Chunk{},
expectedLine: 0,
expectedLink: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
line, linePtr, link := FragmentFirstLineAndLink(tt.chunk)
assert.Equal(t, tt.expectedLink, link, "Mismatch in link")
assert.Equal(t, tt.expectedLine, line, "Mismatch in line")
if linePtr != nil {
assert.Equal(t, tt.expectedLine, *linePtr, "Mismatch in linePtr value")
}
})
}
}
func TestSetLink(t *testing.T) {
tests := []struct {
name string
input *source_metadatapb.MetaData
link string
line int64
wantLink string
wantErr bool
}{
{
name: "Github link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{},
},
},
link: "https://github.com/example",
line: 42,
wantLink: "https://github.com/example#L42",
},
{
name: "Gitlab link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Gitlab{
Gitlab: &source_metadatapb.Gitlab{},
},
},
link: "https://gitlab.com/example",
line: 10,
wantLink: "https://gitlab.com/example#L10",
},
{
name: "Bitbucket link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Bitbucket{
Bitbucket: &source_metadatapb.Bitbucket{},
},
},
link: "https://bitbucket.com/example",
line: 8,
wantLink: "https://bitbucket.com/example#L8",
},
{
name: "Filesystem link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Filesystem{
Filesystem: &source_metadatapb.Filesystem{},
},
},
link: "file:///path/to/example",
line: 3,
wantLink: "file:///path/to/example#L3",
},
{
name: "Azure Repos link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_AzureRepos{
AzureRepos: &source_metadatapb.AzureRepos{},
},
},
link: "https://dev.azure.com/example",
line: 3,
wantLink: "https://dev.azure.com/example?line=3",
},
{
name: "Unsupported metadata type",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Git{
Git: &source_metadatapb.Git{},
},
},
link: "https://git.example.com/link",
line: 5,
wantErr: true,
},
{
name: "Metadata nil",
input: nil,
link: "https://some.link",
line: 1,
wantErr: true,
},
}
ctx := context.Background()
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := UpdateLink(ctx, tt.input, tt.link, tt.line)
if err != nil && !tt.wantErr {
t.Errorf("Unexpected error: %v", err)
}
if tt.wantErr {
return
}
switch data := tt.input.GetData().(type) {
case *source_metadatapb.MetaData_Github:
assert.Equal(t, tt.wantLink, data.Github.Link, "Github link mismatch")
case *source_metadatapb.MetaData_Gitlab:
assert.Equal(t, tt.wantLink, data.Gitlab.Link, "Gitlab link mismatch")
case *source_metadatapb.MetaData_Bitbucket:
assert.Equal(t, tt.wantLink, data.Bitbucket.Link, "Bitbucket link mismatch")
case *source_metadatapb.MetaData_Filesystem:
assert.Equal(t, tt.wantLink, data.Filesystem.Link, "Filesystem link mismatch")
case *source_metadatapb.MetaData_AzureRepos:
assert.Equal(t, tt.wantLink, data.AzureRepos.Link, "Azure Repos link mismatch")
}
})
}
}
func TestLikelyDuplicate(t *testing.T) {
// Initialize detectors
// (not actually calling detector FromData or anything, just using detector struct for key creation)
detectorA := ahocorasick.DetectorMatch{
Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0]),
Detector: DefaultDetectors()[0],
}
detectorB := ahocorasick.DetectorMatch{
Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1]),
Detector: DefaultDetectors()[1],
}
// Define test cases
tests := []struct {
name string
val chunkSecretKey
dupes map[chunkSecretKey]struct{}
expected bool
}{
{
name: "exact duplicate different detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "non-duplicate length outside range",
val: chunkSecretKey{"short", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"muchlongerthanthevalstring", detectorB.Key}: {},
},
expected: false,
},
{
name: "similar within threshold",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "similar outside threshold",
val: chunkSecretKey{"anotherkey", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"completelydifferent", detectorB.Key}: {},
},
expected: false,
},
{
name: "empty strings",
val: chunkSecretKey{"", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{{"", detectorB.Key}: {}},
expected: true,
},
{
name: "similar within threshold same detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key}: {},
},
expected: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
result := likelyDuplicate(ctx, tc.val, tc.dupes)
if result != tc.expected {
t.Errorf("expected %v, got %v", tc.expected, result)
}
})
}
}
type customCleaner struct {
ignoreConfig bool
}
var _ detectors.CustomResultsCleaner = (*customCleaner)(nil)
var _ detectors.Detector = (*customCleaner)(nil)
func (c customCleaner) FromData(aCtx.Context, bool, []byte) ([]detectors.Result, error) {
return []detectors.Result{}, nil
}
func (c customCleaner) Keywords() []string { return []string{} }
func (c customCleaner) Type() detectorspb.DetectorType { return detectorspb.DetectorType(-1) }
func (c customCleaner) CleanResults([]detectors.Result) []detectors.Result {
return []detectors.Result{}
}
func (c customCleaner) ShouldCleanResultsIrrespectiveOfConfiguration() bool { return c.ignoreConfig }
func TestFilterResults_CustomCleaner(t *testing.T) {
testCases := []struct {
name string
cleaningConfigured bool
ignoreConfig bool
resultsToClean []detectors.Result
wantResults []detectors.Result
}{
{
name: "respect config to clean",
cleaningConfigured: true,
ignoreConfig: false,
resultsToClean: []detectors.Result{{}},
wantResults: []detectors.Result{},
},
{
name: "respect config to not clean",
cleaningConfigured: false,
ignoreConfig: false,
resultsToClean: []detectors.Result{{}},
wantResults: []detectors.Result{{}},
},
{
name: "clean irrespective of config",
cleaningConfigured: false,
ignoreConfig: true,
resultsToClean: []detectors.Result{{}},
wantResults: []detectors.Result{},
},
}
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
match := ahocorasick.DetectorMatch{
Detector: customCleaner{
ignoreConfig: tt.ignoreConfig,
},
}
engine := Engine{
filterUnverified: tt.cleaningConfigured,
retainFalsePositives: true,
}
cleaned := engine.filterResults(context.Background(), &match, tt.resultsToClean)
assert.ElementsMatch(t, tt.wantResults, cleaned)
})
}
}
func BenchmarkPopulateMatchingDetectors(b *testing.B) {
allDetectors := DefaultDetectors()
ac := ahocorasick.NewAhoCorasickCore(allDetectors)
// Generate sample data with keywords from detectors.
dataSize := 1 << 20 // 1 MB
sampleData := generateRandomDataWithKeywords(dataSize, allDetectors)
smallChunk := 1 << 10 // 1 KB
mediumChunk := 1 << 12 // 4 KB
current := sources.TotalChunkSize
largeChunk := 1 << 14 // 16 KB
xlChunk := 1 << 15 // 32 KB
xxlChunk := 1 << 16 // 64 KB
xxxlChunk := 1 << 18 // 256 KB
chunkSizes := []int{smallChunk, mediumChunk, current, largeChunk, xlChunk, xxlChunk, xxxlChunk}
for _, chunkSize := range chunkSizes {
b.Run(fmt.Sprintf("ChunkSize_%d", chunkSize), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(chunkSize))
// Create a single chunk of the desired size.
chunk := sampleData[:chunkSize]
b.ResetTimer()
for i := 0; i < b.N; i++ {
ac.FindDetectorMatches([]byte(chunk)) // Match against the single chunk
}
})
}
}
func generateRandomDataWithKeywords(size int, detectors []detectors.Detector) string {
const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
data := make([]byte, size)
r := rand.New(rand.NewSource(42)) // Seed for reproducibility
for i := range data {
data[i] = charset[r.Intn(len(charset))]
}
totalKeywords := 0
for _, d := range detectors {
totalKeywords += len(d.Keywords())
}
// Target keyword density (keywords per character)
// This ensures that the generated data has a reasonable number of keywords and is consistent
// across different data sizes.
keywordDensity := 0.01
targetKeywords := int(float64(size) * keywordDensity)
for i := 0; i < targetKeywords; i++ {
detectorIndex := r.Intn(len(detectors))
keywordIndex := r.Intn(len(detectors[detectorIndex].Keywords()))
keyword := detectors[detectorIndex].Keywords()[keywordIndex]
insertPosition := r.Intn(size - len(keyword))
copy(data[insertPosition:], keyword)
}
return string(data)
}
func TestEngine_ShouldVerifyChunk(t *testing.T) {
tests := []struct {
name string
detector detectors.Detector
overrideKey config.DetectorID
want func(sourceVerify, detectorVerify bool) bool
}{
{
name: "detector override by exact version",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 2},
want: func(sourceVerify, detectorVerify bool) bool { return detectorVerify },
},
{
name: "detector override by versionless config",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 0},
want: func(sourceVerify, detectorVerify bool) bool { return detectorVerify },
},
{
name: "no detector override because of detector type mismatch",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_NpmToken, Version: 2},
want: func(sourceVerify, detectorVerify bool) bool { return sourceVerify },
},
{
name: "no detector override because of detector version mismatch",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 1},
want: func(sourceVerify, detectorVerify bool) bool { return sourceVerify },
},
}
booleanChoices := [2]bool{true, false}
engine := &Engine{verify: true}
for _, tt := range tests {
for _, sourceVerify := range booleanChoices {
for _, detectorVerify := range booleanChoices {
t.Run(fmt.Sprintf("%s (source verify = %v, detector verify = %v)", tt.name, sourceVerify, detectorVerify), func(t *testing.T) {
overrides := map[config.DetectorID]bool{
tt.overrideKey: detectorVerify,
}
want := tt.want(sourceVerify, detectorVerify)
got := engine.shouldVerifyChunk(sourceVerify, tt.detector, overrides)
assert.Equal(t, want, got)
})
}
}
}
}