trufflehog/pkg/engine/engine_test.go
ahrav cb072603dc
Modularize scanning engine (#2887)
* POC: Modularize scanning engine.

* fix typo

* update interface name

* fix tests

* update test

* fix moar tests

* fix bug

* fixes.

* fix merge

* add detector verification overrides

* handle --no-verification flag

* support fp

* add test

* update name

* filter

* update test

* explicit use of detector

* updates
2024-06-13 13:47:09 -07:00

885 lines
25 KiB
Go

package engine
import (
aCtx "context"
"fmt"
"math/rand"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/gitlab/v2"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/config"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/custom_detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/decoders"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
const fakeDetectorKeyword = "fakedetector"
type fakeDetectorV1 struct{}
type fakeDetectorV2 struct{}
var _ detectors.Detector = (*fakeDetectorV1)(nil)
var _ detectors.Versioner = (*fakeDetectorV1)(nil)
var _ detectors.Detector = (*fakeDetectorV2)(nil)
var _ detectors.Versioner = (*fakeDetectorV2)(nil)
func (f fakeDetectorV1) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
return []detectors.Result{
{
DetectorType: detectorspb.DetectorType(-1),
Verified: true,
Raw: []byte("fake secret v1"),
},
}, nil
}
func (f fakeDetectorV1) Keywords() []string { return []string{fakeDetectorKeyword} }
func (f fakeDetectorV1) Type() detectorspb.DetectorType { return detectorspb.DetectorType(-1) }
func (f fakeDetectorV1) Version() int { return 1 }
func (f fakeDetectorV2) FromData(_ aCtx.Context, _ bool, _ []byte) ([]detectors.Result, error) {
return []detectors.Result{
{
DetectorType: detectorspb.DetectorType(-1),
Verified: true,
Raw: []byte("fake secret v2"),
},
}, nil
}
func (f fakeDetectorV2) Keywords() []string { return []string{fakeDetectorKeyword} }
func (f fakeDetectorV2) Type() detectorspb.DetectorType { return detectorspb.DetectorType(-1) }
func (f fakeDetectorV2) Version() int { return 2 }
func TestFragmentLineOffset(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
result *detectors.Result
expectedLine int64
ignore bool
}{
{
name: "ignore found on same line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nsecret here trufflehog:ignore\nline4"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 2,
ignore: true,
},
{
name: "no ignore",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nsecret here\nline4"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 2,
ignore: false,
},
{
name: "ignore on different line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\ntrufflehog:ignore\nline4\nsecret here\nline6"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 4,
ignore: false,
},
{
name: "match on consecutive lines",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\ntrufflehog:ignore\nline4\nsecret\nhere\nline6"),
},
result: &detectors.Result{
Raw: []byte("secret\nhere"),
},
expectedLine: 4,
ignore: false,
},
{
name: "ignore on last consecutive lines",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nline3\nsecret\nhere // trufflehog:ignore\nline5"),
},
result: &detectors.Result{
Raw: []byte("secret\nhere"),
},
expectedLine: 3,
ignore: true,
},
{
name: "ignore on last line",
chunk: &sources.Chunk{
Data: []byte("line1\nline2\nline3\nsecret here // trufflehog:ignore"),
},
result: &detectors.Result{
Raw: []byte("secret here"),
},
expectedLine: 3,
ignore: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
lineOffset, isIgnored := FragmentLineOffset(tt.chunk, tt.result)
if lineOffset != tt.expectedLine {
t.Errorf("Expected line offset to be %d, got %d", tt.expectedLine, lineOffset)
}
if isIgnored != tt.ignore {
t.Errorf("Expected isIgnored to be %v, got %v", tt.ignore, isIgnored)
}
})
}
}
func setupFragmentLineOffsetBench(totalLines, needleLine int) (*sources.Chunk, *detectors.Result) {
data := make([]byte, 0, 4096)
needle := []byte("needle")
for i := 0; i < totalLines; i++ {
if i != needleLine {
data = append(data, []byte(fmt.Sprintf("line%d\n", i))...)
continue
}
data = append(data, needle...)
data = append(data, '\n')
}
chunk := &sources.Chunk{Data: data}
result := &detectors.Result{Raw: needle}
return chunk, result
}
func BenchmarkFragmentLineOffsetStart(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 2)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
func BenchmarkFragmentLineOffsetMiddle(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 256)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
func BenchmarkFragmentLineOffsetEnd(b *testing.B) {
chunk, result := setupFragmentLineOffsetBench(512, 510)
for i := 0; i < b.N; i++ {
_, _ = FragmentLineOffset(chunk, result)
}
}
// Test to make sure that DefaultDecoders always returns the UTF8 decoder first.
// Technically a decoder test but we want this to run and fail in CI
func TestDefaultDecoders(t *testing.T) {
ds := decoders.DefaultDecoders()
if _, ok := ds[0].(*decoders.UTF8); !ok {
t.Errorf("DefaultDecoders() = %v, expected UTF8 decoder to be first", ds)
}
}
func TestSupportsLineNumbers(t *testing.T) {
tests := []struct {
name string
sourceType sourcespb.SourceType
expectedValue bool
}{
{"Git source", sourcespb.SourceType_SOURCE_TYPE_GIT, true},
{"Github source", sourcespb.SourceType_SOURCE_TYPE_GITHUB, true},
{"Gitlab source", sourcespb.SourceType_SOURCE_TYPE_GITLAB, true},
{"Bitbucket source", sourcespb.SourceType_SOURCE_TYPE_BITBUCKET, true},
{"Gerrit source", sourcespb.SourceType_SOURCE_TYPE_GERRIT, true},
{"Github unauthenticated org source", sourcespb.SourceType_SOURCE_TYPE_GITHUB_UNAUTHENTICATED_ORG, true},
{"Public Git source", sourcespb.SourceType_SOURCE_TYPE_PUBLIC_GIT, true},
{"Filesystem source", sourcespb.SourceType_SOURCE_TYPE_FILESYSTEM, true},
{"Azure Repos source", sourcespb.SourceType_SOURCE_TYPE_AZURE_REPOS, true},
{"Unsupported type", sourcespb.SourceType_SOURCE_TYPE_BUILDKITE, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := SupportsLineNumbers(tt.sourceType)
assert.Equal(t, tt.expectedValue, result)
})
}
}
func BenchmarkSupportsLineNumbersLoop(b *testing.B) {
sourceType := sourcespb.SourceType_SOURCE_TYPE_GITHUB
for i := 0; i < b.N; i++ {
_ = SupportsLineNumbers(sourceType)
}
}
// TestEngine_DuplicateSecrets is a test that detects ALL duplicate secrets with the same decoder.
func TestEngine_DuplicateSecrets(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/secrets.txt")
assert.Nil(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: DefaultDetectors(),
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
// Wait for all the chunks to be processed.
assert.Nil(t, e.Finish(ctx))
want := uint64(5)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
// TestEngine_VersionedDetectorsVerifiedSecrets is a test that detects ALL verified secrets across
// versioned detectors.
func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
tmpFile, err := os.CreateTemp("", "testfile")
assert.Nil(t, err)
defer tmpFile.Close()
defer os.Remove(tmpFile.Name())
_, err = tmpFile.WriteString(fmt.Sprintf("test data using keyword %s", fakeDetectorKeyword))
assert.NoError(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: []detectors.Detector{new(fakeDetectorV1), new(fakeDetectorV2)},
Verify: true,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
assert.NoError(t, e.Finish(ctx))
want := uint64(2)
assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound)
}
// TestEngine_CustomDetectorsDetectorsVerifiedSecrets is a test that covers an edge case where there are
// multiple detectors with the same type, keywords and regex that match the same secret.
// This ensures that those secrets get verified.
func TestEngine_CustomDetectorsDetectorsVerifiedSecrets(t *testing.T) {
tmpFile, err := os.CreateTemp("", "testfile")
assert.Nil(t, err)
defer tmpFile.Close()
defer os.Remove(tmpFile.Name())
_, err = tmpFile.WriteString("test stuff")
assert.Nil(t, err)
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer ts.Close()
customDetector1, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
Name: "custom detector 1",
Keywords: []string{"test"},
Regex: map[string]string{"test": "\\w+"},
Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}},
})
assert.Nil(t, err)
customDetector2, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{
Name: "custom detector 2",
Keywords: []string{"test"},
Regex: map[string]string{"test": "\\w+"},
Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}},
})
assert.Nil(t, err)
allDetectors := []detectors.Detector{customDetector1, customDetector2}
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
conf := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: allDetectors,
Verify: true,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &conf)
assert.NoError(t, err)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
assert.Nil(t, e.Finish(ctx))
// We should have 4 verified secrets, 2 for each custom detector.
want := uint64(4)
assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound)
}
func TestVerificationOverlapChunk(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets.txt")
assert.Nil(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors.yaml")
assert.Nil(t, err)
conf, err := config.Read(confPath)
assert.Nil(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: conf.Detectors,
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
if err := e.ScanFileSystem(ctx, cfg); err != nil {
return
}
// Wait for all the chunks to be processed.
assert.Nil(t, e.Finish(ctx))
// We want TWO secrets that match both the custom regexes.
want := uint64(2)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
// We want 0 because these are custom detectors and verification should still occur.
wantDupe := 0
assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount)
}
func TestVerificationOverlapChunkFalsePositive(t *testing.T) {
ctx := context.Background()
absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets_fp.txt")
assert.NoError(t, err)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors_fp.yaml")
assert.NoError(t, err)
conf, err := config.Read(confPath)
assert.NoError(t, err)
const defaultOutputBufferSize = 64
opts := []func(*sources.SourceManager){
sources.WithSourceUnits(),
sources.WithBufferedOutput(defaultOutputBufferSize),
}
sourceManager := sources.NewManager(opts...)
c := Config{
Concurrency: 1,
Decoders: decoders.DefaultDecoders(),
Detectors: conf.Detectors,
Verify: false,
SourceManager: sourceManager,
Dispatcher: NewPrinterDispatcher(new(discardPrinter)),
}
e, err := NewEngine(ctx, &c)
assert.NoError(t, err)
e.verificationOverlapTracker = new(verificationOverlapTracker)
e.Start(ctx)
cfg := sources.FilesystemConfig{Paths: []string{absPath}}
err = e.ScanFileSystem(ctx, cfg)
assert.NoError(t, err)
// Wait for all the chunks to be processed.
assert.NoError(t, e.Finish(ctx))
// We want 0 because the secret is a false positive.
want := uint64(0)
assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound)
}
func TestFragmentFirstLineAndLink(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
expectedLine int64
expectedLink string
}{
{
name: "Test Git Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Git{
Git: &source_metadatapb.Git{
Line: 10,
},
},
},
},
expectedLine: 10,
expectedLink: "", // Git doesn't support links
},
{
name: "Test Github Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{
Line: 5,
Link: "https://example.github.com",
},
},
},
},
expectedLine: 5,
expectedLink: "https://example.github.com",
},
{
name: "Test Azure Repos Metadata",
chunk: &sources.Chunk{
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_AzureRepos{
AzureRepos: &source_metadatapb.AzureRepos{
Line: 5,
Link: "https://example.azure.com",
},
},
},
},
expectedLine: 5,
expectedLink: "https://example.azure.com",
},
{
name: "Unsupported Type",
chunk: &sources.Chunk{},
expectedLine: 0,
expectedLink: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
line, linePtr, link := FragmentFirstLineAndLink(tt.chunk)
assert.Equal(t, tt.expectedLink, link, "Mismatch in link")
assert.Equal(t, tt.expectedLine, line, "Mismatch in line")
if linePtr != nil {
assert.Equal(t, tt.expectedLine, *linePtr, "Mismatch in linePtr value")
}
})
}
}
func TestSetLink(t *testing.T) {
tests := []struct {
name string
input *source_metadatapb.MetaData
link string
line int64
wantLink string
wantErr bool
}{
{
name: "Github link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{},
},
},
link: "https://github.com/example",
line: 42,
wantLink: "https://github.com/example#L42",
},
{
name: "Gitlab link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Gitlab{
Gitlab: &source_metadatapb.Gitlab{},
},
},
link: "https://gitlab.com/example",
line: 10,
wantLink: "https://gitlab.com/example#L10",
},
{
name: "Bitbucket link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Bitbucket{
Bitbucket: &source_metadatapb.Bitbucket{},
},
},
link: "https://bitbucket.com/example",
line: 8,
wantLink: "https://bitbucket.com/example#L8",
},
{
name: "Filesystem link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Filesystem{
Filesystem: &source_metadatapb.Filesystem{},
},
},
link: "file:///path/to/example",
line: 3,
wantLink: "file:///path/to/example#L3",
},
{
name: "Azure Repos link set",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_AzureRepos{
AzureRepos: &source_metadatapb.AzureRepos{},
},
},
link: "https://dev.azure.com/example",
line: 3,
wantLink: "https://dev.azure.com/example?line=3",
},
{
name: "Unsupported metadata type",
input: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Git{
Git: &source_metadatapb.Git{},
},
},
link: "https://git.example.com/link",
line: 5,
wantErr: true,
},
{
name: "Metadata nil",
input: nil,
link: "https://some.link",
line: 1,
wantErr: true,
},
}
ctx := context.Background()
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := UpdateLink(ctx, tt.input, tt.link, tt.line)
if err != nil && !tt.wantErr {
t.Errorf("Unexpected error: %v", err)
}
if tt.wantErr {
return
}
switch data := tt.input.GetData().(type) {
case *source_metadatapb.MetaData_Github:
assert.Equal(t, tt.wantLink, data.Github.Link, "Github link mismatch")
case *source_metadatapb.MetaData_Gitlab:
assert.Equal(t, tt.wantLink, data.Gitlab.Link, "Gitlab link mismatch")
case *source_metadatapb.MetaData_Bitbucket:
assert.Equal(t, tt.wantLink, data.Bitbucket.Link, "Bitbucket link mismatch")
case *source_metadatapb.MetaData_Filesystem:
assert.Equal(t, tt.wantLink, data.Filesystem.Link, "Filesystem link mismatch")
case *source_metadatapb.MetaData_AzureRepos:
assert.Equal(t, tt.wantLink, data.AzureRepos.Link, "Azure Repos link mismatch")
}
})
}
}
func TestLikelyDuplicate(t *testing.T) {
// Initialize detectors
// (not actually calling detector FromData or anything, just using detector struct for key creation)
detectorA := ahocorasick.DetectorMatch{
Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0]),
Detector: DefaultDetectors()[0],
}
detectorB := ahocorasick.DetectorMatch{
Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1]),
Detector: DefaultDetectors()[1],
}
// Define test cases
tests := []struct {
name string
val chunkSecretKey
dupes map[chunkSecretKey]struct{}
expected bool
}{
{
name: "exact duplicate different detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "non-duplicate length outside range",
val: chunkSecretKey{"short", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"muchlongerthanthevalstring", detectorB.Key}: {},
},
expected: false,
},
{
name: "similar within threshold",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "similar outside threshold",
val: chunkSecretKey{"anotherkey", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"completelydifferent", detectorB.Key}: {},
},
expected: false,
},
{
name: "empty strings",
val: chunkSecretKey{"", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{{"", detectorB.Key}: {}},
expected: true,
},
{
name: "similar within threshold same detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key}: {},
},
expected: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
result := likelyDuplicate(ctx, tc.val, tc.dupes)
if result != tc.expected {
t.Errorf("expected %v, got %v", tc.expected, result)
}
})
}
}
func BenchmarkPopulateMatchingDetectors(b *testing.B) {
allDetectors := DefaultDetectors()
ac := ahocorasick.NewAhoCorasickCore(allDetectors)
// Generate sample data with keywords from detectors.
dataSize := 1 << 20 // 1 MB
sampleData := generateRandomDataWithKeywords(dataSize, allDetectors)
smallChunk := 1 << 10 // 1 KB
mediumChunk := 1 << 12 // 4 KB
current := sources.TotalChunkSize
largeChunk := 1 << 14 // 16 KB
xlChunk := 1 << 15 // 32 KB
xxlChunk := 1 << 16 // 64 KB
xxxlChunk := 1 << 18 // 256 KB
chunkSizes := []int{smallChunk, mediumChunk, current, largeChunk, xlChunk, xxlChunk, xxxlChunk}
for _, chunkSize := range chunkSizes {
b.Run(fmt.Sprintf("ChunkSize_%d", chunkSize), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(chunkSize))
// Create a single chunk of the desired size.
chunk := sampleData[:chunkSize]
b.ResetTimer()
for i := 0; i < b.N; i++ {
ac.FindDetectorMatches([]byte(chunk)) // Match against the single chunk
}
})
}
}
func generateRandomDataWithKeywords(size int, detectors []detectors.Detector) string {
const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
data := make([]byte, size)
r := rand.New(rand.NewSource(42)) // Seed for reproducibility
for i := range data {
data[i] = charset[r.Intn(len(charset))]
}
totalKeywords := 0
for _, d := range detectors {
totalKeywords += len(d.Keywords())
}
// Target keyword density (keywords per character)
// This ensures that the generated data has a reasonable number of keywords and is consistent
// across different data sizes.
keywordDensity := 0.01
targetKeywords := int(float64(size) * keywordDensity)
for i := 0; i < targetKeywords; i++ {
detectorIndex := r.Intn(len(detectors))
keywordIndex := r.Intn(len(detectors[detectorIndex].Keywords()))
keyword := detectors[detectorIndex].Keywords()[keywordIndex]
insertPosition := r.Intn(size - len(keyword))
copy(data[insertPosition:], keyword)
}
return string(data)
}
func TestEngine_ShouldVerifyChunk(t *testing.T) {
tests := []struct {
name string
detector detectors.Detector
overrideKey config.DetectorID
want func(sourceVerify, detectorVerify bool) bool
}{
{
name: "detector override by exact version",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 2},
want: func(sourceVerify, detectorVerify bool) bool { return detectorVerify },
},
{
name: "detector override by versionless config",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 0},
want: func(sourceVerify, detectorVerify bool) bool { return detectorVerify },
},
{
name: "no detector override because of detector type mismatch",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_NpmToken, Version: 2},
want: func(sourceVerify, detectorVerify bool) bool { return sourceVerify },
},
{
name: "no detector override because of detector version mismatch",
detector: &gitlab.Scanner{},
overrideKey: config.DetectorID{ID: detectorspb.DetectorType_Gitlab, Version: 1},
want: func(sourceVerify, detectorVerify bool) bool { return sourceVerify },
},
}
booleanChoices := [2]bool{true, false}
engine := &Engine{verify: true}
for _, tt := range tests {
for _, sourceVerify := range booleanChoices {
for _, detectorVerify := range booleanChoices {
t.Run(fmt.Sprintf("%s (source verify = %v, detector verify = %v)", tt.name, sourceVerify, detectorVerify), func(t *testing.T) {
overrides := map[config.DetectorID]bool{
tt.overrideKey: detectorVerify,
}
want := tt.want(sourceVerify, detectorVerify)
got := engine.shouldVerifyChunk(sourceVerify, tt.detector, overrides)
assert.Equal(t, want, got)
})
}
}
}
}