2022-01-13 20:02:24 +00:00
|
|
|
package engine
|
|
|
|
|
|
|
|
import (
|
2022-03-22 16:27:15 +00:00
|
|
|
"bytes"
|
2023-07-11 20:48:00 +00:00
|
|
|
"fmt"
|
2023-02-09 22:55:19 +00:00
|
|
|
"reflect"
|
2022-01-13 20:02:24 +00:00
|
|
|
"runtime"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
2022-01-19 06:24:56 +00:00
|
|
|
"sync/atomic"
|
2022-01-13 20:02:24 +00:00
|
|
|
"time"
|
|
|
|
|
2023-07-25 00:09:57 +00:00
|
|
|
ahocorasick "github.com/BobuSumisu/aho-corasick"
|
2023-07-31 18:12:08 +00:00
|
|
|
lru "github.com/hashicorp/golang-lru"
|
2023-01-10 17:35:44 +00:00
|
|
|
"google.golang.org/protobuf/proto"
|
2022-01-13 20:02:24 +00:00
|
|
|
|
2022-08-29 18:45:37 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
2023-04-18 15:36:00 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/config"
|
2022-08-29 18:45:37 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
|
2022-02-10 18:54:33 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/decoders"
|
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
|
2023-07-31 18:12:08 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/output"
|
2022-10-06 18:55:07 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
|
2022-04-04 04:13:39 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
|
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
|
2022-02-10 18:54:33 +00:00
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
2022-01-13 20:02:24 +00:00
|
|
|
)
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// Metrics for the scan engine for external consumption.
|
|
|
|
type Metrics struct {
|
|
|
|
BytesScanned uint64
|
|
|
|
ChunksScanned uint64
|
|
|
|
VerifiedSecretsFound uint64
|
|
|
|
UnverifiedSecretsFound uint64
|
|
|
|
AvgDetectorTime map[string]time.Duration
|
2023-08-02 18:48:29 +00:00
|
|
|
|
|
|
|
scanStartTime time.Time
|
|
|
|
ScanDuration time.Duration
|
2023-07-31 18:12:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// runtimeMetrics for the scan engine for internal use by the engine.
|
|
|
|
type runtimeMetrics struct {
|
|
|
|
mu sync.RWMutex
|
|
|
|
Metrics
|
2022-03-06 06:42:06 +00:00
|
|
|
detectorAvgTime sync.Map
|
2023-07-31 18:12:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Printer is used to format found results and output them to the user. Ex JSON, plain text, etc.
|
2023-08-02 18:48:29 +00:00
|
|
|
// Please note printer implementations SHOULD BE thread safe.
|
2023-07-31 18:12:08 +00:00
|
|
|
type Printer interface {
|
|
|
|
Print(ctx context.Context, r *detectors.ResultWithMetadata) error
|
|
|
|
}
|
|
|
|
|
|
|
|
type Engine struct {
|
2023-08-02 18:48:29 +00:00
|
|
|
// CLI flags.
|
2023-07-31 18:12:08 +00:00
|
|
|
concurrency uint8
|
|
|
|
decoders []decoders.Decoder
|
|
|
|
detectors map[bool][]detectors.Detector
|
2022-10-31 16:36:10 +00:00
|
|
|
// filterUnverified is used to reduce the number of unverified results.
|
|
|
|
// If there are multiple unverified results for the same chunk for the same detector,
|
|
|
|
// only the first one will be kept.
|
2023-07-31 18:12:08 +00:00
|
|
|
filterUnverified bool
|
|
|
|
onlyVerified bool
|
|
|
|
printAvgDetectorTime bool
|
2023-03-02 17:32:37 +00:00
|
|
|
|
|
|
|
// prefilter is a ahocorasick struct used for doing efficient string
|
|
|
|
// matching given a set of words (keywords from the rules in the config)
|
2023-07-25 00:09:57 +00:00
|
|
|
prefilter ahocorasick.Trie
|
2023-07-31 18:12:08 +00:00
|
|
|
|
2023-08-02 18:48:29 +00:00
|
|
|
// Engine synchronization primitives.
|
2023-08-03 18:36:30 +00:00
|
|
|
sourceManager *sources.SourceManager
|
2023-08-02 18:48:29 +00:00
|
|
|
results chan detectors.ResultWithMetadata
|
2023-07-31 18:12:08 +00:00
|
|
|
detectableChunksChan chan detectableChunk
|
2023-08-02 18:48:29 +00:00
|
|
|
workersWg sync.WaitGroup
|
2023-07-31 18:12:08 +00:00
|
|
|
wgDetectorWorkers sync.WaitGroup
|
|
|
|
WgNotifier sync.WaitGroup
|
|
|
|
|
2023-08-02 18:48:29 +00:00
|
|
|
// Runtime information.
|
2023-07-31 18:12:08 +00:00
|
|
|
metrics runtimeMetrics
|
|
|
|
// numFoundResults is used to keep track of the number of results found.
|
|
|
|
numFoundResults uint32
|
|
|
|
|
|
|
|
// printer provides a method for formatting and outputting search results.
|
|
|
|
// The specific implementation (e.g., JSON, plain text)
|
|
|
|
// should be set during initialization based on user preference or program requirements.
|
|
|
|
printer Printer
|
|
|
|
|
|
|
|
// dedupeCache is used to deduplicate results by comparing the
|
|
|
|
// detector type, raw result, and source metadata
|
|
|
|
dedupeCache *lru.Cache
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type EngineOption func(*Engine)
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
func WithConcurrency(concurrency uint8) EngineOption {
|
2022-01-13 20:02:24 +00:00
|
|
|
return func(e *Engine) {
|
|
|
|
e.concurrency = concurrency
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-29 13:45:56 +00:00
|
|
|
const ignoreTag = "trufflehog:ignore"
|
|
|
|
|
2022-01-13 20:02:24 +00:00
|
|
|
func WithDetectors(verify bool, d ...detectors.Detector) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
if e.detectors == nil {
|
|
|
|
e.detectors = make(map[bool][]detectors.Detector)
|
|
|
|
}
|
|
|
|
if e.detectors[verify] == nil {
|
2022-01-19 06:24:56 +00:00
|
|
|
e.detectors[true] = []detectors.Detector{}
|
|
|
|
e.detectors[false] = []detectors.Detector{}
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
e.detectors[verify] = append(e.detectors[verify], d...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func WithDecoders(decoders ...decoders.Decoder) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
e.decoders = decoders
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-31 16:36:10 +00:00
|
|
|
// WithFilterUnverified sets the filterUnverified flag on the engine. If set to
|
|
|
|
// true, the engine will only return the first unverified result for a chunk for a detector.
|
|
|
|
func WithFilterUnverified(filter bool) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
e.filterUnverified = filter
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// WithOnlyVerified sets the onlyVerified flag on the engine. If set to true,
|
|
|
|
// the engine will only print verified results.
|
|
|
|
func WithOnlyVerified(onlyVerified bool) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
e.onlyVerified = onlyVerified
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// WithPrintAvgDetectorTime sets the printAvgDetectorTime flag on the engine. If set to
|
|
|
|
// true, the engine will print the average time taken by each detector.
|
|
|
|
// This option allows us to measure the time taken for each detector ONLY if
|
|
|
|
// the engine is configured to print the results.
|
|
|
|
// Calculating the average time taken by each detector is an expensive operation
|
|
|
|
// and should be avoided unless specified by the user.
|
|
|
|
func WithPrintAvgDetectorTime(printAvgDetectorTime bool) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
e.printAvgDetectorTime = printAvgDetectorTime
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-27 22:46:45 +00:00
|
|
|
// WithFilterDetectors applies a filter to the configured list of detectors. If
|
|
|
|
// the filterFunc returns true, the detector will be included for scanning.
|
|
|
|
// This option applies to the existing list of detectors configured, so the
|
|
|
|
// order this option appears matters. All filtering happens before scanning.
|
|
|
|
func WithFilterDetectors(filterFunc func(detectors.Detector) bool) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
// If no detectors are configured, do nothing.
|
|
|
|
if e.detectors == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
e.detectors[true] = filterDetectors(filterFunc, e.detectors[true])
|
|
|
|
e.detectors[false] = filterDetectors(filterFunc, e.detectors[false])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// WithPrinter sets the Printer on the engine.
|
|
|
|
func WithPrinter(printer Printer) EngineOption {
|
|
|
|
return func(e *Engine) {
|
|
|
|
e.printer = printer
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-27 22:46:45 +00:00
|
|
|
func filterDetectors(filterFunc func(detectors.Detector) bool, input []detectors.Detector) []detectors.Detector {
|
|
|
|
var output []detectors.Detector
|
|
|
|
for _, detector := range input {
|
|
|
|
if filterFunc(detector) {
|
|
|
|
output = append(output, detector)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// HasFoundResults returns true if any results are found.
|
|
|
|
func (e *Engine) HasFoundResults() bool {
|
|
|
|
return atomic.LoadUint32(&e.numFoundResults) > 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetMetrics returns a copy of Metrics.
|
|
|
|
// It's safe for concurrent use, and the caller can't modify the original data.
|
|
|
|
func (e *Engine) GetMetrics() Metrics {
|
|
|
|
e.metrics.mu.RLock()
|
|
|
|
defer e.metrics.mu.RUnlock()
|
|
|
|
|
|
|
|
result := e.metrics.Metrics
|
|
|
|
result.AvgDetectorTime = make(map[string]time.Duration, len(e.metrics.AvgDetectorTime))
|
|
|
|
|
|
|
|
for detectorName, durations := range e.DetectorAvgTime() {
|
|
|
|
var total time.Duration
|
|
|
|
for _, d := range durations {
|
|
|
|
total += d
|
|
|
|
}
|
|
|
|
avgDuration := total / time.Duration(len(durations))
|
|
|
|
result.AvgDetectorTime[detectorName] = avgDuration
|
|
|
|
}
|
|
|
|
|
2023-08-02 18:48:29 +00:00
|
|
|
result.ScanDuration = e.metrics.getScanDuration()
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetDetectorsMetrics returns a copy of the average time taken by each detector.
|
|
|
|
func (e *Engine) GetDetectorsMetrics() map[string]time.Duration {
|
|
|
|
e.metrics.mu.RLock()
|
|
|
|
defer e.metrics.mu.RUnlock()
|
|
|
|
|
|
|
|
result := make(map[string]time.Duration, len(DefaultDetectors()))
|
|
|
|
for detectorName, durations := range e.DetectorAvgTime() {
|
|
|
|
var total time.Duration
|
|
|
|
for _, d := range durations {
|
|
|
|
total += d
|
|
|
|
}
|
|
|
|
avgDuration := total / time.Duration(len(durations))
|
|
|
|
result[detectorName] = avgDuration
|
|
|
|
}
|
|
|
|
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2023-08-02 18:48:29 +00:00
|
|
|
// getScanDuration returns the duration of the scan.
|
|
|
|
// If the scan is still running, it returns the time since the scan started.
|
|
|
|
func (m *Metrics) getScanDuration() time.Duration {
|
|
|
|
if m.ScanDuration == 0 {
|
|
|
|
return time.Since(m.scanStartTime)
|
|
|
|
}
|
|
|
|
|
|
|
|
return m.ScanDuration
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// DetectorAvgTime returns the average time taken by each detector.
|
|
|
|
func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
|
|
|
|
logger := context.Background().Logger()
|
|
|
|
avgTime := map[string][]time.Duration{}
|
2023-09-15 11:35:15 +00:00
|
|
|
e.metrics.detectorAvgTime.Range(func(k, v any) bool {
|
2023-07-31 18:12:08 +00:00
|
|
|
key, ok := k.(string)
|
|
|
|
if !ok {
|
|
|
|
logger.Info("expected detectorAvgTime key to be a string")
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
value, ok := v.([]time.Duration)
|
|
|
|
if !ok {
|
|
|
|
logger.Info("expected detectorAvgTime value to be []time.Duration")
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
avgTime[key] = value
|
|
|
|
return true
|
|
|
|
})
|
|
|
|
return avgTime
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start the engine with options.
|
|
|
|
func Start(ctx context.Context, options ...EngineOption) (*Engine, error) {
|
|
|
|
const (
|
|
|
|
defaultChannelBuffer = 1
|
|
|
|
// TODO (ahrav): Determine the optimal cache size.
|
|
|
|
cacheSize = 512 // number of entries in the LRU cache
|
|
|
|
)
|
|
|
|
|
|
|
|
cache, err := lru.New(cacheSize)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to initialize LRU cache: %w", err)
|
|
|
|
}
|
|
|
|
|
2022-01-13 20:02:24 +00:00
|
|
|
e := &Engine{
|
2023-07-31 18:12:08 +00:00
|
|
|
detectableChunksChan: make(chan detectableChunk, defaultChannelBuffer),
|
|
|
|
results: make(chan detectors.ResultWithMetadata, defaultChannelBuffer),
|
|
|
|
dedupeCache: cache,
|
|
|
|
printer: new(output.PlainPrinter), // default printer
|
2023-08-02 18:48:29 +00:00
|
|
|
metrics: runtimeMetrics{Metrics: Metrics{scanStartTime: time.Now()}},
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, option := range options {
|
|
|
|
option(e)
|
|
|
|
}
|
|
|
|
|
2022-04-01 23:47:27 +00:00
|
|
|
// Set defaults.
|
2022-01-13 20:02:24 +00:00
|
|
|
if e.concurrency == 0 {
|
|
|
|
numCPU := runtime.NumCPU()
|
2023-02-09 22:55:19 +00:00
|
|
|
ctx.Logger().Info("No concurrency specified, defaulting to max", "cpu", numCPU)
|
2023-07-31 18:12:08 +00:00
|
|
|
e.concurrency = uint8(numCPU)
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
2023-07-28 21:30:43 +00:00
|
|
|
ctx.Logger().V(3).Info("engine started", "workers", e.concurrency)
|
2022-01-13 20:02:24 +00:00
|
|
|
|
2023-08-03 18:36:30 +00:00
|
|
|
// Create SourceManager.
|
|
|
|
e.sourceManager = sources.NewManager(
|
|
|
|
sources.WithConcurrentSources(int(e.concurrency)),
|
|
|
|
sources.WithConcurrentUnits(int(e.concurrency)),
|
|
|
|
)
|
2023-06-26 16:39:57 +00:00
|
|
|
|
2022-01-13 20:02:24 +00:00
|
|
|
if len(e.decoders) == 0 {
|
|
|
|
e.decoders = decoders.DefaultDecoders()
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(e.detectors) == 0 {
|
|
|
|
e.detectors = map[bool][]detectors.Detector{}
|
|
|
|
e.detectors[true] = DefaultDetectors()
|
2022-01-19 06:24:56 +00:00
|
|
|
e.detectors[false] = []detectors.Detector{}
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
|
2023-03-02 17:32:37 +00:00
|
|
|
// build ahocorasick prefilter for efficient string matching
|
|
|
|
// on keywords
|
|
|
|
keywords := []string{}
|
|
|
|
for _, d := range e.detectors[false] {
|
2023-07-25 22:01:15 +00:00
|
|
|
for _, kw := range d.Keywords() {
|
|
|
|
keywords = append(keywords, strings.ToLower(kw))
|
|
|
|
}
|
2023-03-02 17:32:37 +00:00
|
|
|
}
|
|
|
|
for _, d := range e.detectors[true] {
|
2023-07-25 22:01:15 +00:00
|
|
|
for _, kw := range d.Keywords() {
|
|
|
|
keywords = append(keywords, strings.ToLower(kw))
|
|
|
|
}
|
2023-03-02 17:32:37 +00:00
|
|
|
}
|
2023-07-25 00:09:57 +00:00
|
|
|
e.prefilter = *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build()
|
2023-03-02 17:32:37 +00:00
|
|
|
|
2023-07-28 21:30:43 +00:00
|
|
|
ctx.Logger().V(3).Info("loaded decoders", "count", len(e.decoders))
|
|
|
|
ctx.Logger().V(3).Info("loaded detectors",
|
2023-02-09 22:55:19 +00:00
|
|
|
"total", len(e.detectors[true])+len(e.detectors[false]),
|
|
|
|
"verification_enabled", len(e.detectors[true]),
|
|
|
|
"verification_disabled", len(e.detectors[false]),
|
|
|
|
)
|
2022-01-19 06:24:56 +00:00
|
|
|
|
2023-04-18 15:36:00 +00:00
|
|
|
// Sanity check detectors for duplicate configuration. Only log in case
|
|
|
|
// a detector has been configured in a way that isn't represented by
|
|
|
|
// the DetectorID (type and version).
|
|
|
|
{
|
|
|
|
dets := append(e.detectors[true], e.detectors[false]...)
|
|
|
|
seenDetectors := make(map[config.DetectorID]struct{}, len(dets))
|
|
|
|
for _, det := range dets {
|
|
|
|
id := config.GetDetectorID(det)
|
2023-06-13 19:49:21 +00:00
|
|
|
if _, ok := seenDetectors[id]; ok && id.ID != detectorspb.DetectorType_CustomRegex {
|
2023-04-18 15:36:00 +00:00
|
|
|
ctx.Logger().Info("possible duplicate detector configured", "detector", id)
|
|
|
|
}
|
|
|
|
seenDetectors[id] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
ctx.Logger().V(2).Info("starting scanner workers", "count", e.concurrency)
|
|
|
|
// Run the Secret scanner workers and Notifier pipelines.
|
|
|
|
for worker := uint64(0); worker < uint64(e.concurrency); worker++ {
|
2022-05-25 16:35:44 +00:00
|
|
|
e.workersWg.Add(1)
|
|
|
|
go func() {
|
2023-07-31 18:12:08 +00:00
|
|
|
ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5))
|
2023-07-24 14:34:43 +00:00
|
|
|
defer common.Recover(ctx)
|
2022-05-25 16:35:44 +00:00
|
|
|
defer e.workersWg.Done()
|
|
|
|
e.detectorWorker(ctx)
|
|
|
|
}()
|
|
|
|
}
|
2022-01-19 06:24:56 +00:00
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
const detectorWorkerMultiplier = 50
|
|
|
|
ctx.Logger().V(2).Info("starting detector workers", "count", e.concurrency*detectorWorkerMultiplier)
|
|
|
|
for worker := uint64(0); worker < uint64(e.concurrency*detectorWorkerMultiplier); worker++ {
|
|
|
|
e.wgDetectorWorkers.Add(1)
|
|
|
|
go func() {
|
|
|
|
ctx := context.WithValue(ctx, "detector_worker_id", common.RandomID(5))
|
|
|
|
defer common.Recover(ctx)
|
|
|
|
defer e.wgDetectorWorkers.Done()
|
|
|
|
e.detectChunks(ctx)
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
// We want 1/4th of the notifier workers as the number of scanner workers.
|
|
|
|
const notifierWorkerRatio = 4
|
|
|
|
maxNotifierWorkers := 1
|
|
|
|
if numWorkers := e.concurrency / notifierWorkerRatio; numWorkers > 0 {
|
|
|
|
maxNotifierWorkers = int(numWorkers)
|
|
|
|
}
|
|
|
|
ctx.Logger().V(2).Info("starting notifier workers", "count", maxNotifierWorkers)
|
|
|
|
for worker := 0; worker < maxNotifierWorkers; worker++ {
|
|
|
|
e.WgNotifier.Add(1)
|
|
|
|
go func() {
|
|
|
|
ctx := context.WithValue(ctx, "notifier_worker_id", common.RandomID(5))
|
|
|
|
defer common.Recover(ctx)
|
|
|
|
defer e.WgNotifier.Done()
|
|
|
|
e.notifyResults(ctx)
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
return e, nil
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
|
2022-05-25 16:35:44 +00:00
|
|
|
// Finish waits for running sources to complete and workers to finish scanning
|
|
|
|
// chunks before closing their respective channels. Once Finish is called, no
|
|
|
|
// more sources may be scanned by the engine.
|
2023-07-31 18:12:08 +00:00
|
|
|
func (e *Engine) Finish(ctx context.Context) error {
|
2022-09-22 14:01:10 +00:00
|
|
|
defer common.RecoverWithExit(ctx)
|
2023-07-31 18:12:08 +00:00
|
|
|
// Wait for the sources to finish putting chunks onto the chunks channel.
|
2023-08-03 18:36:30 +00:00
|
|
|
err := e.sourceManager.Wait()
|
2023-07-31 18:12:08 +00:00
|
|
|
|
|
|
|
e.workersWg.Wait() // Wait for the workers to finish scanning chunks.
|
|
|
|
close(e.detectableChunksChan)
|
|
|
|
e.wgDetectorWorkers.Wait() // Wait for the detector workers to finish detecting chunks.
|
|
|
|
|
|
|
|
close(e.results) // Detector workers are done, close the results channel and call it a day.
|
|
|
|
e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results.
|
|
|
|
|
2023-08-02 18:48:29 +00:00
|
|
|
e.metrics.ScanDuration = time.Since(e.metrics.scanStartTime)
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
return err
|
2022-05-25 16:35:44 +00:00
|
|
|
}
|
|
|
|
|
2023-08-03 18:36:30 +00:00
|
|
|
func (e *Engine) ChunksChan() <-chan *sources.Chunk {
|
|
|
|
return e.sourceManager.Chunks()
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Engine) ResultsChan() chan detectors.ResultWithMetadata {
|
|
|
|
return e.results
|
|
|
|
}
|
|
|
|
|
2023-08-16 23:09:23 +00:00
|
|
|
// ScanChunk injects a chunk into the output stream of chunks to be scanned.
|
|
|
|
// This method should rarely be used. TODO: Remove when dependencies no longer
|
|
|
|
// rely on this functionality.
|
|
|
|
func (e *Engine) ScanChunk(chunk *sources.Chunk) {
|
|
|
|
e.sourceManager.ScanChunk(chunk)
|
|
|
|
}
|
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
// detectableChunk is a decoded chunk that is ready to be scanned by its detector.
|
|
|
|
type detectableChunk struct {
|
|
|
|
detector detectors.Detector
|
|
|
|
chunk sources.Chunk
|
|
|
|
decoder detectorspb.DecoderType
|
|
|
|
wgDoneFn func()
|
2022-02-07 18:29:06 +00:00
|
|
|
}
|
|
|
|
|
2022-01-13 20:02:24 +00:00
|
|
|
func (e *Engine) detectorWorker(ctx context.Context) {
|
2023-07-31 18:12:08 +00:00
|
|
|
var wgDetect sync.WaitGroup
|
|
|
|
|
2023-08-03 18:36:30 +00:00
|
|
|
for originalChunk := range e.ChunksChan() {
|
2022-10-24 20:57:27 +00:00
|
|
|
for chunk := range sources.Chunker(originalChunk) {
|
2023-03-02 17:32:37 +00:00
|
|
|
matchedKeywords := make(map[string]struct{})
|
2023-07-31 18:12:08 +00:00
|
|
|
atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data)))
|
2022-10-24 20:57:27 +00:00
|
|
|
for _, decoder := range e.decoders {
|
|
|
|
var decoderType detectorspb.DecoderType
|
|
|
|
switch decoder.(type) {
|
2022-11-15 17:36:01 +00:00
|
|
|
case *decoders.UTF8:
|
2022-10-24 20:57:27 +00:00
|
|
|
decoderType = detectorspb.DecoderType_PLAIN
|
|
|
|
case *decoders.Base64:
|
|
|
|
decoderType = detectorspb.DecoderType_BASE64
|
2023-04-20 22:25:36 +00:00
|
|
|
case *decoders.UTF16:
|
|
|
|
decoderType = detectorspb.DecoderType_UTF16
|
2022-10-24 20:57:27 +00:00
|
|
|
default:
|
2023-02-09 22:55:19 +00:00
|
|
|
ctx.Logger().Info("unknown decoder type", "type", reflect.TypeOf(decoder).String())
|
2022-10-24 20:57:27 +00:00
|
|
|
decoderType = detectorspb.DecoderType_UNKNOWN
|
|
|
|
}
|
2023-07-10 15:12:59 +00:00
|
|
|
|
2022-10-24 20:57:27 +00:00
|
|
|
decoded := decoder.FromChunk(chunk)
|
2023-07-10 15:12:59 +00:00
|
|
|
|
2022-10-24 20:57:27 +00:00
|
|
|
if decoded == nil {
|
|
|
|
continue
|
|
|
|
}
|
2023-03-02 17:32:37 +00:00
|
|
|
|
2023-04-14 12:29:32 +00:00
|
|
|
// build a map of all keywords that were matched in the chunk
|
2023-07-25 22:01:15 +00:00
|
|
|
for _, m := range e.prefilter.MatchString(strings.ToLower(string(decoded.Data))) {
|
2023-07-25 00:09:57 +00:00
|
|
|
matchedKeywords[strings.ToLower(m.MatchString())] = struct{}{}
|
2023-03-02 17:32:37 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 20:57:27 +00:00
|
|
|
for verify, detectorsSet := range e.detectors {
|
|
|
|
for _, detector := range detectorsSet {
|
2023-03-02 17:32:37 +00:00
|
|
|
chunkContainsKeyword := false
|
2022-10-24 20:57:27 +00:00
|
|
|
for _, kw := range detector.Keywords() {
|
2023-03-02 17:32:37 +00:00
|
|
|
if _, ok := matchedKeywords[strings.ToLower(kw)]; ok {
|
|
|
|
chunkContainsKeyword = true
|
2023-03-15 21:51:03 +00:00
|
|
|
break
|
2022-10-24 20:57:27 +00:00
|
|
|
}
|
|
|
|
}
|
2023-03-02 17:32:37 +00:00
|
|
|
|
|
|
|
if !chunkContainsKeyword {
|
2022-10-24 20:57:27 +00:00
|
|
|
continue
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
2022-09-22 22:47:42 +00:00
|
|
|
|
2023-07-31 18:12:08 +00:00
|
|
|
decoded.Verify = verify
|
|
|
|
wgDetect.Add(1)
|
|
|
|
e.detectableChunksChan <- detectableChunk{
|
|
|
|
chunk: *decoded,
|
|
|
|
detector: detector,
|
|
|
|
decoder: decoderType,
|
|
|
|
wgDoneFn: wgDetect.Done,
|
2022-03-06 06:42:06 +00:00
|
|
|
}
|
2022-02-07 18:29:06 +00:00
|
|
|
}
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-07-31 18:12:08 +00:00
|
|
|
atomic.AddUint64(&e.metrics.ChunksScanned, 1)
|
|
|
|
}
|
|
|
|
wgDetect.Wait()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Engine) detectChunks(ctx context.Context) {
|
|
|
|
for data := range e.detectableChunksChan {
|
|
|
|
e.detectChunk(ctx, data)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
|
|
|
|
var start time.Time
|
|
|
|
if e.printAvgDetectorTime {
|
|
|
|
start = time.Now()
|
|
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
|
|
|
|
defer common.Recover(ctx)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
results, err := data.detector.FromData(ctx, data.chunk.Verify, data.chunk.Data)
|
|
|
|
if err != nil {
|
|
|
|
ctx.Logger().Error(err, "error scanning chunk")
|
|
|
|
}
|
|
|
|
if e.printAvgDetectorTime && len(results) > 0 {
|
|
|
|
elapsed := time.Since(start)
|
|
|
|
detectorName := results[0].DetectorType.String()
|
|
|
|
avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName)
|
|
|
|
var avgTime []time.Duration
|
|
|
|
if ok {
|
|
|
|
avgTime, ok = avgTimeI.([]time.Duration)
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
avgTime = append(avgTime, elapsed)
|
|
|
|
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
|
|
|
|
}
|
|
|
|
|
|
|
|
if e.filterUnverified {
|
|
|
|
results = detectors.CleanResults(results)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, res := range results {
|
|
|
|
e.processResult(data, res)
|
|
|
|
}
|
|
|
|
data.wgDoneFn()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Engine) processResult(data detectableChunk, res detectors.Result) {
|
|
|
|
ignoreLinePresent := false
|
|
|
|
if SupportsLineNumbers(data.chunk.SourceType) {
|
|
|
|
copyChunk := data.chunk
|
|
|
|
copyMetaDataClone := proto.Clone(data.chunk.SourceMetadata)
|
|
|
|
if copyMetaData, ok := copyMetaDataClone.(*source_metadatapb.MetaData); ok {
|
|
|
|
copyChunk.SourceMetadata = copyMetaData
|
|
|
|
}
|
|
|
|
fragStart, mdLine := FragmentFirstLine(©Chunk)
|
|
|
|
ignoreLinePresent = SetResultLineNumber(©Chunk, &res, fragStart, mdLine)
|
|
|
|
data.chunk = copyChunk
|
|
|
|
}
|
|
|
|
if ignoreLinePresent {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
secret := detectors.CopyMetadata(&data.chunk, res)
|
|
|
|
secret.DecoderType = data.decoder
|
|
|
|
e.results <- secret
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Engine) notifyResults(ctx context.Context) {
|
|
|
|
for r := range e.ResultsChan() {
|
|
|
|
if e.onlyVerified && !r.Verified {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
atomic.AddUint32(&e.numFoundResults, 1)
|
|
|
|
|
2023-09-11 15:18:48 +00:00
|
|
|
// Dedupe results by comparing the detector type, raw result, and source metadata.
|
|
|
|
// We want to avoid duplicate results with different decoder types, but we also
|
|
|
|
// want to include duplicate results with the same decoder type.
|
|
|
|
// Duplicate results with the same decoder type SHOULD have their own entry in the
|
|
|
|
// results list, this would happen if the same secret is found multiple times.
|
2023-07-31 18:12:08 +00:00
|
|
|
key := fmt.Sprintf("%s%s%s%+v", r.DetectorType.String(), r.Raw, r.RawV2, r.SourceMetadata)
|
2023-09-11 15:18:48 +00:00
|
|
|
if val, ok := e.dedupeCache.Get(key); ok {
|
|
|
|
if res, ok := val.(detectorspb.DecoderType); ok && res != r.DecoderType {
|
|
|
|
continue
|
|
|
|
}
|
2023-07-31 18:12:08 +00:00
|
|
|
}
|
2023-09-11 15:18:48 +00:00
|
|
|
e.dedupeCache.Add(key, r.DecoderType)
|
2023-07-31 18:12:08 +00:00
|
|
|
|
|
|
|
if r.Verified {
|
|
|
|
atomic.AddUint64(&e.metrics.VerifiedSecretsFound, 1)
|
|
|
|
} else {
|
|
|
|
atomic.AddUint64(&e.metrics.UnverifiedSecretsFound, 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := e.printer.Print(ctx, &r); err != nil {
|
|
|
|
ctx.Logger().Error(err, "error printing result")
|
|
|
|
}
|
2022-01-13 20:02:24 +00:00
|
|
|
}
|
|
|
|
}
|
2022-03-22 16:27:15 +00:00
|
|
|
|
2023-07-12 22:47:43 +00:00
|
|
|
// SupportsLineNumbers determines if a line number can be found for a source type.
|
|
|
|
func SupportsLineNumbers(sourceType sourcespb.SourceType) bool {
|
|
|
|
switch sourceType {
|
|
|
|
case sourcespb.SourceType_SOURCE_TYPE_GIT,
|
2022-03-22 16:27:15 +00:00
|
|
|
sourcespb.SourceType_SOURCE_TYPE_GITHUB,
|
|
|
|
sourcespb.SourceType_SOURCE_TYPE_GITLAB,
|
|
|
|
sourcespb.SourceType_SOURCE_TYPE_BITBUCKET,
|
|
|
|
sourcespb.SourceType_SOURCE_TYPE_GERRIT,
|
|
|
|
sourcespb.SourceType_SOURCE_TYPE_GITHUB_UNAUTHENTICATED_ORG,
|
|
|
|
sourcespb.SourceType_SOURCE_TYPE_PUBLIC_GIT,
|
2023-07-12 22:47:43 +00:00
|
|
|
sourcespb.SourceType_SOURCE_TYPE_FILESYSTEM:
|
|
|
|
return true
|
|
|
|
default:
|
|
|
|
return false
|
2022-03-22 16:27:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-04 21:11:10 +00:00
|
|
|
// FragmentLineOffset sets the line number for a provided source chunk with a given detector result.
|
2023-06-29 13:45:56 +00:00
|
|
|
func FragmentLineOffset(chunk *sources.Chunk, result *detectors.Result) (int64, bool) {
|
2023-08-14 17:51:41 +00:00
|
|
|
before, after, found := bytes.Cut(chunk.Data, result.Raw)
|
|
|
|
if !found {
|
|
|
|
return 0, false
|
|
|
|
}
|
|
|
|
lineNumber := int64(bytes.Count(before, []byte("\n")))
|
|
|
|
// If the line contains the ignore tag, we should ignore the result.
|
|
|
|
endLine := bytes.Index(after, []byte("\n"))
|
|
|
|
if endLine == -1 {
|
|
|
|
endLine = len(after)
|
|
|
|
}
|
|
|
|
if bytes.Contains(after[:endLine], []byte(ignoreTag)) {
|
|
|
|
return lineNumber, true
|
2022-05-04 21:11:10 +00:00
|
|
|
}
|
2023-08-14 17:51:41 +00:00
|
|
|
return lineNumber, false
|
2022-05-04 21:11:10 +00:00
|
|
|
}
|
|
|
|
|
2022-12-06 23:31:15 +00:00
|
|
|
// FragmentFirstLine returns the first line number of a fragment along with a pointer to the value to update in the
|
2022-05-04 21:11:10 +00:00
|
|
|
// chunk metadata.
|
2022-12-06 23:31:15 +00:00
|
|
|
func FragmentFirstLine(chunk *sources.Chunk) (int64, *int64) {
|
2022-05-04 21:11:10 +00:00
|
|
|
var fragmentStart *int64
|
2022-03-22 16:27:15 +00:00
|
|
|
switch metadata := chunk.SourceMetadata.GetData().(type) {
|
|
|
|
case *source_metadatapb.MetaData_Git:
|
2022-05-04 21:11:10 +00:00
|
|
|
fragmentStart = &metadata.Git.Line
|
2022-03-22 16:27:15 +00:00
|
|
|
case *source_metadatapb.MetaData_Github:
|
2022-05-04 21:11:10 +00:00
|
|
|
fragmentStart = &metadata.Github.Line
|
2022-03-22 16:27:15 +00:00
|
|
|
case *source_metadatapb.MetaData_Gitlab:
|
2022-05-04 21:11:10 +00:00
|
|
|
fragmentStart = &metadata.Gitlab.Line
|
2022-03-22 16:27:15 +00:00
|
|
|
case *source_metadatapb.MetaData_Bitbucket:
|
2022-05-04 21:11:10 +00:00
|
|
|
fragmentStart = &metadata.Bitbucket.Line
|
2022-03-22 16:27:15 +00:00
|
|
|
case *source_metadatapb.MetaData_Gerrit:
|
2022-05-04 21:11:10 +00:00
|
|
|
fragmentStart = &metadata.Gerrit.Line
|
2023-05-09 15:02:34 +00:00
|
|
|
case *source_metadatapb.MetaData_Filesystem:
|
|
|
|
fragmentStart = &metadata.Filesystem.Line
|
2022-05-04 22:45:12 +00:00
|
|
|
default:
|
|
|
|
return 0, nil
|
2022-03-22 16:27:15 +00:00
|
|
|
}
|
2022-05-04 21:11:10 +00:00
|
|
|
return *fragmentStart, fragmentStart
|
2022-03-22 16:27:15 +00:00
|
|
|
}
|
2022-12-06 23:31:15 +00:00
|
|
|
|
|
|
|
// SetResultLineNumber sets the line number in the provided result.
|
2023-06-29 13:45:56 +00:00
|
|
|
func SetResultLineNumber(chunk *sources.Chunk, result *detectors.Result, fragStart int64, mdLine *int64) bool {
|
|
|
|
offset, skip := FragmentLineOffset(chunk, result)
|
2023-01-10 17:35:44 +00:00
|
|
|
*mdLine = fragStart + offset
|
2023-06-29 13:45:56 +00:00
|
|
|
return skip
|
2022-12-06 23:31:15 +00:00
|
|
|
}
|