trufflehog/pkg/detectors/detectors.go

package detectors

import (
	"context"
	"crypto/rand"
	"math/big"
	"net/url"
	"strings"
	"unicode"

	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// Detector defines an interface for scanning for and verifying secrets.
type Detector interface {
	// FromData will scan bytes for results, and optionally verify them.
	FromData(ctx context.Context, verify bool, data []byte) ([]Result, error)
	// Keywords are used for efficiently pre-filtering chunks using substring operations.
	// Use unique identifiers that are part of the secret if you can, or the provider name.
	Keywords() []string
	// Type returns the DetectorType number from detectors.proto for the given detector.
	Type() detectorspb.DetectorType
}

// Versioner is an optional interface that a detector can implement to
// differentiate instances of the same detector type.
type Versioner interface {
	Version() int
}

// EndpointCustomizer is an optional interface that a detector can implement to
// support verifying against user-supplied endpoints.
type EndpointCustomizer interface {
	SetEndpoints(...string) error
	DefaultEndpoint() string
}

type Result struct {
	// DetectorType is the type of Detector.
	DetectorType detectorspb.DetectorType
	// DetectorName is the name of the Detector. Used for custom detectors.
	DetectorName string
	// DecoderType is the type of Decoder.
	DecoderType detectorspb.DecoderType
	Verified    bool
	// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.
	Raw []byte
	// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
	// This is used for secrets that are multi part and could have the same ID. Ex: AWS credentials
	RawV2 []byte
	// Redacted contains the redacted version of the raw secret identification data for display purposes.
	// A secret ID should be used if available.
	Redacted       string
	ExtraData      map[string]string
	StructuredData *detectorspb.StructuredData

	// This field should only be populated if the verification process itself failed in a way that provides no
	// information about the verification status of the candidate secret, such as if the verification request timed out.
	VerificationError error
}

type ResultWithMetadata struct {
	// SourceMetadata contains source-specific contextual information.
	SourceMetadata *source_metadatapb.MetaData
	// SourceID is the ID of the source that the API uses to map secrets to specific sources.
	SourceID sources.SourceID
	// SourceType is the type of Source.
	SourceType sourcespb.SourceType
	// SourceName is the name of the Source.
	SourceName string
	Result
	// Data from the sources.Chunk which this result was emitted for
	Data []byte
}

// CopyMetadata returns a detector result with included metadata from the source chunk.
func CopyMetadata(chunk *sources.Chunk, result Result) ResultWithMetadata {
	return ResultWithMetadata{
		SourceMetadata: chunk.SourceMetadata,
		SourceID:       chunk.SourceID,
		SourceType:     chunk.SourceType,
		SourceName:     chunk.SourceName,
		Result:         result,
		Data:           chunk.Data,
	}
}

// CleanResults returns all verified secrets, and if there are no verified secrets,
// just one unverified secret if there are any.
func CleanResults(results []Result) []Result {
	if len(results) == 0 {
		return results
	}

	var cleaned = make(map[string]Result, 0)

	for _, s := range results {
		if s.Verified {
			cleaned[s.Redacted] = s
		}
	}

	if len(cleaned) == 0 {
		return results[:1]
	}

	results = results[:0]
	for _, r := range cleaned {
		results = append(results, r)
	}

	return results
}

// PrefixRegex ensures that at least one of the given keywords is within
// 20 characters of the capturing group that follows.
// This can help prevent false positives.
func PrefixRegex(keywords []string) string {
	pre := `(?i)(?:`
	middle := strings.Join(keywords, "|")
	post := `)(?:.|[\n\r]){0,40}`
	return pre + middle + post
}

// KeyIsRandom is a Low cost check to make sure that 'keys' include a number to reduce FPs.
// Golang doesn't support regex lookaheads, so must be done in separate calls.
// TODO improve checks. Shannon entropy did not work well.
func KeyIsRandom(key string) bool {
	for _, ch := range key {
		if unicode.IsDigit(ch) {
			return true
		}
	}

	return false
}

func MustGetBenchmarkData() map[string][]byte {
	sizes := map[string]int{
		"xsmall":  10,          // 10 bytes
		"small":   100,         // 100 bytes
		"medium":  1024,        // 1KB
		"large":   10 * 1024,   // 10KB
		"xlarge":  100 * 1024,  // 100KB
		"xxlarge": 1024 * 1024, // 1MB
	}
	data := make(map[string][]byte)

	for key, size := range sizes {
		// Generating a byte slice of a specific size with random data.
		content := make([]byte, size)
		for i := 0; i < size; i++ {
			randomByte, err := rand.Int(rand.Reader, big.NewInt(256))
			if err != nil {
				panic(err)
			}
			content[i] = byte(randomByte.Int64())
		}
		data[key] = content
	}

	return data
}

func RedactURL(u url.URL) string {
	u.User = url.UserPassword(u.User.Username(), "********")
	return strings.TrimSpace(strings.Replace(u.String(), "%2A", "*", -1))
}
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`package detectors`

			`import (`
			`"context"`
[chore] - update benchmarks. (#1641) * update benchmarks. * remove dupe timer reset. 2023-08-23 21:34:10 +00:00			`"crypto/rand"`
			`"math/big"`
Update sqlserver redaction, deduplication, and URI redaction (#1369) * Update sqlserver redaction, deduplication, and URI redaction * don't use pointer 2023-06-09 18:06:54 +00:00			`"net/url"`
more detectors 2022-01-19 06:24:42 +00:00			`"strings"`
			`"unicode"`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00
module v3 2022-02-10 18:54:33 +00:00			`"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"`
			`"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"`
			`"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"`
			`"github.com/trufflesecurity/trufflehog/v3/pkg/sources"`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`)`

[THOG-128] Code cleanup/ OSS onboarding (#117) * Small amount of code clean up. * Rename sem to concurrency for better readability and to remove an extra comment. * fix stashing issue. Co-authored-by: Ahrav Dutta <ahrav.dutta@trufflesec.com> 2022-04-01 23:47:27 +00:00			`// Detector defines an interface for scanning for and verifying secrets.`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`type Detector interface {`
			`// FromData will scan bytes for results, and optionally verify them.`
			`FromData(ctx context.Context, verify bool, data []byte) ([]Result, error)`
			`// Keywords are used for efficiently pre-filtering chunks using substring operations.`
			`// Use unique identifiers that are part of the secret if you can, or the provider name.`
			`Keywords() []string`
Add Type() to detector interface (#1088) * Add Type() to detector interface The goal here is to allow the detector type information to be used without the need for reflection. This could possibly allow us to more easily inject information into detectors or filter them out if necessary. Co-authored-by: ahmed <ahmed.zahran@trufflesec.com> * remove test detector --------- Co-authored-by: ahmed <ahmed.zahran@trufflesec.com> 2023-02-09 22:46:03 +00:00			`// Type returns the DetectorType number from detectors.proto for the given detector.`
			`Type() detectorspb.DetectorType`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`

Support filtering detectors by version (#1150) * Adjust types to use DetectorID struct * Parse versions with detector include and exclude input * Update detectors filter to use version Co-authored-by: steeeve <steve@trufflesec.com> * Implement Versioner for github, gitlab, and npm detectors Co-authored-by: steeeve <steve@trufflesec.com> --------- Co-authored-by: steeeve <steve@trufflesec.com> 2023-03-02 22:33:56 +00:00			`// Versioner is an optional interface that a detector can implement to`
			`// differentiate instances of the same detector type.`
			`type Versioner interface {`
			`Version() int`
			`}`

Implement EndpointCustomizer (#1291) * Implement EndpointCustomizer Add the EndpointCustomizer interface and EndpointSetter convenience struct, implement EndpointCustomizer for github and gitlab detectors, and add parsing, verification, and applying user-supplied configuration. * Check error from SetEndpoints * Rename variable for clarity 2023-04-27 17:23:50 +00:00			`// EndpointCustomizer is an optional interface that a detector can implement to`
			`// support verifying against user-supplied endpoints.`
			`type EndpointCustomizer interface {`
			`SetEndpoints(...string) error`
			`DefaultEndpoint() string`
			`}`

Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`type Result struct {`
			`// DetectorType is the type of Detector.`
			`DetectorType detectorspb.DetectorType`
Add DetectorName to Result (#1223) * Add DetectorName to Result * Use GetName method instead of Name 2023-03-30 16:40:05 +00:00			`// DetectorName is the name of the Detector. Used for custom detectors.`
			`DetectorName string`
Add decoder type to results. (#835) 2022-10-06 18:55:07 +00:00			`// DecoderType is the type of Decoder.`
			`DecoderType detectorspb.DecoderType`
			`Verified bool`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.`
			`Raw []byte`
[Thog-628] update detector results hash v2 (#710) * Start updating detectors that have two part creds to record the raw result as ID + secret. * Add more detectors. * More detectors. * More detectors. * remove comment out imports. 2022-08-12 21:53:37 +00:00			`// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.`
			`// This is used for secrets that are multi part and could have the same ID. Ex: AWS credentials`
			`RawV2 []byte`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`// Redacted contains the redacted version of the raw secret identification data for display purposes.`
			`// A secret ID should be used if available.`
			`Redacted string`
			`ExtraData map[string]string`
			`StructuredData *detectorspb.StructuredData`
Add new verification error message field (#1463) 2023-07-10 15:15:40 +00:00
			`// This field should only be populated if the verification process itself failed in a way that provides no`
			`// information about the verification status of the candidate secret, such as if the verification request timed out.`
			`VerificationError error`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`

			`type ResultWithMetadata struct {`
[THOG-128] Code cleanup/ OSS onboarding (#117) * Small amount of code clean up. * Rename sem to concurrency for better readability and to remove an extra comment. * fix stashing issue. Co-authored-by: Ahrav Dutta <ahrav.dutta@trufflesec.com> 2022-04-01 23:47:27 +00:00			`// SourceMetadata contains source-specific contextual information.`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`SourceMetadata *source_metadatapb.MetaData`
			`// SourceID is the ID of the source that the API uses to map secrets to specific sources.`
Update Source interface to use SourceID and JobID types (#1774) The previous implementation used int64 for both, which can be mixed up easily. Using distinct types adds a layer of type safety checked by the compiler. 2023-09-14 18:28:24 +00:00			`SourceID sources.SourceID`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`// SourceType is the type of Source.`
			`SourceType sourcespb.SourceType`
			`// SourceName is the name of the Source.`
			`SourceName string`
			`Result`
Add Data member to ResultsMetadata struct. (#1358) When a Result is emitted, it should include the `chunk.Data []byte` so that we can utilize the blob of data which caused the result. This makes it so something catching the results does not have to maintain a collection of chunks to correlate the two together. 2023-05-24 16:21:41 +00:00			`// Data from the sources.Chunk which this result was emitted for`
			`Data []byte`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`

[THOG-128] Code cleanup/ OSS onboarding (#117) * Small amount of code clean up. * Rename sem to concurrency for better readability and to remove an extra comment. * fix stashing issue. Co-authored-by: Ahrav Dutta <ahrav.dutta@trufflesec.com> 2022-04-01 23:47:27 +00:00			`// CopyMetadata returns a detector result with included metadata from the source chunk.`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`func CopyMetadata(chunk *sources.Chunk, result Result) ResultWithMetadata {`
			`return ResultWithMetadata{`
			`SourceMetadata: chunk.SourceMetadata,`
			`SourceID: chunk.SourceID,`
			`SourceType: chunk.SourceType,`
			`SourceName: chunk.SourceName,`
			`Result: result,`
Add Data member to ResultsMetadata struct. (#1358) When a Result is emitted, it should include the `chunk.Data []byte` so that we can utilize the blob of data which caused the result. This makes it so something catching the results does not have to maintain a collection of chunks to correlate the two together. 2023-05-24 16:21:41 +00:00			`Data: chunk.Data,`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`
			`}`

			`// CleanResults returns all verified secrets, and if there are no verified secrets,`
			`// just one unverified secret if there are any.`
			`func CleanResults(results []Result) []Result {`
			`if len(results) == 0 {`
			`return results`
			`}`

more detectors 2022-01-19 06:24:42 +00:00			`var cleaned = make(map[string]Result, 0)`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00
			`for _, s := range results {`
			`if s.Verified {`
more detectors 2022-01-19 06:24:42 +00:00			`cleaned[s.Redacted] = s`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`
			`}`

			`if len(cleaned) == 0 {`
			`return results[:1]`
			`}`

more detectors 2022-01-19 06:24:42 +00:00			`results = results[:0]`
			`for _, r := range cleaned {`
			`results = append(results, r)`
			`}`

			`return results`
			`}`

[THOG-128] Code cleanup/ OSS onboarding (#117) * Small amount of code clean up. * Rename sem to concurrency for better readability and to remove an extra comment. * fix stashing issue. Co-authored-by: Ahrav Dutta <ahrav.dutta@trufflesec.com> 2022-04-01 23:47:27 +00:00			`// PrefixRegex ensures that at least one of the given keywords is within`
more detectors 2022-01-19 06:24:42 +00:00			`// 20 characters of the capturing group that follows.`
			`// This can help prevent false positives.`
			`func PrefixRegex(keywords []string) string {`
			pre := `(?i)(?:`
			`middle := strings.Join(keywords, "\|")`
[THOG-234] Update security trails detector's regex and keywords. (#429) * Update detectors PrefixRegex to allow for new line and carriage returns. Add additional keyword for security trails. Add additional unit tests for security trails and PrefixRegex * Update catpure group. 2022-04-18 22:09:50 +00:00			post := `)(?:.\|[\n\r]){0,40}`
more detectors 2022-01-19 06:24:42 +00:00			`return pre + middle + post`
			`}`

[Thog-628] update detector results hash v2 (#710) * Start updating detectors that have two part creds to record the raw result as ID + secret. * Add more detectors. * More detectors. * More detectors. * remove comment out imports. 2022-08-12 21:53:37 +00:00			`// KeyIsRandom is a Low cost check to make sure that 'keys' include a number to reduce FPs.`
fix spelling errors (#1413) 2023-06-21 14:15:28 +00:00			`// Golang doesn't support regex lookaheads, so must be done in separate calls.`
[Thog-628] update detector results hash v2 (#710) * Start updating detectors that have two part creds to record the raw result as ID + secret. * Add more detectors. * More detectors. * More detectors. * remove comment out imports. 2022-08-12 21:53:37 +00:00			`// TODO improve checks. Shannon entropy did not work well.`
more detectors 2022-01-19 06:24:42 +00:00			`func KeyIsRandom(key string) bool {`
			`for _, ch := range key {`
			`if unicode.IsDigit(ch) {`
			`return true`
			`}`
			`}`

			`return false`
			`}`

			`func MustGetBenchmarkData() map[string][]byte {`
[chore] - update benchmarks. (#1641) * update benchmarks. * remove dupe timer reset. 2023-08-23 21:34:10 +00:00			`sizes := map[string]int{`
			`"xsmall": 10, // 10 bytes`
			`"small": 100, // 100 bytes`
			`"medium": 1024, // 1KB`
			`"large": 10 * 1024, // 10KB`
			`"xlarge": 100 * 1024, // 100KB`
			`"xxlarge": 1024 * 1024, // 1MB`
more detectors 2022-01-19 06:24:42 +00:00			`}`
[chore] - update benchmarks. (#1641) * update benchmarks. * remove dupe timer reset. 2023-08-23 21:34:10 +00:00			`data := make(map[string][]byte)`

			`for key, size := range sizes {`
			`// Generating a byte slice of a specific size with random data.`
			`content := make([]byte, size)`
			`for i := 0; i < size; i++ {`
			`randomByte, err := rand.Int(rand.Reader, big.NewInt(256))`
			`if err != nil {`
			`panic(err)`
			`}`
			`content[i] = byte(randomByte.Int64())`
			`}`
			`data[key] = content`
more detectors 2022-01-19 06:24:42 +00:00			`}`

[chore] - update benchmarks. (#1641) * update benchmarks. * remove dupe timer reset. 2023-08-23 21:34:10 +00:00			`return data`
Initial CLI w/ partially implemented Git source and demo detector (#1) 2022-01-13 20:02:24 +00:00			`}`
Update sqlserver redaction, deduplication, and URI redaction (#1369) * Update sqlserver redaction, deduplication, and URI redaction * don't use pointer 2023-06-09 18:06:54 +00:00
			`func RedactURL(u url.URL) string {`
			`u.User = url.UserPassword(u.User.Username(), "********")`
			`return strings.TrimSpace(strings.Replace(u.String(), "%2A", "*", -1))`
			`}`