trufflehog/pkg/detectors/detectors.go
ahrav 96c9c5bf7c
[THOG-234] Update security trails detector's regex and keywords. (#429)
* Update detectors PrefixRegex to allow for new line and carriage returns.
Add additional keyword for security trails.
Add additional unit tests for security trails and PrefixRegex

* Update catpure group.
2022-04-18 15:09:50 -07:00

130 lines
3.6 KiB
Go

package detectors
import (
"context"
"os"
"path/filepath"
"runtime"
"strings"
"unicode"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
// Detector defines an interface for scanning for and verifying secrets.
type Detector interface {
// FromData will scan bytes for results, and optionally verify them.
FromData(ctx context.Context, verify bool, data []byte) ([]Result, error)
// Keywords are used for efficiently pre-filtering chunks using substring operations.
// Use unique identifiers that are part of the secret if you can, or the provider name.
Keywords() []string
}
type Result struct {
// DetectorType is the type of Detector.
DetectorType detectorspb.DetectorType
Verified bool
// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.
Raw []byte
// Redacted contains the redacted version of the raw secret identification data for display purposes.
// A secret ID should be used if available.
Redacted string
ExtraData map[string]string
StructuredData *detectorspb.StructuredData
}
type ResultWithMetadata struct {
// SourceMetadata contains source-specific contextual information.
SourceMetadata *source_metadatapb.MetaData
// SourceID is the ID of the source that the API uses to map secrets to specific sources.
SourceID int64
// SourceType is the type of Source.
SourceType sourcespb.SourceType
// SourceName is the name of the Source.
SourceName string
Result
}
// CopyMetadata returns a detector result with included metadata from the source chunk.
func CopyMetadata(chunk *sources.Chunk, result Result) ResultWithMetadata {
return ResultWithMetadata{
SourceMetadata: chunk.SourceMetadata,
SourceID: chunk.SourceID,
SourceType: chunk.SourceType,
SourceName: chunk.SourceName,
Result: result,
}
}
// CleanResults returns all verified secrets, and if there are no verified secrets,
// just one unverified secret if there are any.
func CleanResults(results []Result) []Result {
if len(results) == 0 {
return results
}
var cleaned = make(map[string]Result, 0)
for _, s := range results {
if s.Verified {
cleaned[s.Redacted] = s
}
}
if len(cleaned) == 0 {
return results[:1]
}
results = results[:0]
for _, r := range cleaned {
results = append(results, r)
}
return results
}
// PrefixRegex ensures that at least one of the given keywords is within
// 20 characters of the capturing group that follows.
// This can help prevent false positives.
func PrefixRegex(keywords []string) string {
pre := `(?i)(?:`
middle := strings.Join(keywords, "|")
post := `)(?:.|[\n\r]){0,40}`
return pre + middle + post
}
//KeyIsRandom is a Low cost check to make sure that 'keys' include a number to reduce FPs.
//Golang doesnt support regex lookaheads, so must be done in separate calls.
//TODO improve checks. Shannon entropy did not work well.
func KeyIsRandom(key string) bool {
for _, ch := range key {
if unicode.IsDigit(ch) {
return true
}
}
return false
}
func MustGetBenchmarkData() map[string][]byte {
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
small := make([]byte, 0)
medium, err := os.ReadFile(filepath.Join(dir, "detectors.go"))
if err != nil {
panic(err)
}
big := make([]byte, 0)
for i := 0; i < 25; i++ {
big = append(big, medium...)
}
return map[string][]byte{
"small": small,
"medium": medium,
"big": big,
}
}