trufflehog/pkg/custom_detectors/custom_detectors.go
Cody Rose 2f7029bc4d
Expose detector-specific false positive logic (#2743)
This PR:

Creates an optional interface that detectors can use to customize their false positive detection
Implements this interface on detectors that have custom logic
In most cases this "custom logic" is simply a no-op because the detector does not participate in false positive detection
Eliminates inline (old-style) false positive exclusion in a few detectors that #2643 missed
2024-04-30 16:10:26 -04:00

255 lines
7 KiB
Go

package custom_detectors
import (
"bytes"
"context"
"encoding/json"
"net/http"
"regexp"
"strings"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"golang.org/x/sync/errgroup"
)
// The maximum number of matches from one chunk. This const is used when
// permutating each regex match to protect the scanner from doing too much work
// for poorly defined regexps.
const maxTotalMatches = 100
// CustomRegexWebhook is a CustomRegex with webhook validation that is
// guaranteed to be valid (assuming the data is not changed after
// initialization).
type CustomRegexWebhook struct {
*custom_detectorspb.CustomRegex
}
// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*CustomRegexWebhook)(nil)
var _ detectors.CustomFalsePositiveChecker = (*CustomRegexWebhook)(nil)
// NewWebhookCustomRegex initializes and validates a CustomRegexWebhook. An
// unexported type is intentionally returned here to ensure the values have
// been validated.
func NewWebhookCustomRegex(pb *custom_detectorspb.CustomRegex) (*CustomRegexWebhook, error) {
// TODO: Return all validation errors.
if err := ValidateKeywords(pb.Keywords); err != nil {
return nil, err
}
if err := ValidateRegex(pb.Regex); err != nil {
return nil, err
}
for _, verify := range pb.Verify {
if err := ValidateVerifyEndpoint(verify.Endpoint, verify.Unsafe); err != nil {
return nil, err
}
if err := ValidateVerifyHeaders(verify.Headers); err != nil {
return nil, err
}
}
// TODO: Copy only necessary data out of pb.
return &CustomRegexWebhook{pb}, nil
}
var httpClient = common.SaneHttpClient()
func (c *CustomRegexWebhook) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)
regexMatches := make(map[string][][]string, len(c.GetRegex()))
// Find all submatches for each regex.
for name, regex := range c.GetRegex() {
regex, err := regexp.Compile(regex)
if err != nil {
// This will only happen if the regex is invalid.
return nil, err
}
regexMatches[name] = regex.FindAllStringSubmatch(dataStr, -1)
}
// Permutate each individual match.
// {
// "foo": [["match1"]]
// "bar": [["match2"], ["match3"]]
// }
// becomes
// [
// {"foo": ["match1"], "bar": ["match2"]},
// {"foo": ["match1"], "bar": ["match3"]},
// ]
matches := permutateMatches(regexMatches)
g := new(errgroup.Group)
// Create result object and test for verification.
resultsCh := make(chan detectors.Result, maxTotalMatches)
for _, match := range matches {
match := match
g.Go(func() error {
return c.createResults(ctx, match, verify, resultsCh)
})
}
// Ignore any errors and collect as many of the results as we can.
_ = g.Wait()
close(resultsCh)
for result := range resultsCh {
// NOTE: I don't believe this is being set anywhere else, hence the map assignment.
result.ExtraData = map[string]string{
"name": c.GetName(),
}
results = append(results, result)
}
return results, nil
}
func (c *CustomRegexWebhook) IsFalsePositive(_ detectors.Result) bool {
return false
}
func (c *CustomRegexWebhook) createResults(ctx context.Context, match map[string][]string, verify bool, results chan<- detectors.Result) error {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return ctx.Err()
}
var raw string
for _, values := range match {
// values[0] contains the entire regex match.
raw += values[0]
}
result := detectors.Result{
DetectorType: detectorspb.DetectorType_CustomRegex,
DetectorName: c.GetName(),
Raw: []byte(raw),
}
if !verify {
select {
case <-ctx.Done():
return ctx.Err()
case results <- result:
return nil
}
}
// Verify via webhook.
jsonBody, err := json.Marshal(map[string]map[string][]string{
c.GetName(): match,
})
if err != nil {
// This should never happen, but if it does, return nil to not
// disrupt other verification.
return nil
}
// Try each config until we successfully verify.
for _, verifyConfig := range c.GetVerify() {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return ctx.Err()
}
req, err := http.NewRequestWithContext(ctx, "POST", verifyConfig.GetEndpoint(), bytes.NewReader(jsonBody))
if err != nil {
continue
}
for _, header := range verifyConfig.GetHeaders() {
key, value, found := strings.Cut(header, ":")
if !found {
// Should be unreachable due to validation.
continue
}
req.Header.Add(key, strings.TrimLeft(value, "\t\n\v\f\r "))
}
res, err := httpClient.Do(req)
if err != nil {
continue
}
// TODO: Read response body.
res.Body.Close()
if res.StatusCode == http.StatusOK {
result.Verified = true
break
}
}
select {
case <-ctx.Done():
return ctx.Err()
case results <- result:
return nil
}
}
func (c *CustomRegexWebhook) Keywords() []string {
return c.GetKeywords()
}
// productIndices produces a permutation of indices for each length. Example:
// productIndices(3, 2) -> [[0 0] [1 0] [2 0] [0 1] [1 1] [2 1]]. It returns
// a slice of length no larger than maxTotalMatches.
func productIndices(lengths ...int) [][]int {
count := 1
for _, l := range lengths {
count *= l
}
if count == 0 {
return nil
}
if count > maxTotalMatches {
count = maxTotalMatches
}
results := make([][]int, count)
for i := 0; i < count; i++ {
j := 1
result := make([]int, 0, len(lengths))
for _, l := range lengths {
result = append(result, (i/j)%l)
j *= l
}
results[i] = result
}
return results
}
// permutateMatches converts the list of all regex matches into all possible
// permutations selecting one from each named entry in the map. For example:
// {"foo": [matchA, matchB], "bar": [matchC]} becomes
//
// [{"foo": matchA, "bar": matchC}, {"foo": matchB, "bar": matchC}]
func permutateMatches(regexMatches map[string][][]string) []map[string][]string {
// Get a consistent order for names and their matching lengths.
// The lengths are used in calculating the permutation so order matters.
names := make([]string, 0, len(regexMatches))
lengths := make([]int, 0, len(regexMatches))
for key, value := range regexMatches {
names = append(names, key)
lengths = append(lengths, len(value))
}
// Permutate all the indices for each match. For example, if "foo" has
// [matchA, matchB] and "bar" has [matchC], we will get indices [0 0] [1 0].
permutationIndices := productIndices(lengths...)
// Build {"foo": matchA, "bar": matchC} and {"foo": matchB, "bar": matchC}
// from the indices.
var matches []map[string][]string
for _, permutation := range permutationIndices {
candidate := make(map[string][]string, len(permutationIndices))
for i, name := range names {
candidate[name] = regexMatches[name][permutation[i]]
}
matches = append(matches, candidate)
}
return matches
}
func (c *CustomRegexWebhook) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_CustomRegex
}