trufflehog/pkg/custom_detectors/custom_detectors.go
2024-06-14 08:26:05 -07:00

256 lines
7 KiB
Go

package custom_detectors
import (
"bytes"
"context"
"encoding/json"
"net/http"
"regexp"
"strings"
"golang.org/x/sync/errgroup"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
// The maximum number of matches from one chunk. This const is used when
// permutating each regex match to protect the scanner from doing too much work
// for poorly defined regexps.
const maxTotalMatches = 100
// CustomRegexWebhook is a CustomRegex with webhook validation that is
// guaranteed to be valid (assuming the data is not changed after
// initialization).
type CustomRegexWebhook struct {
*custom_detectorspb.CustomRegex
}
// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*CustomRegexWebhook)(nil)
var _ detectors.CustomFalsePositiveChecker = (*CustomRegexWebhook)(nil)
// NewWebhookCustomRegex initializes and validates a CustomRegexWebhook. An
// unexported type is intentionally returned here to ensure the values have
// been validated.
func NewWebhookCustomRegex(pb *custom_detectorspb.CustomRegex) (*CustomRegexWebhook, error) {
// TODO: Return all validation errors.
if err := ValidateKeywords(pb.Keywords); err != nil {
return nil, err
}
if err := ValidateRegex(pb.Regex); err != nil {
return nil, err
}
for _, verify := range pb.Verify {
if err := ValidateVerifyEndpoint(verify.Endpoint, verify.Unsafe); err != nil {
return nil, err
}
if err := ValidateVerifyHeaders(verify.Headers); err != nil {
return nil, err
}
}
// TODO: Copy only necessary data out of pb.
return &CustomRegexWebhook{pb}, nil
}
var httpClient = common.SaneHttpClient()
func (c *CustomRegexWebhook) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)
regexMatches := make(map[string][][]string, len(c.GetRegex()))
// Find all submatches for each regex.
for name, regex := range c.GetRegex() {
regex, err := regexp.Compile(regex)
if err != nil {
// This will only happen if the regex is invalid.
return nil, err
}
regexMatches[name] = regex.FindAllStringSubmatch(dataStr, -1)
}
// Permutate each individual match.
// {
// "foo": [["match1"]]
// "bar": [["match2"], ["match3"]]
// }
// becomes
// [
// {"foo": ["match1"], "bar": ["match2"]},
// {"foo": ["match1"], "bar": ["match3"]},
// ]
matches := permutateMatches(regexMatches)
g := new(errgroup.Group)
// Create result object and test for verification.
resultsCh := make(chan detectors.Result, maxTotalMatches)
for _, match := range matches {
match := match
g.Go(func() error {
return c.createResults(ctx, match, verify, resultsCh)
})
}
// Ignore any errors and collect as many of the results as we can.
_ = g.Wait()
close(resultsCh)
for result := range resultsCh {
// NOTE: I don't believe this is being set anywhere else, hence the map assignment.
result.ExtraData = map[string]string{
"name": c.GetName(),
}
results = append(results, result)
}
return results, nil
}
func (c *CustomRegexWebhook) IsFalsePositive(_ detectors.Result) (bool, string) {
return false, ""
}
func (c *CustomRegexWebhook) createResults(ctx context.Context, match map[string][]string, verify bool, results chan<- detectors.Result) error {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return ctx.Err()
}
var raw string
for _, values := range match {
// values[0] contains the entire regex match.
raw += values[0]
}
result := detectors.Result{
DetectorType: detectorspb.DetectorType_CustomRegex,
DetectorName: c.GetName(),
Raw: []byte(raw),
}
if !verify {
select {
case <-ctx.Done():
return ctx.Err()
case results <- result:
return nil
}
}
// Verify via webhook.
jsonBody, err := json.Marshal(map[string]map[string][]string{
c.GetName(): match,
})
if err != nil {
// This should never happen, but if it does, return nil to not
// disrupt other verification.
return nil
}
// Try each config until we successfully verify.
for _, verifyConfig := range c.GetVerify() {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return ctx.Err()
}
req, err := http.NewRequestWithContext(ctx, "POST", verifyConfig.GetEndpoint(), bytes.NewReader(jsonBody))
if err != nil {
continue
}
for _, header := range verifyConfig.GetHeaders() {
key, value, found := strings.Cut(header, ":")
if !found {
// Should be unreachable due to validation.
continue
}
req.Header.Add(key, strings.TrimLeft(value, "\t\n\v\f\r "))
}
res, err := httpClient.Do(req)
if err != nil {
continue
}
// TODO: Read response body.
res.Body.Close()
if res.StatusCode == http.StatusOK {
result.Verified = true
break
}
}
select {
case <-ctx.Done():
return ctx.Err()
case results <- result:
return nil
}
}
func (c *CustomRegexWebhook) Keywords() []string {
return c.GetKeywords()
}
// productIndices produces a permutation of indices for each length. Example:
// productIndices(3, 2) -> [[0 0] [1 0] [2 0] [0 1] [1 1] [2 1]]. It returns
// a slice of length no larger than maxTotalMatches.
func productIndices(lengths ...int) [][]int {
count := 1
for _, l := range lengths {
count *= l
}
if count == 0 {
return nil
}
if count > maxTotalMatches {
count = maxTotalMatches
}
results := make([][]int, count)
for i := 0; i < count; i++ {
j := 1
result := make([]int, 0, len(lengths))
for _, l := range lengths {
result = append(result, (i/j)%l)
j *= l
}
results[i] = result
}
return results
}
// permutateMatches converts the list of all regex matches into all possible
// permutations selecting one from each named entry in the map. For example:
// {"foo": [matchA, matchB], "bar": [matchC]} becomes
//
// [{"foo": matchA, "bar": matchC}, {"foo": matchB, "bar": matchC}]
func permutateMatches(regexMatches map[string][][]string) []map[string][]string {
// Get a consistent order for names and their matching lengths.
// The lengths are used in calculating the permutation so order matters.
names := make([]string, 0, len(regexMatches))
lengths := make([]int, 0, len(regexMatches))
for key, value := range regexMatches {
names = append(names, key)
lengths = append(lengths, len(value))
}
// Permutate all the indices for each match. For example, if "foo" has
// [matchA, matchB] and "bar" has [matchC], we will get indices [0 0] [1 0].
permutationIndices := productIndices(lengths...)
// Build {"foo": matchA, "bar": matchC} and {"foo": matchB, "bar": matchC}
// from the indices.
var matches []map[string][]string
for _, permutation := range permutationIndices {
candidate := make(map[string][]string, len(permutationIndices))
for i, name := range names {
candidate[name] = regexMatches[name][permutation[i]]
}
matches = append(matches, candidate)
}
return matches
}
func (c *CustomRegexWebhook) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_CustomRegex
}