mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
Add an option to filter unverified results using shannon entropy (#1875)
* Add an option to filter unverified results using shannon entropy * lint * add test, update test, and optimize
This commit is contained in:
parent
f09bce3f75
commit
52ed87edb7
5 changed files with 103 additions and 5 deletions
2
main.go
2
main.go
|
@ -48,6 +48,7 @@ var (
|
|||
noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool()
|
||||
onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool()
|
||||
filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool()
|
||||
filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64()
|
||||
configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile()
|
||||
// rules = cli.Flag("rules", "Path to file with custom rules.").String()
|
||||
printAvgDetectorTime = cli.Flag("print-avg-detector-time", "Print the average time spent on each detector.").Bool()
|
||||
|
@ -370,6 +371,7 @@ func run(state overseer.State) {
|
|||
engine.WithOnlyVerified(*onlyVerified),
|
||||
engine.WithPrintAvgDetectorTime(*printAvgDetectorTime),
|
||||
engine.WithPrinter(printer),
|
||||
engine.WithFilterEntropy(*filterEntropy),
|
||||
)
|
||||
if err != nil {
|
||||
logFatal(err, "error initializing engine")
|
||||
|
|
|
@ -12,15 +12,15 @@ func TestPrefixRegex(t *testing.T) {
|
|||
}{
|
||||
{
|
||||
keywords: []string{"securitytrails"},
|
||||
expected: `(?i)(?:securitytrails).|(?:[\n\r]){0,40}`,
|
||||
expected: `(?i)(?:securitytrails)(?:.|[\n\r]){0,40}`,
|
||||
},
|
||||
{
|
||||
keywords: []string{"zipbooks"},
|
||||
expected: `(?i)(?:zipbooks).|(?:[\n\r]){0,40}`,
|
||||
expected: `(?i)(?:zipbooks)(?:.|[\n\r]){0,40}`,
|
||||
},
|
||||
{
|
||||
keywords: []string{"wrike"},
|
||||
expected: `(?i)(?:wrike).|(?:[\n\r]){0,40}`,
|
||||
expected: `(?i)(?:wrike)(?:.|[\n\r]){0,40}`,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
|
|
|
@ -2,6 +2,7 @@ package detectors
|
|||
|
||||
import (
|
||||
_ "embed"
|
||||
"math"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
@ -90,3 +91,39 @@ func bytesToCleanWordList(data []byte) []string {
|
|||
}
|
||||
return words
|
||||
}
|
||||
|
||||
func StringShannonEntropy(input string) float64 {
|
||||
chars := make(map[rune]float64)
|
||||
inverseTotal := 1 / float64(len(input)) // precompute the inverse
|
||||
|
||||
for _, char := range input {
|
||||
chars[char]++
|
||||
}
|
||||
|
||||
entropy := 0.0
|
||||
for _, count := range chars {
|
||||
probability := count * inverseTotal
|
||||
entropy += probability * math.Log2(probability)
|
||||
}
|
||||
|
||||
return -entropy
|
||||
}
|
||||
|
||||
// FilterResultsWithEntropy filters out determinately unverified results that have a shannon entropy below the given value.
|
||||
func FilterResultsWithEntropy(results []Result, entropy float64) []Result {
|
||||
filteredResults := []Result{}
|
||||
for _, result := range results {
|
||||
if !result.Verified && result.VerificationError == nil {
|
||||
if result.RawV2 != nil {
|
||||
if StringShannonEntropy(string(result.RawV2)) >= entropy {
|
||||
filteredResults = append(filteredResults, result)
|
||||
}
|
||||
} else {
|
||||
if StringShannonEntropy(string(result.Raw)) >= entropy {
|
||||
filteredResults = append(filteredResults, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return filteredResults
|
||||
}
|
||||
|
|
|
@ -3,7 +3,10 @@
|
|||
|
||||
package detectors
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
_ "embed"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestIsFalsePositive(t *testing.T) {
|
||||
type args struct {
|
||||
|
@ -40,3 +43,43 @@ func TestIsFalsePositive(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStringShannonEntropy(t *testing.T) {
|
||||
type args struct {
|
||||
input string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "entropy 1",
|
||||
args: args{
|
||||
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
},
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "entropy 2",
|
||||
args: args{
|
||||
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
|
||||
},
|
||||
want: 0.22228483068568816,
|
||||
},
|
||||
{
|
||||
name: "entropy 3",
|
||||
args: args{
|
||||
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaab",
|
||||
},
|
||||
want: 0.22228483068568816,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := StringShannonEntropy(tt.args.input); got != tt.want {
|
||||
t.Errorf("StringShannonEntropy() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,7 +60,9 @@ type Engine struct {
|
|||
// filterUnverified is used to reduce the number of unverified results.
|
||||
// If there are multiple unverified results for the same chunk for the same detector,
|
||||
// only the first one will be kept.
|
||||
filterUnverified bool
|
||||
filterUnverified bool
|
||||
// entropyFilter is used to filter out unverified results using Shannon entropy.
|
||||
filterEntropy *float64
|
||||
onlyVerified bool
|
||||
printAvgDetectorTime bool
|
||||
|
||||
|
@ -128,6 +130,15 @@ func WithFilterUnverified(filter bool) EngineOption {
|
|||
}
|
||||
}
|
||||
|
||||
// WithFilterEntropy filters out unverified results using Shannon entropy.
|
||||
func WithFilterEntropy(entropy float64) EngineOption {
|
||||
return func(e *Engine) {
|
||||
if entropy > 0 {
|
||||
e.filterEntropy = &entropy
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WithOnlyVerified sets the onlyVerified flag on the engine. If set to true,
|
||||
// the engine will only print verified results.
|
||||
func WithOnlyVerified(onlyVerified bool) EngineOption {
|
||||
|
@ -513,6 +524,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
|
|||
if err != nil {
|
||||
ctx.Logger().Error(err, "error scanning chunk")
|
||||
}
|
||||
|
||||
if e.printAvgDetectorTime && len(results) > 0 {
|
||||
elapsed := time.Since(start)
|
||||
detectorName := results[0].DetectorType.String()
|
||||
|
@ -532,6 +544,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
|
|||
results = detectors.CleanResults(results)
|
||||
}
|
||||
|
||||
if e.filterEntropy != nil {
|
||||
results = detectors.FilterResultsWithEntropy(results, *e.filterEntropy)
|
||||
}
|
||||
|
||||
for _, res := range results {
|
||||
e.processResult(ctx, data, res)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue