Add an option to filter unverified results using shannon entropy (#1875)

* Add an option to filter unverified results using shannon entropy

* lint

* add test, update test, and optimize
This commit is contained in:
Dustin Decker 2023-10-08 22:52:28 -04:00 committed by GitHub
parent f09bce3f75
commit 52ed87edb7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 103 additions and 5 deletions

View file

@ -48,6 +48,7 @@ var (
noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool()
onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool()
filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool()
filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64()
configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile()
// rules = cli.Flag("rules", "Path to file with custom rules.").String()
printAvgDetectorTime = cli.Flag("print-avg-detector-time", "Print the average time spent on each detector.").Bool()
@ -370,6 +371,7 @@ func run(state overseer.State) {
engine.WithOnlyVerified(*onlyVerified),
engine.WithPrintAvgDetectorTime(*printAvgDetectorTime),
engine.WithPrinter(printer),
engine.WithFilterEntropy(*filterEntropy),
)
if err != nil {
logFatal(err, "error initializing engine")

View file

@ -12,15 +12,15 @@ func TestPrefixRegex(t *testing.T) {
}{
{
keywords: []string{"securitytrails"},
expected: `(?i)(?:securitytrails).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:securitytrails)(?:.|[\n\r]){0,40}`,
},
{
keywords: []string{"zipbooks"},
expected: `(?i)(?:zipbooks).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:zipbooks)(?:.|[\n\r]){0,40}`,
},
{
keywords: []string{"wrike"},
expected: `(?i)(?:wrike).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:wrike)(?:.|[\n\r]){0,40}`,
},
}
for _, tt := range tests {

View file

@ -2,6 +2,7 @@ package detectors
import (
_ "embed"
"math"
"strings"
"unicode"
)
@ -90,3 +91,39 @@ func bytesToCleanWordList(data []byte) []string {
}
return words
}
func StringShannonEntropy(input string) float64 {
chars := make(map[rune]float64)
inverseTotal := 1 / float64(len(input)) // precompute the inverse
for _, char := range input {
chars[char]++
}
entropy := 0.0
for _, count := range chars {
probability := count * inverseTotal
entropy += probability * math.Log2(probability)
}
return -entropy
}
// FilterResultsWithEntropy filters out determinately unverified results that have a shannon entropy below the given value.
func FilterResultsWithEntropy(results []Result, entropy float64) []Result {
filteredResults := []Result{}
for _, result := range results {
if !result.Verified && result.VerificationError == nil {
if result.RawV2 != nil {
if StringShannonEntropy(string(result.RawV2)) >= entropy {
filteredResults = append(filteredResults, result)
}
} else {
if StringShannonEntropy(string(result.Raw)) >= entropy {
filteredResults = append(filteredResults, result)
}
}
}
}
return filteredResults
}

View file

@ -3,7 +3,10 @@
package detectors
import "testing"
import (
_ "embed"
"testing"
)
func TestIsFalsePositive(t *testing.T) {
type args struct {
@ -40,3 +43,43 @@ func TestIsFalsePositive(t *testing.T) {
})
}
}
func TestStringShannonEntropy(t *testing.T) {
type args struct {
input string
}
tests := []struct {
name string
args args
want float64
}{
{
name: "entropy 1",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
},
want: 0,
},
{
name: "entropy 2",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
},
want: 0.22228483068568816,
},
{
name: "entropy 3",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaab",
},
want: 0.22228483068568816,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := StringShannonEntropy(tt.args.input); got != tt.want {
t.Errorf("StringShannonEntropy() = %v, want %v", got, tt.want)
}
})
}
}

View file

@ -61,6 +61,8 @@ type Engine struct {
// If there are multiple unverified results for the same chunk for the same detector,
// only the first one will be kept.
filterUnverified bool
// entropyFilter is used to filter out unverified results using Shannon entropy.
filterEntropy *float64
onlyVerified bool
printAvgDetectorTime bool
@ -128,6 +130,15 @@ func WithFilterUnverified(filter bool) EngineOption {
}
}
// WithFilterEntropy filters out unverified results using Shannon entropy.
func WithFilterEntropy(entropy float64) EngineOption {
return func(e *Engine) {
if entropy > 0 {
e.filterEntropy = &entropy
}
}
}
// WithOnlyVerified sets the onlyVerified flag on the engine. If set to true,
// the engine will only print verified results.
func WithOnlyVerified(onlyVerified bool) EngineOption {
@ -513,6 +524,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
if err != nil {
ctx.Logger().Error(err, "error scanning chunk")
}
if e.printAvgDetectorTime && len(results) > 0 {
elapsed := time.Since(start)
detectorName := results[0].DetectorType.String()
@ -532,6 +544,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
results = detectors.CleanResults(results)
}
if e.filterEntropy != nil {
results = detectors.FilterResultsWithEntropy(results, *e.filterEntropy)
}
for _, res := range results {
e.processResult(ctx, data, res)
}