mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 15:14:38 +00:00
[chore] Speedup IsKnownFalsePositive using sets (#2090)
Also check that the match is a valid UTF-8 string.
This commit is contained in:
parent
3b9ecaa704
commit
600903f391
2 changed files with 22 additions and 21 deletions
|
@ -5,6 +5,7 @@ import (
|
|||
"math"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var DefaultFalsePositives = []FalsePositive{"example", "xxxxxx", "aaaaaa", "abcde", "00000", "sample", "www"}
|
||||
|
@ -21,9 +22,9 @@ var wordList []byte
|
|||
var programmingBookWords []byte
|
||||
|
||||
type Wordlists struct {
|
||||
wordList []string
|
||||
badList []string
|
||||
programmingBookWords []string
|
||||
wordList map[string]struct{}
|
||||
badList map[string]struct{}
|
||||
programmingBookWords map[string]struct{}
|
||||
}
|
||||
|
||||
var FalsePositiveWordlists = Wordlists{
|
||||
|
@ -36,36 +37,29 @@ var FalsePositiveWordlists = Wordlists{
|
|||
// Currently that includes: No number, english word in key, or matches common example pattens.
|
||||
// Only the secret key material should be passed into this function
|
||||
func IsKnownFalsePositive(match string, falsePositives []FalsePositive, wordCheck bool) bool {
|
||||
|
||||
if !utf8.ValidString(match) {
|
||||
return true
|
||||
}
|
||||
lower := strings.ToLower(match)
|
||||
for _, fp := range falsePositives {
|
||||
if strings.Contains(strings.ToLower(match), string(fp)) {
|
||||
if strings.Contains(lower, string(fp)) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if wordCheck {
|
||||
// check against common substring badlist
|
||||
if hasDictWord(FalsePositiveWordlists.badList, match) {
|
||||
if _, ok := FalsePositiveWordlists.badList[lower]; ok {
|
||||
return true
|
||||
}
|
||||
|
||||
// check for dictionary word substrings
|
||||
if hasDictWord(FalsePositiveWordlists.wordList, match) {
|
||||
if _, ok := FalsePositiveWordlists.wordList[lower]; ok {
|
||||
return true
|
||||
}
|
||||
|
||||
// check for programming book token substrings
|
||||
if hasDictWord(FalsePositiveWordlists.programmingBookWords, match) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasDictWord(wordList []string, token string) bool {
|
||||
lower := strings.ToLower(token)
|
||||
for _, word := range wordList {
|
||||
if strings.Contains(lower, word) {
|
||||
if _, ok := FalsePositiveWordlists.programmingBookWords[lower]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
@ -82,11 +76,11 @@ func HasDigit(key string) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func bytesToCleanWordList(data []byte) []string {
|
||||
words := []string{}
|
||||
func bytesToCleanWordList(data []byte) map[string]struct{} {
|
||||
words := make(map[string]struct{})
|
||||
for _, word := range strings.Split(string(data), "\n") {
|
||||
if strings.TrimSpace(word) != "" {
|
||||
words = append(words, strings.TrimSpace(strings.ToLower(word)))
|
||||
words[strings.TrimSpace(strings.ToLower(word))] = struct{}{}
|
||||
}
|
||||
}
|
||||
return words
|
||||
|
|
|
@ -90,3 +90,10 @@ func TestStringShannonEntropy(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDefaultIsKnownFalsePositive(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
// Use a string that won't be found in any dictionary for the worst case check.
|
||||
IsKnownFalsePositive("aoeuaoeuaoeuaoeuaoeuaoeu", DefaultFalsePositives, true)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue