mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
feat(decoders): HTML entities
This commit is contained in:
parent
980d783ac9
commit
1180b27b44
3 changed files with 274 additions and 0 deletions
|
@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder {
|
|||
&Base64{},
|
||||
&UTF16{},
|
||||
&EscapedUnicode{},
|
||||
&HtmlEntity{},
|
||||
}
|
||||
}
|
||||
|
||||
|
|
168
pkg/decoders/html_entity.go
Normal file
168
pkg/decoders/html_entity.go
Normal file
|
@ -0,0 +1,168 @@
|
|||
package decoders
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
)
|
||||
|
||||
// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
|
||||
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
|
||||
type HtmlEntity struct{}
|
||||
|
||||
var _ Decoder = (*HtmlEntity)(nil)
|
||||
|
||||
func (d *HtmlEntity) FromChunk(chunk *sources.Chunk) *DecodableChunk {
|
||||
if chunk == nil || len(chunk.Data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
matched := false
|
||||
if namedEntityPat.Match(chunk.Data) {
|
||||
matched = true
|
||||
chunk.Data = decodeNamedEntities(chunk.Data)
|
||||
}
|
||||
if decimalEntityPat.Match(chunk.Data) {
|
||||
matched = true
|
||||
chunk.Data = decodeHtmlDecimal(chunk.Data)
|
||||
}
|
||||
if hexEntityPat.Match(chunk.Data) {
|
||||
matched = true
|
||||
chunk.Data = decodeHtmlHex(chunk.Data)
|
||||
}
|
||||
|
||||
if matched {
|
||||
decodableChunk := &DecodableChunk{
|
||||
DecoderType: detectorspb.DecoderType_ESCAPED_UNICODE,
|
||||
Chunk: chunk,
|
||||
}
|
||||
return decodableChunk
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// `A` = `A`
|
||||
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)
|
||||
|
||||
func decodeHtmlDecimal(input []byte) []byte {
|
||||
decoded := make([]byte, 0, len(input))
|
||||
lastIndex := 0
|
||||
|
||||
for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
|
||||
startIndex := match[0]
|
||||
endIndex := match[1]
|
||||
decStartIndex := match[2]
|
||||
decEndIndex := match[3]
|
||||
|
||||
// Copy the part of the input until the start of the entity
|
||||
decoded = append(decoded, input[lastIndex:startIndex]...)
|
||||
|
||||
num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Append the decoded byte
|
||||
decoded = append(decoded, byte(num))
|
||||
|
||||
lastIndex = endIndex
|
||||
}
|
||||
|
||||
// Append the remaining part of the input
|
||||
decoded = append(decoded, input[lastIndex:]...)
|
||||
|
||||
return decoded
|
||||
}
|
||||
|
||||
// `A` = ``
|
||||
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)
|
||||
|
||||
func decodeHtmlHex(input []byte) []byte {
|
||||
decoded := make([]byte, 0, len(input))
|
||||
lastIndex := 0
|
||||
|
||||
for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
|
||||
startIndex := match[0]
|
||||
endIndex := match[1]
|
||||
hexStartIndex := match[2]
|
||||
hexEndIndex := match[3]
|
||||
|
||||
// Copy the part of the input until the start of the entity
|
||||
decoded = append(decoded, input[lastIndex:startIndex]...)
|
||||
|
||||
// Parse the hexadecimal value to an integer
|
||||
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Append the decoded byte
|
||||
decoded = append(decoded, byte(num))
|
||||
|
||||
lastIndex = endIndex
|
||||
}
|
||||
|
||||
// Append the remaining part of the input
|
||||
decoded = append(decoded, input[lastIndex:]...)
|
||||
|
||||
return decoded
|
||||
}
|
||||
|
||||
var (
|
||||
// https://www.compart.com/en/unicode/html
|
||||
namedEntityMap = map[string][]byte{
|
||||
"&tab;": []byte(" "),
|
||||
"&newline;": []byte("\n"),
|
||||
"!": []byte("!"),
|
||||
""": []byte(`"`),
|
||||
"#": []byte("#"),
|
||||
"$": []byte("$"),
|
||||
"%": []byte("%"),
|
||||
"&": []byte("&"),
|
||||
"'": []byte("'"),
|
||||
"(": []byte("("),
|
||||
")": []byte(")"),
|
||||
"*": []byte("*"),
|
||||
"+": []byte("+"),
|
||||
",": []byte(","),
|
||||
".": []byte("."),
|
||||
"/": []byte("/"),
|
||||
":": []byte(":"),
|
||||
";": []byte(";"),
|
||||
"<": []byte("<"),
|
||||
"=": []byte("="),
|
||||
">": []byte(">"),
|
||||
"?": []byte("?"),
|
||||
"@": []byte("@"),
|
||||
"[": []byte("["),
|
||||
"\": []byte("\\"),
|
||||
"]": []byte("]"),
|
||||
"&hat;": []byte("^"),
|
||||
"&underbar;": []byte("_"),
|
||||
"&diacriticalgrave;": []byte("`"),
|
||||
"{": []byte("{"),
|
||||
"&verticalline;": []byte("|"),
|
||||
"}": []byte("}"),
|
||||
"&nonbreakingspace;": []byte(" "),
|
||||
}
|
||||
namedEntityPat = func() *regexp.Regexp {
|
||||
return regexp.MustCompile(
|
||||
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
|
||||
}()
|
||||
)
|
||||
|
||||
func decodeNamedEntities(input []byte) []byte {
|
||||
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
|
||||
m := strings.ToLower(string(match))
|
||||
if replacement, ok := namedEntityMap[m]; ok {
|
||||
return replacement
|
||||
}
|
||||
return match
|
||||
})
|
||||
}
|
105
pkg/decoders/html_entity_test.go
Normal file
105
pkg/decoders/html_entity_test.go
Normal file
|
@ -0,0 +1,105 @@
|
|||
package decoders
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/kylelemons/godebug/pretty"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
)
|
||||
|
||||
func TestHtmlEntity_FromChunk(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
chunk *sources.Chunk
|
||||
want *sources.Chunk
|
||||
wantErr bool
|
||||
}{
|
||||
// 
|
||||
{
|
||||
name: "[decimal] all encoded",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte("token: "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""),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte("token: \"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\""),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "[decimal] mixed content",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte(`token: "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"`),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte(`token: "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"`),
|
||||
},
|
||||
},
|
||||
// 
|
||||
{
|
||||
name: "[hex] all encoded",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte("token: "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""),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte(`token: "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"`),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "[hex] mixed content",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte(`token: "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"`),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte(`token: "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"`),
|
||||
},
|
||||
},
|
||||
// "
|
||||
{
|
||||
name: "[named] all encoded",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte("	
!"#$%&'()*+,./:;<=>?@[\]^_`{|} "),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "[named] mixed content",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte("\t
!"#$%&'()*+,./:;<=>?@[\\]^_`{|} "),
|
||||
},
|
||||
want: &sources.Chunk{
|
||||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "),
|
||||
},
|
||||
},
|
||||
|
||||
// nothing
|
||||
{
|
||||
name: "no escaped",
|
||||
chunk: &sources.Chunk{
|
||||
Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312
|
||||
+//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`),
|
||||
},
|
||||
want: nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
d := &HtmlEntity{}
|
||||
got := d.FromChunk(tt.chunk)
|
||||
if tt.want != nil {
|
||||
if got == nil {
|
||||
t.Fatal("got nil, did not want nil")
|
||||
}
|
||||
if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" {
|
||||
t.Errorf("HtmlEntity.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff)
|
||||
}
|
||||
} else {
|
||||
if got != nil {
|
||||
t.Error("Expected nil chunk")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue