trufflehog/pkg/decoders/escaped_unicode.go
Richard Gomez cbc0f0f48e
Create basic escaped unicode decoder (#2456)
* feat(decoders): basic escaped unicode

* wip: handle unicode notation
Experimenting with this.. might remove
2024-03-02 11:27:44 -08:00

127 lines
3.6 KiB
Go

package decoders
import (
"regexp"
"strconv"
"unicode/utf8"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
type EscapedUnicode struct{}
var _ Decoder = (*EscapedUnicode)(nil)
// It might be advantageous to limit these to a subset of acceptable characters, similar to base64.
// https://dencode.com/en/string/unicode-escape
var (
// Standard Unicode notation.
//https://unicode.org/standard/principles.html
codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)
// Common escape sequence used in programming languages.
escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`)
)
func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}
matched := false
if codePointPat.Match(chunk.Data) {
matched = true
chunk.Data = decodeCodePoint(chunk.Data)
}
if escapePat.Match(chunk.Data) {
matched = true
chunk.Data = decodeEscaped(chunk.Data)
}
if matched {
decodableChunk := &DecodableChunk{
DecoderType: detectorspb.DecoderType_ESCAPED_UNICODE,
Chunk: chunk,
}
return decodableChunk
} else {
return nil
}
}
// Unicode characters are encoded as 1 to 4 bytes per rune.
const maxBytesPerRune = 4
const spaceChar = byte(' ')
func decodeCodePoint(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := codePointPat.FindAllSubmatchIndex(input, -1)
// Iterate over found indices in reverse order to avoid modifying the slice length
utf8Bytes := make([]byte, maxBytesPerRune)
for i := len(indices) - 1; i >= 0; i-- {
matches := indices[i]
startIndex := matches[0]
endIndex := matches[1]
hexStartIndex := matches[2]
hexEndIndex := matches[3]
// If the input is like `U+1234 U+5678` we should replace `U+1234 `.
// Otherwise, we should only replace `U+1234`.
if endIndex != hexEndIndex && input[endIndex-1] != spaceChar {
endIndex = endIndex - 1
}
// Extract the hexadecimal value from the escape sequence
hexValue := string(input[hexStartIndex:hexEndIndex])
// Parse the hexadecimal value to an integer
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
if err != nil {
// If there's an error, continue to the next escape sequence
continue
}
// Convert the Unicode code point to a UTF-8 representation
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
// Replace the escape sequence with the UTF-8 representation
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
}
return input
}
func decodeEscaped(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := escapePat.FindAllSubmatchIndex(input, -1)
// Iterate over found indices in reverse order to avoid modifying the slice length
utf8Bytes := make([]byte, maxBytesPerRune)
for i := len(indices) - 1; i >= 0; i-- {
matches := indices[i]
startIndex := matches[0]
hexStartIndex := matches[2]
endIndex := matches[3]
// Extract the hexadecimal value from the escape sequence
hexValue := string(input[hexStartIndex:endIndex])
// Parse the hexadecimal value to an integer
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
if err != nil {
// If there's an error, continue to the next escape sequence
continue
}
// Convert the Unicode code point to a UTF-8 representation
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
// Replace the escape sequence with the UTF-8 representation
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
}
return input
}