Use heuristic to choose the most likely UTF-16 decoded string (#1381)

* Use heuristic to choose the most likely UTF-16 decoded string

* Assume ASCII and include valid BE and LE bytes

* Remove unused code

* Assume ASCII and return nil when not utf16

---------

Co-authored-by: bill-rich <bill.rich@gmail.com>
This commit is contained in:
Miccah 2023-06-13 19:00:40 -05:00 committed by GitHub
parent 3d395497cf
commit fb76eaf17b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 45 deletions

View file

@ -3,7 +3,6 @@ package decoders
import (
"bytes"
"encoding/binary"
"fmt"
"unicode/utf8"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
@ -17,6 +16,9 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
}
if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil {
if len(utf16Data) == 0 {
return nil
}
chunk.Data = utf16Data
return chunk
}
@ -26,43 +28,19 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
// utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
func utf16ToUTF8(b []byte) ([]byte, error) {
endianness, err := guessUTF16Endianness(b)
if err != nil {
return nil, err
}
buf := &bytes.Buffer{}
for i := 0; i < len(b); i += 2 {
r := rune(endianness.Uint16(b[i:]))
if utf8.ValidRune(r) {
buf.WriteRune(r)
var bufBE, bufLE bytes.Buffer
for i := 0; i < len(b)-1; i += 2 {
if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
bufBE.WriteRune(r)
}
}
if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
bufLE.WriteRune(r)
}
}
}
return buf.Bytes(), nil
}
func guessUTF16Endianness(b []byte) (binary.ByteOrder, error) {
if len(b) < 2 || len(b)%2 != 0 {
return nil, fmt.Errorf("input length must be even and at least 2 bytes long")
}
var evenNullBytes, oddNullBytes int
for i := 0; i < len(b); i += 2 {
if b[i] == 0 {
oddNullBytes++
}
if b[i+1] == 0 {
evenNullBytes++
}
}
if evenNullBytes > oddNullBytes {
return binary.LittleEndian, nil
}
if oddNullBytes > evenNullBytes {
return binary.BigEndian, nil
}
return nil, fmt.Errorf("could not determine endianness")
return append(bufLE.Bytes(), bufBE.Bytes()...), nil
}

BIN
pkg/decoders/utf16_test.dll Normal file

Binary file not shown.

View file

@ -2,6 +2,7 @@ package decoders
import (
"bytes"
"os"
"testing"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
@ -35,8 +36,8 @@ func TestUTF16Decoder(t *testing.T) {
{
name: "Invalid UTF-16 input (odd length)",
input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0},
expected: nil,
expectNil: true,
expected: []byte("Hello Worl"),
expectNil: false,
},
}
@ -57,12 +58,32 @@ func TestUTF16Decoder(t *testing.T) {
return
}
if !bytes.Equal(decodedChunk.Data, tc.expected) {
t.Errorf("Expected decoded data: %v, got: %v", tc.expected, decodedChunk.Data)
t.Errorf("Expected decoded data: %s, got: %s", tc.expected, decodedChunk.Data)
}
})
}
}
func TestDLL(t *testing.T) {
data, err := os.ReadFile("utf16_test.dll")
if err != nil {
t.Errorf("Failed to read test data: %v", err)
return
}
chunk := &sources.Chunk{Data: data}
decoder := &UTF16{}
decodedChunk := decoder.FromChunk(chunk)
if decodedChunk == nil {
t.Errorf("Expected chunk with data, got nil")
return
}
if !bytes.Contains(decodedChunk.Data, []byte("aws_secret_access_key")) {
t.Errorf("Expected chunk to have aws_secret_access_key")
return
}
}
func BenchmarkUtf16ToUtf8(b *testing.B) {
// Example UTF-16LE encoded data
data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0}

View file

@ -26,11 +26,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
// extacting contigous portions of printable characters that we care
// about from some bytes
func extractSubstrings(b []byte) []byte {
isValidByte := func(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}
field := make([]byte, len(b))
fieldLen := 0
@ -53,3 +48,9 @@ func extractSubstrings(b []byte) []byte {
return buf.Bytes()
}
func isValidByte(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}