Use heuristic to choose the most likely UTF-16 decoded string (#1381)

* Use heuristic to choose the most likely UTF-16 decoded string

* Assume ASCII and include valid BE and LE bytes

* Remove unused code

* Assume ASCII and return nil when not utf16

---------

Co-authored-by: bill-rich <bill.rich@gmail.com>
This commit is contained in:
Miccah 2023-06-13 19:00:40 -05:00 committed by GitHub
parent 3d395497cf
commit fb76eaf17b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 45 deletions

View file

@ -3,7 +3,6 @@ package decoders
import ( import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"fmt"
"unicode/utf8" "unicode/utf8"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" "github.com/trufflesecurity/trufflehog/v3/pkg/sources"
@ -17,6 +16,9 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
} }
if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil { if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil {
if len(utf16Data) == 0 {
return nil
}
chunk.Data = utf16Data chunk.Data = utf16Data
return chunk return chunk
} }
@ -26,43 +28,19 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
// utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice. // utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
func utf16ToUTF8(b []byte) ([]byte, error) { func utf16ToUTF8(b []byte) ([]byte, error) {
endianness, err := guessUTF16Endianness(b) var bufBE, bufLE bytes.Buffer
if err != nil { for i := 0; i < len(b)-1; i += 2 {
return nil, err if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
} if isValidByte(byte(r)) {
bufBE.WriteRune(r)
buf := &bytes.Buffer{} }
for i := 0; i < len(b); i += 2 { }
r := rune(endianness.Uint16(b[i:])) if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
if utf8.ValidRune(r) { if isValidByte(byte(r)) {
buf.WriteRune(r) bufLE.WriteRune(r)
}
} }
} }
return buf.Bytes(), nil return append(bufLE.Bytes(), bufBE.Bytes()...), nil
}
func guessUTF16Endianness(b []byte) (binary.ByteOrder, error) {
if len(b) < 2 || len(b)%2 != 0 {
return nil, fmt.Errorf("input length must be even and at least 2 bytes long")
}
var evenNullBytes, oddNullBytes int
for i := 0; i < len(b); i += 2 {
if b[i] == 0 {
oddNullBytes++
}
if b[i+1] == 0 {
evenNullBytes++
}
}
if evenNullBytes > oddNullBytes {
return binary.LittleEndian, nil
}
if oddNullBytes > evenNullBytes {
return binary.BigEndian, nil
}
return nil, fmt.Errorf("could not determine endianness")
} }

BIN
pkg/decoders/utf16_test.dll Normal file

Binary file not shown.

View file

@ -2,6 +2,7 @@ package decoders
import ( import (
"bytes" "bytes"
"os"
"testing" "testing"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" "github.com/trufflesecurity/trufflehog/v3/pkg/sources"
@ -35,8 +36,8 @@ func TestUTF16Decoder(t *testing.T) {
{ {
name: "Invalid UTF-16 input (odd length)", name: "Invalid UTF-16 input (odd length)",
input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0}, input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0},
expected: nil, expected: []byte("Hello Worl"),
expectNil: true, expectNil: false,
}, },
} }
@ -57,12 +58,32 @@ func TestUTF16Decoder(t *testing.T) {
return return
} }
if !bytes.Equal(decodedChunk.Data, tc.expected) { if !bytes.Equal(decodedChunk.Data, tc.expected) {
t.Errorf("Expected decoded data: %v, got: %v", tc.expected, decodedChunk.Data) t.Errorf("Expected decoded data: %s, got: %s", tc.expected, decodedChunk.Data)
} }
}) })
} }
} }
func TestDLL(t *testing.T) {
data, err := os.ReadFile("utf16_test.dll")
if err != nil {
t.Errorf("Failed to read test data: %v", err)
return
}
chunk := &sources.Chunk{Data: data}
decoder := &UTF16{}
decodedChunk := decoder.FromChunk(chunk)
if decodedChunk == nil {
t.Errorf("Expected chunk with data, got nil")
return
}
if !bytes.Contains(decodedChunk.Data, []byte("aws_secret_access_key")) {
t.Errorf("Expected chunk to have aws_secret_access_key")
return
}
}
func BenchmarkUtf16ToUtf8(b *testing.B) { func BenchmarkUtf16ToUtf8(b *testing.B) {
// Example UTF-16LE encoded data // Example UTF-16LE encoded data
data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0} data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0}

View file

@ -26,11 +26,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
// extacting contigous portions of printable characters that we care // extacting contigous portions of printable characters that we care
// about from some bytes // about from some bytes
func extractSubstrings(b []byte) []byte { func extractSubstrings(b []byte) []byte {
isValidByte := func(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}
field := make([]byte, len(b)) field := make([]byte, len(b))
fieldLen := 0 fieldLen := 0
@ -53,3 +48,9 @@ func extractSubstrings(b []byte) []byte {
return buf.Bytes() return buf.Bytes()
} }
func isValidByte(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}