mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
Use heuristic to choose the most likely UTF-16 decoded string (#1381)
* Use heuristic to choose the most likely UTF-16 decoded string * Assume ASCII and include valid BE and LE bytes * Remove unused code * Assume ASCII and return nil when not utf16 --------- Co-authored-by: bill-rich <bill.rich@gmail.com>
This commit is contained in:
parent
3d395497cf
commit
fb76eaf17b
4 changed files with 45 additions and 45 deletions
|
@ -3,7 +3,6 @@ package decoders
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
|
@ -17,6 +16,9 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
|
|||
}
|
||||
|
||||
if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil {
|
||||
if len(utf16Data) == 0 {
|
||||
return nil
|
||||
}
|
||||
chunk.Data = utf16Data
|
||||
return chunk
|
||||
}
|
||||
|
@ -26,43 +28,19 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
|
|||
|
||||
// utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
|
||||
func utf16ToUTF8(b []byte) ([]byte, error) {
|
||||
endianness, err := guessUTF16Endianness(b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
var bufBE, bufLE bytes.Buffer
|
||||
for i := 0; i < len(b)-1; i += 2 {
|
||||
if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
|
||||
if isValidByte(byte(r)) {
|
||||
bufBE.WriteRune(r)
|
||||
}
|
||||
}
|
||||
if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
|
||||
if isValidByte(byte(r)) {
|
||||
bufLE.WriteRune(r)
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
for i := 0; i < len(b); i += 2 {
|
||||
r := rune(endianness.Uint16(b[i:]))
|
||||
if utf8.ValidRune(r) {
|
||||
buf.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
func guessUTF16Endianness(b []byte) (binary.ByteOrder, error) {
|
||||
if len(b) < 2 || len(b)%2 != 0 {
|
||||
return nil, fmt.Errorf("input length must be even and at least 2 bytes long")
|
||||
}
|
||||
|
||||
var evenNullBytes, oddNullBytes int
|
||||
|
||||
for i := 0; i < len(b); i += 2 {
|
||||
if b[i] == 0 {
|
||||
oddNullBytes++
|
||||
}
|
||||
if b[i+1] == 0 {
|
||||
evenNullBytes++
|
||||
}
|
||||
}
|
||||
|
||||
if evenNullBytes > oddNullBytes {
|
||||
return binary.LittleEndian, nil
|
||||
}
|
||||
if oddNullBytes > evenNullBytes {
|
||||
return binary.BigEndian, nil
|
||||
}
|
||||
return nil, fmt.Errorf("could not determine endianness")
|
||||
return append(bufLE.Bytes(), bufBE.Bytes()...), nil
|
||||
}
|
||||
|
|
BIN
pkg/decoders/utf16_test.dll
Normal file
BIN
pkg/decoders/utf16_test.dll
Normal file
Binary file not shown.
|
@ -2,6 +2,7 @@ package decoders
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
|
@ -35,8 +36,8 @@ func TestUTF16Decoder(t *testing.T) {
|
|||
{
|
||||
name: "Invalid UTF-16 input (odd length)",
|
||||
input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0},
|
||||
expected: nil,
|
||||
expectNil: true,
|
||||
expected: []byte("Hello Worl"),
|
||||
expectNil: false,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -57,12 +58,32 @@ func TestUTF16Decoder(t *testing.T) {
|
|||
return
|
||||
}
|
||||
if !bytes.Equal(decodedChunk.Data, tc.expected) {
|
||||
t.Errorf("Expected decoded data: %v, got: %v", tc.expected, decodedChunk.Data)
|
||||
t.Errorf("Expected decoded data: %s, got: %s", tc.expected, decodedChunk.Data)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDLL(t *testing.T) {
|
||||
data, err := os.ReadFile("utf16_test.dll")
|
||||
if err != nil {
|
||||
t.Errorf("Failed to read test data: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
chunk := &sources.Chunk{Data: data}
|
||||
decoder := &UTF16{}
|
||||
decodedChunk := decoder.FromChunk(chunk)
|
||||
if decodedChunk == nil {
|
||||
t.Errorf("Expected chunk with data, got nil")
|
||||
return
|
||||
}
|
||||
if !bytes.Contains(decodedChunk.Data, []byte("aws_secret_access_key")) {
|
||||
t.Errorf("Expected chunk to have aws_secret_access_key")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkUtf16ToUtf8(b *testing.B) {
|
||||
// Example UTF-16LE encoded data
|
||||
data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0}
|
||||
|
|
|
@ -26,11 +26,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
|
|||
// extacting contigous portions of printable characters that we care
|
||||
// about from some bytes
|
||||
func extractSubstrings(b []byte) []byte {
|
||||
isValidByte := func(c byte) bool {
|
||||
// https://www.rapidtables.com/code/text/ascii-table.html
|
||||
// split on anything that is not ascii space through tilde
|
||||
return c > 31 && c < 127
|
||||
}
|
||||
|
||||
field := make([]byte, len(b))
|
||||
fieldLen := 0
|
||||
|
@ -53,3 +48,9 @@ func extractSubstrings(b []byte) []byte {
|
|||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func isValidByte(c byte) bool {
|
||||
// https://www.rapidtables.com/code/text/ascii-table.html
|
||||
// split on anything that is not ascii space through tilde
|
||||
return c > 31 && c < 127
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue