trufflehog/pkg/sources/chunker.go
Miccah 0d451aa806
Fix bug in chunker that surfaces with a flaky passed in io.Reader (#1838)
* Fix bug in chunker that surfaces with a flaky passed in io.Reader

The chunker was previously expecting the passed in io.Reader to always
successfully read a full buffer of data, however it's valid for a Reader
to return less data than requested. When this happens, the chunker would
peek the same data that it then reads in the next iteration of the loop,
causing the same data to be scanned twice.

Co-authored-by: ahrav <ahravdutta02@gmail.com>

* Fix EOF error check

* Use io.ReadFull in Chunker

---------

Co-authored-by: ahrav <ahravdutta02@gmail.com>
2023-10-02 09:38:23 -07:00

173 lines
4.2 KiB
Go

package sources
import (
"bufio"
"bytes"
"errors"
"io"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
)
const (
// ChunkSize is the maximum size of a chunk.
ChunkSize = 10 * 1024
// PeekSize is the size of the peek into the previous chunk.
PeekSize = 3 * 1024
// TotalChunkSize is the total size of a chunk with peek data.
TotalChunkSize = ChunkSize + PeekSize
)
// Chunker takes a chunk and splits it into chunks of ChunkSize.
func Chunker(originalChunk *Chunk) chan *Chunk {
chunkChan := make(chan *Chunk, 1)
go func() {
defer close(chunkChan)
if len(originalChunk.Data) <= TotalChunkSize {
chunkChan <- originalChunk
return
}
r := bytes.NewReader(originalChunk.Data)
reader := bufio.NewReaderSize(bufio.NewReader(r), ChunkSize)
for {
chunkBytes := make([]byte, TotalChunkSize)
chunk := *originalChunk
chunkBytes = chunkBytes[:ChunkSize]
n, err := io.ReadFull(reader, chunkBytes)
if n > 0 {
peekData, _ := reader.Peek(TotalChunkSize - n)
chunkBytes = append(chunkBytes[:n], peekData...)
chunk.Data = chunkBytes
chunkChan <- &chunk
}
if err != nil {
break
}
}
}()
return chunkChan
}
type chunkReaderConfig struct {
chunkSize int
totalSize int
peekSize int
}
// ConfigOption is a function that configures a chunker.
type ConfigOption func(*chunkReaderConfig)
// WithChunkSize sets the chunk size.
func WithChunkSize(size int) ConfigOption {
return func(c *chunkReaderConfig) {
c.chunkSize = size
}
}
// WithPeekSize sets the peek size.
func WithPeekSize(size int) ConfigOption {
return func(c *chunkReaderConfig) {
c.peekSize = size
}
}
// ChunkResult is the output unit of a ChunkReader,
// it contains the data and error of a chunk.
type ChunkResult struct {
data []byte
err error
}
// Bytes for a ChunkResult.
func (cr ChunkResult) Bytes() []byte {
return cr.data
}
// Error for a ChunkResult.
func (cr ChunkResult) Error() error {
return cr.err
}
// ChunkReader reads chunks from a reader and returns a channel of chunks and a channel of errors.
// The channel of chunks is closed when the reader is closed.
// This should be used whenever a large amount of data is read from a reader.
// Ex: reading attachments, archives, etc.
type ChunkReader func(ctx context.Context, reader io.Reader) <-chan ChunkResult
// NewChunkReader returns a ChunkReader with the given options.
func NewChunkReader(opts ...ConfigOption) ChunkReader {
config := applyOptions(opts)
return createReaderFn(config)
}
func applyOptions(opts []ConfigOption) *chunkReaderConfig {
// Set defaults.
config := &chunkReaderConfig{
chunkSize: ChunkSize, // default
peekSize: PeekSize, // default
}
for _, opt := range opts {
opt(config)
}
config.totalSize = config.chunkSize + config.peekSize
return config
}
func createReaderFn(config *chunkReaderConfig) ChunkReader {
return func(ctx context.Context, reader io.Reader) <-chan ChunkResult {
return readInChunks(ctx, reader, config)
}
}
func readInChunks(ctx context.Context, reader io.Reader, config *chunkReaderConfig) <-chan ChunkResult {
const channelSize = 1
chunkReader := bufio.NewReaderSize(reader, config.chunkSize)
chunkResultChan := make(chan ChunkResult, channelSize)
go func() {
defer close(chunkResultChan)
for {
chunkRes := ChunkResult{}
chunkBytes := make([]byte, config.totalSize)
chunkBytes = chunkBytes[:config.chunkSize]
n, err := io.ReadFull(chunkReader, chunkBytes)
if n > 0 {
peekData, _ := chunkReader.Peek(config.totalSize - n)
chunkBytes = append(chunkBytes[:n], peekData...)
chunkRes.data = chunkBytes
}
// If there is an error other than EOF, or if we have read some bytes, send the chunk.
// io.ReadFull will only return io.EOF when n == 0.
if isErrAndNotEOF(err) {
ctx.Logger().Error(err, "error reading chunk")
chunkRes.err = err
chunkResultChan <- chunkRes
} else if n > 0 {
chunkResultChan <- chunkRes
}
// Return on any type of error.
if err != nil {
return
}
}
}()
return chunkResultChan
}
// reportableErr checks whether the error is one we are interested in flagging.
func isErrAndNotEOF(err error) bool {
if err == nil {
return false
}
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
return false
}
return true
}