2022-10-24 20:57:27 +00:00
|
|
|
package sources
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"errors"
|
|
|
|
"io"
|
2023-08-03 13:27:33 +00:00
|
|
|
|
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
|
2022-10-24 20:57:27 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// ChunkSize is the maximum size of a chunk.
|
|
|
|
ChunkSize = 10 * 1024
|
|
|
|
// PeekSize is the size of the peek into the previous chunk.
|
|
|
|
PeekSize = 3 * 1024
|
2023-07-25 02:30:29 +00:00
|
|
|
// TotalChunkSize is the total size of a chunk with peek data.
|
|
|
|
TotalChunkSize = ChunkSize + PeekSize
|
2022-10-24 20:57:27 +00:00
|
|
|
)
|
|
|
|
|
2023-08-03 13:27:33 +00:00
|
|
|
type chunkReaderConfig struct {
|
|
|
|
chunkSize int
|
|
|
|
totalSize int
|
|
|
|
peekSize int
|
|
|
|
}
|
|
|
|
|
|
|
|
// ConfigOption is a function that configures a chunker.
|
|
|
|
type ConfigOption func(*chunkReaderConfig)
|
|
|
|
|
|
|
|
// WithChunkSize sets the chunk size.
|
|
|
|
func WithChunkSize(size int) ConfigOption {
|
|
|
|
return func(c *chunkReaderConfig) {
|
|
|
|
c.chunkSize = size
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// WithPeekSize sets the peek size.
|
|
|
|
func WithPeekSize(size int) ConfigOption {
|
|
|
|
return func(c *chunkReaderConfig) {
|
|
|
|
c.peekSize = size
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-07 19:55:28 +00:00
|
|
|
// ChunkResult is the output unit of a ChunkReader,
|
|
|
|
// it contains the data and error of a chunk.
|
|
|
|
type ChunkResult struct {
|
|
|
|
data []byte
|
|
|
|
err error
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bytes for a ChunkResult.
|
|
|
|
func (cr ChunkResult) Bytes() []byte {
|
|
|
|
return cr.data
|
|
|
|
}
|
|
|
|
|
|
|
|
// Error for a ChunkResult.
|
|
|
|
func (cr ChunkResult) Error() error {
|
|
|
|
return cr.err
|
|
|
|
}
|
|
|
|
|
2023-08-03 13:27:33 +00:00
|
|
|
// ChunkReader reads chunks from a reader and returns a channel of chunks and a channel of errors.
|
|
|
|
// The channel of chunks is closed when the reader is closed.
|
|
|
|
// This should be used whenever a large amount of data is read from a reader.
|
|
|
|
// Ex: reading attachments, archives, etc.
|
2023-08-07 19:55:28 +00:00
|
|
|
type ChunkReader func(ctx context.Context, reader io.Reader) <-chan ChunkResult
|
2023-08-03 13:27:33 +00:00
|
|
|
|
|
|
|
// NewChunkReader returns a ChunkReader with the given options.
|
|
|
|
func NewChunkReader(opts ...ConfigOption) ChunkReader {
|
|
|
|
config := applyOptions(opts)
|
|
|
|
return createReaderFn(config)
|
|
|
|
}
|
|
|
|
|
|
|
|
func applyOptions(opts []ConfigOption) *chunkReaderConfig {
|
|
|
|
// Set defaults.
|
|
|
|
config := &chunkReaderConfig{
|
|
|
|
chunkSize: ChunkSize, // default
|
|
|
|
peekSize: PeekSize, // default
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, opt := range opts {
|
|
|
|
opt(config)
|
|
|
|
}
|
|
|
|
|
|
|
|
config.totalSize = config.chunkSize + config.peekSize
|
|
|
|
|
|
|
|
return config
|
|
|
|
}
|
|
|
|
|
|
|
|
func createReaderFn(config *chunkReaderConfig) ChunkReader {
|
2023-08-07 19:55:28 +00:00
|
|
|
return func(ctx context.Context, reader io.Reader) <-chan ChunkResult {
|
2023-08-03 13:27:33 +00:00
|
|
|
return readInChunks(ctx, reader, config)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-07 19:55:28 +00:00
|
|
|
func readInChunks(ctx context.Context, reader io.Reader, config *chunkReaderConfig) <-chan ChunkResult {
|
2024-07-17 20:52:18 +00:00
|
|
|
const channelSize = 64
|
2023-08-03 13:27:33 +00:00
|
|
|
chunkReader := bufio.NewReaderSize(reader, config.chunkSize)
|
2023-08-07 19:55:28 +00:00
|
|
|
chunkResultChan := make(chan ChunkResult, channelSize)
|
2023-08-03 13:27:33 +00:00
|
|
|
|
|
|
|
go func() {
|
2023-08-07 19:55:28 +00:00
|
|
|
defer close(chunkResultChan)
|
2023-08-03 13:27:33 +00:00
|
|
|
|
|
|
|
for {
|
2023-08-07 19:55:28 +00:00
|
|
|
chunkRes := ChunkResult{}
|
2023-08-03 13:27:33 +00:00
|
|
|
chunkBytes := make([]byte, config.totalSize)
|
|
|
|
chunkBytes = chunkBytes[:config.chunkSize]
|
2023-10-02 16:38:23 +00:00
|
|
|
n, err := io.ReadFull(chunkReader, chunkBytes)
|
2023-08-03 13:27:33 +00:00
|
|
|
if n > 0 {
|
|
|
|
peekData, _ := chunkReader.Peek(config.totalSize - n)
|
|
|
|
chunkBytes = append(chunkBytes[:n], peekData...)
|
2023-08-07 19:55:28 +00:00
|
|
|
chunkRes.data = chunkBytes
|
2022-10-24 20:57:27 +00:00
|
|
|
}
|
2023-07-25 02:30:29 +00:00
|
|
|
|
2023-08-07 19:55:28 +00:00
|
|
|
// If there is an error other than EOF, or if we have read some bytes, send the chunk.
|
2023-10-02 16:38:23 +00:00
|
|
|
// io.ReadFull will only return io.EOF when n == 0.
|
2023-12-21 05:09:05 +00:00
|
|
|
switch {
|
|
|
|
case isErrAndNotEOF(err):
|
2023-10-02 16:38:23 +00:00
|
|
|
ctx.Logger().Error(err, "error reading chunk")
|
|
|
|
chunkRes.err = err
|
2023-12-21 05:09:05 +00:00
|
|
|
case n > 0:
|
|
|
|
chunkRes.err = nil
|
|
|
|
default:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case chunkResultChan <- chunkRes:
|
2023-08-07 19:55:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if err != nil {
|
2023-08-03 13:27:33 +00:00
|
|
|
return
|
|
|
|
}
|
2022-10-24 20:57:27 +00:00
|
|
|
}
|
|
|
|
}()
|
2023-08-07 19:55:28 +00:00
|
|
|
return chunkResultChan
|
2022-10-24 20:57:27 +00:00
|
|
|
}
|
2023-10-02 16:38:23 +00:00
|
|
|
|
|
|
|
// reportableErr checks whether the error is one we are interested in flagging.
|
|
|
|
func isErrAndNotEOF(err error) bool {
|
|
|
|
if err == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|