mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
[perf] - Optimize MIME Type Detection to Reduce Allocations (#3048)
* Streaming file handling. * cleanup * update tests * lint * defer close on input io.ReadCloser's * remove redundant mime type detection * Reduce allocations * fix test * update comment * fix seek bug * address comment * undo
This commit is contained in:
parent
f865482025
commit
42b3a9d999
5 changed files with 159 additions and 58 deletions
|
@ -83,7 +83,12 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, archi
|
|||
fileSize := arEntry.Size
|
||||
fileCtx := logContext.WithValues(ctx, "filename", arEntry.Name, "size", fileSize)
|
||||
|
||||
if err := h.handleNonArchiveContent(fileCtx, arEntry.Data, archiveChan); err != nil {
|
||||
rdr, err := newMimeTypeReader(arEntry.Data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating mime-type reader: %w", err)
|
||||
}
|
||||
|
||||
if err := h.handleNonArchiveContent(fileCtx, rdr, archiveChan); err != nil {
|
||||
fileCtx.Logger().Error(err, "error handling archive content in AR")
|
||||
h.metrics.incErrors()
|
||||
}
|
||||
|
|
|
@ -87,8 +87,11 @@ func (h *archiveHandler) openArchive(ctx logContext.Context, depth int, reader f
|
|||
return ErrMaxDepthReached
|
||||
}
|
||||
|
||||
if reader.format == nil && depth > 0 {
|
||||
return h.handleNonArchiveContent(ctx, reader, archiveChan)
|
||||
if reader.format == nil {
|
||||
if depth > 0 {
|
||||
return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), archiveChan)
|
||||
}
|
||||
return fmt.Errorf("unknown archive format")
|
||||
}
|
||||
|
||||
switch archive := reader.format.(type) {
|
||||
|
@ -117,7 +120,7 @@ func (h *archiveHandler) openArchive(ctx logContext.Context, depth int, reader f
|
|||
}
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("unknown archive type: %s", reader.mimeType)
|
||||
return fmt.Errorf("unknown archive type: %s", reader.format.Name())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,15 +1,10 @@
|
|||
package handlers
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
||||
logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
|
||||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
|
||||
|
@ -49,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) (c
|
|||
h.metrics.incFilesProcessed()
|
||||
}()
|
||||
|
||||
if err = h.handleNonArchiveContent(ctx, input, dataChan); err != nil {
|
||||
if err = h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataChan); err != nil {
|
||||
ctx.Logger().Error(err, "error handling non-archive content.")
|
||||
}
|
||||
}()
|
||||
|
@ -76,26 +71,17 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(start time.Time, err erro
|
|||
// on the type, particularly for binary files. It manages reading file chunks and writing them to the archive channel,
|
||||
// effectively collecting the final bytes for further processing. This function is a key component in ensuring that all
|
||||
// file content, regardless of being an archive or not, is handled appropriately.
|
||||
func (h *defaultHandler) handleNonArchiveContent(ctx logContext.Context, reader io.Reader, archiveChan chan []byte) error {
|
||||
bufReader := bufio.NewReaderSize(reader, defaultBufferSize)
|
||||
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
|
||||
// If fewer bytes are read, MIME type detection may still succeed.
|
||||
buffer, err := bufReader.Peek(defaultBufferSize)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return fmt.Errorf("unable to read file for MIME type detection: %w", err)
|
||||
}
|
||||
func (h *defaultHandler) handleNonArchiveContent(ctx logContext.Context, reader mimeTypeReader, archiveChan chan []byte) error {
|
||||
mimeExt := reader.mimeExt
|
||||
|
||||
mime := mimetype.Detect(buffer)
|
||||
mimeT := mimeType(mime.String())
|
||||
|
||||
if common.SkipFile(mime.Extension()) || common.IsBinary(mime.Extension()) {
|
||||
ctx.Logger().V(5).Info("skipping file", "ext", mimeT)
|
||||
if common.SkipFile(mimeExt) || common.IsBinary(mimeExt) {
|
||||
ctx.Logger().V(5).Info("skipping file", "ext", mimeExt)
|
||||
h.metrics.incFilesSkipped()
|
||||
return nil
|
||||
}
|
||||
|
||||
chunkReader := sources.NewChunkReader()
|
||||
for data := range chunkReader(ctx, bufReader) {
|
||||
for data := range chunkReader(ctx, reader) {
|
||||
if err := data.Error(); err != nil {
|
||||
ctx.Logger().Error(err, "error reading chunk")
|
||||
h.metrics.incErrors()
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package handlers
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
|
@ -30,7 +31,7 @@ import (
|
|||
// random access to the file content.
|
||||
type fileReader struct {
|
||||
format archiver.Format
|
||||
mimeType mimeType
|
||||
mime *mimetype.MIME
|
||||
isGenericArchive bool
|
||||
|
||||
*iobuf.BufferedReadSeeker
|
||||
|
@ -38,50 +39,92 @@ type fileReader struct {
|
|||
|
||||
var ErrEmptyReader = errors.New("reader is empty")
|
||||
|
||||
// mimeTypeReader wraps an io.Reader with MIME type information.
|
||||
// This type is used to pass content through the processing pipeline
|
||||
// while carrying its detected MIME type, avoiding redundant type detection.
|
||||
type mimeTypeReader struct {
|
||||
mimeExt string
|
||||
mimeName mimeType
|
||||
io.Reader
|
||||
}
|
||||
|
||||
// newMimeTypeReaderFromFileReader creates a new mimeTypeReader from a fileReader.
|
||||
func newMimeTypeReaderFromFileReader(r fileReader) mimeTypeReader {
|
||||
return mimeTypeReader{
|
||||
mimeExt: r.mime.Extension(),
|
||||
mimeName: mimeType(r.mime.String()),
|
||||
Reader: r.BufferedReadSeeker,
|
||||
}
|
||||
}
|
||||
|
||||
// newMimeTypeReader creates a new mimeTypeReader from an io.Reader.
|
||||
// It uses a bufio.Reader to perform MIME type detection on the input reader
|
||||
// without consuming it, by peeking into the first 512 bytes of the input.
|
||||
// This encapsulates both the original reader and the detected MIME type information.
|
||||
// This function is particularly useful for specialized archive handlers
|
||||
// that need to pass extracted content to the default handler without modifying the original reader.
|
||||
func newMimeTypeReader(r io.Reader) (mimeTypeReader, error) {
|
||||
const defaultMinBufferSize = 3072
|
||||
bufReader := bufio.NewReaderSize(r, defaultMinBufferSize)
|
||||
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
|
||||
// If fewer bytes are read, MIME type detection may still succeed.
|
||||
buffer, err := bufReader.Peek(defaultMinBufferSize)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return mimeTypeReader{}, fmt.Errorf("unable to read file for MIME type detection: %w", err)
|
||||
}
|
||||
|
||||
mime := mimetype.Detect(buffer)
|
||||
|
||||
return mimeTypeReader{mimeExt: mime.Extension(), mimeName: mimeType(mime.String()), Reader: bufReader}, nil
|
||||
}
|
||||
|
||||
// newFileReader creates a fileReader from an io.Reader, optionally using BufferedFileWriter for certain formats.
|
||||
func newFileReader(r io.Reader) (fileReader, error) {
|
||||
var reader fileReader
|
||||
var fReader fileReader
|
||||
|
||||
bufReader := iobuf.NewBufferedReaderSeeker(r)
|
||||
fReader.BufferedReadSeeker = iobuf.NewBufferedReaderSeeker(r)
|
||||
|
||||
mime, err := mimetype.DetectReader(bufReader)
|
||||
// Disable buffering after initial reads.
|
||||
// This optimization ensures we don't continue writing to the buffer after the initial reads.
|
||||
defer fReader.DisableBuffering()
|
||||
|
||||
mime, err := mimetype.DetectReader(fReader)
|
||||
if err != nil {
|
||||
return reader, fmt.Errorf("unable to detect MIME type: %w", err)
|
||||
return fReader, fmt.Errorf("unable to detect MIME type: %w", err)
|
||||
}
|
||||
reader.mimeType = mimeType(mime.String())
|
||||
fReader.mime = mime
|
||||
|
||||
// Reset the reader to the beginning because DetectReader consumes the reader.
|
||||
if _, err := bufReader.Seek(0, io.SeekStart); err != nil {
|
||||
return reader, fmt.Errorf("error resetting reader after MIME detection: %w", err)
|
||||
if _, err := fReader.Seek(0, io.SeekStart); err != nil {
|
||||
return fReader, fmt.Errorf("error resetting reader after MIME detection: %w", err)
|
||||
}
|
||||
|
||||
format, _, err := archiver.Identify("", bufReader)
|
||||
// If a MIME type is known to not be an archive type, we might as well return here rather than
|
||||
// paying the I/O penalty of an archiver.Identify() call that won't identify anything.
|
||||
if _, ok := skipArchiverMimeTypes[mimeType(mime.String())]; ok {
|
||||
return fReader, nil
|
||||
}
|
||||
|
||||
format, _, err := archiver.Identify("", fReader)
|
||||
switch {
|
||||
case err == nil:
|
||||
reader.isGenericArchive = true
|
||||
reader.mimeType = mimeType(format.Name())
|
||||
reader.format = format
|
||||
fReader.isGenericArchive = true
|
||||
fReader.format = format
|
||||
|
||||
case errors.Is(err, archiver.ErrNoMatch):
|
||||
// Not an archive handled by archiver.
|
||||
// Continue with the default reader.
|
||||
default:
|
||||
return reader, fmt.Errorf("error identifying archive: %w", err)
|
||||
return fReader, fmt.Errorf("error identifying archive: %w", err)
|
||||
}
|
||||
|
||||
// Reset the reader to the beginning again to allow the handler to read from the start.
|
||||
// This is necessary because Identify consumes the reader.
|
||||
if _, err := bufReader.Seek(0, io.SeekStart); err != nil {
|
||||
return reader, fmt.Errorf("error resetting reader after archive identification: %w", err)
|
||||
if _, err := fReader.Seek(0, io.SeekStart); err != nil {
|
||||
return fReader, fmt.Errorf("error resetting reader after archive identification: %w", err)
|
||||
}
|
||||
|
||||
// Disable buffering after initial reads.
|
||||
// This optimization ensures we don't continue writing to the buffer after the initial reads.
|
||||
bufReader.DisableBuffering()
|
||||
|
||||
reader.BufferedReadSeeker = bufReader
|
||||
|
||||
return reader, nil
|
||||
return fReader, nil
|
||||
}
|
||||
|
||||
// FileHandler represents a handler for files.
|
||||
|
@ -123,14 +166,72 @@ const (
|
|||
type mimeType string
|
||||
|
||||
const (
|
||||
rpmMime mimeType = "application/x-rpm"
|
||||
cpioMime mimeType = "application/cpio"
|
||||
unixArMime mimeType = "application/x-unix-archive"
|
||||
arMime mimeType = "application/x-archive"
|
||||
debMime mimeType = "application/vnd.debian.binary-package"
|
||||
rpmMime mimeType = "application/x-rpm"
|
||||
cpioMime mimeType = "application/cpio"
|
||||
unixArMime mimeType = "application/x-unix-archive"
|
||||
arMime mimeType = "application/x-archive"
|
||||
debMime mimeType = "application/vnd.debian.binary-package"
|
||||
textMime mimeType = "text/plain; charset=utf-8"
|
||||
xmlMime mimeType = "text/xml"
|
||||
jsonMime mimeType = "application/json"
|
||||
csvMime mimeType = "text/csv"
|
||||
tsvMime mimeType = "text/tab-separated-values"
|
||||
geoJSONMine mimeType = "application/vnd.geo+json"
|
||||
ndjsonMime mimeType = "application/x-ndjson"
|
||||
htmlMime mimeType = "text/html"
|
||||
phpTextMime mimeType = "text/x-php"
|
||||
rtfTextMime mimeType = "text/rtf"
|
||||
jsAppMime mimeType = "application/javascript"
|
||||
jsTextMime mimeType = "text/javascript"
|
||||
jsMime mimeType = "application/x-javascript"
|
||||
srtMime mimeType = "application/x-subrip"
|
||||
srtXMime mimeType = "application/x-srt"
|
||||
srtTextMime mimeType = "text/x-srt"
|
||||
vttMime mimeType = "text/vtt"
|
||||
luaMime mimeType = "text/x-lua"
|
||||
perlMime mimeType = "text/x-perl"
|
||||
pythonMime mimeType = "text/x-python"
|
||||
pyAppMime mimeType = "application/x-python"
|
||||
pyScriptMime mimeType = "application/x-script.python"
|
||||
tclTextMime mimeType = "text/x-tcl"
|
||||
tclMime mimeType = "application/x-tcl"
|
||||
)
|
||||
|
||||
// selectHandler dynamically selects and configures a FileHandler based on the provided fileReader.
|
||||
// skipArchiverMimeTypes is a set of MIME types that should bypass archiver library processing because they are either
|
||||
// text-based or archives not supported by the library.
|
||||
var skipArchiverMimeTypes = map[mimeType]struct{}{
|
||||
arMime: {},
|
||||
unixArMime: {},
|
||||
debMime: {},
|
||||
rpmMime: {},
|
||||
cpioMime: {},
|
||||
textMime: {},
|
||||
xmlMime: {},
|
||||
jsonMime: {},
|
||||
csvMime: {},
|
||||
tsvMime: {},
|
||||
geoJSONMine: {},
|
||||
ndjsonMime: {},
|
||||
htmlMime: {},
|
||||
phpTextMime: {},
|
||||
rtfTextMime: {},
|
||||
jsAppMime: {},
|
||||
jsTextMime: {},
|
||||
jsMime: {},
|
||||
srtMime: {},
|
||||
srtXMime: {},
|
||||
srtTextMime: {},
|
||||
vttMime: {},
|
||||
luaMime: {},
|
||||
perlMime: {},
|
||||
pythonMime: {},
|
||||
pyAppMime: {},
|
||||
pyScriptMime: {},
|
||||
tclTextMime: {},
|
||||
tclMime: {},
|
||||
}
|
||||
|
||||
// selectHandler dynamically selects and configures a FileHandler based on the provided |mimetype| type and archive flag.
|
||||
// The fileReader contains information about the MIME type and whether the file is an archive.
|
||||
// This method uses specialized handlers for specific file types:
|
||||
// - arHandler is used for Unix archives and Debian packages ('arMime', 'unixArMime', and 'debMime').
|
||||
|
@ -138,14 +239,14 @@ const (
|
|||
// - archiveHandler is used for common archive formats supported by the archiver library (.zip, .tar, .gz, etc.).
|
||||
// - defaultHandler is used for non-archive files.
|
||||
// The selected handler is then returned, ready to handle the file according to its specific format and requirements.
|
||||
func selectHandler(file fileReader) FileHandler {
|
||||
switch file.mimeType {
|
||||
func selectHandler(mimeT mimeType, isGenericArchive bool) FileHandler {
|
||||
switch mimeT {
|
||||
case arMime, unixArMime, debMime:
|
||||
return newARHandler()
|
||||
case rpmMime, cpioMime:
|
||||
return newRPMHandler()
|
||||
default:
|
||||
if file.isGenericArchive {
|
||||
if isGenericArchive {
|
||||
return newArchiveHandler()
|
||||
}
|
||||
return newDefaultHandler(defaultHandlerType)
|
||||
|
@ -181,13 +282,14 @@ func HandleFile(
|
|||
return fmt.Errorf("error creating custom reader: %w", err)
|
||||
}
|
||||
|
||||
mimeT := mimeType(rdr.mime.String())
|
||||
config := newFileHandlingConfig(options...)
|
||||
if config.skipArchives && rdr.isGenericArchive {
|
||||
ctx.Logger().V(5).Info("skipping archive file", "mime", rdr.mimeType)
|
||||
ctx.Logger().V(5).Info("skipping archive file", "mime", mimeT)
|
||||
return nil
|
||||
}
|
||||
|
||||
handler := selectHandler(rdr)
|
||||
handler := selectHandler(mimeT, rdr.isGenericArchive)
|
||||
archiveChan, err := handler.HandleFile(ctx, rdr) // Delegate to the specific handler to process the file.
|
||||
if err != nil {
|
||||
return fmt.Errorf("error handling file: %w", err)
|
||||
|
|
|
@ -90,7 +90,12 @@ func (h *rpmHandler) processRPMFiles(ctx logContext.Context, reader rpmutils.Pay
|
|||
fileSize := fileInfo.Size()
|
||||
fileCtx := logContext.WithValues(ctx, "filename", fileInfo.Name, "size", fileSize)
|
||||
|
||||
if err := h.handleNonArchiveContent(fileCtx, reader, archiveChan); err != nil {
|
||||
rdr, err := newMimeTypeReader(reader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating mime-type reader: %w", err)
|
||||
}
|
||||
|
||||
if err := h.handleNonArchiveContent(fileCtx, rdr, archiveChan); err != nil {
|
||||
fileCtx.Logger().Error(err, "error handling archive content in RPM")
|
||||
h.metrics.incErrors()
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue