[perf] - Optimize MIME Type Detection to Reduce Allocations (#3048)

* Streaming file handling.

* cleanup

* update tests

* lint

* defer close on input io.ReadCloser's

* remove redundant mime type detection

* Reduce allocations

* fix test

* update comment

* fix seek bug

* address comment

* undo
This commit is contained in:
ahrav 2024-07-17 14:04:29 -07:00 committed by GitHub
parent f865482025
commit 42b3a9d999
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 159 additions and 58 deletions

View file

@ -83,7 +83,12 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, archi
fileSize := arEntry.Size
fileCtx := logContext.WithValues(ctx, "filename", arEntry.Name, "size", fileSize)
if err := h.handleNonArchiveContent(fileCtx, arEntry.Data, archiveChan); err != nil {
rdr, err := newMimeTypeReader(arEntry.Data)
if err != nil {
return fmt.Errorf("error creating mime-type reader: %w", err)
}
if err := h.handleNonArchiveContent(fileCtx, rdr, archiveChan); err != nil {
fileCtx.Logger().Error(err, "error handling archive content in AR")
h.metrics.incErrors()
}

View file

@ -87,8 +87,11 @@ func (h *archiveHandler) openArchive(ctx logContext.Context, depth int, reader f
return ErrMaxDepthReached
}
if reader.format == nil && depth > 0 {
return h.handleNonArchiveContent(ctx, reader, archiveChan)
if reader.format == nil {
if depth > 0 {
return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), archiveChan)
}
return fmt.Errorf("unknown archive format")
}
switch archive := reader.format.(type) {
@ -117,7 +120,7 @@ func (h *archiveHandler) openArchive(ctx logContext.Context, depth int, reader f
}
return nil
default:
return fmt.Errorf("unknown archive type: %s", reader.mimeType)
return fmt.Errorf("unknown archive type: %s", reader.format.Name())
}
}

View file

@ -1,15 +1,10 @@
package handlers
import (
"bufio"
"context"
"errors"
"fmt"
"io"
"time"
"github.com/gabriel-vasile/mimetype"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
@ -49,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) (c
h.metrics.incFilesProcessed()
}()
if err = h.handleNonArchiveContent(ctx, input, dataChan); err != nil {
if err = h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataChan); err != nil {
ctx.Logger().Error(err, "error handling non-archive content.")
}
}()
@ -76,26 +71,17 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(start time.Time, err erro
// on the type, particularly for binary files. It manages reading file chunks and writing them to the archive channel,
// effectively collecting the final bytes for further processing. This function is a key component in ensuring that all
// file content, regardless of being an archive or not, is handled appropriately.
func (h *defaultHandler) handleNonArchiveContent(ctx logContext.Context, reader io.Reader, archiveChan chan []byte) error {
bufReader := bufio.NewReaderSize(reader, defaultBufferSize)
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
// If fewer bytes are read, MIME type detection may still succeed.
buffer, err := bufReader.Peek(defaultBufferSize)
if err != nil && !errors.Is(err, io.EOF) {
return fmt.Errorf("unable to read file for MIME type detection: %w", err)
}
func (h *defaultHandler) handleNonArchiveContent(ctx logContext.Context, reader mimeTypeReader, archiveChan chan []byte) error {
mimeExt := reader.mimeExt
mime := mimetype.Detect(buffer)
mimeT := mimeType(mime.String())
if common.SkipFile(mime.Extension()) || common.IsBinary(mime.Extension()) {
ctx.Logger().V(5).Info("skipping file", "ext", mimeT)
if common.SkipFile(mimeExt) || common.IsBinary(mimeExt) {
ctx.Logger().V(5).Info("skipping file", "ext", mimeExt)
h.metrics.incFilesSkipped()
return nil
}
chunkReader := sources.NewChunkReader()
for data := range chunkReader(ctx, bufReader) {
for data := range chunkReader(ctx, reader) {
if err := data.Error(); err != nil {
ctx.Logger().Error(err, "error reading chunk")
h.metrics.incErrors()

View file

@ -1,6 +1,7 @@
package handlers
import (
"bufio"
"errors"
"fmt"
"io"
@ -30,7 +31,7 @@ import (
// random access to the file content.
type fileReader struct {
format archiver.Format
mimeType mimeType
mime *mimetype.MIME
isGenericArchive bool
*iobuf.BufferedReadSeeker
@ -38,50 +39,92 @@ type fileReader struct {
var ErrEmptyReader = errors.New("reader is empty")
// mimeTypeReader wraps an io.Reader with MIME type information.
// This type is used to pass content through the processing pipeline
// while carrying its detected MIME type, avoiding redundant type detection.
type mimeTypeReader struct {
mimeExt string
mimeName mimeType
io.Reader
}
// newMimeTypeReaderFromFileReader creates a new mimeTypeReader from a fileReader.
func newMimeTypeReaderFromFileReader(r fileReader) mimeTypeReader {
return mimeTypeReader{
mimeExt: r.mime.Extension(),
mimeName: mimeType(r.mime.String()),
Reader: r.BufferedReadSeeker,
}
}
// newMimeTypeReader creates a new mimeTypeReader from an io.Reader.
// It uses a bufio.Reader to perform MIME type detection on the input reader
// without consuming it, by peeking into the first 512 bytes of the input.
// This encapsulates both the original reader and the detected MIME type information.
// This function is particularly useful for specialized archive handlers
// that need to pass extracted content to the default handler without modifying the original reader.
func newMimeTypeReader(r io.Reader) (mimeTypeReader, error) {
const defaultMinBufferSize = 3072
bufReader := bufio.NewReaderSize(r, defaultMinBufferSize)
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
// If fewer bytes are read, MIME type detection may still succeed.
buffer, err := bufReader.Peek(defaultMinBufferSize)
if err != nil && !errors.Is(err, io.EOF) {
return mimeTypeReader{}, fmt.Errorf("unable to read file for MIME type detection: %w", err)
}
mime := mimetype.Detect(buffer)
return mimeTypeReader{mimeExt: mime.Extension(), mimeName: mimeType(mime.String()), Reader: bufReader}, nil
}
// newFileReader creates a fileReader from an io.Reader, optionally using BufferedFileWriter for certain formats.
func newFileReader(r io.Reader) (fileReader, error) {
var reader fileReader
var fReader fileReader
bufReader := iobuf.NewBufferedReaderSeeker(r)
fReader.BufferedReadSeeker = iobuf.NewBufferedReaderSeeker(r)
mime, err := mimetype.DetectReader(bufReader)
// Disable buffering after initial reads.
// This optimization ensures we don't continue writing to the buffer after the initial reads.
defer fReader.DisableBuffering()
mime, err := mimetype.DetectReader(fReader)
if err != nil {
return reader, fmt.Errorf("unable to detect MIME type: %w", err)
return fReader, fmt.Errorf("unable to detect MIME type: %w", err)
}
reader.mimeType = mimeType(mime.String())
fReader.mime = mime
// Reset the reader to the beginning because DetectReader consumes the reader.
if _, err := bufReader.Seek(0, io.SeekStart); err != nil {
return reader, fmt.Errorf("error resetting reader after MIME detection: %w", err)
if _, err := fReader.Seek(0, io.SeekStart); err != nil {
return fReader, fmt.Errorf("error resetting reader after MIME detection: %w", err)
}
format, _, err := archiver.Identify("", bufReader)
// If a MIME type is known to not be an archive type, we might as well return here rather than
// paying the I/O penalty of an archiver.Identify() call that won't identify anything.
if _, ok := skipArchiverMimeTypes[mimeType(mime.String())]; ok {
return fReader, nil
}
format, _, err := archiver.Identify("", fReader)
switch {
case err == nil:
reader.isGenericArchive = true
reader.mimeType = mimeType(format.Name())
reader.format = format
fReader.isGenericArchive = true
fReader.format = format
case errors.Is(err, archiver.ErrNoMatch):
// Not an archive handled by archiver.
// Continue with the default reader.
default:
return reader, fmt.Errorf("error identifying archive: %w", err)
return fReader, fmt.Errorf("error identifying archive: %w", err)
}
// Reset the reader to the beginning again to allow the handler to read from the start.
// This is necessary because Identify consumes the reader.
if _, err := bufReader.Seek(0, io.SeekStart); err != nil {
return reader, fmt.Errorf("error resetting reader after archive identification: %w", err)
if _, err := fReader.Seek(0, io.SeekStart); err != nil {
return fReader, fmt.Errorf("error resetting reader after archive identification: %w", err)
}
// Disable buffering after initial reads.
// This optimization ensures we don't continue writing to the buffer after the initial reads.
bufReader.DisableBuffering()
reader.BufferedReadSeeker = bufReader
return reader, nil
return fReader, nil
}
// FileHandler represents a handler for files.
@ -123,14 +166,72 @@ const (
type mimeType string
const (
rpmMime mimeType = "application/x-rpm"
cpioMime mimeType = "application/cpio"
unixArMime mimeType = "application/x-unix-archive"
arMime mimeType = "application/x-archive"
debMime mimeType = "application/vnd.debian.binary-package"
rpmMime mimeType = "application/x-rpm"
cpioMime mimeType = "application/cpio"
unixArMime mimeType = "application/x-unix-archive"
arMime mimeType = "application/x-archive"
debMime mimeType = "application/vnd.debian.binary-package"
textMime mimeType = "text/plain; charset=utf-8"
xmlMime mimeType = "text/xml"
jsonMime mimeType = "application/json"
csvMime mimeType = "text/csv"
tsvMime mimeType = "text/tab-separated-values"
geoJSONMine mimeType = "application/vnd.geo+json"
ndjsonMime mimeType = "application/x-ndjson"
htmlMime mimeType = "text/html"
phpTextMime mimeType = "text/x-php"
rtfTextMime mimeType = "text/rtf"
jsAppMime mimeType = "application/javascript"
jsTextMime mimeType = "text/javascript"
jsMime mimeType = "application/x-javascript"
srtMime mimeType = "application/x-subrip"
srtXMime mimeType = "application/x-srt"
srtTextMime mimeType = "text/x-srt"
vttMime mimeType = "text/vtt"
luaMime mimeType = "text/x-lua"
perlMime mimeType = "text/x-perl"
pythonMime mimeType = "text/x-python"
pyAppMime mimeType = "application/x-python"
pyScriptMime mimeType = "application/x-script.python"
tclTextMime mimeType = "text/x-tcl"
tclMime mimeType = "application/x-tcl"
)
// selectHandler dynamically selects and configures a FileHandler based on the provided fileReader.
// skipArchiverMimeTypes is a set of MIME types that should bypass archiver library processing because they are either
// text-based or archives not supported by the library.
var skipArchiverMimeTypes = map[mimeType]struct{}{
arMime: {},
unixArMime: {},
debMime: {},
rpmMime: {},
cpioMime: {},
textMime: {},
xmlMime: {},
jsonMime: {},
csvMime: {},
tsvMime: {},
geoJSONMine: {},
ndjsonMime: {},
htmlMime: {},
phpTextMime: {},
rtfTextMime: {},
jsAppMime: {},
jsTextMime: {},
jsMime: {},
srtMime: {},
srtXMime: {},
srtTextMime: {},
vttMime: {},
luaMime: {},
perlMime: {},
pythonMime: {},
pyAppMime: {},
pyScriptMime: {},
tclTextMime: {},
tclMime: {},
}
// selectHandler dynamically selects and configures a FileHandler based on the provided |mimetype| type and archive flag.
// The fileReader contains information about the MIME type and whether the file is an archive.
// This method uses specialized handlers for specific file types:
// - arHandler is used for Unix archives and Debian packages ('arMime', 'unixArMime', and 'debMime').
@ -138,14 +239,14 @@ const (
// - archiveHandler is used for common archive formats supported by the archiver library (.zip, .tar, .gz, etc.).
// - defaultHandler is used for non-archive files.
// The selected handler is then returned, ready to handle the file according to its specific format and requirements.
func selectHandler(file fileReader) FileHandler {
switch file.mimeType {
func selectHandler(mimeT mimeType, isGenericArchive bool) FileHandler {
switch mimeT {
case arMime, unixArMime, debMime:
return newARHandler()
case rpmMime, cpioMime:
return newRPMHandler()
default:
if file.isGenericArchive {
if isGenericArchive {
return newArchiveHandler()
}
return newDefaultHandler(defaultHandlerType)
@ -181,13 +282,14 @@ func HandleFile(
return fmt.Errorf("error creating custom reader: %w", err)
}
mimeT := mimeType(rdr.mime.String())
config := newFileHandlingConfig(options...)
if config.skipArchives && rdr.isGenericArchive {
ctx.Logger().V(5).Info("skipping archive file", "mime", rdr.mimeType)
ctx.Logger().V(5).Info("skipping archive file", "mime", mimeT)
return nil
}
handler := selectHandler(rdr)
handler := selectHandler(mimeT, rdr.isGenericArchive)
archiveChan, err := handler.HandleFile(ctx, rdr) // Delegate to the specific handler to process the file.
if err != nil {
return fmt.Errorf("error handling file: %w", err)

View file

@ -90,7 +90,12 @@ func (h *rpmHandler) processRPMFiles(ctx logContext.Context, reader rpmutils.Pay
fileSize := fileInfo.Size()
fileCtx := logContext.WithValues(ctx, "filename", fileInfo.Name, "size", fileSize)
if err := h.handleNonArchiveContent(fileCtx, reader, archiveChan); err != nil {
rdr, err := newMimeTypeReader(reader)
if err != nil {
return fmt.Errorf("error creating mime-type reader: %w", err)
}
if err := h.handleNonArchiveContent(fileCtx, rdr, archiveChan); err != nil {
fileCtx.Logger().Error(err, "error handling archive content in RPM")
h.metrics.incErrors()
}