mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
[not-fixup] - Reduce memory consumption for Buffered File Writer (#2377)
* correctly use the buffered file writer * use value from source * reorder fields * use only the DetectorKey as a map field * correctly use the buffered file writer * use value from source * reorder fields * add tests and update * Fix issue with buffer slices growing * fix test * fix * add singleton * use shared pool * optimize * rename and cleanup * use correct calculation to grow buffer * only grow if needed * address comments * remove unused * remove * rip out Grow * address coment * use 2k default buffer * update comment allow large buffers to be garbage collected
This commit is contained in:
parent
8104611d6e
commit
843334222c
3 changed files with 295 additions and 36 deletions
|
@ -449,7 +449,6 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
|
|||
}
|
||||
// Create a new currentDiff and currentCommit
|
||||
currentDiff = diff()
|
||||
// currentDiff = NewDiff(withCustomContentWriter(c.contentWriter()))
|
||||
currentCommit = &Commit{Message: strings.Builder{}}
|
||||
// Check that the commit line contains a hash and set it.
|
||||
if len(line) >= 47 {
|
||||
|
|
|
@ -13,12 +13,59 @@ import (
|
|||
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
|
||||
)
|
||||
|
||||
// bufferPool is used to store buffers for reuse.
|
||||
var bufferPool = sync.Pool{
|
||||
// TODO: Consider growing the buffer before returning it if we can find an optimal size.
|
||||
// Ideally the size would cover the majority of cases without being too large.
|
||||
// This would avoid the need to grow the buffer when writing to it, reducing allocations.
|
||||
New: func() any { return new(bytes.Buffer) },
|
||||
type bufPoolOpt func(pool *bufferPool)
|
||||
|
||||
type bufferPool struct {
|
||||
bufferSize uint32
|
||||
*sync.Pool
|
||||
}
|
||||
|
||||
const defaultBufferSize = 2 << 10 // 2KB
|
||||
func newBufferPool(opts ...bufPoolOpt) *bufferPool {
|
||||
pool := &bufferPool{bufferSize: defaultBufferSize}
|
||||
|
||||
for _, opt := range opts {
|
||||
opt(pool)
|
||||
}
|
||||
pool.Pool = &sync.Pool{
|
||||
New: func() any {
|
||||
buf := new(bytes.Buffer)
|
||||
buf.Grow(int(pool.bufferSize))
|
||||
return buf
|
||||
},
|
||||
}
|
||||
|
||||
return pool
|
||||
}
|
||||
|
||||
// sharedBufferPool is the shared buffer pool used by all BufferedFileWriters.
|
||||
// This allows for efficient reuse of buffers across multiple writers.
|
||||
var sharedBufferPool *bufferPool
|
||||
|
||||
func init() { sharedBufferPool = newBufferPool() }
|
||||
|
||||
func (bp *bufferPool) get(ctx context.Context) *bytes.Buffer {
|
||||
buf, ok := bp.Pool.Get().(*bytes.Buffer)
|
||||
if !ok {
|
||||
ctx.Logger().Error(fmt.Errorf("buffer pool returned unexpected type"), "using new buffer")
|
||||
buf = bytes.NewBuffer(make([]byte, 0, bp.bufferSize))
|
||||
}
|
||||
|
||||
return buf
|
||||
}
|
||||
|
||||
func (bp *bufferPool) put(buf *bytes.Buffer) {
|
||||
// If the buffer is more than twice the default size, release it for garbage collection.
|
||||
// This prevents us from returning very large buffers to the pool.
|
||||
const maxAllowedCapacity = 2 * defaultBufferSize
|
||||
if buf.Cap() > maxAllowedCapacity {
|
||||
buf = nil // Release the large buffer for garbage collection.
|
||||
} else {
|
||||
// Reset the buffer to clear any existing data.
|
||||
buf.Reset()
|
||||
}
|
||||
|
||||
bp.Put(buf)
|
||||
}
|
||||
|
||||
// state represents the current mode of BufferedFileWriter.
|
||||
|
@ -39,7 +86,8 @@ type BufferedFileWriter struct {
|
|||
|
||||
state state // Current state of the writer. (writeOnly or readOnly)
|
||||
|
||||
buf bytes.Buffer // Buffer for storing data under the threshold in memory.
|
||||
bufPool *bufferPool // Pool for storing buffers for reuse.
|
||||
buf *bytes.Buffer // Buffer for storing data under the threshold in memory.
|
||||
filename string // Name of the temporary file.
|
||||
file io.WriteCloser // File for storing data over the threshold.
|
||||
}
|
||||
|
@ -55,7 +103,11 @@ func WithThreshold(threshold uint64) Option {
|
|||
// New creates a new BufferedFileWriter with the given options.
|
||||
func New(opts ...Option) *BufferedFileWriter {
|
||||
const defaultThreshold = 10 * 1024 * 1024 // 10MB
|
||||
w := &BufferedFileWriter{threshold: defaultThreshold, state: writeOnly}
|
||||
w := &BufferedFileWriter{
|
||||
threshold: defaultThreshold,
|
||||
state: writeOnly,
|
||||
bufPool: sharedBufferPool,
|
||||
}
|
||||
for _, opt := range opts {
|
||||
opt(w)
|
||||
}
|
||||
|
@ -78,17 +130,16 @@ func (w *BufferedFileWriter) String() (string, error) {
|
|||
}
|
||||
defer file.Close()
|
||||
|
||||
// Create a buffer large enough to hold file data and additional buffer data, if any.
|
||||
fileSize := w.size
|
||||
buf := bytes.NewBuffer(make([]byte, 0, fileSize))
|
||||
|
||||
var buf bytes.Buffer
|
||||
// Read the file contents into the buffer.
|
||||
if _, err := io.Copy(buf, file); err != nil {
|
||||
if _, err := io.CopyBuffer(&buf, file, nil); err != nil {
|
||||
return "", fmt.Errorf("failed to read file contents: %w", err)
|
||||
}
|
||||
|
||||
// Append buffer data, if any, to the end of the file contents.
|
||||
buf.Write(w.buf.Bytes())
|
||||
if _, err := w.buf.WriteTo(&buf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
@ -100,33 +151,44 @@ func (w *BufferedFileWriter) Write(ctx context.Context, data []byte) (int, error
|
|||
}
|
||||
|
||||
size := uint64(len(data))
|
||||
|
||||
if w.buf == nil || w.buf.Len() == 0 {
|
||||
w.buf = w.bufPool.get(ctx)
|
||||
}
|
||||
|
||||
bufferLength := w.buf.Len()
|
||||
|
||||
defer func() {
|
||||
w.size += size
|
||||
ctx.Logger().V(4).Info(
|
||||
"write complete",
|
||||
"data_size", size,
|
||||
"content_size", w.buf.Len(),
|
||||
"content_size", bufferLength,
|
||||
"total_size", w.size,
|
||||
)
|
||||
}()
|
||||
|
||||
if w.buf.Len() == 0 {
|
||||
bufPtr, ok := bufferPool.Get().(*bytes.Buffer)
|
||||
if !ok {
|
||||
ctx.Logger().Error(fmt.Errorf("buffer pool returned unexpected type"), "using new buffer")
|
||||
bufPtr = new(bytes.Buffer)
|
||||
}
|
||||
bufPtr.Reset() // Reset the buffer to clear any existing data
|
||||
w.buf = *bufPtr
|
||||
}
|
||||
|
||||
if uint64(w.buf.Len())+size <= w.threshold {
|
||||
totalSizeNeeded := uint64(bufferLength) + uint64(len(data))
|
||||
if totalSizeNeeded <= w.threshold {
|
||||
// If the total size is within the threshold, write to the buffer.
|
||||
ctx.Logger().V(4).Info(
|
||||
"writing to buffer",
|
||||
"data_size", size,
|
||||
"content_size", w.buf.Len(),
|
||||
"content_size", bufferLength,
|
||||
)
|
||||
|
||||
availableSpace := w.buf.Cap() - bufferLength
|
||||
growSize := int(totalSizeNeeded) - bufferLength
|
||||
if growSize > availableSpace {
|
||||
ctx.Logger().V(4).Info(
|
||||
"buffer size exceeded, growing buffer",
|
||||
"current_size", bufferLength,
|
||||
"new_size", totalSizeNeeded,
|
||||
"available_space", availableSpace,
|
||||
"grow_size", growSize,
|
||||
)
|
||||
}
|
||||
|
||||
return w.buf.Write(data)
|
||||
}
|
||||
|
||||
|
@ -143,14 +205,12 @@ func (w *BufferedFileWriter) Write(ctx context.Context, data []byte) (int, error
|
|||
|
||||
// Transfer existing data in buffer to the file, then clear the buffer.
|
||||
// This ensures all the data is in one place - either entirely in the buffer or the file.
|
||||
if w.buf.Len() > 0 {
|
||||
ctx.Logger().V(4).Info("writing buffer to file", "content_size", w.buf.Len())
|
||||
if _, err := w.file.Write(w.buf.Bytes()); err != nil {
|
||||
if bufferLength > 0 {
|
||||
ctx.Logger().V(4).Info("writing buffer to file", "content_size", bufferLength)
|
||||
if _, err := w.buf.WriteTo(w.file); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Reset the buffer to clear any existing data and return it to the pool.
|
||||
w.buf.Reset()
|
||||
bufferPool.Put(&w.buf)
|
||||
w.bufPool.put(w.buf)
|
||||
}
|
||||
}
|
||||
ctx.Logger().V(4).Info("writing to file", "data_size", size)
|
||||
|
@ -167,7 +227,7 @@ func (w *BufferedFileWriter) CloseForWriting() error {
|
|||
}
|
||||
|
||||
if w.buf.Len() > 0 {
|
||||
_, err := w.file.Write(w.buf.Bytes())
|
||||
_, err := w.buf.WriteTo(w.file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -199,7 +259,7 @@ func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) {
|
|||
// Data is in memory.
|
||||
return &bufferReadCloser{
|
||||
Reader: bytes.NewReader(w.buf.Bytes()),
|
||||
onClose: func() { bufferPool.Put(&w.buf) },
|
||||
onClose: func() { w.bufPool.put(w.buf) },
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package bufferedfilewriter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -89,12 +90,151 @@ func TestBufferedFileWriterString(t *testing.T) {
|
|||
|
||||
got, err := writer.String()
|
||||
assert.NoError(t, err)
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, tc.expectedStr, got, "String content mismatch")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
smallBuffer = 2 << 5 // 64B
|
||||
mediumBuffer = 2 << 10 // 2KB
|
||||
smallFile = 2 << 25 // 32MB
|
||||
mediumFile = 2 << 28 // 256MB
|
||||
)
|
||||
|
||||
func BenchmarkBufferedFileWriterString_BufferOnly_Small(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), smallBuffer)
|
||||
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterString_BufferOnly_Medium(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), mediumBuffer)
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterString_OnlyFile_Small(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), smallFile)
|
||||
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterString_OnlyFile_Medium(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), mediumFile)
|
||||
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterString_BufferWithFile_Small(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), smallFile)
|
||||
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
// Write again so we also fill up the buffer.
|
||||
_, err = writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterString_BufferWithFile_Medium(b *testing.B) {
|
||||
data := bytes.Repeat([]byte("a"), mediumFile)
|
||||
|
||||
ctx := context.Background()
|
||||
writer := New()
|
||||
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
// Write again so we also fill up the buffer.
|
||||
_, err = writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
|
||||
benchmarkBufferedFileWriterString(b, writer)
|
||||
|
||||
err = writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
|
||||
func benchmarkBufferedFileWriterString(b *testing.B, w *BufferedFileWriter) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := w.String()
|
||||
assert.NoError(b, err)
|
||||
}
|
||||
b.StopTimer()
|
||||
}
|
||||
|
||||
func TestBufferedFileWriterLen(t *testing.T) {
|
||||
t.Parallel()
|
||||
tests := []struct {
|
||||
|
@ -306,3 +446,63 @@ func TestBufferedFileWriterWriteInReadOnlyState(t *testing.T) {
|
|||
_, err := writer.Write(context.Background(), []byte("should fail"))
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterWriteLarge(b *testing.B) {
|
||||
ctx := context.Background()
|
||||
data := make([]byte, 1024*1024*10) // 10MB
|
||||
for i := range data {
|
||||
data[i] = byte(i % 256) // Simple pattern to avoid uniform zero data
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
// Threshold is smaller than the data size, data should get flushed to the file.
|
||||
writer := New(WithThreshold(1024))
|
||||
|
||||
b.StartTimer()
|
||||
{
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
}
|
||||
b.StopTimer()
|
||||
|
||||
// Ensure proper cleanup after each write operation, including closing the file
|
||||
err := writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBufferedFileWriterWriteSmall(b *testing.B) {
|
||||
ctx := context.Background()
|
||||
data := make([]byte, 1024*1024) // 1MB
|
||||
for i := range data {
|
||||
data[i] = byte(i % 256) // Simple pattern to avoid uniform zero data
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
// Threshold is the same as the buffer size, data should always be written to the buffer.
|
||||
writer := New(WithThreshold(1024 * 1024))
|
||||
|
||||
b.StartTimer()
|
||||
{
|
||||
_, err := writer.Write(ctx, data)
|
||||
assert.NoError(b, err)
|
||||
}
|
||||
b.StopTimer()
|
||||
|
||||
// Ensure proper cleanup after each write operation, including closing the file.
|
||||
err := writer.CloseForWriting()
|
||||
assert.NoError(b, err)
|
||||
|
||||
rc, err := writer.ReadCloser()
|
||||
assert.NoError(b, err)
|
||||
rc.Close()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue