Merge branch 'main' into test-revert-regex-engine

2024-11-14 08:57:40 +00:00 · 2024-09-10 17:57:42 -07:00 · 2024-09-10 17:57:42 -07:00 · a19bcd7813
commit a19bcd7813
parent e36888cec0 70c6bb5634
11 changed files with 29 additions and 213 deletions
--- a/pkg/decoders/base64.go
+++ b/pkg/decoders/base64.go
@ -27,6 +27,10 @@ func init() {
 	}
 }

+func (d *Base64) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_BASE64
+}
+
 func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
 	encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
@ -67,10 +71,6 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	return nil
 }

-func (d *Base64) Type() detectorspb.DecoderType {
-	return detectorspb.DecoderType_BASE64
-}
-
 func isASCII(b []byte) bool {
 	for i := 0; i < len(b); i++ {
 		if b[i] > unicode.MaxASCII {
--- a/pkg/decoders/escaped_unicode.go
+++ b/pkg/decoders/escaped_unicode.go
@ -24,6 +24,10 @@ var (
 	escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`)
 )

+func (d *EscapedUnicode) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_ESCAPED_UNICODE
+}
+
 func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	if chunk == nil || len(chunk.Data) == 0 {
 		return nil
@ -94,10 +98,6 @@ func decodeCodePoint(input []byte) []byte {
 	return input
 }

-func (d *EscapedUnicode) Type() detectorspb.DecoderType {
-	return detectorspb.DecoderType_ESCAPED_UNICODE
-}
-
 func decodeEscaped(input []byte) []byte {
 	// Find all Unicode escape sequences in the input byte slice
 	indices := escapePat.FindAllSubmatchIndex(input, -1)
--- a/pkg/decoders/utf16.go
+++ b/pkg/decoders/utf16.go
@ -11,6 +11,10 @@ import (

 type UTF16 struct{}

+func (d *UTF16) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_UTF16
+}
+
 func (d *UTF16) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	if chunk == nil || len(chunk.Data) == 0 {
 		return nil
@ -28,10 +32,6 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	return nil
 }

-func (d *UTF16) Type() detectorspb.DecoderType {
-	return detectorspb.DecoderType_UTF16
-}
-
 // utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
 func utf16ToUTF8(b []byte) ([]byte, error) {
 	var bufBE, bufLE bytes.Buffer
--- a/pkg/decoders/utf8.go
+++ b/pkg/decoders/utf8.go
@ -10,6 +10,10 @@ import (

 type UTF8 struct{}

+func (d *UTF8) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_PLAIN
+}
+
 func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	if chunk == nil || len(chunk.Data) == 0 {
 		return nil
@ -25,10 +29,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	return decodableChunk
 }

-func (d *UTF8) Type() detectorspb.DecoderType {
-	return detectorspb.DecoderType_PLAIN
-}
-
 // extractSubstrings performs similarly to the strings binutil,
 // extacting contigous portions of printable characters that we care
 // about from some bytes
--- a/pkg/detectors/aha/aha.go
+++ b/pkg/detectors/aha/aha.go
@ -31,7 +31,7 @@ var (
 // Keywords are used for efficiently pre-filtering chunks.
 // Use identifiers in the secret preferably, or the provider name.
 func (s Scanner) Keywords() []string {
-	return []string{"aha"}
+	return []string{"aha.io"}
 }

 func (s Scanner) getClient() *http.Client {
--- a/pkg/detectors/jiratoken/v2/jiratoken_v2.go
+++ b/pkg/detectors/jiratoken/v2/jiratoken_v2.go
@ -31,7 +31,7 @@ var (
 	// Tokens created after Jan 18 2023 use a variable length
 	tokenPat  = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([A-Za-z0-9+/=_-]+=[A-Za-z0-9]{8})\b`)
 	domainPat = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([a-zA-Z-0-9]{5,24}\.[a-zA-Z-0-9]{3,16}\.[a-zA-Z-0-9]{3,16})\b`)
-	emailPat  = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`)
+	emailPat  = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})\b`)
 )

 const (
@ -54,11 +54,11 @@ func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (result
 	emails := emailPat.FindAllStringSubmatch(dataStr, -1)

 	for _, email := range emails {
-		email = strings.Split(email[0], " ")
 		if len(email) != 2 {
 			continue
 		}
 		resEmail := strings.TrimSpace(email[1])
+
 		for _, token := range tokens {
 			if len(token) != 2 {
 				continue
--- a/pkg/engine/engine.go
+++ b/pkg/engine/engine.go
@ -771,7 +771,7 @@ func (e *Engine) scannerWorker(ctx context.Context) {
 			decodeLatency.WithLabelValues(decoder.Type().String(), chunk.SourceName).Observe(float64(decodeTime))

 			if decoded == nil {
-				ctx.Logger().V(4).Info("no decoder found for chunk", "chunk", chunk)
+				ctx.Logger().V(4).Info("decoder not applicable for chunk", "decoder", decoder.Type().String(), "chunk", chunk)
 				continue
 			}

@ -797,7 +797,6 @@ func (e *Engine) scannerWorker(ctx context.Context) {
 					wgDoneFn: wgDetect.Done,
 				}
 			}
-			continue
 		}

 		dataSize := float64(len(chunk.Data))
--- a/pkg/feature/feature.go
+++ b/pkg/feature/feature.go
@ -5,4 +5,5 @@ import "sync/atomic"
 var (
 	ForceSkipBinaries  = atomic.Bool{}
 	ForceSkipArchives  = atomic.Bool{}
+	SkipAdditionalRefs = atomic.Bool{}
 )
--- a/pkg/sources/git/git.go
+++ b/pkg/sources/git/git.go
@ -422,6 +422,11 @@ func executeClone(ctx context.Context, params cloneParams) (*git.Repository, err
 		params.clonePath,
 		"--quiet", // https://git-scm.com/docs/git-clone#Documentation/git-clone.txt-code--quietcode
 	}
+	if !feature.SkipAdditionalRefs.Load() {
+		gitArgs = append(gitArgs,
+			"-c",
+			"remote.origin.fetch=+refs/*:refs/remotes/origin/*")
+	}
 	gitArgs = append(gitArgs, params.args...)
 	cloneCmd := exec.Command("git", gitArgs...)

--- a/pkg/writers/buffered_file_writer/bufferedfilewriter.go
+++ b/pkg/writers/buffered_file_writer/bufferedfilewriter.go
@ -118,17 +118,6 @@ func New(opts ...Option) *BufferedFileWriter {
 	return w
 }

-// NewFromReader creates a new instance of BufferedFileWriter and writes the content from the provided reader to the writer.
-func NewFromReader(r io.Reader, opts ...Option) (*BufferedFileWriter, error) {
-	opts = append(opts, WithBufferSize(Large))
-	writer := New(opts...)
-	if _, err := io.Copy(writer, r); err != nil && !errors.Is(err, io.EOF) {
-		return nil, fmt.Errorf("error writing to buffered file writer: %w", err)
-	}
-
-	return writer, nil
-}
-
 // Len returns the number of bytes written to the buffer or file.
 func (w *BufferedFileWriter) Len() int { return int(w.size) }

@ -291,14 +280,7 @@ func (w *BufferedFileWriter) CloseForWriting() error {
 // If the content is stored in memory, it returns a custom reader that handles returning the buffer to the pool.
 // The caller should call Close() on the returned io.Reader when done to ensure resources are properly released.
 // This method can only be used when the BufferedFileWriter is in read-only mode.
-func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) { return w.ReadSeekCloser() }
-
-// ReadSeekCloser returns an io.ReadSeekCloser to read the written content.
-// If the content is stored in a file, it opens the file and returns a file reader.
-// If the content is stored in memory, it returns a custom reader that allows seeking and handles returning
-// the buffer to the pool.
-// This method can only be used when the BufferedFileWriter is in read-only mode.
-func (w *BufferedFileWriter) ReadSeekCloser() (io.ReadSeekCloser, error) {
+func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) {
 	if w.state != readOnly {
 		return nil, fmt.Errorf("BufferedFileWriter must be in read-only mode to read")
 	}
--- a/pkg/writers/buffered_file_writer/bufferedfilewriter_test.go
+++ b/pkg/writers/buffered_file_writer/bufferedfilewriter_test.go
@ -2,11 +2,8 @@ package bufferedfilewriter

 import (
 	"bytes"
-	"crypto/rand"
-	"fmt"
 	"io"
 	"os"
-	"strings"
 	"testing"
 	"time"

@ -498,103 +495,6 @@ func BenchmarkBufferedFileWriterWriteSmall(b *testing.B) {
 	}
 }

-// Create a custom reader that can simulate errors.
-type errorReader struct{}
-
-func (errorReader) Read([]byte) (n int, err error) { return 0, fmt.Errorf("error reading") }
-
-func TestNewFromReader(t *testing.T) {
-	t.Parallel()
-
-	testCases := []struct {
-		name     string
-		reader   io.Reader
-		wantErr  bool
-		wantData string
-	}{
-		{
-			name:     "Success case",
-			reader:   strings.NewReader("hello world"),
-			wantData: "hello world",
-		},
-		{
-			name:   "Empty reader",
-			reader: strings.NewReader(""),
-		},
-		{
-			name:    "Error reader",
-			reader:  errorReader{},
-			wantErr: true,
-		},
-	}
-
-	for _, tc := range testCases {
-		tc := tc
-		t.Run(tc.name, func(t *testing.T) {
-			t.Parallel()
-			bufWriter, err := NewFromReader(tc.reader)
-			if err != nil && tc.wantErr {
-				return
-			}
-
-			assert.NoError(t, err)
-			assert.NotNil(t, bufWriter)
-
-			err = bufWriter.CloseForWriting()
-			assert.NoError(t, err)
-
-			b := new(bytes.Buffer)
-			rdr, err := bufWriter.ReadCloser()
-			if err != nil && tc.wantErr {
-				return
-			}
-			assert.NoError(t, err)
-
-			if rdr == nil {
-				return
-			}
-			defer rdr.Close()
-
-			_, err = b.ReadFrom(rdr)
-			assert.NoError(t, err)
-			assert.Equal(t, tc.wantData, b.String())
-		})
-	}
-}
-
-func TestNewFromReaderThresholdExceeded(t *testing.T) {
-	t.Parallel()
-
-	// Create a large data buffer that exceeds the threshold.
-	largeData := make([]byte, 1024*1024) // 1 MB
-	_, err := rand.Read(largeData)
-	assert.NoError(t, err)
-
-	// Create a BufferedFileWriter with a smaller threshold.
-	threshold := uint64(1024) // 1 KB
-	bufWriter, err := NewFromReader(bytes.NewReader(largeData), WithThreshold(threshold))
-	assert.NoError(t, err)
-
-	err = bufWriter.CloseForWriting()
-	assert.NoError(t, err)
-
-	rdr, err := bufWriter.ReadCloser()
-	assert.NoError(t, err)
-	defer rdr.Close()
-
-	// Verify that the data was written to a file.
-	assert.NotEmpty(t, bufWriter.filename)
-	assert.NotNil(t, bufWriter.file)
-
-	// Read the data from the BufferedFileWriter.
-	readData, err := io.ReadAll(rdr)
-	assert.NoError(t, err)
-	assert.Equal(t, largeData, readData)
-
-	// Verify the size of the data written.
-	assert.Equal(t, uint64(len(largeData)), bufWriter.size)
-}
-
 func TestBufferWriterCloseForWritingWithFile(t *testing.T) {
 	bufPool := pool.NewBufferPool(defaultBufferSize)

@ -700,74 +600,3 @@ func TestBufferedFileWriter_ReadFrom(t *testing.T) {
 		})
 	}
 }
-
-// simpleReader wraps a string, allowing it to be read as an io.Reader without implementing io.WriterTo.
-type simpleReader struct {
-	data   []byte
-	offset int
-}
-
-func newSimpleReader(s string) *simpleReader { return &simpleReader{data: []byte(s)} }
-
-// Read implements the io.Reader interface.
-func (sr *simpleReader) Read(p []byte) (n int, err error) {
-	if sr.offset >= len(sr.data) {
-		return 0, io.EOF // no more data to read
-	}
-	n = copy(p, sr.data[sr.offset:]) // copy data to p
-	sr.offset += n                   // move offset for next read
-	return
-}
-
-func TestNewFromReaderThresholdExceededSimpleReader(t *testing.T) {
-	t.Parallel()
-
-	// Create a large data buffer that exceeds the threshold.
-	largeData := strings.Repeat("a", 1024*1024) // 1 MB
-
-	// Create a BufferedFileWriter with a smaller threshold.
-	threshold := uint64(1024) // 1 KB
-	bufWriter, err := NewFromReader(newSimpleReader(largeData), WithThreshold(threshold))
-	assert.NoError(t, err)
-
-	err = bufWriter.CloseForWriting()
-	assert.NoError(t, err)
-
-	rdr, err := bufWriter.ReadCloser()
-	assert.NoError(t, err)
-	defer rdr.Close()
-
-	// Verify that the data was written to a file.
-	assert.NotEmpty(t, bufWriter.filename)
-	assert.NotNil(t, bufWriter.file)
-
-	// Read the data from the BufferedFileWriter.
-	readData, err := io.ReadAll(rdr)
-	assert.NoError(t, err)
-	assert.Equal(t, largeData, string(readData))
-
-	// Verify the size of the data written.
-	assert.Equal(t, uint64(len(largeData)), bufWriter.size)
-}
-
-func BenchmarkNewFromReader(b *testing.B) {
-	largeData := strings.Repeat("a", 1024*1024) // 1 MB
-
-	b.ResetTimer()
-
-	for i := 0; i < b.N; i++ {
-		reader := newSimpleReader(largeData)
-
-		b.StartTimer()
-		bufWriter, err := NewFromReader(reader)
-		assert.NoError(b, err)
-		b.StopTimer()
-
-		err = bufWriter.CloseForWriting()
-		assert.NoError(b, err)
-
-		rdr, err := bufWriter.ReadCloser()
-		assert.NoError(b, err)
-		rdr.Close()
-	}
-}