Merge branch 'main' into test-revert-regex-engine

This commit is contained in:
Ahrav Dutta 2024-09-10 17:57:42 -07:00
commit a19bcd7813
11 changed files with 29 additions and 213 deletions

View file

@ -27,6 +27,10 @@ func init() {
}
}
func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}
func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
@ -67,10 +71,6 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return nil
}
func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}
func isASCII(b []byte) bool {
for i := 0; i < len(b); i++ {
if b[i] > unicode.MaxASCII {

View file

@ -24,6 +24,10 @@ var (
escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`)
)
func (d *EscapedUnicode) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_ESCAPED_UNICODE
}
func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
@ -94,10 +98,6 @@ func decodeCodePoint(input []byte) []byte {
return input
}
func (d *EscapedUnicode) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_ESCAPED_UNICODE
}
func decodeEscaped(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := escapePat.FindAllSubmatchIndex(input, -1)

View file

@ -11,6 +11,10 @@ import (
type UTF16 struct{}
func (d *UTF16) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_UTF16
}
func (d *UTF16) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
@ -28,10 +32,6 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return nil
}
func (d *UTF16) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_UTF16
}
// utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
func utf16ToUTF8(b []byte) ([]byte, error) {
var bufBE, bufLE bytes.Buffer

View file

@ -10,6 +10,10 @@ import (
type UTF8 struct{}
func (d *UTF8) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_PLAIN
}
func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
@ -25,10 +29,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return decodableChunk
}
func (d *UTF8) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_PLAIN
}
// extractSubstrings performs similarly to the strings binutil,
// extacting contigous portions of printable characters that we care
// about from some bytes

View file

@ -31,7 +31,7 @@ var (
// Keywords are used for efficiently pre-filtering chunks.
// Use identifiers in the secret preferably, or the provider name.
func (s Scanner) Keywords() []string {
return []string{"aha"}
return []string{"aha.io"}
}
func (s Scanner) getClient() *http.Client {

View file

@ -31,7 +31,7 @@ var (
// Tokens created after Jan 18 2023 use a variable length
tokenPat = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([A-Za-z0-9+/=_-]+=[A-Za-z0-9]{8})\b`)
domainPat = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([a-zA-Z-0-9]{5,24}\.[a-zA-Z-0-9]{3,16}\.[a-zA-Z-0-9]{3,16})\b`)
emailPat = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`)
emailPat = regexp.MustCompile(detectors.PrefixRegex([]string{"jira"}) + `\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})\b`)
)
const (
@ -54,11 +54,11 @@ func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (result
emails := emailPat.FindAllStringSubmatch(dataStr, -1)
for _, email := range emails {
email = strings.Split(email[0], " ")
if len(email) != 2 {
continue
}
resEmail := strings.TrimSpace(email[1])
for _, token := range tokens {
if len(token) != 2 {
continue

View file

@ -771,7 +771,7 @@ func (e *Engine) scannerWorker(ctx context.Context) {
decodeLatency.WithLabelValues(decoder.Type().String(), chunk.SourceName).Observe(float64(decodeTime))
if decoded == nil {
ctx.Logger().V(4).Info("no decoder found for chunk", "chunk", chunk)
ctx.Logger().V(4).Info("decoder not applicable for chunk", "decoder", decoder.Type().String(), "chunk", chunk)
continue
}
@ -797,7 +797,6 @@ func (e *Engine) scannerWorker(ctx context.Context) {
wgDoneFn: wgDetect.Done,
}
}
continue
}
dataSize := float64(len(chunk.Data))

View file

@ -5,4 +5,5 @@ import "sync/atomic"
var (
ForceSkipBinaries = atomic.Bool{}
ForceSkipArchives = atomic.Bool{}
SkipAdditionalRefs = atomic.Bool{}
)

View file

@ -422,6 +422,11 @@ func executeClone(ctx context.Context, params cloneParams) (*git.Repository, err
params.clonePath,
"--quiet", // https://git-scm.com/docs/git-clone#Documentation/git-clone.txt-code--quietcode
}
if !feature.SkipAdditionalRefs.Load() {
gitArgs = append(gitArgs,
"-c",
"remote.origin.fetch=+refs/*:refs/remotes/origin/*")
}
gitArgs = append(gitArgs, params.args...)
cloneCmd := exec.Command("git", gitArgs...)

View file

@ -118,17 +118,6 @@ func New(opts ...Option) *BufferedFileWriter {
return w
}
// NewFromReader creates a new instance of BufferedFileWriter and writes the content from the provided reader to the writer.
func NewFromReader(r io.Reader, opts ...Option) (*BufferedFileWriter, error) {
opts = append(opts, WithBufferSize(Large))
writer := New(opts...)
if _, err := io.Copy(writer, r); err != nil && !errors.Is(err, io.EOF) {
return nil, fmt.Errorf("error writing to buffered file writer: %w", err)
}
return writer, nil
}
// Len returns the number of bytes written to the buffer or file.
func (w *BufferedFileWriter) Len() int { return int(w.size) }
@ -291,14 +280,7 @@ func (w *BufferedFileWriter) CloseForWriting() error {
// If the content is stored in memory, it returns a custom reader that handles returning the buffer to the pool.
// The caller should call Close() on the returned io.Reader when done to ensure resources are properly released.
// This method can only be used when the BufferedFileWriter is in read-only mode.
func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) { return w.ReadSeekCloser() }
// ReadSeekCloser returns an io.ReadSeekCloser to read the written content.
// If the content is stored in a file, it opens the file and returns a file reader.
// If the content is stored in memory, it returns a custom reader that allows seeking and handles returning
// the buffer to the pool.
// This method can only be used when the BufferedFileWriter is in read-only mode.
func (w *BufferedFileWriter) ReadSeekCloser() (io.ReadSeekCloser, error) {
func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) {
if w.state != readOnly {
return nil, fmt.Errorf("BufferedFileWriter must be in read-only mode to read")
}

View file

@ -2,11 +2,8 @@ package bufferedfilewriter
import (
"bytes"
"crypto/rand"
"fmt"
"io"
"os"
"strings"
"testing"
"time"
@ -498,103 +495,6 @@ func BenchmarkBufferedFileWriterWriteSmall(b *testing.B) {
}
}
// Create a custom reader that can simulate errors.
type errorReader struct{}
func (errorReader) Read([]byte) (n int, err error) { return 0, fmt.Errorf("error reading") }
func TestNewFromReader(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
reader io.Reader
wantErr bool
wantData string
}{
{
name: "Success case",
reader: strings.NewReader("hello world"),
wantData: "hello world",
},
{
name: "Empty reader",
reader: strings.NewReader(""),
},
{
name: "Error reader",
reader: errorReader{},
wantErr: true,
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
bufWriter, err := NewFromReader(tc.reader)
if err != nil && tc.wantErr {
return
}
assert.NoError(t, err)
assert.NotNil(t, bufWriter)
err = bufWriter.CloseForWriting()
assert.NoError(t, err)
b := new(bytes.Buffer)
rdr, err := bufWriter.ReadCloser()
if err != nil && tc.wantErr {
return
}
assert.NoError(t, err)
if rdr == nil {
return
}
defer rdr.Close()
_, err = b.ReadFrom(rdr)
assert.NoError(t, err)
assert.Equal(t, tc.wantData, b.String())
})
}
}
func TestNewFromReaderThresholdExceeded(t *testing.T) {
t.Parallel()
// Create a large data buffer that exceeds the threshold.
largeData := make([]byte, 1024*1024) // 1 MB
_, err := rand.Read(largeData)
assert.NoError(t, err)
// Create a BufferedFileWriter with a smaller threshold.
threshold := uint64(1024) // 1 KB
bufWriter, err := NewFromReader(bytes.NewReader(largeData), WithThreshold(threshold))
assert.NoError(t, err)
err = bufWriter.CloseForWriting()
assert.NoError(t, err)
rdr, err := bufWriter.ReadCloser()
assert.NoError(t, err)
defer rdr.Close()
// Verify that the data was written to a file.
assert.NotEmpty(t, bufWriter.filename)
assert.NotNil(t, bufWriter.file)
// Read the data from the BufferedFileWriter.
readData, err := io.ReadAll(rdr)
assert.NoError(t, err)
assert.Equal(t, largeData, readData)
// Verify the size of the data written.
assert.Equal(t, uint64(len(largeData)), bufWriter.size)
}
func TestBufferWriterCloseForWritingWithFile(t *testing.T) {
bufPool := pool.NewBufferPool(defaultBufferSize)
@ -700,74 +600,3 @@ func TestBufferedFileWriter_ReadFrom(t *testing.T) {
})
}
}
// simpleReader wraps a string, allowing it to be read as an io.Reader without implementing io.WriterTo.
type simpleReader struct {
data []byte
offset int
}
func newSimpleReader(s string) *simpleReader { return &simpleReader{data: []byte(s)} }
// Read implements the io.Reader interface.
func (sr *simpleReader) Read(p []byte) (n int, err error) {
if sr.offset >= len(sr.data) {
return 0, io.EOF // no more data to read
}
n = copy(p, sr.data[sr.offset:]) // copy data to p
sr.offset += n // move offset for next read
return
}
func TestNewFromReaderThresholdExceededSimpleReader(t *testing.T) {
t.Parallel()
// Create a large data buffer that exceeds the threshold.
largeData := strings.Repeat("a", 1024*1024) // 1 MB
// Create a BufferedFileWriter with a smaller threshold.
threshold := uint64(1024) // 1 KB
bufWriter, err := NewFromReader(newSimpleReader(largeData), WithThreshold(threshold))
assert.NoError(t, err)
err = bufWriter.CloseForWriting()
assert.NoError(t, err)
rdr, err := bufWriter.ReadCloser()
assert.NoError(t, err)
defer rdr.Close()
// Verify that the data was written to a file.
assert.NotEmpty(t, bufWriter.filename)
assert.NotNil(t, bufWriter.file)
// Read the data from the BufferedFileWriter.
readData, err := io.ReadAll(rdr)
assert.NoError(t, err)
assert.Equal(t, largeData, string(readData))
// Verify the size of the data written.
assert.Equal(t, uint64(len(largeData)), bufWriter.size)
}
func BenchmarkNewFromReader(b *testing.B) {
largeData := strings.Repeat("a", 1024*1024) // 1 MB
b.ResetTimer()
for i := 0; i < b.N; i++ {
reader := newSimpleReader(largeData)
b.StartTimer()
bufWriter, err := NewFromReader(reader)
assert.NoError(b, err)
b.StopTimer()
err = bufWriter.CloseForWriting()
assert.NoError(b, err)
rdr, err := bufWriter.ReadCloser()
assert.NoError(b, err)
rdr.Close()
}
}