[feat] - Make skipping binaries configurable (#2226)

* Make skipping binaries configurable

* remove ioutil

* fix

* address comments

* address comments

* use multi-reader

* remove print

* use const

* fix test

* fix my stupidness
This commit is contained in:
ahrav 2023-12-15 11:46:27 -08:00 committed by GitHub
parent 78b5a95342
commit 5c6ce693c1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 767 additions and 475 deletions

View file

@ -201,7 +201,9 @@ func main() {
},
},
}
})
},
true,
)
logger.Info("scanning repo", "repo", r)
err = s.ScanRepo(ctx, repo, path, git.NewScanOptions(), sources.ChanReporter{Ch: chunksChan})

View file

@ -36,17 +36,49 @@ var (
"gif",
"tiff",
"fnt", // Windows font file
"fon", // Generic font file
"ttf", // TrueType font
"otf", // OpenType font
"woff", // Web Open Font Format
"woff2", // Web Open Font Format 2
"eot", // Embedded OpenType font
"svgz", // Compressed Scalable Vector Graphics file
"icns", // Apple icon image file
"ico", // Icon file
}
binaryExtensions = map[string]struct{}{
// binaries
// These can theoretically contain secrets, but need decoding for users to make sense of them, and we don't have
// any such decoders right now.
"class",
"dll",
"xsb",
"jdo",
"jks",
"ser",
"idx",
"hprof",
"class": {}, // Java bytecode class file
"dll": {}, // Dynamic Link Library, Windows
"jdo": {}, // Java Data Object, Java serialization format
"jks": {}, // Java Key Store, Java keystore format
"ser": {}, // Java serialization format
"idx": {}, // Index file, often binary
"hprof": {}, // Java heap dump format
"exe": {}, // Executable, Windows
"bin": {}, // Binary, often used for compiled source code
"so": {}, // Shared object, Unix/Linux
"o": {}, // Object file from compilation/ intermediate object file
"a": {}, // Static library, Unix/Linux
"dylib": {}, // Dynamic library, macOS
"lib": {}, // Library, Unix/Linux
"obj": {}, // Object file, typically from compiled source code
"pdb": {}, // Program Database, Microsoft Visual Studio debugging format
"dat": {}, // Generic data file, often binary but not always
"elf": {}, // Executable and Linkable Format, common in Unix/Linux
"dmg": {}, // Disk Image for macOS
"iso": {}, // ISO image (optical disk image)
"img": {}, // Disk image files
"out": {}, // Common output file from compiled executable in Unix/Linux
"com": {}, // DOS command file, executable
"sys": {}, // Windows system file, often a driver
"vxd": {}, // Virtual device driver in Windows
"sfx": {}, // Self-extracting archive
"bundle": {}, // Mac OS X application bundle
}
)
@ -58,3 +90,9 @@ func SkipFile(filename string) bool {
}
return false
}
// IsBinary returns true if the file extension is in the binaryExtensions list.
func IsBinary(filename string) bool {
_, ok := binaryExtensions[strings.ToLower(strings.TrimPrefix(filepath.Ext(filename), "."))]
return ok
}

View file

@ -34,3 +34,36 @@ func BenchmarkSkipFile(b *testing.B) {
SkipFile("test.mp4")
}
}
func TestIsBinary(t *testing.T) {
type testCase struct {
file string
want bool
}
// Add a test case for each binary extension.
testCases := make([]testCase, 0, len(binaryExtensions)+1)
for ext := range binaryExtensions {
testCases = append(testCases, testCase{
file: "test." + ext,
want: true,
})
}
// Add a test case for a file that should not be skipped.
testCases = append(testCases, testCase{file: "test.txt", want: false})
for _, tt := range testCases {
t.Run(tt.file, func(t *testing.T) {
if got := IsBinary(tt.file); got != tt.want {
t.Errorf("IsBinary(%v) got %v, want %v", tt.file, got, tt.want)
}
})
}
}
func BenchmarkIsBinary(b *testing.B) {
for i := 0; i < b.N; i++ {
IsBinary("test.exe")
}
}

View file

@ -23,6 +23,7 @@ func (e *Engine) ScanGit(ctx context.Context, c sources.GitConfig) error {
IncludePathsFile: c.IncludePathsFile,
ExcludePathsFile: c.ExcludePathsFile,
MaxDepth: int64(c.MaxDepth),
SkipBinaries: c.SkipBinaries,
}
var conn anypb.Any
if err := anypb.MarshalFrom(&conn, connection, proto.MarshalOptions{}); err != nil {

View file

@ -24,6 +24,7 @@ func (e *Engine) ScanGitHub(ctx context.Context, c sources.GithubConfig) error {
IncludeIssueComments: c.IncludeIssueComments,
IncludePullRequestComments: c.IncludePullRequestComments,
IncludeGistComments: c.IncludeGistComments,
SkipBinaries: c.SkipBinaries,
}
if len(c.Token) > 0 {
connection.Credential = &sourcespb.GitHub_Token{

View file

@ -24,7 +24,7 @@ func (e *Engine) ScanGitLab(ctx context.Context, c sources.GitlabConfig) error {
}
scanOptions := git.NewScanOptions(opts...)
connection := &sourcespb.GitLab{}
connection := &sourcespb.GitLab{SkipBinaries: c.SkipBinaries}
switch {
case len(c.Token) > 0:

View file

@ -1,6 +1,7 @@
package handlers
import (
"bufio"
"bytes"
"context"
"errors"
@ -12,6 +13,7 @@ import (
"strings"
"time"
"github.com/gabriel-vasile/mimetype"
"github.com/h2non/filetype"
"github.com/mholt/archiver/v4"
@ -43,11 +45,14 @@ var _ SpecializedHandler = (*Archive)(nil)
type Archive struct {
size int
currentDepth int
skipBinaries bool
}
// New sets a default maximum size and current size counter.
func (a *Archive) New() {
a.size = 0
// New creates a new Archive handler with the provided options.
func (a *Archive) New(opts ...Option) {
for _, opt := range opts {
opt(a)
}
}
// SetArchiveMaxSize sets the maximum size of the archive.
@ -114,9 +119,34 @@ func (a *Archive) openArchive(ctx logContext.Context, depth int, reader io.Reade
}
}
const mimeTypeBufferSize = 512
func (a *Archive) handleNonArchiveContent(ctx logContext.Context, reader io.Reader, archiveChan chan []byte) error {
bufReader := bufio.NewReaderSize(reader, mimeTypeBufferSize)
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
// If fewer bytes are read, MIME type detection may still succeed.
buffer, err := bufReader.Peek(mimeTypeBufferSize)
if err != nil && !errors.Is(err, io.EOF) {
return fmt.Errorf("unable to read file for MIME type detection: %w", err)
}
mime := mimetype.Detect(buffer)
mimeT := mimeType(mime.String())
if common.SkipFile(mime.Extension()) {
ctx.Logger().V(5).Info("skipping file", "ext", mimeT)
return nil
}
if a.skipBinaries {
if common.IsBinary(mime.Extension()) || mimeT == machOType || mimeT == octetStream {
ctx.Logger().V(5).Info("skipping binary file", "ext", mimeT)
return nil
}
}
chunkReader := sources.NewChunkReader()
chunkResChan := chunkReader(ctx, reader)
chunkResChan := chunkReader(ctx, bufReader)
for data := range chunkResChan {
if err := data.Error(); err != nil {
ctx.Logger().Error(err, "error reading chunk")
@ -166,6 +196,11 @@ func (a *Archive) extractorHandler(archiveChan chan []byte) func(context.Context
return nil
}
if a.skipBinaries && common.IsBinary(f.Name()) {
lCtx.Logger().V(5).Info("skipping binary file", "filename", f.Name())
return nil
}
fileBytes, err := a.ReadToMax(lCtx, fReader)
if err != nil {
return err
@ -222,6 +257,8 @@ type mimeType string
const (
arMimeType mimeType = "application/x-unix-archive"
rpmMimeType mimeType = "application/x-rpm"
machOType mimeType = "application/x-mach-binary"
octetStream mimeType = "application/octet-stream"
)
// mimeTools maps MIME types to the necessary command-line tools to handle them.

View file

@ -1,9 +1,12 @@
package handlers
import (
"archive/tar"
"bytes"
"context"
"encoding/binary"
"io"
"math/rand"
"net/http"
"os"
"regexp"
@ -129,6 +132,82 @@ func TestHandleFile(t *testing.T) {
assert.Equal(t, 1, len(reporter.Ch))
}
func TestHandleFileSkipBinaries(t *testing.T) {
filename := createBinaryArchive(t)
defer os.Remove(filename)
file, err := os.Open(filename)
assert.NoError(t, err)
ctx, cancel := logContext.WithTimeout(logContext.Background(), 5*time.Second)
defer cancel()
sourceChan := make(chan *sources.Chunk, 1)
go func() {
defer close(sourceChan)
HandleFile(ctx, file, &sources.Chunk{}, sources.ChanReporter{Ch: sourceChan}, WithSkipBinaries(true))
}()
count := 0
for range sourceChan {
count++
}
// The binary archive should not be scanned.
assert.Equal(t, 0, count)
}
func createBinaryArchive(t *testing.T) string {
t.Helper()
f, err := os.CreateTemp("", "testbinary")
assert.NoError(t, err)
defer os.Remove(f.Name())
r := rand.New(rand.NewSource(time.Now().UnixNano()))
randomBytes := make([]byte, 1024)
_, err = r.Read(randomBytes)
assert.NoError(t, err)
_, err = f.Write(randomBytes)
assert.NoError(t, err)
// Create and write some structured binary data (e.g., integers, floats)
for i := 0; i < 10; i++ {
err = binary.Write(f, binary.LittleEndian, int32(rand.Intn(1000)))
assert.NoError(t, err)
err = binary.Write(f, binary.LittleEndian, rand.Float64())
assert.NoError(t, err)
}
tarFile, err := os.Create("example.tar")
if err != nil {
t.Fatal(err)
}
defer tarFile.Close()
// Create a new tar archive.
tarWriter := tar.NewWriter(tarFile)
defer tarWriter.Close()
fileInfo, err := f.Stat()
assert.NoError(t, err)
header, err := tar.FileInfoHeader(fileInfo, "")
assert.NoError(t, err)
err = tarWriter.WriteHeader(header)
assert.NoError(t, err)
fileContent, err := os.ReadFile(f.Name())
assert.NoError(t, err)
_, err = tarWriter.Write(fileContent)
assert.NoError(t, err)
return tarFile.Name()
}
func TestReadToMax(t *testing.T) {
tests := []struct {
name string

View file

@ -24,10 +24,22 @@ type SpecializedHandler interface {
HandleSpecialized(logContext.Context, io.Reader) (io.Reader, bool, error)
}
// Option is a function type that applies a configuration to a Handler.
type Option func(Handler)
// WithSkipBinaries returns a Option that configures whether to skip binary files.
func WithSkipBinaries(skip bool) Option {
return func(h Handler) {
if a, ok := h.(*Archive); ok {
a.skipBinaries = skip
}
}
}
type Handler interface {
FromFile(logContext.Context, io.Reader) chan []byte
IsFiletype(logContext.Context, io.Reader) (io.Reader, bool)
New()
New(...Option)
}
// HandleFile processes a given file by selecting an appropriate handler from DefaultHandlers.
@ -36,9 +48,9 @@ type Handler interface {
// packages them in the provided chunk skeleton, and reports them to the chunk reporter.
// The function returns true if processing was successful and false otherwise.
// Context is used for cancellation, and the caller is responsible for canceling it if needed.
func HandleFile(ctx logContext.Context, file io.Reader, chunkSkel *sources.Chunk, reporter sources.ChunkReporter) bool {
func HandleFile(ctx logContext.Context, file io.Reader, chunkSkel *sources.Chunk, reporter sources.ChunkReporter, opts ...Option) bool {
for _, h := range DefaultHandlers() {
h.New()
h.New(opts...)
// The re-reader is used to reset the file reader after checking if the handler implements SpecializedHandler.
// This is necessary because the archive pkg doesn't correctly determine the file type when using

File diff suppressed because it is too large Load diff

View file

@ -585,6 +585,8 @@ func (m *Bitbucket) validate(all bool) error {
errors = append(errors, err)
}
// no validation rules for SkipBinaries
switch m.Credential.(type) {
case *Bitbucket_Token:
@ -1802,6 +1804,8 @@ func (m *Git) validate(all bool) error {
// no validation rules for Uri
// no validation rules for SkipBinaries
switch m.Credential.(type) {
case *Git_BasicAuth:
@ -2009,6 +2013,8 @@ func (m *GitLab) validate(all bool) error {
errors = append(errors, err)
}
// no validation rules for SkipBinaries
switch m.Credential.(type) {
case *GitLab_Token:
@ -2202,6 +2208,8 @@ func (m *GitHub) validate(all bool) error {
// no validation rules for IncludeGistComments
// no validation rules for SkipBinaries
switch m.Credential.(type) {
case *GitHub_GithubApp:
@ -3584,6 +3592,8 @@ func (m *Gerrit) validate(all bool) error {
errors = append(errors, err)
}
// no validation rules for SkipBinaries
switch m.Credential.(type) {
case *Gerrit_BasicAuth:

View file

@ -58,6 +58,7 @@ type Git struct {
verify bool
metrics metrics
concurrency *semaphore.Weighted
skipBinaries bool
}
type metrics struct {
@ -65,7 +66,7 @@ type metrics struct {
}
func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sources.SourceID, sourceName string, verify bool, concurrency int,
sourceMetadataFunc func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData,
sourceMetadataFunc func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData, skipBinaries bool,
) *Git {
return &Git{
sourceType: sourceType,
@ -75,6 +76,7 @@ func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sourc
sourceMetadataFunc: sourceMetadataFunc,
verify: verify,
concurrency: semaphore.NewWeighted(int64(concurrency)),
skipBinaries: skipBinaries,
}
}
@ -175,7 +177,9 @@ func (s *Source) Init(aCtx context.Context, name string, jobId sources.JobID, so
},
},
}
})
},
conn.GetSkipBinaries(),
)
return nil
}
@ -512,7 +516,7 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string
SourceMetadata: metadata,
Verify: s.verify,
}
if err := handleBinary(ctx, gitDir, reporter, chunkSkel, commitHash, fileName); err != nil {
if err := s.handleBinary(ctx, gitDir, reporter, chunkSkel, commitHash, fileName); err != nil {
logger.V(1).Info("error handling binary file", "error", err, "filename", fileName, "commit", commitHash, "file", diff.PathB)
}
continue
@ -676,7 +680,7 @@ func (s *Git) ScanStaged(ctx context.Context, repo *git.Repository, path string,
SourceMetadata: metadata,
Verify: s.verify,
}
if err := handleBinary(ctx, gitDir, reporter, chunkSkel, commitHash, fileName); err != nil {
if err := s.handleBinary(ctx, gitDir, reporter, chunkSkel, commitHash, fileName); err != nil {
logger.V(1).Info("error handling binary file", "error", err, "filename", fileName)
}
continue
@ -993,15 +997,24 @@ func getSafeRemoteURL(repo *git.Repository, preferred string) string {
return safeURL
}
func handleBinary(ctx context.Context, gitDir string, reporter sources.ChunkReporter, chunkSkel *sources.Chunk, commitHash plumbing.Hash, path string) error {
func (s *Git) handleBinary(ctx context.Context, gitDir string, reporter sources.ChunkReporter, chunkSkel *sources.Chunk, commitHash plumbing.Hash, path string) error {
fileCtx := context.WithValues(ctx, "commit", commitHash.String(), "path", path)
fileCtx.Logger().V(5).Info("handling binary file")
if common.SkipFile(path) {
fileCtx.Logger().V(5).Info("skipping binary file")
fileCtx.Logger().V(5).Info("file contains ignored extension")
return nil
}
var handlerOpts []handlers.Option
if s.skipBinaries {
handlerOpts = append(handlerOpts, handlers.WithSkipBinaries(true))
if common.IsBinary(path) {
fileCtx.Logger().V(5).Info("skipping binary file")
return nil
}
}
const maxSize = 1 * 1024 * 1024 * 1024 // 1GB
cmd := exec.Command("git", "-C", gitDir, "cat-file", "blob", commitHash.String()+":"+path)
@ -1057,7 +1070,7 @@ func handleBinary(ctx context.Context, gitDir string, reporter sources.ChunkRepo
defer reader.Close()
if handlers.HandleFile(fileCtx, reader, chunkSkel, reporter) {
if handlers.HandleFile(fileCtx, reader, chunkSkel, reporter, handlerOpts...) {
return nil
}

View file

@ -275,7 +275,9 @@ func (s *Source) Init(aCtx context.Context, name string, jobID sources.JobID, so
},
},
}
})
},
conn.GetSkipBinaries(),
)
return nil
}

View file

@ -136,7 +136,9 @@ func (s *Source) Init(_ context.Context, name string, jobId sources.JobID, sourc
},
},
}
})
},
conn.GetSkipBinaries(),
)
return nil
}

View file

@ -171,48 +171,54 @@ type GitConfig struct {
// ExcludeGlobs is a list of comma separated globs to exclude from the scan.
// This differs from the Filter exclusions as ExcludeGlobs is applied at the `git log -p` level
ExcludeGlobs string
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
}
// GithubConfig defines the optional configuration for a github source.
type GithubConfig struct {
// Endpoint is the endpoint of the source.
Endpoint,
Endpoint string
// Token is the token to use to authenticate with the source.
Token string
// IncludeForks indicates whether to include forks in the scan.
IncludeForks,
IncludeForks bool
// IncludeMembers indicates whether to include members in the scan.
IncludeMembers bool
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// Repos is the list of repositories to scan.
Repos,
Repos []string
// Orgs is the list of organizations to scan.
Orgs,
Orgs []string
// ExcludeRepos is a list of repositories to exclude from the scan.
ExcludeRepos,
ExcludeRepos []string
// IncludeRepos is a list of repositories to include in the scan.
IncludeRepos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
// IncludeIssueComments indicates whether to include GitHub issue comments in the scan.
IncludeIssueComments,
IncludeIssueComments bool
// IncludePullRequestComments indicates whether to include GitHub pull request comments in the scan.
IncludePullRequestComments,
IncludePullRequestComments bool
// IncludeGistComments indicates whether to include GitHub gist comments in the scan.
IncludeGistComments bool
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
}
// GitlabConfig defines the optional configuration for a gitlab source.
type GitlabConfig struct {
// Endpoint is the endpoint of the source.
Endpoint,
Endpoint string
// Token is the token to use to authenticate with the source.
Token string
// Repos is the list of repositories to scan.
Repos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
}
// FilesystemConfig defines the optional configuration for a filesystem source.

View file

@ -95,6 +95,7 @@ message Bitbucket {
}
repeated string repositories = 5;
repeated string ignore_repos = 6;
bool skip_binaries = 7;
}
message CircleCI {
@ -194,6 +195,7 @@ message Git {
// Passing a single repository via the uri field also allows for additional options to be specified
// like head, base, bare, etc.
string uri = 13; // repository URL. https://, file://, or ssh://
bool skip_binaries = 14;
}
message GitLab {
@ -205,6 +207,7 @@ message GitLab {
}
repeated string repositories = 5;
repeated string ignore_repos = 6;
bool skip_binaries = 7;
}
message GitHub {
@ -226,6 +229,7 @@ message GitHub {
bool includePullRequestComments = 14;
bool includeIssueComments = 15;
bool includeGistComments = 16;
bool skip_binaries = 17;
}
message GoogleDrive {
@ -296,6 +300,7 @@ message Gerrit {
credentials.Unauthenticated unauthenticated = 3;
}
repeated string projects = 4;
bool skip_binaries = 5;
}
message Jenkins {