Add skip archive support (#2257)

This commit is contained in:
Dustin Decker 2023-12-22 11:55:23 -08:00 committed by GitHub
parent f699f60e89
commit 7d93adc1d0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 593 additions and 459 deletions

View file

@ -203,6 +203,7 @@ func main() {
} }
}, },
true, true,
false,
) )
logger.Info("scanning repo", "repo", r) logger.Info("scanning repo", "repo", r)

View file

@ -46,6 +46,7 @@ type Archive struct {
size int size int
currentDepth int currentDepth int
skipBinaries bool skipBinaries bool
skipArchives bool
} }
// New creates a new Archive handler with the provided options. // New creates a new Archive handler with the provided options.
@ -72,6 +73,10 @@ func SetArchiveMaxTimeout(timeout time.Duration) {
// FromFile extracts the files from an archive. // FromFile extracts the files from an archive.
func (a *Archive) FromFile(originalCtx logContext.Context, data io.Reader) chan []byte { func (a *Archive) FromFile(originalCtx logContext.Context, data io.Reader) chan []byte {
if a.skipArchives {
return nil
}
archiveChan := make(chan []byte, defaultBufferSize) archiveChan := make(chan []byte, defaultBufferSize)
go func() { go func() {
ctx, cancel := logContext.WithTimeout(originalCtx, maxTimeout) ctx, cancel := logContext.WithTimeout(originalCtx, maxTimeout)

View file

@ -311,6 +311,31 @@ func TestExtractDebContent(t *testing.T) {
assert.Equal(t, expectedLength, len(string(content))) assert.Equal(t, expectedLength, len(string(content)))
} }
func TestSkipArchive(t *testing.T) {
file, err := os.Open("testdata/test.tgz")
assert.Nil(t, err)
defer file.Close()
reader, err := diskbufferreader.New(file)
assert.NoError(t, err)
ctx := logContext.Background()
chunkCh := make(chan *sources.Chunk)
go func() {
defer close(chunkCh)
ok := HandleFile(ctx, reader, &sources.Chunk{}, sources.ChanReporter{Ch: chunkCh}, WithSkipArchives(true))
assert.False(t, ok)
}()
wantCount := 0
count := 0
for range chunkCh {
count++
}
assert.Equal(t, wantCount, count)
}
func TestExtractTarContent(t *testing.T) { func TestExtractTarContent(t *testing.T) {
file, err := os.Open("testdata/test.tgz") file, err := os.Open("testdata/test.tgz")
assert.Nil(t, err) assert.Nil(t, err)

View file

@ -36,6 +36,15 @@ func WithSkipBinaries(skip bool) Option {
} }
} }
// WithSkipArchives returns a Option that configures whether to skip archive files.
func WithSkipArchives(skip bool) Option {
return func(h Handler) {
if a, ok := h.(*Archive); ok {
a.skipArchives = skip
}
}
}
type Handler interface { type Handler interface {
FromFile(logContext.Context, io.Reader) chan []byte FromFile(logContext.Context, io.Reader) chan []byte
IsFiletype(logContext.Context, io.Reader) (io.Reader, bool) IsFiletype(logContext.Context, io.Reader) (io.Reader, bool)
@ -84,6 +93,10 @@ func processHandler(ctx logContext.Context, h Handler, reReader *diskbufferreade
} }
func handleChunks(ctx logContext.Context, handlerChan chan []byte, chunkSkel *sources.Chunk, reporter sources.ChunkReporter) bool { func handleChunks(ctx logContext.Context, handlerChan chan []byte, chunkSkel *sources.Chunk, reporter sources.ChunkReporter) bool {
if handlerChan == nil {
return false
}
for { for {
select { select {
case data, open := <-handlerChan: case data, open := <-handlerChan:

File diff suppressed because it is too large Load diff

View file

@ -587,6 +587,8 @@ func (m *Bitbucket) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *Bitbucket_Token: case *Bitbucket_Token:
@ -1806,6 +1808,8 @@ func (m *Git) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *Git_BasicAuth: case *Git_BasicAuth:
@ -2015,6 +2019,8 @@ func (m *GitLab) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *GitLab_Token: case *GitLab_Token:
@ -2210,6 +2216,8 @@ func (m *GitHub) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *GitHub_GithubApp: case *GitHub_GithubApp:
@ -3594,6 +3602,8 @@ func (m *Gerrit) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *Gerrit_BasicAuth: case *Gerrit_BasicAuth:
@ -4681,6 +4691,8 @@ func (m *AzureRepos) validate(all bool) error {
// no validation rules for SkipBinaries // no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) { switch m.Credential.(type) {
case *AzureRepos_Token: case *AzureRepos_Token:

View file

@ -58,6 +58,7 @@ type Git struct {
metrics metrics metrics metrics
concurrency *semaphore.Weighted concurrency *semaphore.Weighted
skipBinaries bool skipBinaries bool
skipArchives bool
} }
type metrics struct { type metrics struct {
@ -66,6 +67,7 @@ type metrics struct {
func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sources.SourceID, sourceName string, verify bool, concurrency int, func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sources.SourceID, sourceName string, verify bool, concurrency int,
sourceMetadataFunc func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData, skipBinaries bool, sourceMetadataFunc func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData, skipBinaries bool,
skipArchives bool,
) *Git { ) *Git {
return &Git{ return &Git{
sourceType: sourceType, sourceType: sourceType,
@ -76,6 +78,7 @@ func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sourc
verify: verify, verify: verify,
concurrency: semaphore.NewWeighted(int64(concurrency)), concurrency: semaphore.NewWeighted(int64(concurrency)),
skipBinaries: skipBinaries, skipBinaries: skipBinaries,
skipArchives: skipArchives,
} }
} }
@ -178,6 +181,7 @@ func (s *Source) Init(aCtx context.Context, name string, jobId sources.JobID, so
} }
}, },
conn.GetSkipBinaries(), conn.GetSkipBinaries(),
conn.GetSkipArchives(),
) )
return nil return nil
} }
@ -1014,6 +1018,10 @@ func (s *Git) handleBinary(ctx context.Context, gitDir string, reporter sources.
} }
} }
if s.skipArchives {
handlerOpts = append(handlerOpts, handlers.WithSkipArchives(true))
}
cmd := exec.Command("git", "-C", gitDir, "cat-file", "blob", commitHash.String()+":"+path) cmd := exec.Command("git", "-C", gitDir, "cat-file", "blob", commitHash.String()+":"+path)
var stderr bytes.Buffer var stderr bytes.Buffer

View file

@ -277,6 +277,7 @@ func (s *Source) Init(aCtx context.Context, name string, jobID sources.JobID, so
} }
}, },
conn.GetSkipBinaries(), conn.GetSkipBinaries(),
conn.GetSkipArchives(),
) )
return nil return nil

View file

@ -138,6 +138,7 @@ func (s *Source) Init(_ context.Context, name string, jobId sources.JobID, sourc
} }
}, },
conn.GetSkipBinaries(), conn.GetSkipBinaries(),
conn.GetSkipArchives(),
) )
return nil return nil

View file

@ -96,6 +96,7 @@ message Bitbucket {
repeated string repositories = 5; repeated string repositories = 5;
repeated string ignore_repos = 6; repeated string ignore_repos = 6;
bool skip_binaries = 7; bool skip_binaries = 7;
bool skip_archives = 8;
} }
message CircleCI { message CircleCI {
@ -196,6 +197,7 @@ message Git {
// like head, base, bare, etc. // like head, base, bare, etc.
string uri = 13; // repository URL. https://, file://, or ssh:// string uri = 13; // repository URL. https://, file://, or ssh://
bool skip_binaries = 14; bool skip_binaries = 14;
bool skip_archives = 15;
} }
message GitLab { message GitLab {
@ -208,6 +210,7 @@ message GitLab {
repeated string repositories = 5; repeated string repositories = 5;
repeated string ignore_repos = 6; repeated string ignore_repos = 6;
bool skip_binaries = 7; bool skip_binaries = 7;
bool skip_archives = 8;
} }
message GitHub { message GitHub {
@ -230,6 +233,7 @@ message GitHub {
bool include_issue_comments = 15; bool include_issue_comments = 15;
bool include_gist_comments = 16; bool include_gist_comments = 16;
bool skip_binaries = 17; bool skip_binaries = 17;
bool skip_archives = 18;
} }
message GoogleDrive { message GoogleDrive {
@ -301,6 +305,7 @@ message Gerrit {
} }
repeated string projects = 4; repeated string projects = 4;
bool skip_binaries = 5; bool skip_binaries = 5;
bool skip_archives = 6;
} }
message Jenkins { message Jenkins {
@ -369,4 +374,5 @@ message AzureRepos {
repeated string include_projects = 10; repeated string include_projects = 10;
repeated string ignore_projects = 11; repeated string ignore_projects = 11;
bool skip_binaries = 12; bool skip_binaries = 12;
bool skip_archives = 13;
} }