Add skip archive support (#2257)

This commit is contained in:
Dustin Decker 2023-12-22 11:55:23 -08:00 committed by GitHub
parent f699f60e89
commit 7d93adc1d0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 593 additions and 459 deletions

View file

@ -203,6 +203,7 @@ func main() {
}
},
true,
false,
)
logger.Info("scanning repo", "repo", r)

View file

@ -46,6 +46,7 @@ type Archive struct {
size int
currentDepth int
skipBinaries bool
skipArchives bool
}
// New creates a new Archive handler with the provided options.
@ -72,6 +73,10 @@ func SetArchiveMaxTimeout(timeout time.Duration) {
// FromFile extracts the files from an archive.
func (a *Archive) FromFile(originalCtx logContext.Context, data io.Reader) chan []byte {
if a.skipArchives {
return nil
}
archiveChan := make(chan []byte, defaultBufferSize)
go func() {
ctx, cancel := logContext.WithTimeout(originalCtx, maxTimeout)

View file

@ -311,6 +311,31 @@ func TestExtractDebContent(t *testing.T) {
assert.Equal(t, expectedLength, len(string(content)))
}
func TestSkipArchive(t *testing.T) {
file, err := os.Open("testdata/test.tgz")
assert.Nil(t, err)
defer file.Close()
reader, err := diskbufferreader.New(file)
assert.NoError(t, err)
ctx := logContext.Background()
chunkCh := make(chan *sources.Chunk)
go func() {
defer close(chunkCh)
ok := HandleFile(ctx, reader, &sources.Chunk{}, sources.ChanReporter{Ch: chunkCh}, WithSkipArchives(true))
assert.False(t, ok)
}()
wantCount := 0
count := 0
for range chunkCh {
count++
}
assert.Equal(t, wantCount, count)
}
func TestExtractTarContent(t *testing.T) {
file, err := os.Open("testdata/test.tgz")
assert.Nil(t, err)

View file

@ -36,6 +36,15 @@ func WithSkipBinaries(skip bool) Option {
}
}
// WithSkipArchives returns a Option that configures whether to skip archive files.
func WithSkipArchives(skip bool) Option {
return func(h Handler) {
if a, ok := h.(*Archive); ok {
a.skipArchives = skip
}
}
}
type Handler interface {
FromFile(logContext.Context, io.Reader) chan []byte
IsFiletype(logContext.Context, io.Reader) (io.Reader, bool)
@ -84,6 +93,10 @@ func processHandler(ctx logContext.Context, h Handler, reReader *diskbufferreade
}
func handleChunks(ctx logContext.Context, handlerChan chan []byte, chunkSkel *sources.Chunk, reporter sources.ChunkReporter) bool {
if handlerChan == nil {
return false
}
for {
select {
case data, open := <-handlerChan:

File diff suppressed because it is too large Load diff

View file

@ -587,6 +587,8 @@ func (m *Bitbucket) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *Bitbucket_Token:
@ -1806,6 +1808,8 @@ func (m *Git) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *Git_BasicAuth:
@ -2015,6 +2019,8 @@ func (m *GitLab) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *GitLab_Token:
@ -2210,6 +2216,8 @@ func (m *GitHub) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *GitHub_GithubApp:
@ -3594,6 +3602,8 @@ func (m *Gerrit) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *Gerrit_BasicAuth:
@ -4681,6 +4691,8 @@ func (m *AzureRepos) validate(all bool) error {
// no validation rules for SkipBinaries
// no validation rules for SkipArchives
switch m.Credential.(type) {
case *AzureRepos_Token:

View file

@ -58,6 +58,7 @@ type Git struct {
metrics metrics
concurrency *semaphore.Weighted
skipBinaries bool
skipArchives bool
}
type metrics struct {
@ -66,6 +67,7 @@ type metrics struct {
func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sources.SourceID, sourceName string, verify bool, concurrency int,
sourceMetadataFunc func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData, skipBinaries bool,
skipArchives bool,
) *Git {
return &Git{
sourceType: sourceType,
@ -76,6 +78,7 @@ func NewGit(sourceType sourcespb.SourceType, jobID sources.JobID, sourceID sourc
verify: verify,
concurrency: semaphore.NewWeighted(int64(concurrency)),
skipBinaries: skipBinaries,
skipArchives: skipArchives,
}
}
@ -178,6 +181,7 @@ func (s *Source) Init(aCtx context.Context, name string, jobId sources.JobID, so
}
},
conn.GetSkipBinaries(),
conn.GetSkipArchives(),
)
return nil
}
@ -1014,6 +1018,10 @@ func (s *Git) handleBinary(ctx context.Context, gitDir string, reporter sources.
}
}
if s.skipArchives {
handlerOpts = append(handlerOpts, handlers.WithSkipArchives(true))
}
cmd := exec.Command("git", "-C", gitDir, "cat-file", "blob", commitHash.String()+":"+path)
var stderr bytes.Buffer

View file

@ -277,6 +277,7 @@ func (s *Source) Init(aCtx context.Context, name string, jobID sources.JobID, so
}
},
conn.GetSkipBinaries(),
conn.GetSkipArchives(),
)
return nil

View file

@ -138,6 +138,7 @@ func (s *Source) Init(_ context.Context, name string, jobId sources.JobID, sourc
}
},
conn.GetSkipBinaries(),
conn.GetSkipArchives(),
)
return nil

View file

@ -96,6 +96,7 @@ message Bitbucket {
repeated string repositories = 5;
repeated string ignore_repos = 6;
bool skip_binaries = 7;
bool skip_archives = 8;
}
message CircleCI {
@ -196,6 +197,7 @@ message Git {
// like head, base, bare, etc.
string uri = 13; // repository URL. https://, file://, or ssh://
bool skip_binaries = 14;
bool skip_archives = 15;
}
message GitLab {
@ -208,6 +210,7 @@ message GitLab {
repeated string repositories = 5;
repeated string ignore_repos = 6;
bool skip_binaries = 7;
bool skip_archives = 8;
}
message GitHub {
@ -230,6 +233,7 @@ message GitHub {
bool include_issue_comments = 15;
bool include_gist_comments = 16;
bool skip_binaries = 17;
bool skip_archives = 18;
}
message GoogleDrive {
@ -301,6 +305,7 @@ message Gerrit {
}
repeated string projects = 4;
bool skip_binaries = 5;
bool skip_archives = 6;
}
message Jenkins {
@ -369,4 +374,5 @@ message AzureRepos {
repeated string include_projects = 10;
repeated string ignore_projects = 11;
bool skip_binaries = 12;
bool skip_archives = 13;
}