From 77dc2720a8b9b7fc75db4d92c2ba73a59bc4c90b Mon Sep 17 00:00:00 2001 From: Miccah Date: Wed, 18 Sep 2024 14:30:10 -0700 Subject: [PATCH] Update GitHub enumeration to report unique filtered values (#3292) The reported values should match the values populated in s.repos. --- pkg/sources/github/github.go | 47 +++++++++++++++++++++++++------ pkg/sources/github/github_test.go | 17 +++++++++-- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/pkg/sources/github/github.go b/pkg/sources/github/github.go index 10f8f7fad..c98031627 100644 --- a/pkg/sources/github/github.go +++ b/pkg/sources/github/github.go @@ -331,7 +331,14 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, tar githubSecondsSpentRateLimited.WithLabelValues(s.name).Set(0) githubReposScanned.WithLabelValues(s.name).Set(0) - err := s.enumerate(ctx) + // We don't care about handling enumerated values as they happen during + // the normal Chunks flow because we enumerate and scan in two steps. + noopReporter := sources.VisitorReporter{ + VisitUnit: func(context.Context, sources.SourceUnit) error { + return nil + }, + } + err := s.enumerate(ctx, noopReporter) if err != nil { return fmt.Errorf("error enumerating: %w", err) } @@ -339,30 +346,52 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, tar return s.scan(ctx, chunksReporter) } -func (s *Source) enumerate(ctx context.Context) error { - // Create a reporter that does nothing for now. - noopReporter := sources.VisitorReporter{ +// enumerate enumerates the GitHub source based on authentication method and +// user configuration. It populates s.filteredRepoCache, s.repoInfoCache, +// s.memberCache, s.totalRepoSize, s.orgsCache, and s.repos. Additionally, +// repositories and gists are reported to the provided UnitReporter. +func (s *Source) enumerate(ctx context.Context, reporter sources.UnitReporter) error { + seenUnits := make(map[sources.SourceUnit]struct{}) + // Wrapper reporter to deduplicate and filter found units. + dedupeReporter := sources.VisitorReporter{ VisitUnit: func(ctx context.Context, su sources.SourceUnit) error { - return nil + // Only report units that passed the user configured filter. + name := su.Display() + if !s.filteredRepoCache.Exists(name) { + return ctx.Err() + } + // Only report a unit once. + if _, ok := seenUnits[su]; ok { + return ctx.Err() + } + seenUnits[su] = struct{}{} + return reporter.UnitOk(ctx, su) }, + VisitErr: reporter.UnitErr, } + // Report any values that were already configured. + for _, name := range s.filteredRepoCache.Keys() { + url, _ := s.filteredRepoCache.Get(name) + _ = dedupeReporter.UnitOk(ctx, RepoUnit{name: name, url: url}) + } + // I'm not wild about switching on the connector type here (as opposed to dispatching to the connector itself) but // this felt like a compromise that allowed me to isolate connection logic without rewriting the entire source. switch c := s.connector.(type) { case *appConnector: - if err := s.enumerateWithApp(ctx, c.InstallationClient(), noopReporter); err != nil { + if err := s.enumerateWithApp(ctx, c.InstallationClient(), dedupeReporter); err != nil { return err } case *basicAuthConnector: - if err := s.enumerateBasicAuth(ctx, noopReporter); err != nil { + if err := s.enumerateBasicAuth(ctx, dedupeReporter); err != nil { return err } case *tokenConnector: - if err := s.enumerateWithToken(ctx, c.IsGithubEnterprise(), noopReporter); err != nil { + if err := s.enumerateWithToken(ctx, c.IsGithubEnterprise(), dedupeReporter); err != nil { return err } case *unauthenticatedConnector: - s.enumerateUnauthenticated(ctx, noopReporter) + s.enumerateUnauthenticated(ctx, dedupeReporter) } s.repos = make([]string, 0, s.filteredRepoCache.Count()) diff --git a/pkg/sources/github/github_test.go b/pkg/sources/github/github_test.go index 8e986a5cb..d659ca014 100644 --- a/pkg/sources/github/github_test.go +++ b/pkg/sources/github/github_test.go @@ -11,6 +11,7 @@ import ( "net/http" "net/url" "reflect" + "slices" "strconv" "testing" "time" @@ -566,8 +567,18 @@ func TestEnumerate(t *testing.T) { s.cacheRepoInfo(repo) s.filteredRepoCache.Set(repo.GetFullName(), repo.GetCloneURL()) + var reportedRepos []string + reporter := sources.VisitorReporter{ + VisitUnit: func(ctx context.Context, su sources.SourceUnit) error { + url, _ := su.SourceUnitID() + reportedRepos = append(reportedRepos, url) + return nil + }, + } + // Act - err := s.enumerate(context.Background()) + err := s.enumerate(context.Background(), reporter) + slices.Sort(reportedRepos) // Assert assert.Nil(t, err) @@ -576,6 +587,8 @@ func TestEnumerate(t *testing.T) { assert.True(t, s.filteredRepoCache.Exists("super-secret-user/super-secret-repo")) assert.True(t, s.filteredRepoCache.Exists("cached-user/cached-repo")) assert.True(t, s.filteredRepoCache.Exists("2801a2b0523099d0614a951579d99ba9")) + assert.Equal(t, 3, len(s.repos)) + assert.Equal(t, s.repos, reportedRepos) // Enumeration cached all repos. assert.Equal(t, 3, len(s.repoInfoCache.cache)) _, ok := s.repoInfoCache.get("https://github.com/super-secret-user/super-secret-repo.git") @@ -640,7 +653,7 @@ func BenchmarkEnumerate(b *testing.B) { setupMocks(b) b.StartTimer() - _ = s.enumerate(context.Background()) + _ = s.enumerate(context.Background(), noopReporter()) } }