Download files when reverifying (#3252)

The previous implementation of targeted file scanning pulled patches out of commit data, which didn't work for binary files (because GitHub doesn't return patches for them). This PR changes the system to always just download the requested file and scan it, which means we get binary file support.
This commit is contained in:
Cody Rose 2024-08-29 16:10:11 -04:00 committed by GitHub
parent 247b56ad0b
commit dbc1464c63
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 38 additions and 74 deletions

View file

@ -16,6 +16,7 @@ import (
"github.com/go-logr/logr"
"github.com/gobwas/glob"
"github.com/google/go-github/v63/github"
"github.com/trufflesecurity/trufflehog/v3/pkg/handlers"
"golang.org/x/exp/rand"
"golang.org/x/sync/errgroup"
"google.golang.org/protobuf/proto"
@ -1392,35 +1393,34 @@ func (s *Source) scanTarget(ctx context.Context, target sources.ChunkingTarget,
return fmt.Errorf("invalid GitHub URL")
}
qry := commitQuery{
repo: segments[2],
owner: segments[1],
sha: meta.GetCommit(),
filename: meta.GetFile(),
readCloser, resp, err := s.connector.APIClient().Repositories.DownloadContents(
ctx,
segments[1],
segments[2],
meta.GetFile(),
&github.RepositoryContentGetOptions{Ref: meta.GetCommit()})
// As of this writing, if the returned readCloser is not nil, it's just the Body of the returned github.Response, so
// there's no need to independently close it.
if resp != nil && resp.Body != nil {
defer resp.Body.Close()
}
res, err := s.getDiffForFileInCommit(ctx, qry)
if err != nil {
return err
return fmt.Errorf("could not download file for scan: %w", err)
}
chunk := &sources.Chunk{
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected HTTP response status when trying to download file for scan: %v", resp.Status)
}
reporter := sources.ChanReporter{Ch: chunksChan}
chunkSkel := sources.Chunk{
SourceType: s.Type(),
SourceName: s.name,
SourceID: s.SourceID(),
JobID: s.JobID(),
SecretID: target.SecretID,
Data: []byte(stripLeadingPlusMinus(res)),
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{Github: meta},
},
Verify: s.verify,
}
return common.CancellableWrite(ctx, chunksChan, chunk)
}
// stripLeadingPlusMinus removes leading + and - characters from lines in a diff string. These characters exist in the
// diffs returned when performing a targeted scan and need to be removed so that detectors are operating on the correct
// text.
func stripLeadingPlusMinus(diff string) string {
return strings.ReplaceAll(strings.ReplaceAll(diff, "\n+", "\n"), "\n-", "\n")
Verify: s.verify}
return handlers.HandleFile(ctx, readCloser, &chunkSkel, reporter)
}

View file

@ -758,6 +758,24 @@ func TestSource_Chunks_TargetedScan(t *testing.T) {
},
wantChunks: 1,
},
{
name: "targeted scan, binary file",
init: init{
name: "test source",
connection: &sourcespb.GitHub{Credential: &sourcespb.GitHub_Token{Token: githubToken}},
queryCriteria: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{
Repository: "https://github.com/truffle-sandbox/test-secrets.git",
Link: "https://github.com/truffle-sandbox/test-secrets/blob/70bef8590f87257c0992eecc7db529827a12b801/null_text_w_ptp.ipynb",
Commit: "70bef8590f87257c0992eecc7db529827a12b801",
File: "null_text_w_ptp.ipynb",
},
},
},
},
wantChunks: 607,
},
{
name: "no file in commit",
init: init{

View file

@ -281,60 +281,6 @@ func (s *Source) wikiIsReachable(ctx context.Context, repoURL string) bool {
return wikiURL == res.Request.URL.String()
}
// commitQuery represents the details required to fetch a commit.
type commitQuery struct {
repo string
owner string
sha string
filename string
}
// getDiffForFileInCommit retrieves the diff for a specified file in a commit.
// If the file or its diff is not found, it returns an error.
func (s *Source) getDiffForFileInCommit(ctx context.Context, query commitQuery) (string, error) {
var (
commit *github.RepositoryCommit
err error
)
for {
commit, _, err = s.connector.APIClient().Repositories.GetCommit(ctx, query.owner, query.repo, query.sha, nil)
if s.handleRateLimit(err) {
continue
}
if err != nil {
return "", fmt.Errorf("error fetching commit %s: %w", query.sha, err)
}
break
}
if len(commit.Files) == 0 {
return "", fmt.Errorf("commit %s does not contain any files", query.sha)
}
res := new(strings.Builder)
// Only return the diff if the file is in the commit.
for _, file := range commit.Files {
if *file.Filename != query.filename {
continue
}
if file.Patch == nil {
return "", fmt.Errorf("commit %s file %s does not have a diff", query.sha, query.filename)
}
if _, err := res.WriteString(*file.Patch); err != nil {
return "", fmt.Errorf("buffer write error for commit %s file %s: %w", query.sha, query.filename, err)
}
res.WriteString("\n")
}
if res.Len() == 0 {
return "", fmt.Errorf("commit %s does not contain patch for file %s", query.sha, query.filename)
}
return res.String(), nil
}
func (s *Source) normalizeRepo(repo string) (string, error) {
// If there's a '/', assume it's a URL and try to normalize it.
if strings.ContainsRune(repo, '/') {