Support scanning binary files in git sources (#684)

* Scan binary files for git sources

* Create data chunks in for loop

* Linter feedback and newline commit result

* Use disk buffered reader and chunker function
This commit is contained in:
Bill Rich 2022-08-10 16:10:45 -07:00 committed by GitHub
parent a473b9aa99
commit 4a93e49eea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 4 deletions

2
go.mod
View file

@ -4,7 +4,7 @@ go 1.18
replace github.com/jpillora/overseer => github.com/trufflesecurity/overseer v1.1.7-custom5
replace github.com/zricethezav/gitleaks/v8 => github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom9
replace github.com/zricethezav/gitleaks/v8 => github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom10
replace github.com/gitleaks/go-gitdiff => github.com/trufflesecurity/go-gitdiff v0.7.6-zombies2

4
go.sum
View file

@ -400,8 +400,8 @@ github.com/tailscale/depaware v0.0.0-20210622194025-720c4b409502 h1:34icjjmqJ2HP
github.com/tailscale/depaware v0.0.0-20210622194025-720c4b409502/go.mod h1:p9lPsd+cx33L3H9nNoecRRxPssFKUwwI50I3pZ0yT+8=
github.com/therootcompany/xz v1.0.1 h1:CmOtsn1CbtmyYiusbfmhmkpAAETj0wBIH6kCYaX+xzw=
github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0BWbMn8qNMY=
github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom9 h1:OvS9aj6Fasot5FaTpSyCV4WNq/8SMov9/bNUMoZFwEI=
github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom9/go.mod h1:2iZpX4Epnmx7VK2atbIMEjHW9rivie5RRe0ZhPWUFvM=
github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom10 h1:QuGZ5bJcQpVz+3sfvKKPDkQwdUueiBg0V+2eMHzkryo=
github.com/trufflesecurity/gitleaks/v8 v8.6.1-custom10/go.mod h1:2iZpX4Epnmx7VK2atbIMEjHW9rivie5RRe0ZhPWUFvM=
github.com/trufflesecurity/go-gitdiff v0.7.6-zombies2 h1:srCJzbE3b44+ZIPcgJSfvinHCOQlkMwVghtKf23un6o=
github.com/trufflesecurity/go-gitdiff v0.7.6-zombies2/go.mod h1:pKz0X4YzCKZs30BL+weqBIG7mx0jl4tF1uXV9ZyNvrA=
github.com/trufflesecurity/overseer v1.1.7-custom5 h1:xu+Fg6fkSRifUPzUCl7N8HmobJ6WGOkIApGnM7mJS6w=

View file

@ -14,6 +14,7 @@ import (
"strings"
"time"
diskbufferreader "github.com/bill-rich/disk-buffer-reader"
"github.com/gitleaks/go-gitdiff/gitdiff"
"github.com/go-errors/errors"
"github.com/go-git/go-git/v5"
@ -27,6 +28,8 @@ import (
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/anypb"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/handlers"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sanitizer"
@ -340,6 +343,23 @@ func (s *Git) ScanCommits(repo *git.Repository, path string, scanOptions *ScanOp
when = file.PatchHeader.AuthorDate.String()
}
// Handle binary files by reading the entire file rather than using the diff.
if file.IsBinary {
commitHash := plumbing.NewHash(hash)
metadata := s.sourceMetadataFunc(fileName, email, hash, when, urlMetadata, 0)
chunkSkel := &sources.Chunk{
SourceName: s.sourceName,
SourceID: s.sourceID,
SourceType: s.sourceType,
SourceMetadata: metadata,
Verify: s.verify,
}
if err = handleBinary(repo, chunksChan, chunkSkel, commitHash, fileName); err != nil {
log.WithError(err).Error("Error handling binary file")
}
continue
}
for _, frag := range file.TextFragments {
var sb strings.Builder
newLineNumber := frag.NewPosition
@ -615,3 +635,45 @@ func getSafeRemoteURL(repo *git.Repository, preferred string) string {
}
return safeURL
}
func handleBinary(repo *git.Repository, chunksChan chan *sources.Chunk, chunkSkel *sources.Chunk, commitHash plumbing.Hash, path string) error {
log.WithField("path", path).Trace("Binary file found in repository.")
commit, err := repo.CommitObject(commitHash)
if err != nil {
return err
}
file, err := commit.File(path)
if err != nil {
return err
}
fileReader, err := file.Reader()
if err != nil {
return err
}
defer fileReader.Close()
reader, err := diskbufferreader.New(fileReader)
if err != nil {
return err
}
if handlers.HandleFile(reader, chunkSkel, chunksChan) {
return nil
}
log.WithField("path", path).Trace("Binary file is not recognized by file handlers. Chunking raw.")
if err := reader.Reset(); err != nil {
return err
}
reader.Stop()
for chunkData := range common.ChunkReader(reader) {
chunk := *chunkSkel
chunk.Data = chunkData
chunksChan <- &chunk
}
return nil
}

View file

@ -229,7 +229,7 @@ func TestSource_Chunks_Integration(t *testing.T) {
"2f251b8c1e72135a375b659951097ec7749d4af9-bump": {B: []byte(" \n")},
"e6c8bbabd8796ea3cd85bfc2e55b27e0a491747f-bump": {B: []byte("oops \n")},
"735b52b0eb40610002bb1088e902bd61824eb305-bump": {B: []byte("oops\n")},
//"ce62d79908803153ef6e145e042d3e80488ef747-bump": {B: []byte("\n")},
"ce62d79908803153ef6e145e042d3e80488ef747-bump": {B: []byte("\n")},
// Normally we might expect to see this commit, and we may in the future.
// But at the moment we're ignoring any commit unless it contains at least one non-space character.
"27fbead3bf883cdb7de9d7825ed401f28f9398f1-slack": {B: []byte("yup, just did that\n\ngithub_lol: \"ffc7e0f9400fb6300167009e42d2f842cd7956e2\"\n\noh, goodness. there's another one!")},