CFOR Commit Scanner (#3145)

* alpha feature for scanning hidden commits on github

* improvements re: git operations

* lint updates

* updating with exec block due to no gh token

* reworked logic into new source

* fixed collisions threshold flag input

* fixed IOutil issues

* removed additions from GH config

---------

Co-authored-by: Joe Leon <joe.leon@trufflesec.com>
This commit is contained in:
joeleonjr 2024-08-01 23:04:20 -04:00 committed by GitHub
parent 38e844f968
commit 7d606e2480
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 2546 additions and 1222 deletions

21
main.go
View file

@ -150,6 +150,16 @@ var (
githubScanPRComments = githubScan.Flag("pr-comments", "Include pull request descriptions and comments in scan.").Bool()
githubScanGistComments = githubScan.Flag("gist-comments", "Include gist comments in scan.").Bool()
// GitHub Cross Fork Object Reference Experimental Feature
githubExperimentalScan = cli.Command("github-experimental", "Run an experimental GitHub scan. Must specify at least one experimental sub-module to run: object-discovery.")
// GitHub Experimental SubModules
githubExperimentalObjectDiscovery = githubExperimentalScan.Flag("object-discovery", "Discover hidden data objects in GitHub repositories.").Bool()
// GitHub Experimental Options
githubExperimentalToken = githubExperimentalScan.Flag("token", "GitHub token. Can be provided with environment variable GITHUB_TOKEN.").Envar("GITHUB_TOKEN").String()
githubExperimentalRepo = githubExperimentalScan.Flag("repo", "GitHub repository to scan. Example: https://github.com/<user>/<repo>.git").Required().String()
githubExperimentalCollisionThreshold = githubExperimentalScan.Flag("collision-threshold", "Threshold for short-sha collisions in object-discovery submodule. Default is 1.").Default("1").Int()
githubExperimentalDeleteCache = githubExperimentalScan.Flag("delete-cached-data", "Delete cached data after object-discovery secret scanning.").Bool()
gitlabScan = cli.Command("gitlab", "Find credentials in GitLab repositories.")
// TODO: Add more GitLab options
gitlabScanEndpoint = gitlabScan.Flag("endpoint", "GitLab endpoint.").Default("https://gitlab.com").String()
@ -667,6 +677,17 @@ func runSingleScan(ctx context.Context, cmd string, cfg engine.Config) (metrics,
if err := eng.ScanGitHub(ctx, cfg); err != nil {
return scanMetrics, fmt.Errorf("failed to scan Github: %v", err)
}
case githubExperimentalScan.FullCommand():
cfg := sources.GitHubExperimentalConfig{
Token: *githubExperimentalToken,
Repository: *githubExperimentalRepo,
ObjectDiscovery: *githubExperimentalObjectDiscovery,
CollisionThreshold: *githubExperimentalCollisionThreshold,
DeleteCachedData: *githubExperimentalDeleteCache,
}
if err := eng.ScanGitHubExperimental(ctx, cfg); err != nil {
return scanMetrics, fmt.Errorf("failed to scan using Github Experimental: %v", err)
}
case gitlabScan.FullCommand():
filter, err := common.FilterFromFiles(*gitlabScanIncludePaths, *gitlabScanExcludePaths)
if err != nil {

View file

@ -0,0 +1,65 @@
package engine
import (
"fmt"
"runtime"
gogit "github.com/go-git/go-git/v5"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/anypb"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/git"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/github"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/github_experimental"
)
// ScanGitHubExperimental scans GitHub using an experimental feature. Consider all functionality to be in an alpha release here.
func (e *Engine) ScanGitHubExperimental(ctx context.Context, c sources.GitHubExperimentalConfig) error {
connection := sourcespb.GitHubExperimental{
Repository: c.Repository,
ObjectDiscovery: c.ObjectDiscovery,
CollisionThreshold: int64(c.CollisionThreshold),
DeleteCachedData: c.DeleteCachedData,
}
// Check at least one experimental sub-module is being used.
// Add to this list as more experimental sub-modules are added.
if !c.ObjectDiscovery {
return fmt.Errorf("at least one experimental submodule must be enabled")
}
if len(c.Token) > 0 {
connection.Credential = &sourcespb.GitHubExperimental_Token{
Token: c.Token,
}
} else {
return fmt.Errorf("token is required for github experimental")
}
var conn anypb.Any
err := anypb.MarshalFrom(&conn, &connection, proto.MarshalOptions{})
if err != nil {
ctx.Logger().Error(err, "failed to marshal github experimental connection")
return err
}
logOptions := &gogit.LogOptions{}
opts := []git.ScanOption{
git.ScanOptionLogOptions(logOptions),
}
scanOptions := git.NewScanOptions(opts...)
sourceName := "trufflehog - github experimental (alpha release)"
sourceID, jobID, _ := e.sourceManager.GetIDs(ctx, sourceName, github.SourceType)
githubExperimentalSource := &github_experimental.Source{}
if err := githubExperimentalSource.Init(ctx, sourceName, jobID, sourceID, true, &conn, runtime.NumCPU()); err != nil {
return err
}
githubExperimentalSource.WithScanOptions(scanOptions)
_, err = e.sourceManager.Run(ctx, sourceName, githubExperimentalSource)
return err
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -2759,6 +2759,133 @@ var _ interface {
ErrorName() string
} = GitHubValidationError{}
// Validate checks the field values on GitHubExperimental with the rules
// defined in the proto definition for this message. If any rules are
// violated, the first error encountered is returned, or nil if there are no violations.
func (m *GitHubExperimental) Validate() error {
return m.validate(false)
}
// ValidateAll checks the field values on GitHubExperimental with the rules
// defined in the proto definition for this message. If any rules are
// violated, the result is a list of violation errors wrapped in
// GitHubExperimentalMultiError, or nil if none found.
func (m *GitHubExperimental) ValidateAll() error {
return m.validate(true)
}
func (m *GitHubExperimental) validate(all bool) error {
if m == nil {
return nil
}
var errors []error
// no validation rules for Repository
// no validation rules for ObjectDiscovery
// no validation rules for CollisionThreshold
// no validation rules for DeleteCachedData
switch v := m.Credential.(type) {
case *GitHubExperimental_Token:
if v == nil {
err := GitHubExperimentalValidationError{
field: "Credential",
reason: "oneof value cannot be a typed-nil",
}
if !all {
return err
}
errors = append(errors, err)
}
// no validation rules for Token
default:
_ = v // ensures v is used
}
if len(errors) > 0 {
return GitHubExperimentalMultiError(errors)
}
return nil
}
// GitHubExperimentalMultiError is an error wrapping multiple validation errors
// returned by GitHubExperimental.ValidateAll() if the designated constraints
// aren't met.
type GitHubExperimentalMultiError []error
// Error returns a concatenation of all the error messages it wraps.
func (m GitHubExperimentalMultiError) Error() string {
var msgs []string
for _, err := range m {
msgs = append(msgs, err.Error())
}
return strings.Join(msgs, "; ")
}
// AllErrors returns a list of validation violation errors.
func (m GitHubExperimentalMultiError) AllErrors() []error { return m }
// GitHubExperimentalValidationError is the validation error returned by
// GitHubExperimental.Validate if the designated constraints aren't met.
type GitHubExperimentalValidationError struct {
field string
reason string
cause error
key bool
}
// Field function returns field value.
func (e GitHubExperimentalValidationError) Field() string { return e.field }
// Reason function returns reason value.
func (e GitHubExperimentalValidationError) Reason() string { return e.reason }
// Cause function returns cause value.
func (e GitHubExperimentalValidationError) Cause() error { return e.cause }
// Key function returns key value.
func (e GitHubExperimentalValidationError) Key() bool { return e.key }
// ErrorName returns error name.
func (e GitHubExperimentalValidationError) ErrorName() string {
return "GitHubExperimentalValidationError"
}
// Error satisfies the builtin error interface
func (e GitHubExperimentalValidationError) Error() string {
cause := ""
if e.cause != nil {
cause = fmt.Sprintf(" | caused by: %v", e.cause)
}
key := ""
if e.key {
key = "key for "
}
return fmt.Sprintf(
"invalid %sGitHubExperimental.%s: %s%s",
key,
e.field,
e.reason,
cause)
}
var _ error = GitHubExperimentalValidationError{}
var _ interface {
Field() string
Reason() string
Key() bool
Cause() error
ErrorName() string
} = GitHubExperimentalValidationError{}
// Validate checks the field values on GoogleDrive with the rules defined in
// the proto definition for this message. If any rules are violated, the first
// error encountered is returned, or nil if there are no violations.

View file

@ -310,7 +310,7 @@ func (s *Source) Validate(ctx context.Context) []error {
errs = append(errs, fmt.Errorf("error creating GitHub client: %+v", err))
}
default:
errs = append(errs, fmt.Errorf("Invalid configuration given for source. Name: %s, Type: %s", s.name, s.Type()))
errs = append(errs, fmt.Errorf("invalid configuration given for source. Name: %s, Type: %s", s.name, s.Type()))
}
// Run a simple query to check if the client is actually valid
@ -400,7 +400,7 @@ func (s *Source) enumerate(ctx context.Context, apiEndpoint string) (*github.Cli
}
default:
// TODO: move this error to Init
return nil, fmt.Errorf("Invalid configuration given for source. Name: %s, Type: %s", s.name, s.Type())
return nil, fmt.Errorf("invalid configuration given for source. Name: %s, Type: %s", s.name, s.Type())
}
s.repos = make([]string, 0, s.filteredRepoCache.Count())

View file

@ -0,0 +1,224 @@
package github_experimental
import (
"fmt"
"net/http"
"strings"
"github.com/go-logr/logr"
"github.com/google/go-github/v63/github"
"golang.org/x/sync/errgroup"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/anypb"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/giturl"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sanitizer"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/git"
)
const (
SourceType = sourcespb.SourceType_SOURCE_TYPE_GITHUB_EXPERIMENTAL
// unauthGithubOrgRateLimt = 30
// defaultPagination = 100
// membersAppPagination = 500
)
type Source struct {
name string
// Protects the user and token.
//userMu sync.Mutex
//githubUser string
//githubToken string
sourceID sources.SourceID
jobID sources.JobID
verify bool
//orgsCache cache.Cache[string]
//memberCache map[string]struct{}
//repos []string
//filteredRepoCache *filteredRepoCache
repoInfoCache repoInfoCache
//totalRepoSize int // total size of all repos in kb
useCustomContentWriter bool
git *git.Git
scanOptions *git.ScanOptions
httpClient *http.Client
log logr.Logger
conn *sourcespb.GitHubExperimental
jobPool *errgroup.Group
apiClient *github.Client
sources.Progress
sources.CommonSourceUnitUnmarshaller
}
// WithCustomContentWriter sets the useCustomContentWriter flag on the source.
func (s *Source) WithCustomContentWriter() { s.useCustomContentWriter = true }
func (s *Source) WithScanOptions(scanOptions *git.ScanOptions) {
s.scanOptions = scanOptions
}
// Ensure the Source satisfies the interfaces at compile time
var _ sources.Source = (*Source)(nil)
var _ sources.SourceUnitUnmarshaller = (*Source)(nil)
// Type returns the type of source.
// It is used for matching source types in configuration and job input.
func (s *Source) Type() sourcespb.SourceType {
return SourceType
}
func (s *Source) SourceID() sources.SourceID {
return s.sourceID
}
func (s *Source) JobID() sources.JobID {
return s.jobID
}
// Init returns an initialized GitHubExperimental source.
func (s *Source) Init(aCtx context.Context, name string, jobID sources.JobID, sourceID sources.SourceID, verify bool, connection *anypb.Any, concurrency int) error {
err := git.CmdCheck()
if err != nil {
return err
}
s.log = aCtx.Logger()
s.name = name
s.sourceID = sourceID
s.jobID = jobID
s.verify = verify
s.jobPool = &errgroup.Group{}
s.jobPool.SetLimit(concurrency)
s.httpClient = common.RetryableHTTPClientTimeout(60)
s.apiClient = github.NewClient(s.httpClient)
var conn sourcespb.GitHubExperimental
err = anypb.UnmarshalTo(connection, &conn, proto.UnmarshalOptions{})
if err != nil {
return fmt.Errorf("error unmarshalling connection: %w", err)
}
s.conn = &conn
s.conn.Repository, err = s.normalizeRepo(s.conn.Repository)
if err != nil {
return fmt.Errorf("error normalizing repo: %w", err)
}
s.repoInfoCache = newRepoInfoCache()
cfg := &git.Config{
SourceName: s.name,
JobID: s.jobID,
SourceID: s.sourceID,
SourceType: s.Type(),
Verify: s.verify,
SkipBinaries: false,
SkipArchives: false,
Concurrency: concurrency,
SourceMetadataFunc: func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData {
return &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{
Commit: sanitizer.UTF8(commit),
File: sanitizer.UTF8(file),
Email: sanitizer.UTF8(email),
Repository: sanitizer.UTF8(repository),
Link: giturl.GenerateLink(repository, commit, file, line),
Timestamp: sanitizer.UTF8(timestamp),
Line: line,
Visibility: s.visibilityOf(aCtx, repository),
},
},
}
},
UseCustomContentWriter: s.useCustomContentWriter,
}
s.git = git.NewGit(cfg)
return nil
}
func (s *Source) visibilityOf(ctx context.Context, repoURL string) source_metadatapb.Visibility {
// It isn't possible to get the visibility of a wiki.
// We must use the visibility of the corresponding repository.
if strings.HasSuffix(repoURL, ".wiki.git") {
repoURL = strings.TrimSuffix(repoURL, ".wiki.git") + ".git"
}
repoInfo, ok := s.repoInfoCache.get(repoURL)
if !ok {
// This should never happen.
err := fmt.Errorf("no repoInfo for URL: %s", repoURL)
ctx.Logger().Error(err, "failed to get repository visibility")
return source_metadatapb.Visibility_unknown
}
return repoInfo.visibility
}
// Chunks emits chunks of bytes over a channel.
func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, targets ...sources.ChunkingTarget) error {
if s.conn.ObjectDiscovery {
err := s.EnumerateAndScanAllObjects(ctx, chunksChan)
return err
}
return nil
}
func getRepoURLParts(repoURLString string) (string, []string, error) {
// Support ssh and https URLs.
repoURL, err := git.GitURLParse(repoURLString)
if err != nil {
return "", nil, err
}
// Remove the user information.
// e.g., `git@github.com` -> `github.com`
if repoURL.User != nil {
repoURL.User = nil
}
urlString := repoURL.String()
trimmedURL := strings.TrimPrefix(urlString, repoURL.Scheme+"://")
trimmedURL = strings.TrimSuffix(trimmedURL, ".git")
urlParts := strings.Split(trimmedURL, "/")
// Validate
switch len(urlParts) {
case 2:
// gist.github.com/<gist_id>
if !strings.EqualFold(urlParts[0], "gist.github.com") {
err = fmt.Errorf("failed to parse repository or gist URL (%s): 2 path segments are only expected if the host is 'gist.github.com' ('gist.github.com', '<gist_id>')", urlString)
}
case 3:
// github.com/<user>/repo>
// gist.github.com/<user>/<gist_id>
// github.company.org/<user>/repo>
// github.company.org/gist/<gist_id>
case 4:
// github.company.org/gist/<user/<id>
if !strings.EqualFold(urlParts[1], "gist") || (strings.EqualFold(urlParts[0], "github.com") && strings.EqualFold(urlParts[1], "gist")) {
err = fmt.Errorf("failed to parse repository or gist URL (%s): 4 path segments are only expected if the host isn't 'github.com' and the path starts with 'gist' ('github.example.com', 'gist', '<owner>', '<gist_id>')", urlString)
}
default:
err = fmt.Errorf("invalid repository or gist URL (%s): length of URL segments should be between 2 and 4, not %d (%v)", urlString, len(urlParts), urlParts)
}
if err != nil {
return "", nil, err
}
return urlString, urlParts, nil
}

View file

@ -0,0 +1,654 @@
package github_experimental
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"strings"
"time"
"github.com/google/go-github/v63/github"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/git"
"golang.org/x/oauth2"
)
// Assumption: sleeping for 60 seconds is enough to reset the secondary rate limit
// see https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api#secondary-rate-limits
const secondaryRateLimitSleep = 60
// Assumption: on average, a fork contributes 0.1% additional commits
const forkCommitMultiplier = 0.001
// Threshold for estimated Short SHA-1 hash collisions (default to 1...so basically none)
// as calculated using the Birthday Paradox
// Adjust this to a higher value if you're willing to accept more collisions (and shorter runtime).
var collisionThreshold float64
// Starting character length (4 is the minimum required by git)
const startingCharLen = 4
// Max character length (6 is the default maximum)
// 6 chars == 16M possibilities --> which will take 18k-55k queries.
// that's really the max that's tolerable since it will take a long time to run.
// If you increase this to accomdate a MASSIVE repository, it will take a long time to run.
const maxCharLen = 6
// Starting GraphQL query chunk size.
// Max that worked was 900.
// 350 is a safe starting point.
const maxChunkSize = 900
const initialChunkSize = 350
// Max number of commits to fetch from the repository in one command
// ex: git fetch origin <commit1> <commit2> ... <commit1000>
const gitFetchMax = 1000
// Constants for commit types
const (
invalidCommit = "invalid"
validHiddenCommit = "valid_hidden"
)
type backoff struct {
value float64
decreasePercentage float64
increasePercentage float64
successThreshold int
successCount int
}
func newBackoff(initialValue, decreasePercentage, increasePercentage float64, successThreshold int) *backoff {
return &backoff{
value: initialValue,
decreasePercentage: decreasePercentage,
increasePercentage: increasePercentage,
successThreshold: successThreshold,
}
}
func (b *backoff) errorOccurred() float64 {
b.value -= b.value * (b.decreasePercentage / 100)
b.successCount = 0 // Reset success count on error
if b.value < 100 {
b.value = 100
}
return b.value
}
func (b *backoff) successOccurred() float64 {
b.successCount++
if b.successCount >= b.successThreshold {
b.value += b.value * (b.increasePercentage / 100)
b.successCount = 0 // Reset success count after increasing the value
}
if b.value > maxChunkSize {
b.value = maxChunkSize
}
return b.value
}
func (b *backoff) getValue() int {
return int(b.value)
}
// Github token
var ghToken = ""
func getForksCount(owner, repoName string) (int, error) {
ctx := context.Background()
ts := oauth2.StaticTokenSource(
&oauth2.Token{AccessToken: ghToken},
)
tc := oauth2.NewClient(ctx, ts)
client := github.NewClient(tc)
repo, _, err := client.Repositories.Get(ctx, owner, repoName)
if err != nil {
return 0, err
}
return repo.GetForksCount(), nil
}
func getGitHubUser() (string, error) {
ctx := context.Background()
ts := oauth2.StaticTokenSource(
&oauth2.Token{AccessToken: ghToken},
)
tc := oauth2.NewClient(ctx, ts)
client := github.NewClient(tc)
ghUser, _, err := client.Users.Get(ctx, "")
if err != nil {
return "", err
}
return ghUser.GetLogin(), nil
}
// runGitCommand runs a git command
func runGitCommand(args []string) ([]byte, error) {
cmd := exec.Command("git", args...)
out, err := cmd.CombinedOutput()
return out, err
}
func getExistingHashes(path string) ([]string, error) {
var hashes []string
gitArgs := []string{
"-C",
path,
"--work-tree",
path,
"cat-file",
"--batch-check",
"--batch-all-objects",
}
outputBytes, err := runGitCommand(gitArgs)
if err != nil {
return hashes, err
}
output := string(outputBytes)
lines := strings.Split(output, "\n")
for _, line := range lines {
if len(line) > 0 {
parts := strings.Fields(line)
if len(parts) > 0 {
hashes = append(hashes, parts[0])
}
}
}
return hashes, nil
}
// calculateUsedKeySet Estimates the total used key set -
// meaning how many used hashes are in the repository.
func calculateUsedKeySet(commitCount, forksCount int) int {
// Calculate total known key set
commits := float64(commitCount)
forks := float64(forksCount)
knownKeySet := (commits + (commits * forkCommitMultiplier * forks))
return int(knownKeySet)
}
// Estimate the number of collisions using the Birthday Paradox
func estimateCollisions(keySpace, knownKeySet int) float64 {
keySpaceF := float64(keySpace)
knownKeySetF := float64(knownKeySet)
return (knownKeySetF * (knownKeySetF - 1)) / (2 * keySpaceF)
}
func getShortShaLen(knownKeySet int) int {
// Calculate the length of the short SHA-1 hash
// This is the minimum length required to avoid collisions
// in the estimated known key set
shortShaLen := startingCharLen
keySpace := 1 << (shortShaLen * 4)
collisions := estimateCollisions(keySpace, knownKeySet)
fmt.Println("Collisions: ", collisions)
fmt.Println("Collision Threshold: ", collisionThreshold)
for collisions > collisionThreshold {
if shortShaLen >= maxCharLen {
break
}
shortShaLen++
keySpace = 1 << (shortShaLen * 4)
collisions = estimateCollisions(keySpace, knownKeySet)
}
return shortShaLen
}
// Generate all possible min commit hashes
func generateShortSHAStrings(charLen int) []string {
hexDigits := "0123456789abcdef"
var hexStrings []string
var generateCombinations func(prefix string, length int)
generateCombinations = func(prefix string, length int) {
if length == 0 {
hexStrings = append(hexStrings, prefix)
return
}
for _, digit := range hexDigits {
generateCombinations(prefix+string(digit), length-1)
}
}
generateCombinations("", charLen)
return hexStrings
}
// Write commits to disk
func writeCommitsToDisk(commits []string, commitsType, folder string) error {
filename := fmt.Sprintf("%s/%s.txt", folder, commitsType)
// Open file in append mode, create if it doesn't exist
file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return err
}
defer file.Close()
for _, commit := range commits {
if _, err := file.WriteString(commit + "\n"); err != nil {
return err
}
}
return nil
}
// Read commits from disk
func readCommitsFromDisk(commitsType, folder string) ([]string, error) {
filename := fmt.Sprintf("%s/%s.txt", folder, commitsType)
if _, err := os.Stat(filename); os.IsNotExist(err) {
return nil, nil
}
data, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
lines := strings.Split(string(data), "\n")
var commits []string
for _, line := range lines {
if line != "" {
commits = append(commits, strings.TrimSpace(line))
}
}
return removeNewlineAndUnique(commits), nil
}
// Remove newlines from commits and make them unique
func removeNewlineAndUnique(commits []string) []string {
commitMap := make(map[string]struct{})
for _, commit := range commits {
cleanCommit := strings.TrimSpace(commit)
commitMap[cleanCommit] = struct{}{}
}
var uniqueCommits []string
for commit := range commitMap {
uniqueCommits = append(uniqueCommits, commit)
}
return uniqueCommits
}
// Remove commits that are already in the existing_commits list
func removeByShortSHA(existingCommits, newCommits []string) []string {
existingSet := make(map[string]struct{})
for _, commit := range existingCommits {
existingSet[commit] = struct{}{}
}
var filteredCommits []string
for _, commit := range newCommits {
if _, exists := existingSet[commit]; !exists {
filteredCommits = append(filteredCommits, commit)
}
}
return filteredCommits
}
// Remove commits that are already in the existing_commits list (by char_len)
func removeBySHA(existingCommits, newCommits []string, charLen int) []string {
existingSet := make(map[string]struct{})
for _, commit := range existingCommits {
shortSHA := commit
if len(commit) > charLen {
shortSHA = commit[:charLen]
}
existingSet[shortSHA] = struct{}{}
}
var filteredCommits []string
for _, commit := range newCommits {
shortSHA := commit
if len(commit) > charLen {
shortSHA = commit[:charLen]
}
if _, exists := existingSet[shortSHA]; !exists {
filteredCommits = append(filteredCommits, commit)
}
}
return filteredCommits
}
func processCommits(ctx context.Context, needsProcessing []string, owner, repo, path string) {
repoCtx := context.WithValue(ctx, "repo", repo)
startingSize := float64(len(needsProcessing))
queryChunkSize := newBackoff(initialChunkSize, 10, 10, 1)
for len(needsProcessing) > 0 {
if len(needsProcessing) < queryChunkSize.getValue() {
queryChunkSize.value = float64(len(needsProcessing))
}
chunkSize := queryChunkSize.getValue()
chunk := needsProcessing[:chunkSize]
needsProcessing = needsProcessing[chunkSize:]
commitData, err := checkHashes(owner, repo, chunk)
if err != nil {
repoCtx.Logger().V(2).Info("Temporary error occurred in guessing commits", "error", err)
needsProcessing = append(needsProcessing, chunk...)
queryChunkSize.errorOccurred()
if strings.Contains(err.Error(), "You have exceeded a secondary rate limit") {
repoCtx.Logger().V(2).Info("Reached secondary GitHub Rate Limit. Sleeping for 60 seconds.")
time.Sleep(secondaryRateLimitSleep * time.Second)
}
continue
}
percentCompleted := (1 - (float64(len(needsProcessing)) / startingSize)) * 100
repoCtx.Logger().V(2).Info("Progress", "percent_completed", percentCompleted, "needs_processing", len(needsProcessing))
queryChunkSize.successOccurred()
err = writeCommitsToDisk(commitData[validHiddenCommit], validHiddenCommit, path)
if err != nil {
repoCtx.Logger().V(2).Info("Failed to write valid hidden commits to disk", "error", err)
}
err = writeCommitsToDisk(commitData[invalidCommit], invalidCommit, path)
if err != nil {
repoCtx.Logger().V(2).Info("Failed to write invalid commits to disk", "error", err)
}
}
}
type commitData struct {
OID string `json:"oid"`
}
type responseData struct {
Data struct {
Repository map[string]commitData `json:"repository"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
} `json:"errors"`
Message string `json:"message"`
}
func checkHashes(owner, repo string, hashes []string) (map[string][]string, error) {
testCases := ""
for _, h := range hashes {
testCase := fmt.Sprintf(`
commit%s: object(expression: "%s") {
... on Commit {
oid
}
}
`, h, h)
testCases += testCase
}
query := fmt.Sprintf(`
query {
repository(owner: "%s", name: "%s") {
%s
}
}
`, owner, repo, testCases)
headers := map[string]string{
"Authorization": "Bearer " + ghToken,
"Content-Type": "application/json",
"Github-Verified-Fetch": "true",
"X-Requested-With": "XMLHttpRequest",
"Accept-Language": "en-US,en;q=0.9",
"Priority": "u=1, i",
}
requestBody, err := json.Marshal(map[string]string{"query": query})
if err != nil {
return nil, fmt.Errorf("failed to marshal request body: %w", err)
}
req, err := http.NewRequest("POST", "https://api.github.com/graphql", bytes.NewBuffer(requestBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
for key, value := range headers {
req.Header.Set(key, value)
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("python request error: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
var data responseData
if err := json.Unmarshal(body, &data); err != nil {
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
}
if len(data.Errors) > 0 {
return nil, fmt.Errorf("%s (GitHub Request Error)", strings.Split(data.Errors[0].Message, ".")[0])
}
if data.Message != "" {
return nil, fmt.Errorf("%s (GitHub Request Error)", strings.Split(data.Message, ".")[0])
}
commits := data.Data.Repository
valid_cfor := []string{}
invalid := []string{}
for commit, value := range commits {
commit = strings.Replace(commit, "commit", "", 1)
if value.OID == "{}" || value.OID == "" {
invalid = append(invalid, commit)
} else {
valid_cfor = append(valid_cfor, value.OID)
}
}
res := map[string][]string{
validHiddenCommit: valid_cfor,
invalidCommit: invalid,
}
return res, nil
}
// createBatches divides a slice into batches of a specified size
func createBatches(items []string, batchSize int) <-chan []string {
out := make(chan []string)
go func() {
defer close(out)
itemsCopy := append([]string(nil), items...)
for len(itemsCopy) > 0 {
end := batchSize
if len(itemsCopy) < batchSize {
end = len(itemsCopy)
}
batch := itemsCopy[:end]
itemsCopy = itemsCopy[end:]
out <- batch
}
}()
return out
}
// downloadPatches fetches and checks out cfor commits
func downloadPatches(valid_cfor []string, path string) error {
// Download all patches
for batch := range createBatches(valid_cfor, gitFetchMax) {
gitArgs := []string{
"-C",
path,
"--work-tree",
path,
"fetch",
"--quiet",
"origin",
}
gitArgs = append(gitArgs, batch...)
_, err := runGitCommand(gitArgs)
if err != nil {
return err
}
}
// Checkout each commit
// Note: path and worktree are needed or else git will do something funny with the actual cwd
for _, commit := range valid_cfor {
branchName := fmt.Sprintf("_%s", commit)
gitArgs := []string{
"-C",
path,
"--work-tree",
path,
"checkout",
"--quiet",
"-b",
branchName,
commit,
}
_, err := runGitCommand(gitArgs)
if err != nil {
return fmt.Errorf("failed to checkout commit %s: %v", commit, err)
}
}
return nil
}
// scanHiddenData scans hidden data (and non-hidden data) for secrets in a GitHub repository
func (s *Source) EnumerateAndScanAllObjects(ctx context.Context, chunksChan chan *sources.Chunk) error {
// assign github token to global variable
ghToken = s.conn.GetToken()
// set collision threshold to user input
collisionThreshold = float64(s.conn.CollisionThreshold)
// parse the repo URL
repoURL, urlParts, err := getRepoURLParts(s.conn.Repository)
if err != nil {
return fmt.Errorf("failed to get repo URL parts: %w", err)
}
// read in the owner and repo name
owner := urlParts[1]
repoName := urlParts[2]
// get repo metadata and store in cacheRepoInfo
repoCtx := context.WithValue(ctx, "repo", owner+"/"+repoName)
ghRepo, _, err := s.apiClient.Repositories.Get(repoCtx, owner, repoName)
if err != nil {
return fmt.Errorf("failed to fetch repository: %w", err)
}
s.cacheRepoInfo(ghRepo)
// Create a folder housing the repo and commit data
userHomeDir, err := os.UserHomeDir()
if err != nil {
return fmt.Errorf("failed to get user home directory: %w", err)
}
folderPath := userHomeDir + "/.trufflehog/" + owner + "/" + repoName
err = os.MkdirAll(folderPath, 0755)
if err != nil {
return fmt.Errorf("failed to create .trufflehog folder in user's home directory: %w", err)
}
// Get GitHub User tied to token
ghUser, err := getGitHubUser()
if err != nil {
return fmt.Errorf("failed to get GitHub user details: %w", err)
}
// get the number of forks
forksCount, err := getForksCount(owner, repoName)
if err != nil {
return fmt.Errorf("failed to get forks count: %w", err)
}
// download the repo
path, repo, err := git.CloneRepoUsingToken(ctx, ghToken, repoURL, ghUser)
if err != nil {
return fmt.Errorf("failed to clone the repository: %w", err)
}
defer os.RemoveAll(path)
// count total valid hashes
validHashes, err := getExistingHashes(path)
if err != nil {
return fmt.Errorf("failed to enumerate existing commit object hashes: %w", err)
}
// Calculate estimated used key set
estimatedUsedKeySet := calculateUsedKeySet(len(validHashes), forksCount)
// Calculate Short SHA-1 Length for Unambiguous Commit Identifiers
shortShaLen := getShortShaLen(estimatedUsedKeySet)
// Log stats
repoCtx.Logger().V(2).Info("Estimated used keys", "count", estimatedUsedKeySet)
repoCtx.Logger().V(2).Info("Target Short SHA-1 length", "length", shortShaLen)
repoCtx.Logger().V(2).Info("Estimated collisions", "count", estimateCollisions(1<<(shortShaLen*4), estimatedUsedKeySet))
// Read in existing commits (if any)
validHiddenCommits, err := readCommitsFromDisk(validHiddenCommit, folderPath)
if err != nil {
return fmt.Errorf("failed to read valid hidden commits from disk: %w", err)
}
invalidCommits, err := readCommitsFromDisk(invalidCommit, folderPath)
if err != nil {
return fmt.Errorf("failed to read invalid commits from disk: %w", err)
}
// Generate all possible commit hashes using the short SHA-1 length
possibleCommits := generateShortSHAStrings(shortShaLen)
// Remove commits that are already used by the repo or previously calculated (on restart)
possibleCommits = removeBySHA(validHashes, possibleCommits, shortShaLen)
possibleCommits = removeBySHA(validHiddenCommits, possibleCommits, shortShaLen)
possibleCommits = removeByShortSHA(invalidCommits, possibleCommits)
// Guess all possible commit hashes
processCommits(ctx, possibleCommits, owner, repoName, folderPath)
// Download commit hashes and checkout into branches (only way scanner will pick them up)
err = downloadPatches(validHiddenCommits, path)
if err != nil {
return fmt.Errorf("failed to download patches: %w", err)
}
// Scan git for secrets
repoCtx.Logger().V(2).Info("scanning for secrets in repo", "repo_url", repoURL)
start := time.Now()
err = s.git.ScanRepo(ctx, repo, path, s.scanOptions, sources.ChanReporter{Ch: chunksChan})
if err != nil {
return fmt.Errorf("failed to scan repo: %w", err)
}
duration := time.Since(start)
repoCtx.Logger().V(2).Info("scanned 1 repo for hidden data", "duration_seconds", duration)
// Remove the folder if user requests
if s.conn.DeleteCachedData {
err = os.RemoveAll(folderPath)
if err != nil {
return fmt.Errorf("failed to delete cached data: %w", err)
}
}
return nil
}

View file

@ -0,0 +1,71 @@
package github_experimental
import (
"fmt"
"strings"
"sync"
"github.com/google/go-github/v63/github"
"github.com/trufflesecurity/trufflehog/v3/pkg/giturl"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
)
type repoInfoCache struct {
mu sync.RWMutex
cache map[string]repoInfo
}
func newRepoInfoCache() repoInfoCache {
return repoInfoCache{
cache: make(map[string]repoInfo),
}
}
func (r *repoInfoCache) put(repoURL string, info repoInfo) {
r.mu.Lock()
defer r.mu.Unlock()
r.cache[repoURL] = info
}
func (r *repoInfoCache) get(repoURL string) (repoInfo, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
info, ok := r.cache[repoURL]
return info, ok
}
type repoInfo struct {
owner string
name string
fullName string
hasWiki bool // the repo is _likely_ to have a wiki (see the comment on wikiIsReachable func).
size int
visibility source_metadatapb.Visibility
}
func (s *Source) cacheRepoInfo(r *github.Repository) {
info := repoInfo{
owner: r.GetOwner().GetLogin(),
name: r.GetName(),
fullName: r.GetFullName(),
hasWiki: r.GetHasWiki(),
size: r.GetSize(),
}
if r.GetPrivate() {
info.visibility = source_metadatapb.Visibility_private
} else {
info.visibility = source_metadatapb.Visibility_public
}
s.repoInfoCache.put(r.GetCloneURL(), info)
}
func (s *Source) normalizeRepo(repo string) (string, error) {
// If there's a '/', assume it's a URL and try to normalize it.
if strings.ContainsRune(repo, '/') {
return giturl.NormalizeGithubRepo(repo)
}
return "", fmt.Errorf("no repositories found for %s", repo)
}

View file

@ -238,6 +238,20 @@ type GithubConfig struct {
IncludeWikis bool
}
// GitHubExperimentalConfig defines the optional configuration for an experimental GitHub source.
type GitHubExperimentalConfig struct {
// Repository is the repository to scan.
Repository string
// Token is the token to use to authenticate with the source.
Token string
// ObjectDiscovery indicates whether to discover all commit objects (CFOR) in the repository.
ObjectDiscovery bool
// CollisionThreshold is the number of short-sha collisions tolerated during hidden data enumeration. Default is 1.
CollisionThreshold int
// DeleteCachedData indicates whether to delete cached data.
DeleteCachedData bool
}
// GitlabConfig defines the optional configuration for a gitlab source.
type GitlabConfig struct {
// Endpoint is the endpoint of the source.

View file

@ -49,6 +49,7 @@ enum SourceType {
SOURCE_TYPE_WEBHOOK = 34;
SOURCE_TYPE_ELASTICSEARCH = 35;
SOURCE_TYPE_HUGGINGFACE = 36;
SOURCE_TYPE_GITHUB_EXPERIMENTAL = 37;
}
message LocalSource {
@ -243,6 +244,16 @@ message GitHub {
bool include_wikis = 19;
}
message GitHubExperimental {
string repository = 1;
oneof credential {
string token = 2;
}
bool object_discovery = 3;
int64 collision_threshold = 4;
bool delete_cached_data = 5;
}
message GoogleDrive {
oneof credential {
string refresh_token = 1;