trufflehog/pkg/sources/sources.go
ahrav d334b3075e
move all Git setup into Init method (#2105)
* add proto fields for git

* add uri to proto

* move all git setup into Init method

* fix logic for when to use repoPath
2023-11-16 13:59:53 -08:00

325 lines
13 KiB
Go

package sources
import (
"sync"
"google.golang.org/protobuf/types/known/anypb"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
)
type (
SourceID int64
JobID int64
)
// Chunk contains data to be decoded and scanned along with context on where it came from.
type Chunk struct {
// SourceName is the name of the Source that produced the chunk.
SourceName string
// SourceID is the ID of the source that the Chunk originated from.
SourceID SourceID
// JobID is the ID of the job that the Chunk originated from.
JobID JobID
// SourceType is the type of Source that produced the chunk.
SourceType sourcespb.SourceType
// SourceMetadata holds the context of where the Chunk was found.
SourceMetadata *source_metadatapb.MetaData
// Data is the data to decode and scan.
Data []byte
// Verify specifies whether any secrets in the Chunk should be verified.
Verify bool
}
// ChunkingTarget specifies criteria for a targeted chunking process.
// Instead of collecting data indiscriminately, this struct allows the caller
// to specify particular subsets of data they're interested in. This becomes
// especially useful when one needs to verify or recheck specific data points
// without processing the entire dataset.
type ChunkingTarget struct {
// QueryCriteria represents specific parameters or conditions to target the chunking process.
QueryCriteria *source_metadatapb.MetaData
}
// Source defines the interface required to implement a source chunker.
type Source interface {
// Type returns the source type, used for matching against configuration and jobs.
Type() sourcespb.SourceType
// SourceID returns the initialized source ID used for tracking relationships in the DB.
SourceID() SourceID
// JobID returns the initialized job ID used for tracking relationships in the DB.
JobID() JobID
// Init initializes the source.
Init(aCtx context.Context, name string, jobId JobID, sourceId SourceID, verify bool, connection *anypb.Any, concurrency int) error
// Chunks emits data over a channel which is then decoded and scanned for secrets.
// By default, data is obtained indiscriminately. However, by providing one or more
// ChunkingTarget parameters, the caller can direct the function to retrieve
// specific chunks of data. This targeted approach allows for efficient and
// intentional data processing, beneficial when verifying or rechecking specific data points.
Chunks(ctx context.Context, chunksChan chan *Chunk, targets ...ChunkingTarget) error
// GetProgress is the completion progress (percentage) for Scanned Source.
GetProgress() *Progress
}
// SourceUnitEnumChunker are the two required interfaces to support enumerating
// and chunking of units.
type SourceUnitEnumChunker interface {
SourceUnitEnumerator
SourceUnitChunker
}
// SourceUnitUnmarshaller defines an optional interface a Source can implement
// to support units coming from an external source.
type SourceUnitUnmarshaller interface {
UnmarshalSourceUnit(data []byte) (SourceUnit, error)
}
// SourceUnitEnumerator defines an optional interface a Source can implement to
// support enumerating an initialized Source into SourceUnits.
type SourceUnitEnumerator interface {
// Enumerate creates 0 or more units from an initialized source,
// reporting them or any errors to the UnitReporter. This method is
// synchronous but can be called in a goroutine to support concurrent
// enumeration and chunking. An error should only be returned from this
// method in the case of context cancellation, fatal source errors, or
// errors returned by the reporter All other errors related to unit
// enumeration are tracked by the UnitReporter.
Enumerate(ctx context.Context, reporter UnitReporter) error
}
// UnitReporter defines the interface a source will use to report whether a
// unit was found during enumeration. Either method may be called any number of
// times. Implementors of this interface should allow for concurrent calls.
type UnitReporter interface {
UnitOk(ctx context.Context, unit SourceUnit) error
UnitErr(ctx context.Context, err error) error
}
// SourceUnitChunker defines an optional interface a Source can implement to
// support chunking a single SourceUnit.
type SourceUnitChunker interface {
// ChunkUnit creates 0 or more chunks from a unit, reporting them or
// any errors to the ChunkReporter. An error should only be returned
// from this method in the case of context cancellation, fatal source
// errors, or errors returned by the reporter. All other errors related
// to unit chunking are tracked by the ChunkReporter.
ChunkUnit(ctx context.Context, unit SourceUnit, reporter ChunkReporter) error
}
// ChunkReporter defines the interface a source will use to report whether a
// chunk was found during unit chunking. Either method may be called any number
// of times. Implementors of this interface should allow for concurrent calls.
type ChunkReporter interface {
ChunkOk(ctx context.Context, chunk Chunk) error
ChunkErr(ctx context.Context, err error) error
}
// SourceUnit is an object that represents a Source's unit of work. This is
// used as the output of enumeration, progress reporting, and job distribution.
type SourceUnit interface {
// SourceUnitID uniquely identifies a source unit.
SourceUnitID() string
}
// GCSConfig defines the optional configuration for a GCS source.
type GCSConfig struct {
// CloudCred determines whether to use cloud credentials.
// This can NOT be used with a secret.
CloudCred,
// WithoutAuth is a flag to indicate whether to use authentication.
WithoutAuth bool
// ApiKey is the API key to use to authenticate with the source.
ApiKey,
// ProjectID is the project ID to use to authenticate with the source.
ProjectID,
// ServiceAccount is the service account to use to authenticate with the source.
ServiceAccount string
// MaxObjectSize is the maximum object size to scan.
MaxObjectSize int64
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// IncludeBuckets is a list of buckets to include in the scan.
IncludeBuckets,
// ExcludeBuckets is a list of buckets to exclude from the scan.
ExcludeBuckets,
// IncludeObjects is a list of objects to include in the scan.
IncludeObjects,
// ExcludeObjects is a list of objects to exclude from the scan.
ExcludeObjects []string
}
// GitConfig defines the optional configuration for a git source.
type GitConfig struct {
// HeadRef is the head reference to use to scan from.
HeadRef string
// BaseRef is the base reference to use to scan from.
BaseRef string
// MaxDepth is the maximum depth to scan the source.
MaxDepth int
// Bare is an indicator to handle bare repositories properly.
Bare bool
// URI is the URI of the repository to scan. file://, http://, https:// and ssh:// are supported.
URI string
// IncludePathsFile is the path to a file containing a list of regexps to include in the scan.
IncludePathsFile string
// ExcludePathsFile is the path to a file containing a list of regexps to exclude from the scan.
ExcludePathsFile string
// ExcludeGlobs is a list of comma separated globs to exclude from the scan.
// This differs from the Filter exclusions as ExcludeGlobs is applied at the `git log -p` level
ExcludeGlobs string
}
// GithubConfig defines the optional configuration for a github source.
type GithubConfig struct {
// Endpoint is the endpoint of the source.
Endpoint,
// Token is the token to use to authenticate with the source.
Token string
// IncludeForks indicates whether to include forks in the scan.
IncludeForks,
// IncludeMembers indicates whether to include members in the scan.
IncludeMembers bool
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// Repos is the list of repositories to scan.
Repos,
// Orgs is the list of organizations to scan.
Orgs,
// ExcludeRepos is a list of repositories to exclude from the scan.
ExcludeRepos,
// IncludeRepos is a list of repositories to include in the scan.
IncludeRepos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
// IncludeIssueComments indicates whether to include GitHub issue comments in the scan.
IncludeIssueComments,
// IncludePullRequestComments indicates whether to include GitHub pull request comments in the scan.
IncludePullRequestComments,
// IncludeGistComments indicates whether to include GitHub gist comments in the scan.
IncludeGistComments bool
}
// GitlabConfig defines the optional configuration for a gitlab source.
type GitlabConfig struct {
// Endpoint is the endpoint of the source.
Endpoint,
// Token is the token to use to authenticate with the source.
Token string
// Repos is the list of repositories to scan.
Repos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
}
// FilesystemConfig defines the optional configuration for a filesystem source.
type FilesystemConfig struct {
// Paths is the list of files and directories to scan.
Paths []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
}
// S3Config defines the optional configuration for an S3 source.
type S3Config struct {
// CloudCred determines whether to use cloud credentials.
// This can NOT be used with a secret.
CloudCred bool
// Key is any key to use to authenticate with the source.
Key,
// Secret is any secret to use to authenticate with the source.
Secret,
// Temporary session token associated with a temporary access key id and secret key.
SessionToken string
// Buckets is the list of buckets to scan.
Buckets []string
// Roles is the list of Roles to use.
Roles []string
// MaxObjectSize is the maximum object size to scan.
MaxObjectSize int64
}
// SyslogConfig defines the optional configuration for a syslog source.
type SyslogConfig struct {
// Address used to connect to the source.
Address,
// Protocol used to connect to the source.
Protocol,
// CertPath is the path to the certificate to use to connect to the source.
CertPath,
// Format is the format used to connect to the source.
Format,
// KeyPath is the path to the key to use to connect to the source.
KeyPath string
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
}
// Progress is used to update job completion progress across sources.
type Progress struct {
mut sync.Mutex
PercentComplete int64
Message string
EncodedResumeInfo string
SectionsCompleted int32
SectionsRemaining int32
}
// Validator is an interface for validating a source. Sources can optionally implement this interface to validate
// their configuration.
type Validator interface {
Validate(ctx context.Context) []error
}
// SetProgressComplete sets job progress information for a running job based on the highest level objects in the source.
// i is the current iteration in the loop of target scope
// scope should be the len(scopedItems)
// message is the public facing user information about the current progress
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
//
// NOTE: SetProgressOngoing should be used when source does not yet know how many items it is scanning (scope)
// and does not want to display a percentage complete
func (p *Progress) SetProgressComplete(i, scope int, message, encodedResumeInfo string) {
p.mut.Lock()
defer p.mut.Unlock()
p.Message = message
p.EncodedResumeInfo = encodedResumeInfo
p.SectionsCompleted = int32(i)
p.SectionsRemaining = int32(scope)
// If the iteration and scope are both 0, completion is 100%.
if i == 0 && scope == 0 {
p.PercentComplete = 100
return
}
p.PercentComplete = int64((float64(i) / float64(scope)) * 100)
}
// SetProgressOngoing sets information about the current running job based on
// the highest level objects in the source.
// message is the public facing user information about the current progress
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
//
// NOTE: This method should be used over SetProgressComplete when the source does
// not yet know how many items it is scanning and does not want to display a percentage complete.
func (p *Progress) SetProgressOngoing(message string, encodedResumeInfo string) {
p.mut.Lock()
defer p.mut.Unlock()
p.Message = message
p.EncodedResumeInfo = encodedResumeInfo
// Explicitly set SectionsRemaining to 0 so the frontend does not display a percent.
p.SectionsRemaining = 0
}
// GetProgress gets job completion percentage for metrics reporting.
func (p *Progress) GetProgress() *Progress {
p.mut.Lock()
defer p.mut.Unlock()
return p
}