mirror of
https://github.com/trufflesecurity/trufflehog.git
synced 2024-11-10 07:04:24 +00:00
311494e86e
* Add stub source and elastic API funcs * Spawn workers and ship chunks * Now successfully detects a credential - Added tests - Added some documentation comments - Threaded the passed context through to all the API requests * Linting fixes * Add integration tests and resolve some bugs they uncovered * Logstash -> Elasticsearch * Add support for --index-pattern * Add support for --query-json * Use structs instead of string building to construct a search body * Support --since-timestamp * Implement additional authentication methods * Fix some small bugs * Refactoring to support --best-effort-scan * Finish implementation of --best-effort-scan * Implement scan catch-up * Finish connecting support for nodes CLI arg * Add some integration tests around the catchup mechanism * go mod tidy * Fix some linting issues * Remove some debugging Prints * Move off of _doc * Remove informational Printf and add informational logging * Remove debugging logging * Copy the index from the outer loop as well * Don't burn up the ES API with rapid requests if there's no work to do in subsequent scans * No need to export UnitOfWork.AddSearch * Use a better name for the range query variable when building the timestamp range clause in searches * Replace some unlocking defers with explicit unlocks to make the synchronized part of the code clearer * found -> ok * Remove superfluous buildElasticClient method --------- Co-authored-by: Charlie Gunyon <charlie@spectral.energy>
409 lines
16 KiB
Go
409 lines
16 KiB
Go
package sources
|
|
|
|
import (
|
|
"sync"
|
|
|
|
"google.golang.org/protobuf/types/known/anypb"
|
|
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
|
|
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
|
|
)
|
|
|
|
type (
|
|
SourceID int64
|
|
JobID int64
|
|
)
|
|
|
|
// Chunk contains data to be decoded and scanned along with context on where it came from.
|
|
//
|
|
// **Important:** The order of the fields in this struct is specifically designed to optimize
|
|
// struct alignment and minimize memory usage. Do not change the field order without carefully considering
|
|
// the potential impact on memory consumption.
|
|
// Ex: https://go.dev/play/p/Azf4a7O-DhC
|
|
type Chunk struct {
|
|
// Data is the data to decode and scan.
|
|
Data []byte
|
|
|
|
// SourceName is the name of the Source that produced the chunk.
|
|
SourceName string
|
|
// SourceID is the ID of the source that the Chunk originated from.
|
|
SourceID SourceID
|
|
// JobID is the ID of the job that the Chunk originated from.
|
|
JobID JobID
|
|
// SecretID is the ID of the secret, if it exists.
|
|
// Only secrets that are being reverified will have a SecretID.
|
|
SecretID int64
|
|
|
|
// SourceMetadata holds the context of where the Chunk was found.
|
|
SourceMetadata *source_metadatapb.MetaData
|
|
// SourceType is the type of Source that produced the chunk.
|
|
SourceType sourcespb.SourceType
|
|
|
|
// Verify specifies whether any secrets in the Chunk should be verified.
|
|
Verify bool
|
|
}
|
|
|
|
// ChunkingTarget specifies criteria for a targeted chunking process.
|
|
// Instead of collecting data indiscriminately, this struct allows the caller
|
|
// to specify particular subsets of data they're interested in. This becomes
|
|
// especially useful when one needs to verify or recheck specific data points
|
|
// without processing the entire dataset.
|
|
type ChunkingTarget struct {
|
|
// QueryCriteria represents specific parameters or conditions to target the chunking process.
|
|
QueryCriteria *source_metadatapb.MetaData
|
|
// SecretID is the ID of the secret.
|
|
SecretID int64
|
|
}
|
|
|
|
// Source defines the interface required to implement a source chunker.
|
|
type Source interface {
|
|
// Type returns the source type, used for matching against configuration and jobs.
|
|
Type() sourcespb.SourceType
|
|
// SourceID returns the initialized source ID used for tracking relationships in the DB.
|
|
SourceID() SourceID
|
|
// JobID returns the initialized job ID used for tracking relationships in the DB.
|
|
JobID() JobID
|
|
// Init initializes the source.
|
|
Init(aCtx context.Context, name string, jobId JobID, sourceId SourceID, verify bool, connection *anypb.Any, concurrency int) error
|
|
// Chunks emits data over a channel which is then decoded and scanned for secrets.
|
|
// By default, data is obtained indiscriminately. However, by providing one or more
|
|
// ChunkingTarget parameters, the caller can direct the function to retrieve
|
|
// specific chunks of data. This targeted approach allows for efficient and
|
|
// intentional data processing, beneficial when verifying or rechecking specific data points.
|
|
Chunks(ctx context.Context, chunksChan chan *Chunk, targets ...ChunkingTarget) error
|
|
// GetProgress is the completion progress (percentage) for Scanned Source.
|
|
GetProgress() *Progress
|
|
}
|
|
|
|
// SourceUnitEnumChunker are the two required interfaces to support enumerating
|
|
// and chunking of units.
|
|
type SourceUnitEnumChunker interface {
|
|
SourceUnitEnumerator
|
|
SourceUnitChunker
|
|
}
|
|
|
|
// SourceUnitUnmarshaller defines an optional interface a Source can implement
|
|
// to support units coming from an external source.
|
|
type SourceUnitUnmarshaller interface {
|
|
UnmarshalSourceUnit(data []byte) (SourceUnit, error)
|
|
}
|
|
|
|
// SourceUnitEnumerator defines an optional interface a Source can implement to
|
|
// support enumerating an initialized Source into SourceUnits.
|
|
type SourceUnitEnumerator interface {
|
|
// Enumerate creates 0 or more units from an initialized source,
|
|
// reporting them or any errors to the UnitReporter. This method is
|
|
// synchronous but can be called in a goroutine to support concurrent
|
|
// enumeration and chunking. An error should only be returned from this
|
|
// method in the case of context cancellation, fatal source errors, or
|
|
// errors returned by the reporter All other errors related to unit
|
|
// enumeration are tracked by the UnitReporter.
|
|
Enumerate(ctx context.Context, reporter UnitReporter) error
|
|
}
|
|
|
|
// UnitReporter defines the interface a source will use to report whether a
|
|
// unit was found during enumeration. Either method may be called any number of
|
|
// times. Implementors of this interface should allow for concurrent calls.
|
|
type UnitReporter interface {
|
|
UnitOk(ctx context.Context, unit SourceUnit) error
|
|
UnitErr(ctx context.Context, err error) error
|
|
}
|
|
|
|
// SourceUnitChunker defines an optional interface a Source can implement to
|
|
// support chunking a single SourceUnit.
|
|
type SourceUnitChunker interface {
|
|
// ChunkUnit creates 0 or more chunks from a unit, reporting them or
|
|
// any errors to the ChunkReporter. An error should only be returned
|
|
// from this method in the case of context cancellation, fatal source
|
|
// errors, or errors returned by the reporter. All other errors related
|
|
// to unit chunking are tracked by the ChunkReporter.
|
|
ChunkUnit(ctx context.Context, unit SourceUnit, reporter ChunkReporter) error
|
|
}
|
|
|
|
// ChunkReporter defines the interface a source will use to report whether a
|
|
// chunk was found during unit chunking. Either method may be called any number
|
|
// of times. Implementors of this interface should allow for concurrent calls.
|
|
type ChunkReporter interface {
|
|
ChunkOk(ctx context.Context, chunk Chunk) error
|
|
ChunkErr(ctx context.Context, err error) error
|
|
}
|
|
|
|
type SourceUnitKind string
|
|
|
|
// SourceUnit is an object that represents a Source's unit of work. This is
|
|
// used as the output of enumeration, progress reporting, and job distribution.
|
|
type SourceUnit interface {
|
|
// SourceUnitID uniquely identifies a source unit. It does not need to
|
|
// be human readable or two-way, however, it should be canonical and
|
|
// stable across runs.
|
|
SourceUnitID() (string, SourceUnitKind)
|
|
|
|
// Display is the human readable representation of the SourceUnit.
|
|
Display() string
|
|
}
|
|
|
|
// DockerConfig defines the optional configuration for a Docker source.
|
|
type DockerConfig struct {
|
|
// Images is the list of images to scan.
|
|
Images []string
|
|
// BearerToken is the token to use to authenticate with the source.
|
|
BearerToken string
|
|
// UseDockerKeychain determines whether to use the Docker keychain.
|
|
UseDockerKeychain bool
|
|
}
|
|
|
|
// GCSConfig defines the optional configuration for a GCS source.
|
|
type GCSConfig struct {
|
|
// CloudCred determines whether to use cloud credentials.
|
|
// This can NOT be used with a secret.
|
|
CloudCred,
|
|
// WithoutAuth is a flag to indicate whether to use authentication.
|
|
WithoutAuth bool
|
|
// ApiKey is the API key to use to authenticate with the source.
|
|
ApiKey,
|
|
// ProjectID is the project ID to use to authenticate with the source.
|
|
ProjectID,
|
|
// ServiceAccount is the service account to use to authenticate with the source.
|
|
ServiceAccount string
|
|
// MaxObjectSize is the maximum object size to scan.
|
|
MaxObjectSize int64
|
|
// Concurrency is the number of concurrent workers to use to scan the source.
|
|
Concurrency int
|
|
// IncludeBuckets is a list of buckets to include in the scan.
|
|
IncludeBuckets,
|
|
// ExcludeBuckets is a list of buckets to exclude from the scan.
|
|
ExcludeBuckets,
|
|
// IncludeObjects is a list of objects to include in the scan.
|
|
IncludeObjects,
|
|
// ExcludeObjects is a list of objects to exclude from the scan.
|
|
ExcludeObjects []string
|
|
}
|
|
|
|
// GitConfig defines the optional configuration for a git source.
|
|
type GitConfig struct {
|
|
// HeadRef is the head reference to use to scan from.
|
|
HeadRef string
|
|
// BaseRef is the base reference to use to scan from.
|
|
BaseRef string
|
|
// MaxDepth is the maximum depth to scan the source.
|
|
MaxDepth int
|
|
// Bare is an indicator to handle bare repositories properly.
|
|
Bare bool
|
|
// URI is the URI of the repository to scan. file://, http://, https:// and ssh:// are supported.
|
|
URI string
|
|
// IncludePathsFile is the path to a file containing a list of regexps to include in the scan.
|
|
IncludePathsFile string
|
|
// ExcludePathsFile is the path to a file containing a list of regexps to exclude from the scan.
|
|
ExcludePathsFile string
|
|
// ExcludeGlobs is a list of comma separated globs to exclude from the scan.
|
|
// This differs from the Filter exclusions as ExcludeGlobs is applied at the `git log -p` level
|
|
ExcludeGlobs string
|
|
// SkipBinaries allows skipping binary files from the scan.
|
|
SkipBinaries bool
|
|
}
|
|
|
|
// GithubConfig defines the optional configuration for a github source.
|
|
type GithubConfig struct {
|
|
// Endpoint is the endpoint of the source.
|
|
Endpoint string
|
|
// Token is the token to use to authenticate with the source.
|
|
Token string
|
|
// IncludeForks indicates whether to include forks in the scan.
|
|
IncludeForks bool
|
|
// IncludeMembers indicates whether to include members in the scan.
|
|
IncludeMembers bool
|
|
// Concurrency is the number of concurrent workers to use to scan the source.
|
|
Concurrency int
|
|
// Repos is the list of repositories to scan.
|
|
Repos []string
|
|
// Orgs is the list of organizations to scan.
|
|
Orgs []string
|
|
// ExcludeRepos is a list of repositories to exclude from the scan.
|
|
ExcludeRepos []string
|
|
// IncludeRepos is a list of repositories to include in the scan.
|
|
IncludeRepos []string
|
|
// Filter is the filter to use to scan the source.
|
|
Filter *common.Filter
|
|
// IncludeIssueComments indicates whether to include GitHub issue comments in the scan.
|
|
IncludeIssueComments bool
|
|
// IncludePullRequestComments indicates whether to include GitHub pull request comments in the scan.
|
|
IncludePullRequestComments bool
|
|
// IncludeGistComments indicates whether to include GitHub gist comments in the scan.
|
|
IncludeGistComments bool
|
|
// SkipBinaries allows skipping binary files from the scan.
|
|
SkipBinaries bool
|
|
// IncludeWikis indicates whether to include repository wikis in the scan.
|
|
IncludeWikis bool
|
|
}
|
|
|
|
// GitlabConfig defines the optional configuration for a gitlab source.
|
|
type GitlabConfig struct {
|
|
// Endpoint is the endpoint of the source.
|
|
Endpoint string
|
|
// Token is the token to use to authenticate with the source.
|
|
Token string
|
|
// Repos is the list of repositories to scan.
|
|
Repos []string
|
|
// Filter is the filter to use to scan the source.
|
|
Filter *common.Filter
|
|
// SkipBinaries allows skipping binary files from the scan.
|
|
SkipBinaries bool
|
|
}
|
|
|
|
// FilesystemConfig defines the optional configuration for a filesystem source.
|
|
type FilesystemConfig struct {
|
|
// Paths is the list of files and directories to scan.
|
|
Paths []string
|
|
// IncludePathsFile is the path to a file containing a list of regexps to include in the scan.
|
|
IncludePathsFile string
|
|
// ExcludePathsFile is the path to a file containing a list of regexps to exclude from the scan.
|
|
ExcludePathsFile string
|
|
}
|
|
|
|
// S3Config defines the optional configuration for an S3 source.
|
|
type S3Config struct {
|
|
// CloudCred determines whether to use cloud credentials.
|
|
// This can NOT be used with a secret.
|
|
CloudCred bool
|
|
// Key is any key to use to authenticate with the source.
|
|
Key,
|
|
// Secret is any secret to use to authenticate with the source.
|
|
Secret,
|
|
// Temporary session token associated with a temporary access key id and secret key.
|
|
SessionToken string
|
|
// Buckets is the list of buckets to scan.
|
|
Buckets []string
|
|
// IgnoreBuckets is the list buckets to ignore.
|
|
IgnoreBuckets []string
|
|
// Roles is the list of Roles to use.
|
|
Roles []string
|
|
// MaxObjectSize is the maximum object size to scan.
|
|
MaxObjectSize int64
|
|
}
|
|
|
|
// SyslogConfig defines the optional configuration for a syslog source.
|
|
type SyslogConfig struct {
|
|
// Address used to connect to the source.
|
|
Address,
|
|
// Protocol used to connect to the source.
|
|
Protocol,
|
|
// CertPath is the path to the certificate to use to connect to the source.
|
|
CertPath,
|
|
// Format is the format used to connect to the source.
|
|
Format,
|
|
// KeyPath is the path to the key to use to connect to the source.
|
|
KeyPath string
|
|
// Concurrency is the number of concurrent workers to use to scan the source.
|
|
Concurrency int
|
|
}
|
|
|
|
// PostmanConfig defines the optional configuration for a Postman source.
|
|
type PostmanConfig struct {
|
|
// Workspace UUID(s) or file path(s) to Postman workspace (.zip)
|
|
Workspaces []string
|
|
// Collection ID(s) or file path(s) to Postman collection (.json)
|
|
Collections []string
|
|
// Environment ID(s) or file path(s) to Postman environment (.json)
|
|
Environments []string
|
|
// Token is the token to use to authenticate with the API.
|
|
Token string
|
|
// IncludeCollections is a list of Collections to include in the scan.
|
|
IncludeCollections []string
|
|
// IncludeEnvironment is a list of Environments to include in the scan.
|
|
IncludeEnvironments []string
|
|
// ExcludeCollections is a list of Collections to exclude in the scan.
|
|
ExcludeCollections []string
|
|
// ExcludeEnvironment is a list of Environments to exclude in the scan.
|
|
ExcludeEnvironments []string
|
|
// Concurrency is the number of concurrent workers to use to scan the source.
|
|
Concurrency int
|
|
// CollectionPaths is the list of paths to Postman collections.
|
|
CollectionPaths []string
|
|
// WorkspacePaths is the list of paths to Postman workspaces.
|
|
WorkspacePaths []string
|
|
// EnvironmentPaths is the list of paths to Postman environments.
|
|
EnvironmentPaths []string
|
|
// Filter is the filter to use to scan the source.
|
|
Filter *common.Filter
|
|
}
|
|
|
|
type ElasticsearchConfig struct {
|
|
Nodes []string
|
|
Username string
|
|
Password string
|
|
CloudID string
|
|
APIKey string
|
|
ServiceToken string
|
|
IndexPattern string
|
|
QueryJSON string
|
|
SinceTimestamp string
|
|
BestEffortScan bool
|
|
}
|
|
|
|
// Progress is used to update job completion progress across sources.
|
|
type Progress struct {
|
|
mut sync.Mutex
|
|
PercentComplete int64
|
|
Message string
|
|
EncodedResumeInfo string
|
|
SectionsCompleted int32
|
|
SectionsRemaining int32
|
|
}
|
|
|
|
// Validator is an interface for validating a source. Sources can optionally implement this interface to validate
|
|
// their configuration.
|
|
type Validator interface {
|
|
Validate(ctx context.Context) []error
|
|
}
|
|
|
|
// SetProgressComplete sets job progress information for a running job based on the highest level objects in the source.
|
|
// i is the current iteration in the loop of target scope
|
|
// scope should be the len(scopedItems)
|
|
// message is the public facing user information about the current progress
|
|
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
|
|
//
|
|
// NOTE: SetProgressOngoing should be used when source does not yet know how many items it is scanning (scope)
|
|
// and does not want to display a percentage complete
|
|
func (p *Progress) SetProgressComplete(i, scope int, message, encodedResumeInfo string) {
|
|
p.mut.Lock()
|
|
defer p.mut.Unlock()
|
|
|
|
p.Message = message
|
|
p.EncodedResumeInfo = encodedResumeInfo
|
|
p.SectionsCompleted = int32(i)
|
|
p.SectionsRemaining = int32(scope)
|
|
|
|
// If the iteration and scope are both 0, completion is 100%.
|
|
if i == 0 && scope == 0 {
|
|
p.PercentComplete = 100
|
|
return
|
|
}
|
|
|
|
p.PercentComplete = int64((float64(i) / float64(scope)) * 100)
|
|
}
|
|
|
|
// SetProgressOngoing sets information about the current running job based on
|
|
// the highest level objects in the source.
|
|
// message is the public facing user information about the current progress
|
|
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
|
|
//
|
|
// NOTE: This method should be used over SetProgressComplete when the source does
|
|
// not yet know how many items it is scanning and does not want to display a percentage complete.
|
|
func (p *Progress) SetProgressOngoing(message string, encodedResumeInfo string) {
|
|
p.mut.Lock()
|
|
defer p.mut.Unlock()
|
|
|
|
p.Message = message
|
|
p.EncodedResumeInfo = encodedResumeInfo
|
|
// Explicitly set SectionsRemaining to 0 so the frontend does not display a percent.
|
|
p.SectionsRemaining = 0
|
|
}
|
|
|
|
// GetProgress gets job completion percentage for metrics reporting.
|
|
func (p *Progress) GetProgress() *Progress {
|
|
p.mut.Lock()
|
|
defer p.mut.Unlock()
|
|
return p
|
|
}
|