trufflehog/pkg/gitparse/gitparse.go

981 lines
28 KiB
Go
Raw Normal View History

package gitparse
import (
"bufio"
"bytes"
"fmt"
2022-09-02 18:02:38 +00:00
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/go-logr/logr"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
bufferwriter "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffer_writer"
bufferedfilewriter "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer"
)
const (
// defaultDateFormat is the standard date format for git.
defaultDateFormat = "Mon Jan 2 15:04:05 2006 -0700"
// defaultMaxDiffSize is the maximum size for a diff. Larger diffs will be cut off.
defaultMaxDiffSize = 2 * 1024 * 1024 * 1024 // 2GB
// defaultMaxCommitSize is the maximum size for a commit. Larger commits will be cut off.
defaultMaxCommitSize = 2 * 1024 * 1024 * 1024 // 2GB
)
// contentWriter defines a common interface for writing, reading, and managing diff content.
// It abstracts the underlying storage mechanism, allowing flexibility in how content is handled.
// This interface enables the use of different content storage strategies (e.g., in-memory buffer, file-based storage)
// based on performance needs or resource constraints, providing a unified way to interact with different content types.
type contentWriter interface { // Write appends data to the content storage.
// Write appends data to the content storage.
Write(data []byte) (int, error)
// ReadCloser provides a reader for accessing stored content.
ReadCloser() (io.ReadCloser, error)
// CloseForWriting closes the content storage for writing.
CloseForWriting() error
// Len returns the current size of the content.
Len() int
// String returns the content as a string or an error if the content cannot be converted to a string.
String() (string, error)
}
// Diff contains the information about a file diff in a commit.
// It abstracts the underlying content representation, allowing for flexible handling of diff content.
// The use of contentWriter enables the management of diff data either in memory or on disk,
// based on its size, optimizing resource usage and performance.
type Diff struct {
PathB string
LineStart int
IsBinary bool
Commit *Commit
contentWriter contentWriter
}
type diffOption func(*Diff)
// withPathB sets the PathB option.
func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } }
// withCustomContentWriter sets the useCustomContentWriter option.
func withCustomContentWriter(cr contentWriter) diffOption {
return func(d *Diff) { d.contentWriter = cr }
}
// newDiff creates a new Diff with a threshold and an associated commit.
// All Diffs must have an associated commit.
// The contentWriter is used to manage the diff's content, allowing for flexible handling of diff data.
// By default, a buffer is used as the contentWriter, but this can be overridden with a custom contentWriter.
func newDiff(commit *Commit, opts ...diffOption) *Diff {
diff := &Diff{Commit: commit}
for _, opt := range opts {
opt(diff)
}
if diff.contentWriter == nil {
diff.contentWriter = bufferwriter.New()
}
return diff
}
// Len returns the length of the storage.
func (d *Diff) Len() int { return d.contentWriter.Len() }
// ReadCloser returns a ReadCloser for the contentWriter.
func (d *Diff) ReadCloser() (io.ReadCloser, error) { return d.contentWriter.ReadCloser() }
// write delegates to the contentWriter.
func (d *Diff) write(p []byte) error {
_, err := d.contentWriter.Write(p)
return err
}
// finalize ensures proper closure of resources associated with the Diff.
// handle the final flush in the finalize method, in case there's data remaining in the buffer.
// This method should be called to release resources, especially when writing to a file.
func (d *Diff) finalize() error { return d.contentWriter.CloseForWriting() }
// Commit contains commit header info and diffs.
type Commit struct {
Hash string
Author string
Committer string
Date time.Time
Message strings.Builder
Size int // in bytes
hasDiffs bool
}
// Parser sets values used in GitParse.
type Parser struct {
maxDiffSize int
maxCommitSize int
dateFormat string
useCustomContentWriter bool
}
2023-07-25 22:52:34 +00:00
type ParseState int
const (
Initial ParseState = iota
CommitLine
2023-07-27 14:24:49 +00:00
MergeLine
2023-07-25 22:52:34 +00:00
AuthorLine
AuthorDateLine
CommitterLine
CommitterDateLine
2023-07-25 22:52:34 +00:00
MessageStartLine
MessageLine
MessageEndLine
NotesStartLine
NotesLine
NotesEndLine
2023-07-25 22:52:34 +00:00
DiffLine
ModeLine
IndexLine
FromFileLine
ToFileLine
BinaryFileLine
HunkLineNumberLine
HunkContentLine
ParseFailure
)
func (state ParseState) String() string {
return [...]string{
"Initial",
"CommitLine",
2023-07-27 14:24:49 +00:00
"MergeLine",
2023-07-25 22:52:34 +00:00
"AuthorLine",
"AuthorDateLine",
"CommitterLine",
"CommitterDateLine",
2023-07-25 22:52:34 +00:00
"MessageStartLine",
"MessageLine",
"MessageEndLine",
"NotesStartLine",
"NotesLine",
"NotesEndLine",
2023-07-25 22:52:34 +00:00
"DiffLine",
"ModeLine",
"IndexLine",
"FromFileLine",
"ToFileLine",
"BinaryFileLine",
"HunkLineNumberLine",
"HunkContentLine",
"ParseFailure",
2023-07-25 22:52:34 +00:00
}[state]
}
// UseCustomContentWriter sets useCustomContentWriter option.
func UseCustomContentWriter() Option {
return func(parser *Parser) { parser.useCustomContentWriter = true }
}
// WithMaxDiffSize sets maxDiffSize option. Diffs larger than maxDiffSize will
// be truncated.
func WithMaxDiffSize(maxDiffSize int) Option {
return func(parser *Parser) {
parser.maxDiffSize = maxDiffSize
}
}
// WithMaxCommitSize sets maxCommitSize option. Commits larger than maxCommitSize
// will be put in the commit channel and additional diffs will be added to a
// new commit.
func WithMaxCommitSize(maxCommitSize int) Option {
return func(parser *Parser) {
parser.maxCommitSize = maxCommitSize
}
}
// Option is used for adding options to Config.
type Option func(*Parser)
// NewParser creates a GitParse config from options and sets defaults.
func NewParser(options ...Option) *Parser {
parser := &Parser{
dateFormat: defaultDateFormat,
maxDiffSize: defaultMaxDiffSize,
maxCommitSize: defaultMaxCommitSize,
}
for _, option := range options {
option(parser)
}
return parser
}
// RepoPath parses the output of the `git log` command for the `source` path.
// The Diff chan will return diffs in the order they are parsed from the log.
func (c *Parser) RepoPath(
ctx context.Context,
source string,
head string,
abbreviatedLog bool,
excludedGlobs []string,
isBare bool,
) (chan *Diff, error) {
args := []string{
"-C", source,
"log",
"--patch", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---patch
"--full-history",
"--date=format:%a %b %d %H:%M:%S %Y %z",
"--pretty=fuller", // https://git-scm.com/docs/git-log#_pretty_formats
"--notes", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---notesltrefgt
}
if abbreviatedLog {
args = append(args, "--diff-filter=AM")
}
if head != "" {
args = append(args, head)
} else {
args = append(args, "--all")
}
for _, glob := range excludedGlobs {
args = append(args, "--", ".", ":(exclude)"+glob)
}
cmd := exec.Command("git", args...)
absPath, err := filepath.Abs(source)
if err == nil {
if !isBare {
cmd.Env = append(cmd.Env, "GIT_DIR="+filepath.Join(absPath, ".git"))
} else {
cmd.Env = append(cmd.Env,
"GIT_DIR="+absPath,
)
// We need those variables to handle incoming commits
// while using trufflehog in pre-receive hooks
if dir := os.Getenv("GIT_OBJECT_DIRECTORY"); dir != "" {
cmd.Env = append(cmd.Env, "GIT_OBJECT_DIRECTORY="+dir)
}
if dir := os.Getenv("GIT_ALTERNATE_OBJECT_DIRECTORIES"); dir != "" {
cmd.Env = append(cmd.Env, "GIT_ALTERNATE_OBJECT_DIRECTORIES="+dir)
}
}
}
2023-07-25 22:52:34 +00:00
return c.executeCommand(ctx, cmd, false)
}
// Staged parses the output of the `git diff` command for the `source` path.
func (c *Parser) Staged(ctx context.Context, source string) (chan *Diff, error) {
// Provide the --cached flag to diff to get the diff of the staged changes.
args := []string{"-C", source, "diff", "-p", "--cached", "--full-history", "--diff-filter=AM", "--date=format:%a %b %d %H:%M:%S %Y %z"}
cmd := exec.Command("git", args...)
absPath, err := filepath.Abs(source)
if err == nil {
cmd.Env = append(cmd.Env, "GIT_DIR="+filepath.Join(absPath, ".git"))
}
2023-07-25 22:52:34 +00:00
return c.executeCommand(ctx, cmd, true)
}
// executeCommand runs an exec.Cmd, reads stdout and stderr, and waits for the Cmd to complete.
func (c *Parser) executeCommand(ctx context.Context, cmd *exec.Cmd, isStaged bool) (chan *Diff, error) {
diffChan := make(chan *Diff, 64)
stdOut, err := cmd.StdoutPipe()
if err != nil {
return diffChan, err
}
stdErr, err := cmd.StderrPipe()
if err != nil {
return diffChan, err
}
err = cmd.Start()
if err != nil {
return diffChan, err
}
go func() {
scanner := bufio.NewScanner(stdErr)
for scanner.Scan() {
ctx.Logger().V(2).Info(scanner.Text())
}
}()
go func() {
c.FromReader(ctx, stdOut, diffChan, isStaged)
if err := stdOut.Close(); err != nil {
ctx.Logger().V(2).Info("Error closing git stdout pipe.", "error", err)
}
2022-09-02 18:02:38 +00:00
if err := cmd.Wait(); err != nil {
ctx.Logger().V(2).Info("Error waiting for git command to complete.", "error", err)
2022-09-02 18:02:38 +00:00
}
}()
return diffChan, nil
2022-09-02 18:02:38 +00:00
}
func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan *Diff, isStaged bool) {
2022-09-02 18:02:38 +00:00
outReader := bufio.NewReader(stdOut)
var (
2023-07-25 22:52:34 +00:00
currentCommit *Commit
totalLogSize int
)
2023-07-25 22:52:34 +00:00
var latestState = Initial
diff := func(c *Commit, opts ...diffOption) *Diff {
opts = append(opts, withCustomContentWriter(bufferwriter.New()))
return newDiff(c, opts...)
}
if c.useCustomContentWriter {
diff = func(c *Commit, opts ...diffOption) *Diff {
opts = append(opts, withCustomContentWriter(bufferedfilewriter.New()))
return newDiff(c, opts...)
}
}
currentDiff := diff(currentCommit)
2022-09-02 18:02:38 +00:00
defer common.RecoverWithExit(ctx)
defer close(diffChan)
2022-09-02 18:02:38 +00:00
for {
if common.IsDone(ctx) {
break
}
2023-07-25 22:52:34 +00:00
2022-09-02 18:02:38 +00:00
line, err := outReader.ReadBytes([]byte("\n")[0])
if err != nil && len(line) == 0 {
break
}
2023-07-25 22:52:34 +00:00
2022-09-02 18:02:38 +00:00
switch {
2023-07-25 22:52:34 +00:00
case isCommitLine(isStaged, latestState, line):
latestState = CommitLine
2022-09-02 18:02:38 +00:00
// If there is a currentDiff, add it to currentCommit.
if currentDiff.Len() > 0 || currentDiff.IsBinary {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(
err,
"failed to finalize diff",
"commit", currentCommit.Hash,
"diff", currentDiff.PathB,
"size", currentDiff.Len(),
"latest_state", latestState.String(),
)
}
diffChan <- currentDiff
currentCommit.Size += currentDiff.Len()
currentCommit.hasDiffs = true
2022-09-02 18:02:38 +00:00
}
// If there is a currentCommit, send it to the channel.
if currentCommit != nil {
totalLogSize += currentCommit.Size
if !currentCommit.hasDiffs {
// Initialize an empty Diff instance associated with the given commit.
// Since this diff represents "no changes", we only need to set the commit.
// This is required to ensure commits that have no diffs are still processed.
diffChan <- &Diff{Commit: currentCommit}
}
2022-09-02 18:02:38 +00:00
}
2022-09-02 18:02:38 +00:00
// Create a new currentDiff and currentCommit
currentCommit = &Commit{Message: strings.Builder{}}
currentDiff = diff(currentCommit)
2022-09-02 18:02:38 +00:00
// Check that the commit line contains a hash and set it.
if len(line) >= 47 {
currentCommit.Hash = string(line[7:47])
}
2023-07-27 14:24:49 +00:00
case isMergeLine(isStaged, latestState, line):
latestState = MergeLine
2023-07-25 22:52:34 +00:00
case isAuthorLine(isStaged, latestState, line):
latestState = AuthorLine
currentCommit.Author = strings.TrimSpace(string(line[8:]))
case isAuthorDateLine(isStaged, latestState, line):
latestState = AuthorDateLine
2023-07-25 22:52:34 +00:00
date, err := time.Parse(c.dateFormat, strings.TrimSpace(string(line[12:])))
2022-09-02 18:02:38 +00:00
if err != nil {
ctx.Logger().Error(err, "failed to parse commit date", "commit", currentCommit.Hash, "latestState", latestState.String())
latestState = ParseFailure
continue
2022-09-02 18:02:38 +00:00
}
currentCommit.Date = date
case isCommitterLine(isStaged, latestState, line):
latestState = CommitterLine
currentCommit.Committer = strings.TrimSpace(string(line[8:]))
case isCommitterDateLine(isStaged, latestState, line):
latestState = CommitterDateLine
// NoOp
2023-07-25 22:52:34 +00:00
case isMessageStartLine(isStaged, latestState, line):
latestState = MessageStartLine
// NoOp
case isMessageLine(isStaged, latestState, line):
latestState = MessageLine
currentCommit.Message.Write(line[4:]) // Messages are indented by 4 spaces.
2023-07-25 22:52:34 +00:00
case isMessageEndLine(isStaged, latestState, line):
latestState = MessageEndLine
// NoOp
case isNotesStartLine(isStaged, latestState, line):
latestState = NotesStartLine
currentCommit.Message.WriteString("\n")
currentCommit.Message.Write(line)
case isNotesLine(isStaged, latestState, line):
latestState = NotesLine
currentCommit.Message.Write(line[4:]) // Notes are indented by 4 spaces.
case isNotesEndLine(isStaged, latestState, line):
latestState = NotesEndLine
// NoOp
2023-07-25 22:52:34 +00:00
case isDiffLine(isStaged, latestState, line):
latestState = DiffLine
if currentDiff.Len() > 0 || currentDiff.IsBinary {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err,
"failed to finalize diff",
"commit", currentCommit.Hash,
"diff", currentDiff.PathB,
"size", currentDiff.Len(),
"latest_state", latestState.String(),
)
}
diffChan <- currentDiff
currentCommit.hasDiffs = true
}
// This should never be nil, but check in case the stdin stream is messed up.
if currentCommit == nil {
currentCommit = &Commit{}
2022-09-02 18:02:38 +00:00
}
currentDiff = diff(currentCommit)
case isModeLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = ModeLine
2022-09-02 18:02:38 +00:00
// NoOp
case isIndexLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = IndexLine
2022-09-02 18:02:38 +00:00
// NoOp
case isBinaryLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = BinaryFileLine
path, ok := pathFromBinaryLine(line)
if !ok {
err = fmt.Errorf(`expected line to match 'Binary files a/fileA and b/fileB differ', got "%s"`, line)
2024-03-19 15:50:27 +00:00
ctx.Logger().Error(err, "Failed to parse BinaryFileLine")
latestState = ParseFailure
continue
}
2023-10-26 17:07:02 +00:00
// Don't do anything if the file is deleted. (pathA has file path, pathB is /dev/null)
if path != "" {
currentDiff.PathB = path
2023-10-26 17:07:02 +00:00
currentDiff.IsBinary = true
}
case isFromFileLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = FromFileLine
// NoOp
case isToFileLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = ToFileLine
path, ok := pathFromToFileLine(line)
2024-03-19 15:50:27 +00:00
if !ok {
err = fmt.Errorf(`expected line to match format '+++ b/path/to/file.go', got '%s'`, line)
2024-03-19 15:50:27 +00:00
ctx.Logger().Error(err, "Failed to parse ToFileLine")
latestState = ParseFailure
continue
}
currentDiff.PathB = path
case isHunkLineNumberLine(latestState, line):
2023-07-25 22:52:34 +00:00
latestState = HunkLineNumberLine
if currentDiff.Len() > 0 || currentDiff.IsBinary {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(
err,
"failed to finalize diff",
"commit", currentCommit.Hash,
"diff", currentDiff.PathB,
"size", currentDiff.Len(),
"latest_state", latestState.String(),
)
}
diffChan <- currentDiff
}
currentDiff = diff(currentCommit, withPathB(currentDiff.PathB))
2022-09-02 18:02:38 +00:00
words := bytes.Split(line, []byte(" "))
if len(words) >= 3 {
startSlice := bytes.Split(words[2], []byte(","))
lineStart, err := strconv.Atoi(string(startSlice[0]))
if err == nil {
currentDiff.LineStart = lineStart
}
}
case isHunkContextLine(latestState, line):
2023-07-25 22:52:34 +00:00
if latestState != HunkContentLine {
latestState = HunkContentLine
}
// TODO: Why do we care about this? It creates empty lines in the diff. If there are no plusLines, it's just newlines.
if err := currentDiff.write([]byte("\n")); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
}
case isHunkPlusLine(latestState, line):
2023-07-25 22:52:34 +00:00
if latestState != HunkContentLine {
latestState = HunkContentLine
}
if err := currentDiff.write(line[1:]); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
2023-07-25 22:52:34 +00:00
}
// NoOp. We only care about additions.
case isHunkMinusLine(latestState, line),
isHunkNewlineWarningLine(latestState, line),
isHunkEmptyLine(latestState, line):
2023-07-25 22:52:34 +00:00
if latestState != HunkContentLine {
latestState = HunkContentLine
}
// NoOp
case isCommitSeparatorLine(latestState, line):
2023-07-25 22:52:34 +00:00
// NoOp
default:
// Skip ahead until we find the next diff or commit.
if latestState == ParseFailure {
continue
}
// Here be dragons...
// Build an informative error message.
err := fmt.Errorf(`invalid line "%s" after state "%s"`, line, latestState)
var logger logr.Logger
2023-07-25 22:52:34 +00:00
if currentCommit != nil && currentCommit.Hash != "" {
logger = ctx.Logger().WithValues("commit", currentCommit.Hash)
2023-07-25 22:52:34 +00:00
} else {
logger = ctx.Logger()
2023-07-25 22:52:34 +00:00
}
logger.Error(err, "failed to parse Git input. Recovering at the latest commit or diff...")
2023-07-25 22:52:34 +00:00
latestState = ParseFailure
}
2023-07-25 22:52:34 +00:00
if currentDiff.Len() > c.maxDiffSize {
ctx.Logger().V(2).Info(fmt.Sprintf(
"Diff for %s exceeded MaxDiffSize(%d)", currentDiff.PathB, c.maxDiffSize,
))
break
}
2022-09-02 18:02:38 +00:00
}
cleanupParse(ctx, currentCommit, currentDiff, diffChan, &totalLogSize)
ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize)
}
2023-07-27 14:24:49 +00:00
func isMergeLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != CommitLine {
return false
}
if len(line) > 6 && bytes.Equal(line[:6], []byte("Merge:")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// commit 7a95bbf0199e280a0e42dbb1d1a3f56cdd0f6e05
func isCommitLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || !(latestState == Initial ||
latestState == MessageStartLine ||
latestState == MessageEndLine ||
latestState == ModeLine ||
latestState == IndexLine ||
latestState == BinaryFileLine ||
latestState == ToFileLine ||
latestState == HunkContentLine ||
latestState == ParseFailure) {
return false
2022-09-02 18:02:38 +00:00
}
2023-07-25 22:52:34 +00:00
if len(line) > 7 && bytes.Equal(line[:7], []byte("commit ")) {
return true
2022-09-02 18:02:38 +00:00
}
2023-07-25 22:52:34 +00:00
return false
}
// Author: Bill Rich <bill.rich@trufflesec.com>
func isAuthorLine(isStaged bool, latestState ParseState, line []byte) bool {
2023-07-27 14:24:49 +00:00
if isStaged || !(latestState == CommitLine || latestState == MergeLine) {
2023-07-25 22:52:34 +00:00
return false
}
if len(line) > 8 && bytes.Equal(line[:7], []byte("Author:")) {
return true
}
return false
}
// AuthorDate: Tue Aug 10 15:20:40 2021 +0100
func isAuthorDateLine(isStaged bool, latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if isStaged || latestState != AuthorLine {
return false
}
if len(line) > 10 && bytes.Equal(line[:11], []byte("AuthorDate:")) {
return true
}
return false
}
// Commit: Bill Rich <bill.rich@trufflesec.com>
func isCommitterLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != AuthorDateLine {
return false
}
if len(line) > 8 && bytes.Equal(line[:7], []byte("Commit:")) {
return true
}
return false
}
// CommitDate: Wed Apr 17 19:59:28 2024 -0400
func isCommitterDateLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != CommitterLine {
return false
}
if len(line) > 10 && bytes.Equal(line[:11], []byte("CommitDate:")) {
return true
}
return false
}
// Line directly after CommitterDate with only a newline.
2023-07-25 22:52:34 +00:00
func isMessageStartLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != CommitterDateLine {
2023-07-25 22:52:34 +00:00
return false
}
// TODO: Improve the implementation of this and isMessageEndLine
if len(strings.TrimRight(string(line[:]), "\r\n")) == 0 {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// Line that starts with 4 spaces
func isMessageLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || !(latestState == MessageStartLine || latestState == MessageLine) {
return false
}
if len(line) > 4 && bytes.Equal(line[:4], []byte(" ")) {
return true
}
return false
}
// Line directly after MessageLine with only a newline.
func isMessageEndLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != MessageLine {
return false
}
if len(strings.TrimRight(string(line[:]), "\r\n")) == 0 {
return true
}
return false
}
// `Notes:` or `Notes (context):`
// See https://tylercipriani.com/blog/2022/11/19/git-notes-gits-coolest-most-unloved-feature/
func isNotesStartLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != MessageEndLine {
return false
}
if len(line) > 5 && bytes.Equal(line[:5], []byte("Notes")) {
return true
}
return false
}
// Line after NotesStartLine that starts with 4 spaces
func isNotesLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || !(latestState == NotesStartLine || latestState == NotesLine) {
return false
}
if len(line) > 4 && bytes.Equal(line[:4], []byte(" ")) {
return true
}
return false
}
// Line directly after NotesLine with only a newline.
func isNotesEndLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != NotesLine {
return false
}
if len(strings.TrimRight(string(line[:]), "\r\n")) == 0 {
return true
}
return false
}
// diff --git a/internal/addrs/move_endpoint_module.go b/internal/addrs/move_endpoint_module.go
2023-07-25 22:52:34 +00:00
func isDiffLine(isStaged bool, latestState ParseState, line []byte) bool {
if !(latestState == MessageStartLine || // Empty commit messages can go from MessageStart->Diff
latestState == MessageEndLine ||
latestState == NotesEndLine ||
2023-07-25 22:52:34 +00:00
latestState == BinaryFileLine ||
latestState == ModeLine ||
2023-07-25 22:52:34 +00:00
latestState == IndexLine ||
latestState == HunkContentLine ||
latestState == ParseFailure) {
if !(isStaged && latestState == Initial) {
2023-07-25 22:52:34 +00:00
return false
}
}
if len(line) > 11 && bytes.Equal(line[:11], []byte("diff --git ")) {
return true
}
return false
}
// old mode 100644
// new mode 100755
// new file mode 100644
// similarity index 100%
// rename from old.txt
// rename to new.txt
// deleted file mode 100644
func isModeLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == DiffLine || latestState == ModeLine) {
return false
}
// This could probably be better written.
if (len(line) > 17 && bytes.Equal(line[:17], []byte("deleted file mode"))) ||
(len(line) > 16 && bytes.Equal(line[:16], []byte("similarity index"))) ||
(len(line) > 13 && bytes.Equal(line[:13], []byte("new file mode"))) ||
(len(line) > 11 && bytes.Equal(line[:11], []byte("rename from"))) ||
(len(line) > 9 && bytes.Equal(line[:9], []byte("rename to"))) ||
(len(line) > 8 && bytes.Equal(line[:8], []byte("old mode"))) ||
(len(line) > 8 && bytes.Equal(line[:8], []byte("new mode"))) {
return true
}
return false
}
// index 1ed6fbee1..aea1e643a 100644
2023-07-25 22:52:34 +00:00
// index 00000000..e69de29b
func isIndexLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == DiffLine || latestState == ModeLine) {
return false
}
if len(line) > 6 && bytes.Equal(line[:6], []byte("index ")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// Binary files /dev/null and b/plugin.sig differ
func isBinaryLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if latestState != IndexLine {
return false
}
if len(line) > 7 && bytes.Equal(line[:6], []byte("Binary")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// Get the b/ file path. Ignoring the edge case of files having `and /b` in the name for simplicity.
func pathFromBinaryLine(line []byte) (string, bool) {
if bytes.Contains(line, []byte("and /dev/null")) {
return "", true
}
var (
path string
err error
)
2024-03-19 15:50:27 +00:00
if _, after, ok := bytes.Cut(line, []byte(" and b/")); ok {
// drop the " differ\n"
2024-03-19 15:50:27 +00:00
path = string(after[:len(after)-8])
} else if _, after, ok = bytes.Cut(line, []byte(` and "b/`)); ok {
// Edge case where the path is quoted.
// https://github.com/trufflesecurity/trufflehog/issues/2384
// Drop the `" differ\n` and handle escaped characters in the path.
// e.g., "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
path, err = strconv.Unquote(`"` + string(after[:len(after)-9]) + `"`)
if err != nil {
return "", false
}
2024-03-19 15:50:27 +00:00
} else {
// Unknown format.
return "", false
}
return path, true
2023-07-25 22:52:34 +00:00
}
// --- a/internal/addrs/move_endpoint_module.go
// --- /dev/null
func isFromFileLine(latestState ParseState, line []byte) bool {
if !(latestState == IndexLine || latestState == ModeLine) {
2023-07-25 22:52:34 +00:00
return false
}
if len(line) >= 6 && bytes.Equal(line[:4], []byte("--- ")) {
return true
}
return false
}
// +++ b/internal/addrs/move_endpoint_module.go
func isToFileLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if latestState != FromFileLine {
return false
}
if len(line) >= 6 && bytes.Equal(line[:4], []byte("+++ ")) {
return true
}
return false
}
2024-03-19 15:50:27 +00:00
// Get the b/ file path.
func pathFromToFileLine(line []byte) (string, bool) {
2024-03-19 15:50:27 +00:00
// Normalize paths, as they can end in `\n`, `\t\n`, etc.
// See https://github.com/trufflesecurity/trufflehog/issues/1060
line = bytes.TrimSpace(line)
// File was deleted.
if bytes.Equal(line, []byte("+++ /dev/null")) {
return "", true
}
var (
path string
err error
)
2024-03-19 15:50:27 +00:00
if _, after, ok := bytes.Cut(line, []byte("+++ b/")); ok {
path = string(after)
} else if _, after, ok = bytes.Cut(line, []byte(`+++ "b/`)); ok {
// Edge case where the path is quoted.
// e.g., `+++ "b/C++/1 \320\243\321\200\320\276\320\272/B.c"`
// Drop the trailing `"` and handle escaped characters in the path
// e.g., "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
path, err = strconv.Unquote(`"` + string(after[:len(after)-1]) + `"`)
if err != nil {
return "", false
}
2024-03-19 15:50:27 +00:00
} else {
// Unknown format.
return "", false
}
return path, true
2024-03-19 15:50:27 +00:00
}
2023-07-25 22:52:34 +00:00
// @@ -298 +298 @@ func maxRetryErrorHandler(resp *http.Response, err error, numTries int)
func isHunkLineNumberLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == ToFileLine || latestState == HunkContentLine) {
return false
}
if len(line) >= 8 && bytes.Equal(line[:2], []byte("@@")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// fmt.Println("ok")
// (There's a space before `fmt` that gets removed by the formatter.)
func isHunkContextLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == HunkLineNumberLine || latestState == HunkContentLine) {
return false
}
if len(line) >= 1 && bytes.Equal(line[:1], []byte(" ")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// +fmt.Println("ok")
func isHunkPlusLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == HunkLineNumberLine || latestState == HunkContentLine) {
return false
}
if len(line) >= 1 && bytes.Equal(line[:1], []byte("+")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// -fmt.Println("ok")
func isHunkMinusLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == HunkLineNumberLine || latestState == HunkContentLine) {
return false
}
if len(line) >= 1 && bytes.Equal(line[:1], []byte("-")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// \ No newline at end of file
func isHunkNewlineWarningLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if latestState != HunkContentLine {
return false
}
if len(line) >= 27 && bytes.Equal(line[:27], []byte("\\ No newline at end of file")) {
return true
}
return false
}
2023-07-25 22:52:34 +00:00
// Newline after hunk, or an empty line, e.g.
// +}
//
// commit 00920984e3435057f09cee5468850f7546dfa637 (tag: v3.42.0)
func isHunkEmptyLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if !(latestState == HunkLineNumberLine || latestState == HunkContentLine) {
return false
}
// TODO: Can this also be `\n\r`?
if len(line) == 1 && bytes.Equal(line[:1], []byte("\n")) {
return true
}
return false
}
func isCommitSeparatorLine(latestState ParseState, line []byte) bool {
2023-07-25 22:52:34 +00:00
if (latestState == ModeLine || latestState == IndexLine || latestState == BinaryFileLine || latestState == ToFileLine) &&
len(line) == 1 && bytes.Equal(line[:1], []byte("\n")) {
return true
}
return false
}
func cleanupParse(ctx context.Context, currentCommit *Commit, currentDiff *Diff, diffChan chan *Diff, totalLogSize *int) {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err, "failed to finalize diff")
return
}
2023-07-25 22:52:34 +00:00
// Ignore empty or binary diffs (this condition may be redundant).
if currentDiff != nil && (currentDiff.Len() > 0 || currentDiff.IsBinary) {
currentDiff.Commit = currentCommit
diffChan <- currentDiff
2023-07-25 22:52:34 +00:00
}
if currentCommit != nil {
if totalLogSize != nil {
*totalLogSize += currentCommit.Size
}
}
}