2022-08-23 20:29:20 +00:00
package gitparse
import (
"bufio"
"bytes"
"fmt"
2022-09-02 18:02:38 +00:00
"io"
2022-08-23 20:29:20 +00:00
"os/exec"
"path/filepath"
"strconv"
"strings"
"time"
2022-08-29 18:45:37 +00:00
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
2022-08-23 20:29:20 +00:00
)
2023-01-23 18:14:10 +00:00
const (
2023-02-04 21:19:23 +00:00
// defaultDateFormat is the standard date format for git.
defaultDateFormat = "Mon Jan 02 15:04:05 2006 -0700"
2023-01-23 18:14:10 +00:00
2023-02-04 21:19:23 +00:00
// defaultMaxDiffSize is the maximum size for a diff. Larger diffs will be cut off.
defaultMaxDiffSize = 1 * 1024 * 1024 * 1024 // 1GB
// defaultMaxCommitSize is the maximum size for a commit. Larger commits will be cut off.
defaultMaxCommitSize = 1 * 1024 * 1024 * 1024 // 1GB
2023-01-23 18:14:10 +00:00
)
2022-08-23 20:29:20 +00:00
// Commit contains commit header info and diffs.
type Commit struct {
Hash string
Author string
Date time . Time
Message strings . Builder
Diffs [ ] Diff
2023-05-02 23:36:39 +00:00
Size int // in bytes
2022-08-23 20:29:20 +00:00
}
// Diff contains the info about a file diff in a commit.
type Diff struct {
PathB string
LineStart int
Content bytes . Buffer
IsBinary bool
}
2023-02-04 21:19:23 +00:00
// Parser sets values used in GitParse.
type Parser struct {
maxDiffSize int
maxCommitSize int
dateFormat string
}
// WithMaxDiffSize sets maxDiffSize option. Diffs larger than maxDiffSize will
// be truncated.
func WithMaxDiffSize ( maxDiffSize int ) Option {
return func ( parser * Parser ) {
parser . maxDiffSize = maxDiffSize
}
}
// WithMaxCommitSize sets maxCommitSize option. Commits larger than maxCommitSize
// will be put in the commit channel and additional diffs will be added to a
// new commit.
func WithMaxCommitSize ( maxCommitSize int ) Option {
return func ( parser * Parser ) {
parser . maxCommitSize = maxCommitSize
}
}
// Option is used for adding options to Config.
type Option func ( * Parser )
// NewParser creates a GitParse config from options and sets defaults.
func NewParser ( options ... Option ) * Parser {
parser := & Parser {
dateFormat : defaultDateFormat ,
maxDiffSize : defaultMaxDiffSize ,
maxCommitSize : defaultMaxCommitSize ,
}
for _ , option := range options {
option ( parser )
}
return parser
}
2022-09-09 04:46:12 +00:00
// Equal compares the content of two Commits to determine if they are the same.
func ( c1 * Commit ) Equal ( c2 * Commit ) bool {
switch {
case c1 . Hash != c2 . Hash :
return false
case c1 . Author != c2 . Author :
return false
case ! c1 . Date . Equal ( c2 . Date ) :
return false
case c1 . Message . String ( ) != c2 . Message . String ( ) :
return false
case len ( c1 . Diffs ) != len ( c2 . Diffs ) :
return false
}
for i := range c1 . Diffs {
d1 := c1 . Diffs [ i ]
d2 := c2 . Diffs [ i ]
switch {
case d1 . PathB != d2 . PathB :
return false
case d1 . LineStart != d2 . LineStart :
return false
case d1 . Content . String ( ) != d2 . Content . String ( ) :
return false
case d1 . IsBinary != d2 . IsBinary :
return false
}
}
return true
}
2022-08-23 20:29:20 +00:00
// RepoPath parses the output of the `git log` command for the `source` path.
2023-03-28 15:46:03 +00:00
func ( c * Parser ) RepoPath ( ctx context . Context , source string , head string , abbreviatedLog bool , excludedGlobs [ ] string ) ( chan Commit , error ) {
2023-05-05 19:51:27 +00:00
args := [ ] string { "-C" , source , "log" , "-p" , "--full-history" , "--date=format:%a %b %d %H:%M:%S %Y %z" }
2023-01-26 17:17:54 +00:00
if abbreviatedLog {
args = append ( args , "--diff-filter=AM" )
}
2022-08-23 20:29:20 +00:00
if head != "" {
args = append ( args , head )
} else {
args = append ( args , "--all" )
}
2023-03-28 15:46:03 +00:00
for _ , glob := range excludedGlobs {
args = append ( args , "--" , "." , fmt . Sprintf ( ":(exclude)%s" , glob ) )
}
2022-08-23 20:29:20 +00:00
cmd := exec . Command ( "git" , args ... )
absPath , err := filepath . Abs ( source )
if err == nil {
cmd . Env = append ( cmd . Env , fmt . Sprintf ( "GIT_DIR=%s" , filepath . Join ( absPath , ".git" ) ) )
}
2023-02-04 21:19:23 +00:00
return c . executeCommand ( ctx , cmd )
2022-09-04 01:01:36 +00:00
}
// Unstaged parses the output of the `git diff` command for the `source` path.
2023-02-04 21:19:23 +00:00
func ( c * Parser ) Unstaged ( ctx context . Context , source string ) ( chan Commit , error ) {
2023-03-01 16:58:36 +00:00
// Provide the --cached flag to diff to get the diff of the staged changes.
2023-05-05 19:51:27 +00:00
args := [ ] string { "-C" , source , "diff" , "-p" , "--cached" , "--full-history" , "--diff-filter=AM" , "--date=format:%a %b %d %H:%M:%S %Y %z" , "HEAD" }
2022-09-04 01:01:36 +00:00
cmd := exec . Command ( "git" , args ... )
absPath , err := filepath . Abs ( source )
if err == nil {
cmd . Env = append ( cmd . Env , fmt . Sprintf ( "GIT_DIR=%s" , filepath . Join ( absPath , ".git" ) ) )
}
2023-02-04 21:19:23 +00:00
return c . executeCommand ( ctx , cmd )
2022-09-04 01:01:36 +00:00
}
// executeCommand runs an exec.Cmd, reads stdout and stderr, and waits for the Cmd to complete.
2023-02-04 21:19:23 +00:00
func ( c * Parser ) executeCommand ( ctx context . Context , cmd * exec . Cmd ) ( chan Commit , error ) {
2022-10-12 19:55:08 +00:00
commitChan := make ( chan Commit , 64 )
2022-09-04 01:01:36 +00:00
2022-08-23 20:29:20 +00:00
stdOut , err := cmd . StdoutPipe ( )
if err != nil {
return commitChan , err
}
stdErr , err := cmd . StderrPipe ( )
if err != nil {
return commitChan , err
}
err = cmd . Start ( )
if err != nil {
return commitChan , err
}
go func ( ) {
scanner := bufio . NewScanner ( stdErr )
for scanner . Scan ( ) {
2023-02-14 23:00:07 +00:00
ctx . Logger ( ) . V ( 2 ) . Info ( scanner . Text ( ) )
2022-08-23 20:29:20 +00:00
}
} ( )
go func ( ) {
2023-04-03 00:54:43 +00:00
c . FromReader ( ctx , stdOut , commitChan )
2022-09-02 18:02:38 +00:00
if err := cmd . Wait ( ) ; err != nil {
2023-02-14 23:00:07 +00:00
ctx . Logger ( ) . V ( 2 ) . Info ( "Error waiting for git command to complete." , "error" , err )
2022-09-02 18:02:38 +00:00
}
} ( )
return commitChan , nil
}
2023-04-03 00:54:43 +00:00
func ( c * Parser ) FromReader ( ctx context . Context , stdOut io . Reader , commitChan chan Commit ) {
2022-09-02 18:02:38 +00:00
outReader := bufio . NewReader ( stdOut )
2023-05-02 23:36:39 +00:00
var (
currentCommit * Commit
currentDiff * Diff
recentlyPassedAuthor bool
totalLogSize int
)
2022-09-02 18:02:38 +00:00
2022-09-22 14:01:10 +00:00
defer common . RecoverWithExit ( ctx )
2023-02-07 23:25:00 +00:00
defer close ( commitChan )
2022-09-02 18:02:38 +00:00
for {
2023-02-07 23:25:00 +00:00
if common . IsDone ( ctx ) {
break
}
2022-09-02 18:02:38 +00:00
line , err := outReader . ReadBytes ( [ ] byte ( "\n" ) [ 0 ] )
if err != nil && len ( line ) == 0 {
break
}
switch {
case isCommitLine ( line ) :
// If there is a currentDiff, add it to currentCommit.
if currentDiff != nil && currentDiff . Content . Len ( ) > 0 {
currentCommit . Diffs = append ( currentCommit . Diffs , * currentDiff )
2023-05-02 23:36:39 +00:00
currentCommit . Size += currentDiff . Content . Len ( )
2022-09-02 18:02:38 +00:00
}
// If there is a currentCommit, send it to the channel.
if currentCommit != nil {
commitChan <- * currentCommit
2023-05-02 23:36:39 +00:00
totalLogSize += currentCommit . Size
2022-09-02 18:02:38 +00:00
}
// Create a new currentDiff and currentCommit
currentDiff = & Diff { }
currentCommit = & Commit {
Message : strings . Builder { } ,
}
// Check that the commit line contains a hash and set it.
if len ( line ) >= 47 {
currentCommit . Hash = string ( line [ 7 : 47 ] )
}
case isAuthorLine ( line ) :
2022-09-09 04:46:12 +00:00
currentCommit . Author = strings . TrimRight ( string ( line [ 8 : ] ) , "\n" )
2023-04-13 14:53:21 +00:00
recentlyPassedAuthor = true
2022-09-02 18:02:38 +00:00
case isDateLine ( line ) :
2023-02-04 21:19:23 +00:00
date , err := time . Parse ( c . dateFormat , strings . TrimSpace ( string ( line [ 6 : ] ) ) )
2022-09-02 18:02:38 +00:00
if err != nil {
2023-02-14 23:00:07 +00:00
ctx . Logger ( ) . V ( 2 ) . Info ( "Could not parse date from git stream." , "error" , err )
2022-09-02 18:02:38 +00:00
}
currentCommit . Date = date
case isDiffLine ( line ) :
// This should never be nil, but check in case the stdin stream is messed up.
2022-09-04 01:01:36 +00:00
if currentCommit == nil {
currentCommit = & Commit { }
}
2022-09-02 18:02:38 +00:00
if currentDiff != nil && currentDiff . Content . Len ( ) > 0 {
currentCommit . Diffs = append ( currentCommit . Diffs , * currentDiff )
2023-02-07 23:25:00 +00:00
// If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits.
totalSize := 0
for _ , diff := range currentCommit . Diffs {
totalSize += diff . Content . Len ( )
}
if totalSize > c . maxCommitSize {
2023-02-23 23:20:54 +00:00
oldCommit := currentCommit
2023-02-07 23:25:00 +00:00
commitChan <- * currentCommit
2023-05-02 23:36:39 +00:00
totalLogSize += currentCommit . Size
2023-02-07 23:25:00 +00:00
currentCommit = & Commit {
Hash : currentCommit . Hash ,
Author : currentCommit . Author ,
Date : currentCommit . Date ,
2023-02-23 23:20:54 +00:00
Message : strings . Builder { } ,
2023-02-07 23:25:00 +00:00
Diffs : [ ] Diff { } ,
}
2023-02-23 23:20:54 +00:00
// Message needs to be recreated here otherwise writing to it again will result in a panic.
currentCommit . Message . WriteString ( oldCommit . Message . String ( ) )
2023-02-07 23:25:00 +00:00
}
2022-09-02 18:02:38 +00:00
}
currentDiff = & Diff { }
case isModeLine ( line ) :
// NoOp
case isIndexLine ( line ) :
2023-04-13 14:53:21 +00:00
recentlyPassedAuthor = false
2022-09-02 18:02:38 +00:00
// NoOp
case isPlusFileLine ( line ) :
2023-02-17 01:15:32 +00:00
currentDiff . PathB = strings . TrimRight ( strings . TrimRight ( string ( line [ 6 : ] ) , "\n" ) , "\t" ) // Trim the newline and tab characters. https://github.com/trufflesecurity/trufflehog/issues/1060
2022-09-02 18:02:38 +00:00
case isMinusFileLine ( line ) :
// NoOp
case isPlusDiffLine ( line ) :
currentDiff . Content . Write ( line [ 1 : ] )
case isMinusDiffLine ( line ) :
// NoOp. We only care about additions.
2023-05-05 19:51:27 +00:00
case ( isMessageLine ( line ) && recentlyPassedAuthor ) :
currentCommit . Message . Write ( line [ 4 : ] )
2022-09-09 22:00:33 +00:00
case isContextDiffLine ( line ) :
currentDiff . Content . Write ( [ ] byte ( "\n" ) )
2022-09-02 18:02:38 +00:00
case isBinaryLine ( line ) :
currentDiff . IsBinary = true
currentDiff . PathB = pathFromBinaryLine ( line )
case isLineNumberDiffLine ( line ) :
if currentDiff != nil && currentDiff . Content . Len ( ) > 0 {
currentCommit . Diffs = append ( currentCommit . Diffs , * currentDiff )
}
newDiff := & Diff {
PathB : currentDiff . PathB ,
2022-08-23 20:29:20 +00:00
}
2022-09-02 18:02:38 +00:00
currentDiff = newDiff
2022-08-23 20:29:20 +00:00
2022-09-02 18:02:38 +00:00
words := bytes . Split ( line , [ ] byte ( " " ) )
if len ( words ) >= 3 {
startSlice := bytes . Split ( words [ 2 ] , [ ] byte ( "," ) )
lineStart , err := strconv . Atoi ( string ( startSlice [ 0 ] ) )
if err == nil {
currentDiff . LineStart = lineStart
2022-08-23 20:29:20 +00:00
}
}
}
2023-02-04 21:19:23 +00:00
if currentDiff . Content . Len ( ) > c . maxDiffSize {
2023-02-14 23:00:07 +00:00
ctx . Logger ( ) . V ( 2 ) . Info ( fmt . Sprintf (
"Diff for %s exceeded MaxDiffSize(%d)" , currentDiff . PathB , c . maxDiffSize ,
) )
2023-01-23 18:14:10 +00:00
break
}
2022-09-02 18:02:38 +00:00
}
2023-05-02 23:36:39 +00:00
cleanupParse ( currentCommit , currentDiff , commitChan , & totalLogSize )
ctx . Logger ( ) . V ( 2 ) . Info ( "finished parsing git log." , "total_log_size" , totalLogSize )
2023-02-07 23:25:00 +00:00
}
2023-05-02 23:36:39 +00:00
func cleanupParse ( currentCommit * Commit , currentDiff * Diff , commitChan chan Commit , totalLogSize * int ) {
2022-09-02 18:02:38 +00:00
if currentDiff != nil && currentDiff . Content . Len ( ) > 0 {
currentCommit . Diffs = append ( currentCommit . Diffs , * currentDiff )
}
if currentCommit != nil {
commitChan <- * currentCommit
2023-05-02 23:36:39 +00:00
if totalLogSize != nil {
* totalLogSize += currentCommit . Size
}
2022-09-02 18:02:38 +00:00
}
2022-08-23 20:29:20 +00:00
}
// Date: Tue Aug 10 15:20:40 2021 +0100
func isDateLine ( line [ ] byte ) bool {
if len ( line ) > 7 && bytes . Equal ( line [ : 5 ] , [ ] byte ( "Date:" ) ) {
return true
}
return false
}
// Author: Bill Rich <bill.rich@trufflesec.com>
func isAuthorLine ( line [ ] byte ) bool {
if len ( line ) > 8 && bytes . Equal ( line [ : 7 ] , [ ] byte ( "Author:" ) ) {
return true
}
return false
}
// commit 7a95bbf0199e280a0e42dbb1d1a3f56cdd0f6e05
func isCommitLine ( line [ ] byte ) bool {
if len ( line ) > 7 && bytes . Equal ( line [ : 6 ] , [ ] byte ( "commit" ) ) {
return true
}
return false
}
// diff --git a/internal/addrs/move_endpoint_module.go b/internal/addrs/move_endpoint_module.go
func isDiffLine ( line [ ] byte ) bool {
if len ( line ) > 5 && bytes . Equal ( line [ : 4 ] , [ ] byte ( "diff" ) ) {
return true
}
return false
}
// index 1ed6fbee1..aea1e643a 100644
func isIndexLine ( line [ ] byte ) bool {
if len ( line ) > 6 && bytes . Equal ( line [ : 5 ] , [ ] byte ( "index" ) ) {
return true
}
return false
}
// new file mode 100644
func isModeLine ( line [ ] byte ) bool {
if len ( line ) > 13 && bytes . Equal ( line [ : 13 ] , [ ] byte ( "new file mode" ) ) {
return true
}
return false
}
// --- a/internal/addrs/move_endpoint_module.go
func isMinusFileLine ( line [ ] byte ) bool {
2022-08-25 17:45:35 +00:00
if len ( line ) >= 6 && bytes . Equal ( line [ : 3 ] , [ ] byte ( "---" ) ) {
2022-08-23 20:29:20 +00:00
return true
}
return false
}
// +++ b/internal/addrs/move_endpoint_module.go
func isPlusFileLine ( line [ ] byte ) bool {
2022-08-25 17:45:35 +00:00
if len ( line ) >= 6 && bytes . Equal ( line [ : 3 ] , [ ] byte ( "+++" ) ) {
2022-08-23 20:29:20 +00:00
return true
}
return false
}
// +fmt.Println("ok")
func isPlusDiffLine ( line [ ] byte ) bool {
if len ( line ) >= 1 && bytes . Equal ( line [ : 1 ] , [ ] byte ( "+" ) ) {
return true
}
return false
}
// -fmt.Println("ok")
func isMinusDiffLine ( line [ ] byte ) bool {
if len ( line ) >= 1 && bytes . Equal ( line [ : 1 ] , [ ] byte ( "-" ) ) {
return true
}
return false
}
2022-09-22 14:01:10 +00:00
// fmt.Println("ok")
2022-09-09 22:00:33 +00:00
func isContextDiffLine ( line [ ] byte ) bool {
if len ( line ) >= 1 && bytes . Equal ( line [ : 1 ] , [ ] byte ( " " ) ) {
return true
}
return false
}
2022-08-23 20:29:20 +00:00
// Line that starts with 4 spaces
func isMessageLine ( line [ ] byte ) bool {
if len ( line ) > 4 && bytes . Equal ( line [ : 4 ] , [ ] byte ( " " ) ) {
return true
}
return false
}
// Binary files /dev/null and b/plugin.sig differ
func isBinaryLine ( line [ ] byte ) bool {
if len ( line ) > 7 && bytes . Equal ( line [ : 6 ] , [ ] byte ( "Binary" ) ) {
return true
}
return false
}
// @@ -298 +298 @@ func maxRetryErrorHandler(resp *http.Response, err error, numTries int)
func isLineNumberDiffLine ( line [ ] byte ) bool {
if len ( line ) >= 8 && bytes . Equal ( line [ : 2 ] , [ ] byte ( "@@" ) ) {
return true
}
return false
}
2023-02-16 22:11:35 +00:00
// Get the b/ file path. Ignoring the edge case of files having `and /b` in the name for simplicity.
2022-08-23 20:29:20 +00:00
func pathFromBinaryLine ( line [ ] byte ) string {
2023-02-14 23:00:07 +00:00
logger := context . Background ( ) . Logger ( )
2023-02-16 22:11:35 +00:00
sbytes := bytes . Split ( line , [ ] byte ( " and b/" ) )
2022-08-23 20:29:20 +00:00
if len ( sbytes ) != 2 {
2023-02-16 22:11:35 +00:00
logger . V ( 2 ) . Info ( "Expected binary line to be in 'Binary files a/fileA and b/fileB differ' format." , "got" , line )
2022-08-23 20:29:20 +00:00
return ""
}
bRaw := sbytes [ 1 ]
2023-02-16 22:11:35 +00:00
return string ( bRaw [ : len ( bRaw ) - 7 ] ) // drop the "b/" and " differ"
2022-08-23 20:29:20 +00:00
}