diff --git a/pkg/detectors/pypi/pypi.go b/pkg/detectors/pypi/pypi.go new file mode 100644 index 000000000..140984963 --- /dev/null +++ b/pkg/detectors/pypi/pypi.go @@ -0,0 +1,125 @@ +package pypi + +import ( + "bytes" + "context" + "fmt" + "io" + "mime/multipart" + "net/http" + + regexp "github.com/wasilibs/go-re2" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" +) + +type Scanner struct { + client *http.Client +} + +// Ensure the Scanner satisfies the interface at compile time. +var _ detectors.Detector = (*Scanner)(nil) + +var ( + defaultClient = common.SaneHttpClient() + // Make sure that your group is surrounded in boundary characters such as below to reduce false positives. + keyPat = regexp.MustCompile("(pypi-AgEIcHlwaS5vcmcCJ[a-zA-Z0-9-_]{157})") +) + +// Keywords are used for efficiently pre-filtering chunks. +// Use identifiers in the secret preferably, or the provider name. +func (s Scanner) Keywords() []string { + return []string{"pypi-AgEIcHlwaS5vcmcCJ"} +} + +// FromData will find and optionally verify Pypi secrets in a given set of bytes. +func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { + dataStr := string(data) + + uniqueMatches := make(map[string]struct{}) + for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) { + uniqueMatches[match[1]] = struct{}{} + } + + for match := range uniqueMatches { + s1 := detectors.Result{ + DetectorType: detectorspb.DetectorType_PyPI, + Raw: []byte(match), + } + + if verify { + client := s.client + if client == nil { + client = defaultClient + } + + isVerified, extraData, verificationErr := verifyMatch(ctx, client, match) + s1.Verified = isVerified + s1.ExtraData = extraData + s1.SetVerificationError(verificationErr, match) + } + + results = append(results, s1) + } + + return +} + +func verifyMatch(ctx context.Context, client *http.Client, token string) (bool, map[string]string, error) { + // Create a buffer to hold the multipart form data + var body bytes.Buffer + writer := multipart.NewWriter(&body) + + // Add the form fields like in the curl request + _ = writer.WriteField(":action", "file_upload") + _ = writer.WriteField("name", "dummy-package") + _ = writer.WriteField("version", "0.0.1") + _ = writer.WriteField("content", "dummy-content") + + // Close the writer to finalize the form + writer.Close() + + // Create a new POST request to the PyPI legacy upload URL + req, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://upload.pypi.org/legacy/", &body) + if err != nil { + return false, nil, err + } + + // Add the Authorization header with the PyPI API token + req.Header.Add("Authorization", "token "+token) + // Set the Content-Type to the multipart form boundary + req.Header.Set("Content-Type", writer.FormDataContentType()) + + // Execute the HTTP request + res, err := client.Do(req) + if err != nil { + return false, nil, err + } + defer func() { + _, _ = io.Copy(io.Discard, res.Body) + _ = res.Body.Close() + }() + + // Check for expected status codes for verification + if res.StatusCode == http.StatusBadRequest { + verified, err := common.ResponseContainsSubstring(res.Body, "Include at least one message digest.") + if err != nil { + return false, nil, err + } + if verified { + return true, nil, nil + } + } else if res.StatusCode == http.StatusForbidden { + // If we get a 403 status, the key is invalid + return false, nil, nil + } + + // For all other status codes, return an error + return false, nil, fmt.Errorf("unexpected HTTP response status %d", res.StatusCode) +} + +func (s Scanner) Type() detectorspb.DetectorType { + return detectorspb.DetectorType_PyPI +} diff --git a/pkg/detectors/pypi/pypi_test.go b/pkg/detectors/pypi/pypi_test.go new file mode 100644 index 000000000..614da0756 --- /dev/null +++ b/pkg/detectors/pypi/pypi_test.go @@ -0,0 +1,220 @@ +//go:build detectors +// +build detectors + +package pypi + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" +) + +func TestPypi_Pattern(t *testing.T) { + d := Scanner{} + ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) + tests := []struct { + name string + input string + want []string + }{ + { + name: "typical pattern", + input: "pypi_token = 'pypi-AgEIcHlwaS5vcmcCJDQyM2M0Yjg4LWUyNDnnnnhhMy1hNigyLWI2ZWUyMTMwYzI2MgACKlszLCJhOWQwMWE0MS01Nzk4LTQyOWYtOTk4MS1lYzE5NTJhM2E3YzgiXQAABiBeGtDnnnnnV32VpiyeU-YUDKplSv0E5ngmwsnHaV2jGg'", + want: []string{"pypi-AgEIcHlwaS5vcmcCJDQyM2M0Yjg4LWUyNDnnnnhhMy1hNigyLWI2ZWUyMTMwYzI2MgACKlszLCJhOWQwMWE0MS01Nzk4LTQyOWYtOTk4MS1lYzE5NTJhM2E3YzgiXQAABiBeGtDnnnnnV32VpiyeU-YUDKplSv0E5ngmwsnHaV2jGg"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input)) + if len(matchedDetectors) == 0 { + t.Errorf("keywords '%v' not matched by: %s", d.Keywords(), test.input) + return + } + + results, err := d.FromData(context.Background(), false, []byte(test.input)) + if err != nil { + t.Errorf("error = %v", err) + return + } + + if len(results) != len(test.want) { + if len(results) == 0 { + t.Errorf("did not receive result") + } else { + t.Errorf("expected %d results, only received %d", len(test.want), len(results)) + } + return + } + + actual := make(map[string]struct{}, len(results)) + for _, r := range results { + if len(r.RawV2) > 0 { + actual[string(r.RawV2)] = struct{}{} + } else { + actual[string(r.Raw)] = struct{}{} + } + } + expected := make(map[string]struct{}, len(test.want)) + for _, v := range test.want { + expected[v] = struct{}{} + } + + if diff := cmp.Diff(expected, actual); diff != "" { + t.Errorf("%s diff: (-want +got)\n%s", test.name, diff) + } + }) + } +} + +func TestPypi_FromChunk(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + defer cancel() + testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors5") + if err != nil { + t.Fatalf("could not get test secrets from GCP: %s", err) + } + secret := testSecrets.MustGetField("PYPI") + inactiveSecret := testSecrets.MustGetField("PYPI_INACTIVE") + + type args struct { + ctx context.Context + data []byte + verify bool + } + tests := []struct { + name string + s Scanner + args args + want []detectors.Result + wantErr bool + wantVerificationErr bool + }{ + { + name: "found, verified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a pypi secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_PyPI, + Verified: true, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, unverified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a pypi secret %s within but not valid", inactiveSecret)), // the secret would satisfy the regex but not pass validation + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_PyPI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "not found", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte("You cannot find the secret within"), + verify: true, + }, + want: nil, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, would be verified if not for timeout", + s: Scanner{client: common.SaneHttpClientTimeOut(1 * time.Microsecond)}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a pypi secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_PyPI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + { + name: "found, verified but unexpected api surface", + s: Scanner{client: common.ConstantResponseHttpClient(404, "")}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a pypi secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_PyPI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := tt.s.FromData(tt.args.ctx, tt.args.verify, tt.args.data) + if (err != nil) != tt.wantErr { + t.Errorf("Pypi.FromData() error = %v, wantErr %v", err, tt.wantErr) + return + } + for i := range got { + if len(got[i].Raw) == 0 { + t.Fatalf("no raw secret present: \n %+v", got[i]) + } + if (got[i].VerificationError() != nil) != tt.wantVerificationErr { + t.Fatalf("wantVerificationError = %v, verification error = %v", tt.wantVerificationErr, got[i].VerificationError()) + } + } + ignoreOpts := cmpopts.IgnoreFields(detectors.Result{}, "Raw", "verificationError") + if diff := cmp.Diff(got, tt.want, ignoreOpts); diff != "" { + t.Errorf("Pypi.FromData() %s diff: (-got +want)\n%s", tt.name, diff) + } + }) + } +} + +func BenchmarkFromData(benchmark *testing.B) { + ctx := context.Background() + s := Scanner{} + for name, data := range detectors.MustGetBenchmarkData() { + benchmark.Run(name, func(b *testing.B) { + b.ResetTimer() + for n := 0; n < b.N; n++ { + _, err := s.FromData(ctx, false, data) + if err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/pkg/engine/defaults.go b/pkg/engine/defaults.go index b0eed55b7..9e77803b1 100644 --- a/pkg/engine/defaults.go +++ b/pkg/engine/defaults.go @@ -551,6 +551,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/purestake" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/pushbulletapikey" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/pusherchannelkey" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/pypi" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/qase" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/qualaroo" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/qubole" @@ -806,6 +807,7 @@ import ( func DefaultDetectors() []detectors.Detector { return []detectors.Detector{ &heroku.Scanner{}, + &pypi.Scanner{}, &linearapi.Scanner{}, &alibaba.Scanner{}, aws.New(), diff --git a/pkg/pb/detectorspb/detectors.pb.go b/pkg/pb/detectorspb/detectors.pb.go index 638fbbd24..319e394cb 100644 --- a/pkg/pb/detectorspb/detectors.pb.go +++ b/pkg/pb/detectorspb/detectors.pb.go @@ -1099,6 +1099,7 @@ const ( DetectorType_Netsuite DetectorType = 995 DetectorType_RobinhoodCrypto DetectorType = 996 DetectorType_NVAPI DetectorType = 997 + DetectorType_PyPI DetectorType = 998 ) // Enum value maps for DetectorType. @@ -2098,6 +2099,7 @@ var ( 995: "Netsuite", 996: "RobinhoodCrypto", 997: "NVAPI", + 998: "PyPI", } DetectorType_value = map[string]int32{ "Alibaba": 0, @@ -3094,6 +3096,7 @@ var ( "Netsuite": 995, "RobinhoodCrypto": 996, "NVAPI": 997, + "PyPI": 998, } ) @@ -3547,7 +3550,7 @@ var file_detectors_proto_rawDesc = []byte{ 0x4c, 0x41, 0x49, 0x4e, 0x10, 0x01, 0x12, 0x0a, 0x0a, 0x06, 0x42, 0x41, 0x53, 0x45, 0x36, 0x34, 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x54, 0x46, 0x31, 0x36, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x49, 0x43, 0x4f, 0x44, 0x45, - 0x10, 0x04, 0x2a, 0xba, 0x7f, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54, + 0x10, 0x04, 0x2a, 0xc5, 0x7f, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x41, 0x6c, 0x69, 0x62, 0x61, 0x62, 0x61, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x41, 0x4d, 0x51, 0x50, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x41, 0x57, 0x53, 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x41, 0x7a, 0x75, 0x72, 0x65, 0x10, 0x03, 0x12, 0x0a, @@ -4566,12 +4569,13 @@ var file_detectors_proto_rawDesc = []byte{ 0x0a, 0x45, 0x6c, 0x65, 0x76, 0x65, 0x6e, 0x4c, 0x61, 0x62, 0x73, 0x10, 0xe2, 0x07, 0x12, 0x0d, 0x0a, 0x08, 0x4e, 0x65, 0x74, 0x73, 0x75, 0x69, 0x74, 0x65, 0x10, 0xe3, 0x07, 0x12, 0x14, 0x0a, 0x0f, 0x52, 0x6f, 0x62, 0x69, 0x6e, 0x68, 0x6f, 0x6f, 0x64, 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, - 0x10, 0xe4, 0x07, 0x12, 0x0a, 0x0a, 0x05, 0x4e, 0x56, 0x41, 0x50, 0x49, 0x10, 0xe5, 0x07, 0x42, - 0x3d, 0x5a, 0x3b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, - 0x75, 0x66, 0x66, 0x6c, 0x65, 0x73, 0x65, 0x63, 0x75, 0x72, 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, - 0x75, 0x66, 0x66, 0x6c, 0x65, 0x68, 0x6f, 0x67, 0x2f, 0x76, 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, - 0x70, 0x62, 0x2f, 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x10, 0xe4, 0x07, 0x12, 0x0a, 0x0a, 0x05, 0x4e, 0x56, 0x41, 0x50, 0x49, 0x10, 0xe5, 0x07, 0x12, + 0x09, 0x0a, 0x04, 0x50, 0x79, 0x50, 0x49, 0x10, 0xe6, 0x07, 0x42, 0x3d, 0x5a, 0x3b, 0x67, 0x69, + 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, + 0x73, 0x65, 0x63, 0x75, 0x72, 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, + 0x68, 0x6f, 0x67, 0x2f, 0x76, 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x70, 0x62, 0x2f, 0x64, 0x65, + 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x33, } var ( diff --git a/proto/detectors.proto b/proto/detectors.proto index 9da116a14..5f2817004 100644 --- a/proto/detectors.proto +++ b/proto/detectors.proto @@ -1007,6 +1007,7 @@ enum DetectorType { Netsuite = 995; RobinhoodCrypto = 996; NVAPI = 997; + PyPI = 998; } message Result {