feat(azure): create openai detector

This commit is contained in:
Richard Gomez 2024-01-27 17:08:47 -05:00
parent dc9c9a30b3
commit 2313629cd7
5 changed files with 431 additions and 7 deletions

View file

@ -0,0 +1,153 @@
package azure_openai
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
regexp "github.com/wasilibs/go-re2"
logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
)
// Scanner detects API keys for Azure's OpenAI service.
// https://learn.microsoft.com/en-us/azure/ai-services/openai/reference
type Scanner struct {
client *http.Client
}
// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*Scanner)(nil)
var (
// TODO: Investigate custom `azure-api.net` endpoints.
// https://github.com/openai/openai-python#microsoft-azure-openai
azureUrlPat = regexp.MustCompile(`(?i)([a-z0-9-]+\.openai\.azure\.com)`)
azureKeyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"api[_.-]?key"}) + `\b(?-i:([a-f0-9]{32}))\b`)
)
// Keywords are used for efficiently pre-filtering chunks.
// Use identifiers in the secret preferably, or the provider name.
func (s Scanner) Keywords() []string {
return []string{".openai.azure.com"}
}
// FromData will find and optionally verify OpenAI secrets in a given set of bytes.
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)
// De-duplicate results.
tokens := make(map[string]struct{})
for _, match := range azureKeyPat.FindAllStringSubmatch(dataStr, -1) {
tokens[match[1]] = struct{}{}
}
if len(tokens) == 0 {
return
}
urls := make(map[string]struct{})
for _, match := range azureUrlPat.FindAllStringSubmatch(dataStr, -1) {
urls[match[1]] = struct{}{}
}
// Process results.
logCtx := logContext.AddLogger(ctx)
for token := range tokens {
s1 := detectors.Result{
DetectorType: s.Type(),
Redacted: token[:3] + "..." + token[25:],
Raw: []byte(token),
}
for url := range urls {
if verify {
client := s.client
if client == nil {
client = common.SaneHttpClient()
}
isVerified, extraData, verificationErr := verifyAzureToken(logCtx, client, url, token)
if isVerified || len(urls) == 1 {
s1.RawV2 = []byte(token + ":" + url)
s1.Verified = isVerified
s1.ExtraData = extraData
s1.SetVerificationError(verificationErr, token)
break
}
}
}
results = append(results, s1)
}
return
}
func verifyAzureToken(ctx logContext.Context, client *http.Client, baseUrl, token string) (bool, map[string]string, error) {
// TODO: Replace this with a more suitable long-term endpoint.
// Most endpoints require additional info, e.g., deployment name, which complicates verification.
// This may be retired in the future, so we should look for another candidate.
// https://learn.microsoft.com/en-us/answers/questions/1371786/get-azure-openai-deployments-in-api
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("https://%s/openai/deployments?api-version=2023-03-15-preview", baseUrl), nil)
if err != nil {
return false, nil, nil
}
req.Header.Add("Content-Type", "application/json")
req.Header.Add("api-key", token)
res, err := client.Do(req)
if err != nil {
return false, nil, err
}
defer func() {
_, _ = io.Copy(io.Discard, res.Body)
_ = res.Body.Close()
}()
switch res.StatusCode {
case http.StatusOK:
body, err := io.ReadAll(res.Body)
if err != nil {
return false, nil, err
}
var deployments deploymentsResponse
if err := json.Unmarshal(body, &deployments); err != nil {
if json.Valid(body) {
return false, nil, fmt.Errorf("failed to decode response %s: %w", req.URL, err)
} else {
// If the response isn't JSON it's highly unlikely to be valid.
return false, nil, nil
}
}
// JSON unmarshal doesn't check whether the structure actually matches.
if deployments.Object == "" {
return false, nil, nil
}
// No extra data available at the moment.
return true, nil, nil
case http.StatusUnauthorized:
return false, nil, nil
default:
return false, nil, fmt.Errorf("unexpected response status %d for %s", res.StatusCode, req.URL)
}
}
type deploymentsResponse struct {
Data []deployment `json:"data"`
Object string `json:"object"`
}
type deployment struct {
ID string `json:"id"`
}
func (s Scanner) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_AzureOpenAI
}

View file

@ -0,0 +1,264 @@
//go:build detectors
// +build detectors
package azure_openai
import (
"context"
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)
func TestAzureOpenAI_Pattern(t *testing.T) {
d := Scanner{}
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
tests := []struct {
name string
input string
want []string
}{
{
name: "Generic environment variables",
input: `export OPENAI_API_VERSION=2023-07-15-preview
export OPENAI_API_TYPE=AZURE
export OPENAI_API_BASE=https://james-test-gpt4.openai.azure.com/
export OPENAI_API_KEY=3397348fcdcb4a5fbeb6cceb5a6a284f`,
want: []string{"3397348fcdcb4a5fbeb6cceb5a6a284f"},
},
{
name: "Generic non-structured",
input: `# {'input': ['This is a test query.'], 'engine': 'text-embedding-ada-002'}
# url /openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01
# params {'input': ['This is a test query.'], 'encoding_format': 'base64'}
# headers None
# message='Request to OpenAI API' method=post path=https://notebook-openai01.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01
# api_version=2022-12-01 data='{"input": ["This is a test query."], "encoding_format": "base64"}' message='Post details'
# https://notebook-openai01.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01
# {'X-OpenAI-Client-User-Agent': '{"bindings_version": "0.27.6", "httplib": "requests", "lang": "python", "lang_version": "3.11.2", "platform": "macOS-13.2-arm64-arm-64bit", "publisher": "openai", "uname": "Darwin 22.3.0 Darwin Kernel Version 22.3.0: Thu Jan 5 20:48:54 PST 2023; root:xnu-8792.81.2~2/RELEASE_ARM64_T6000 arm64 arm"}', 'User-Agent': 'OpenAI/v1 PythonBindings/0.27.6', 'api-key': '49eb7c2d3acd41f4ac31fef59ceacbba', 'OpenAI-Debug': 'true', 'Content-Type': 'application/json'}`,
want: []string{"49eb7c2d3acd41f4ac31fef59ceacbba"},
},
{
name: "Python",
input: `import openai
openai.api_key = '1bb7dff73fe449de829363ea03bab134'
openai.api_base = "https://hrcop-openai.openai.azure.com/"
`,
want: []string{"1bb7dff73fe449de829363ea03bab134"},
},
{
name: "Python environment variables",
input: `os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = "https://superhackathonai101-openai.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = '1bb7dde73fe449de229361ea03bab234'`,
want: []string{"1bb7dde73fe449de229361ea03bab234"},
},
{
name: "TypeScript",
input: `import OpenAI from "openai";
export const openai = new OpenAI({
apiKey: "3375e3ad9a874cd6bd954b6f163be84f",
baseURL:
"https://kumar-azure.openai.azure.com/openai/deployments/ChatAutoUpdate",
defaultQuery: { "api-version": "2023-06-01-preview" },
});`,
want: []string{"3375e3ad9a874cd6bd954b6f163be84f"},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, 2)
ahoCorasickCore.PopulateMatchingDetectors(test.input, chunkSpecificDetectors)
if len(chunkSpecificDetectors) == 0 {
t.Errorf("keywords '%v' not matched by: %s", d.Keywords(), test.input)
return
}
results, err := d.FromData(context.Background(), false, []byte(test.input))
if err != nil {
t.Errorf("error = %v", err)
return
}
if len(results) != len(test.want) {
if len(results) == 0 {
t.Errorf("did not receive result")
} else {
t.Errorf("expected %d results, only received %d", len(test.want), len(results))
}
return
}
actual := make(map[string]struct{}, len(results))
for _, r := range results {
if len(r.RawV2) > 0 {
actual[string(r.RawV2)] = struct{}{}
} else {
actual[string(r.Raw)] = struct{}{}
}
}
expected := make(map[string]struct{}, len(test.want))
for _, v := range test.want {
expected[v] = struct{}{}
}
if diff := cmp.Diff(expected, actual); diff != "" {
t.Errorf("%s diff: (-want +got)\n%s", test.name, diff)
}
})
}
}
func TestAzureOpenAI_FromChunk(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors5")
if err != nil {
t.Fatalf("could not get test secrets from GCP: %s", err)
}
secret := testSecrets.MustGetField("AZUREOPENAI")
inactiveSecret := testSecrets.MustGetField("AZUREOPENAI_INACTIVE")
type args struct {
ctx context.Context
data []byte
verify bool
}
tests := []struct {
name string
s Scanner
args args
want []detectors.Result
wantErr bool
wantVerificationErr bool
}{
{
name: "found, verified",
s: Scanner{},
args: args{
ctx: context.Background(),
data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)),
verify: true,
},
want: []detectors.Result{
{
DetectorType: detectorspb.DetectorType_AzureOpenAI,
Verified: true,
},
},
wantErr: false,
wantVerificationErr: false,
},
{
name: "found, unverified",
s: Scanner{},
args: args{
ctx: context.Background(),
data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within but not valid", inactiveSecret)), // the secret would satisfy the regex but not pass validation
verify: true,
},
want: []detectors.Result{
{
DetectorType: detectorspb.DetectorType_AzureOpenAI,
Verified: false,
},
},
wantErr: false,
wantVerificationErr: false,
},
{
name: "not found",
s: Scanner{},
args: args{
ctx: context.Background(),
data: []byte("You cannot find the secret within"),
verify: true,
},
want: nil,
wantErr: false,
wantVerificationErr: false,
},
{
name: "found, would be verified if not for timeout",
s: Scanner{client: common.SaneHttpClientTimeOut(1 * time.Microsecond)},
args: args{
ctx: context.Background(),
data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)),
verify: true,
},
want: []detectors.Result{
{
DetectorType: detectorspb.DetectorType_AzureOpenAI,
Verified: false,
},
},
wantErr: false,
wantVerificationErr: true,
},
{
name: "found, verified but unexpected api surface",
s: Scanner{client: common.ConstantResponseHttpClient(404, "")},
args: args{
ctx: context.Background(),
data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)),
verify: true,
},
want: []detectors.Result{
{
DetectorType: detectorspb.DetectorType_AzureOpenAI,
Verified: false,
},
},
wantErr: false,
wantVerificationErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := tt.s.FromData(tt.args.ctx, tt.args.verify, tt.args.data)
if (err != nil) != tt.wantErr {
t.Errorf("Azureopenai.FromData() error = %v, wantErr %v", err, tt.wantErr)
return
}
for i := range got {
if len(got[i].Raw) == 0 {
t.Fatalf("no raw secret present: \n %+v", got[i])
}
if (got[i].VerificationError() != nil) != tt.wantVerificationErr {
t.Fatalf("wantVerificationError = %v, verification error = %v", tt.wantVerificationErr, got[i].VerificationError())
}
}
ignoreOpts := cmpopts.IgnoreFields(detectors.Result{}, "Raw", "verificationError")
if diff := cmp.Diff(got, tt.want, ignoreOpts); diff != "" {
t.Errorf("Azureopenai.FromData() %s diff: (-got +want)\n%s", tt.name, diff)
}
})
}
}
func BenchmarkFromData(benchmark *testing.B) {
ctx := context.Background()
s := Scanner{}
for name, data := range detectors.MustGetBenchmarkData() {
benchmark.Run(name, func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
_, err := s.FromData(ctx, false, data)
if err != nil {
b.Fatal(err)
}
}
})
}
}

View file

@ -64,6 +64,7 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/aylien"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/ayrshare"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azure"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azure_openai"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azurebatch"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azurecontainerregistry"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azuredevopspersonalaccesstoken"
@ -1632,6 +1633,7 @@ func DefaultDetectors() []detectors.Detector {
netsuite.Scanner{},
robinhoodcrypto.Scanner{},
nvapi.Scanner{},
&azure_openai.Scanner{},
}
}

View file

@ -1100,6 +1100,7 @@ const (
DetectorType_RobinhoodCrypto DetectorType = 996
DetectorType_NVAPI DetectorType = 997
DetectorType_PyPI DetectorType = 998
DetectorType_AzureOpenAI DetectorType = 999
)
// Enum value maps for DetectorType.
@ -2100,6 +2101,7 @@ var (
996: "RobinhoodCrypto",
997: "NVAPI",
998: "PyPI",
999: "AzureOpenAI",
}
DetectorType_value = map[string]int32{
"Alibaba": 0,
@ -3097,6 +3099,7 @@ var (
"RobinhoodCrypto": 996,
"NVAPI": 997,
"PyPI": 998,
"AzureOpenAI": 999,
}
)
@ -3550,7 +3553,7 @@ var file_detectors_proto_rawDesc = []byte{
0x4c, 0x41, 0x49, 0x4e, 0x10, 0x01, 0x12, 0x0a, 0x0a, 0x06, 0x42, 0x41, 0x53, 0x45, 0x36, 0x34,
0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x54, 0x46, 0x31, 0x36, 0x10, 0x03, 0x12, 0x13, 0x0a,
0x0f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x49, 0x43, 0x4f, 0x44, 0x45,
0x10, 0x04, 0x2a, 0xc5, 0x7f, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54,
0x10, 0x04, 0x2a, 0xd7, 0x7f, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54,
0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x41, 0x6c, 0x69, 0x62, 0x61, 0x62, 0x61, 0x10, 0x00,
0x12, 0x08, 0x0a, 0x04, 0x41, 0x4d, 0x51, 0x50, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x41, 0x57,
0x53, 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x41, 0x7a, 0x75, 0x72, 0x65, 0x10, 0x03, 0x12, 0x0a,
@ -4570,12 +4573,13 @@ var file_detectors_proto_rawDesc = []byte{
0x0a, 0x08, 0x4e, 0x65, 0x74, 0x73, 0x75, 0x69, 0x74, 0x65, 0x10, 0xe3, 0x07, 0x12, 0x14, 0x0a,
0x0f, 0x52, 0x6f, 0x62, 0x69, 0x6e, 0x68, 0x6f, 0x6f, 0x64, 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f,
0x10, 0xe4, 0x07, 0x12, 0x0a, 0x0a, 0x05, 0x4e, 0x56, 0x41, 0x50, 0x49, 0x10, 0xe5, 0x07, 0x12,
0x09, 0x0a, 0x04, 0x50, 0x79, 0x50, 0x49, 0x10, 0xe6, 0x07, 0x42, 0x3d, 0x5a, 0x3b, 0x67, 0x69,
0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65,
0x73, 0x65, 0x63, 0x75, 0x72, 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65,
0x68, 0x6f, 0x67, 0x2f, 0x76, 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x70, 0x62, 0x2f, 0x64, 0x65,
0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f,
0x33,
0x09, 0x0a, 0x04, 0x50, 0x79, 0x50, 0x49, 0x10, 0xe6, 0x07, 0x12, 0x10, 0x0a, 0x0b, 0x41, 0x7a,
0x75, 0x72, 0x65, 0x4f, 0x70, 0x65, 0x6e, 0x41, 0x49, 0x10, 0xe7, 0x07, 0x42, 0x3d, 0x5a, 0x3b,
0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66,
0x6c, 0x65, 0x73, 0x65, 0x63, 0x75, 0x72, 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66,
0x6c, 0x65, 0x68, 0x6f, 0x67, 0x2f, 0x76, 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x70, 0x62, 0x2f,
0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f,
0x74, 0x6f, 0x33,
}
var (

View file

@ -1008,6 +1008,7 @@ enum DetectorType {
RobinhoodCrypto = 996;
NVAPI = 997;
PyPI = 998;
AzureOpenAI = 999;
}
message Result {