fix: restore deleted tests, downgrade Twilio SK to UNCERTAIN, fix LINEAR_ regex

Pringled · Pringled · commit ce41c8d1441d · 2026-03-07T10:32:06.000+01:00
- Restore 14 value-pattern and name-regex tests accidentally deleted in 124de66 (recovered from db20f03 and merged with tests added in HEAD) - Downgrade Twilio SK prefix from SeverityHigh to SeverityUncertain: the bare 'SK' prefix is too broad (no hex charset validation), so false positives are likely; test updated to assert UNCERTAIN - Fix LINEAR_ name-regex: replace \bLINEAR_ with (^|_)LINEAR_ so that MY_LINEAR_TOKEN matches (underscore is a word char in RE2, so \b fails there) while BILINEAR_FILTER still does not match
diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go
@@ -78,7 +78,7 @@ var nameRegexPatterns = []*regexp.Regexp{
 	regexp.MustCompile(`(?i)GITLAB`),
 	regexp.MustCompile(`(?i)BITBUCKET`),
 	// Productivity / project tools (common in agent contexts)
-	regexp.MustCompile(`(?i)\bLINEAR_`), // word boundary + underscore avoids BILINEAR_FILTER
+	regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while still matching MY_LINEAR_TOKEN
 	regexp.MustCompile(`(?i)NOTION`),
 	regexp.MustCompile(`(?i)AIRTABLE`),
 	// Database-as-a-service (API keys / connection tokens)
@@ -125,7 +125,9 @@ var valuePatterns = []valuePattern{
 	// Groq — gsk_ prefix confirmed in Groq docs.
 	{prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"},
 	// Twilio API key SID — SK + 32 hex chars = 34 total.
-	{prefix: "SK", totalLen: 34, severity: models.SeverityHigh, providerTag: "Twilio API key SID"},
+	// SeverityUncertain: the SK prefix is too broad (any 34-char string starting with SK
+	// would match); we don't validate the hex charset, so false positives are likely.
+	{prefix: "SK", totalLen: 34, severity: models.SeverityUncertain, providerTag: "Twilio API key SID"},
 	// SendGrid — SG. + 22 + . + 43 = 69 total (with the dots).
 	{prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"},
 	// HuggingFace
diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go
@@ -363,10 +363,333 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) {
 	}
 }
 
+// ── Value-pattern tests ───────────────────────────────────────────────────────
+
+// TestAPIKeyScanner_ValuePattern_AmbiguousSK verifies that a value matching the
+// generic sk- format (51 chars) produces an UNCERTAIN finding, not HIGH, because
+// sk- is used by many tools beyond OpenAI legacy.
+func TestAPIKeyScanner_ValuePattern_AmbiguousSK(t *testing.T) {
+	value := "sk-" + strings.Repeat("x", 48) // total 51 chars
+	t.Setenv("SOME_CRED", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "SOME_CRED")
+	for _, f := range result.Findings {
+		if f.Resource == "SOME_CRED" {
+			if f.Severity != "UNCERTAIN" {
+				t.Errorf("expected UNCERTAIN severity for ambiguous sk- key, got %q", f.Severity)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_StripeLiveSecret verifies that a Stripe live secret key
+// (sk_live_ + 47 chars = 55 total) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_StripeLiveSecret(t *testing.T) {
+	value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars
+	t.Setenv("PAYMENT_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "PAYMENT_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "PAYMENT_KEY" {
+			if f.Severity != "HIGH" {
+				t.Errorf("expected HIGH severity for Stripe live key, got %q", f.Severity)
+			}
+			if !strings.Contains(f.Description, "Stripe") {
+				t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_StripeTestSecret verifies that a Stripe test secret key
+// (sk_test_ + 47 chars = 55 total) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_StripeTestSecret(t *testing.T) {
+	value := "sk_test_" + strings.Repeat("t", 47) // total 55 chars
+	t.Setenv("TEST_PAYMENT_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "TEST_PAYMENT_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "TEST_PAYMENT_KEY" {
+			if !strings.Contains(f.Description, "Stripe") {
+				t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description)
+			}
+		}
+	}
+}
+
+// TestAPIKeyScanner_ValuePattern_GitLabPAT verifies that a GitLab personal access token
+// (glpat- + 20 chars = 26 total) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_GitLabPAT(t *testing.T) {
+	value := "glpat-" + strings.Repeat("g", 20) // total 26 chars
+	t.Setenv("REPO_TOKEN", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "REPO_TOKEN")
+	for _, f := range result.Findings {
+		if f.Resource == "REPO_TOKEN" {
+			if f.Severity != "HIGH" {
+				t.Errorf("expected HIGH severity for GitLab PAT, got %q", f.Severity)
+			}
+			if !strings.Contains(f.Description, "GitLab") {
+				t.Errorf("expected description to contain %q, got %q", "GitLab", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_NpmToken verifies that an npm granular access token
+// (npm_ + 36 chars = 40 total) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_NpmToken(t *testing.T) {
+	value := "npm_" + strings.Repeat("n", 36) // total 40 chars
+	t.Setenv("REGISTRY_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "REGISTRY_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "REGISTRY_KEY" {
+			if !strings.Contains(f.Description, "npm") {
+				t.Errorf("expected description to contain %q, got %q", "npm", f.Description)
+			}
+		}
+	}
+}
+
+// TestAPIKeyScanner_ValuePattern_Groq verifies that a Groq key (gsk_ + 52 chars = 56 total)
+// produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_Groq(t *testing.T) {
+	value := "gsk_" + strings.Repeat("q", 52) // total 56 chars
+	t.Setenv("INFERENCE_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "INFERENCE_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "INFERENCE_KEY" {
+			if f.Severity != "HIGH" {
+				t.Errorf("expected HIGH severity for Groq key, got %q", f.Severity)
+			}
+			if !strings.Contains(f.Description, "Groq") {
+				t.Errorf("expected description to contain %q, got %q", "Groq", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_SendGrid verifies that a SendGrid key
+// (SG. + 22 chars + . + 43 chars = 69 total) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_SendGrid(t *testing.T) {
+	// SG. (3) + 22 chars + . (1) + 43 chars = 69 total
+	value := "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43)
+	t.Setenv("MAIL_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "MAIL_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "MAIL_KEY" {
+			if !strings.Contains(f.Description, "SendGrid") {
+				t.Errorf("expected description to contain %q, got %q", "SendGrid", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_Anthropic verifies that an Anthropic key
+// (sk-ant- prefix, 108 total chars) produces a HIGH finding.
+func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) {
+	value := "sk-ant-" + strings.Repeat("a", 101) // total 108 chars
+	t.Setenv("LLM_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "LLM_KEY")
+	for _, f := range result.Findings {
+		if f.Resource == "LLM_KEY" {
+			if f.Severity != "HIGH" {
+				t.Errorf("expected HIGH severity for Anthropic key, got %q", f.Severity)
+			}
+			if !strings.Contains(f.Description, "Anthropic") {
+				t.Errorf("expected description to contain %q, got %q", "Anthropic", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN
+// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not
+// be treated as a Fly.io credential due to the word-boundary anchor in the pattern).
+func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) {
+	clearHighRiskEnv(t)
+	t.Setenv("FLY_API_TOKEN", "real-token")
+	t.Setenv("BUTTERFLY_KEY", "not-a-fly-token")
+	t.Setenv("FLYWEIGHT_INDEX", "not-a-token")
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	// FLY_API_TOKEN must be flagged.
+	assertResource(t, result.Findings, "FLY_API_TOKEN")
+
+	// BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged.
+	for _, f := range result.Findings {
+		if f.Resource == "BUTTERFLY_KEY" {
+			t.Error("BUTTERFLY_KEY should not be flagged by FLY_ pattern")
+		}
+		if f.Resource == "FLYWEIGHT_INDEX" {
+			t.Error("FLYWEIGHT_INDEX should not be flagged by FLY_ pattern")
+		}
+	}
+}
+
+// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords
+// added in this session are recognised.
+func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) {
+	clearHighRiskEnv(t)
+	cases := []struct {
+		envVar string
+		value  string
+	}{
+		{"MY_GEMINI_KEY", "gemini-key-value"},
+		{"VERTEX_API_KEY", "vertex-key-value"},
+		{"BEDROCK_ACCESS_KEY", "bedrock-key-value"},
+		{"AZURE_OPENAI_KEY", "azure-openai-key"},
+		{"RESEND_API_KEY", "resend-key-value"},
+		{"POSTMARK_TOKEN", "postmark-key-value"},
+		{"MY_LINEAR_TOKEN", "linear-key-value"},
+		{"NOTION_API_KEY", "notion-key-value"},
+		{"AIRTABLE_KEY", "airtable-key-value"},
+		{"SUPABASE_KEY", "supabase-key-value"},
+		{"NEON_API_KEY", "neon-key-value"},
+		{"PLANETSCALE_TOKEN", "ps-key-value"},
+	}
+
+	for _, tc := range cases {
+		t.Setenv(tc.envVar, tc.value)
+	}
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	for _, tc := range cases {
+		assertResource(t, result.Findings, tc.envVar)
+	}
+}
+
+// TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the
+// OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding
+// with the correct resource name and provider tag in the description.
+func TestAPIKeyScanner_ValuePattern_OpenAIProject(t *testing.T) {
+	value := "sk-proj-" + strings.Repeat("a", 48) // total 56 chars
+	t.Setenv("SOME_AI_CRED", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "SOME_AI_CRED")
+	for _, f := range result.Findings {
+		if f.Resource == "SOME_AI_CRED" {
+			if !strings.Contains(f.Description, "OpenAI project") {
+				t.Errorf("expected description to contain %q, got %q", "OpenAI project", f.Description)
+			}
+		}
+	}
+	assertNoSecretValue(t, result.Findings, value)
+}
+
+// TestAPIKeyScanner_ValuePattern_HuggingFace verifies that a value matching the
+// HuggingFace token format (hf_ + 34 chars = 37 total) produces a correct finding.
+func TestAPIKeyScanner_ValuePattern_HuggingFace(t *testing.T) {
+	value := "hf_" + strings.Repeat("b", 34) // total 37 chars
+	// Use a variable name that does NOT match any nameRegex pattern so the finding
+	// comes from scanValuePatterns (and the HuggingFace provider tag is in the description).
+	t.Setenv("ML_MODEL_CRED", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "ML_MODEL_CRED")
+	for _, f := range result.Findings {
+		if f.Resource == "ML_MODEL_CRED" {
+			if !strings.Contains(f.Description, "HuggingFace") {
+				t.Errorf("expected description to contain %q, got %q", "HuggingFace", f.Description)
+			}
+		}
+	}
+}
+
+// TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT verifies that a value matching the
+// GitHub classic PAT format (ghp_ + 36 chars = 40 total) produces a correct finding.
+func TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT(t *testing.T) {
+	value := "ghp_" + strings.Repeat("c", 36) // total 40 chars
+	t.Setenv("WORK_GH_TOKEN", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	assertResource(t, result.Findings, "WORK_GH_TOKEN")
+	for _, f := range result.Findings {
+		if f.Resource == "WORK_GH_TOKEN" {
+			if !strings.Contains(f.Description, "GitHub") {
+				t.Errorf("expected description to contain %q, got %q", "GitHub", f.Description)
+			}
+		}
+	}
+}
+
+// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the
+// right prefix but wrong length does NOT produce a finding.
+func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) {
+	value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern
+	t.Setenv("SOME_KEY", value)
+	clearHighRiskEnv(t)
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	for _, f := range result.Findings {
+		if f.Resource == "SOME_KEY" {
+			t.Errorf("got unexpected finding for SOME_KEY with wrong-length value")
+		}
+	}
+}
+
 // TestAPIKeyScanner_ValuePattern_TwilioSID verifies that a Twilio API key SID
-// (SK + 32 hex chars = 34 total) produces a HIGH finding.
-// The variable name is intentionally neutral (no provider keyword) so the finding
-// comes from the value-pattern pass, confirming the pattern itself works.
+// (SK + 32 hex chars = 34 total) produces an UNCERTAIN finding.
+// The SK prefix is intentionally broad (any 34-char string starting with SK matches)
+// so we use SeverityUncertain rather than SeverityHigh to avoid false positives.
 func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) {
 	value := "SK" + strings.Repeat("f", 32) // total 34 chars
 	t.Setenv("CRED_SID", value)
@@ -378,8 +701,8 @@ func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) {
 	assertResource(t, result.Findings, "CRED_SID")
 	for _, f := range result.Findings {
 		if f.Resource == "CRED_SID" {
-			if f.Severity != "HIGH" {
-				t.Errorf("expected HIGH severity for Twilio SID, got %q", f.Severity)
+			if f.Severity != "UNCERTAIN" {
+				t.Errorf("expected UNCERTAIN severity for Twilio SID (broad SK prefix), got %q", f.Severity)
 			}
 			if !strings.Contains(f.Description, "Twilio") {
 				t.Errorf("expected description to contain %q, got %q", "Twilio", f.Description)