Fixes

Pringled · Pringled · commit c7b9c0044b2f · 2026-03-07T10:58:35.000+01:00
diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go
@@ -13,10 +13,14 @@ import (
 	"github.com/Pringled/agentcheck/internal/models"
 )
 
-// nameRegexPatterns is compiled once at package init. It matches env var names that suggest
-// they hold credentials for known providers or generic secret terms.
-// Case-insensitive match on the full variable name.
-var nameRegexPatterns = []*regexp.Regexp{
+// credentialSuffixRe matches env var names that contain a credential-related term.
+// Provider name patterns require this suffix to avoid false positives on non-credential
+// vars like GITHUB_WORKSPACE or OPENAI_BASE_URL.
+var credentialSuffixRe = regexp.MustCompile(`(?i)(KEY|TOKEN|SECRET|PASSWORD|CRED)`)
+
+// providerNamePatterns matches env var names containing a known provider keyword.
+// These only produce a finding when the name also matches credentialSuffixRe.
+var providerNamePatterns = []*regexp.Regexp{
 	// AI / ML providers
 	regexp.MustCompile(`(?i)OPENAI`),
 	regexp.MustCompile(`(?i)ANTHROPIC`),
@@ -81,15 +85,19 @@ var nameRegexPatterns = []*regexp.Regexp{
 	regexp.MustCompile(`(?i)GITHUB`),
 	regexp.MustCompile(`(?i)GITLAB`),
 	regexp.MustCompile(`(?i)BITBUCKET`),
-	// Productivity / project tools (common in agent contexts)
-	regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while still matching MY_LINEAR_TOKEN
+	// Productivity / project tools
+	regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while matching MY_LINEAR_TOKEN
 	regexp.MustCompile(`(?i)NOTION`),
 	regexp.MustCompile(`(?i)AIRTABLE`),
-	// Database-as-a-service (API keys / connection tokens)
+	// Database-as-a-service
 	regexp.MustCompile(`(?i)SUPABASE`),
 	regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) avoids ANEMONE_CONFIG, NEONLIGHTS_COLOR while matching MY_NEON_KEY
 	regexp.MustCompile(`(?i)PLANETSCALE`),
-	// Generic credential terms
+}
+
+// credentialSuffixPatterns matches generic credential terms in env var names.
+// These match standalone without requiring a provider keyword.
+var credentialSuffixPatterns = []*regexp.Regexp{
 	regexp.MustCompile(`(?i)API_KEY`),
 	regexp.MustCompile(`(?i)API_TOKEN`),
 	regexp.MustCompile(`(?i)SECRET_KEY`),
@@ -109,15 +117,15 @@ type valuePattern struct {
 
 // valuePatterns lists known API key formats identified by a distinctive prefix and exact total length.
 var valuePatterns = []valuePattern{
-	// OpenAI — more-specific prefixes listed first so they match before the generic sk- entry.
+	// OpenAI - more-specific prefixes listed first so they match before the generic sk- entry.
 	{prefix: "sk-proj-", totalLen: 56, severity: models.SeverityHigh, providerTag: "OpenAI project"},
 	{prefix: "sk-admin-", totalLen: 57, severity: models.SeverityHigh, providerTag: "OpenAI admin"},
-	// sk- is shared by many tools (OpenAI legacy, LangChain proxies, self-hosted LLMs, …).
+	// sk- is shared by many tools (OpenAI legacy, LangChain proxies, self-hosted LLMs, etc.).
 	// Flag as UNCERTAIN so the user can confirm the actual provider via the variable name.
 	{prefix: "sk-", totalLen: 51, severity: models.SeverityUncertain, providerTag: "possible OpenAI legacy or other sk- key"},
-	// Anthropic — prefix is distinctive enough for HIGH confidence.
+	// Anthropic - prefix is distinctive enough for HIGH confidence.
 	{prefix: "sk-ant-", totalLen: 108, severity: models.SeverityHigh, providerTag: "Anthropic"},
-	// Stripe — underscore separator makes these provider-specific.
+	// Stripe - underscore separator makes these provider-specific.
 	{prefix: "sk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live secret"},
 	{prefix: "sk_test_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe test secret"},
 	{prefix: "rk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live restricted"},
@@ -128,9 +136,8 @@ var valuePatterns = []valuePattern{
 	{prefix: "npm_", totalLen: 40, severity: models.SeverityHigh, providerTag: "npm access token"},
 	// Groq — gsk_ prefix confirmed in Groq docs.
 	{prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"},
-	// Twilio API key SID — SK + 32 hex chars = 34 total.
-	// SeverityUncertain: the SK prefix is too broad (any 34-char string starting with SK
-	// would match); we don't validate the hex charset, so false positives are likely.
+	// Twilio API key SID - SK + 32 hex chars = 34 total.
+	// SeverityUncertain: SK prefix is broad, false positives are likely.
 	{prefix: "SK", totalLen: 34, severity: models.SeverityUncertain, providerTag: "Twilio API key SID"},
 	// SendGrid — SG. + 22 + . + 43 = 69 total (with the dots).
 	{prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"},
@@ -160,9 +167,6 @@ var credentialFiles = []config.CredentialFile{
 
 // APIKeyScanner scans for high-risk API keys in environment variables and credential config files.
 // Key names and file paths only are reported in findings; values and file contents are never emitted.
-// Exception: scanValuePatterns transiently reads env var values solely for prefix+length pattern
-// matching; values are discarded immediately and never stored in findings, logs, or any
-// data structure. See scanValuePatterns for the full security contract.
 // It never returns skipped=true.
 type APIKeyScanner struct {
 	Base
@@ -218,8 +222,6 @@ func (s *APIKeyScanner) Scan() models.ScanResult {
 
 // scanEnvKeys checks built-in and extra environment variable key names for presence.
 // Key names only are reported; values are never read or stored.
-// seenEnvNames is the shared cross-pass dedup set; matched names are added to it so
-// that scanNameRegex and scanValuePatterns will skip variables already claimed here.
 func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Finding {
 	var findings []models.Finding
 
@@ -284,30 +286,39 @@ func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Fin
 			continue
 		}
 
-		for _, re := range nameRegexPatterns {
-			if re.MatchString(name) {
-				seenEnvNames[name] = true
-				findings = append(findings, models.Finding{
-					Scanner:     "api_keys",
-					Resource:    name, // key name only, never the value
-					Severity:    models.SeverityHigh,
-					Description: "Can be used to make authenticated API calls.",
-				})
+		matched := false
+		// Provider patterns require the name to also contain a credential suffix.
+		for _, re := range providerNamePatterns {
+			if re.MatchString(name) && credentialSuffixRe.MatchString(name) {
+				matched = true
 				break
 			}
 		}
+		// Credential suffix patterns match standalone.
+		if !matched {
+			for _, re := range credentialSuffixPatterns {
+				if re.MatchString(name) {
+					matched = true
+					break
+				}
+			}
+		}
+		if matched {
+			seenEnvNames[name] = true
+			findings = append(findings, models.Finding{
+				Scanner:     "api_keys",
+				Resource:    name,
+				Severity:    models.SeverityHigh,
+				Description: "Can be used to make authenticated API calls.",
+			})
+		}
 	}
 
 	return findings
 }
 
 // scanValuePatterns reads env var values to match against known provider prefixes.
-// NOTE: unlike scanEnvKeys and scanNameRegex, this method reads the actual value.
-// Values are used only for prefix+length pattern matching and then discarded immediately.
-// No value is stored in findings, logs, or returned data structures.
-// This is a deliberate, scoped relaxation of the "values never read" contract.
-// seenEnvNames is the shared cross-pass dedup set; names already claimed by scanNameRegex
-// are skipped, and newly matched names are added.
+// Values are used only for prefix+length matching and then discarded.
 func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models.Finding {
 	var findings []models.Finding
 
@@ -344,7 +355,6 @@ func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models
 				break // one finding per variable name
 			}
 		}
-		// value goes out of scope here; it is not stored anywhere
 	}
 
 	return findings
@@ -359,9 +369,7 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding {
 	// If home directory cannot be resolved, skip all ~-based paths to avoid
 	// scanning incorrect root-relative paths (e.g. /.aws/credentials).
 	homeDir := s.resolveHomeDir()
-	// Combine built-in and extra credential files into a single pass.
-	// seenPath deduplicates so that an extra path duplicating a built-in
-	// (e.g. ~/.netrc in both lists) produces only one finding.
+	// seenPath deduplicates built-in and extra paths.
 	allCredFiles := append(credentialFiles, s.ExtraCredentialFiles...)
 	seenPath := make(map[string]bool, len(allCredFiles))
 	for _, cf := range allCredFiles {
@@ -386,11 +394,10 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding {
 	return findings
 }
 
-// envKeyFinding builds a HIGH severity finding for a detected environment variable key.
 func envKeyFinding(key string) models.Finding {
 	return models.Finding{
 		Scanner:     "api_keys",
-		Resource:    key, // key name only, never the value
+		Resource:    key,
 		Severity:    models.SeverityHigh,
 		Description: "Can be used to make authenticated API calls.",
 	}
diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go
@@ -19,9 +19,6 @@ func clearHighRiskEnv(t *testing.T) {
 }
 
 // clearAllEnv sets every environment variable to empty for the duration of the test.
-// Use this in tests that assert 0 findings, since nameRegex patterns (e.g. (?i)GITHUB)
-// can match CI variables like GITHUB_WORKSPACE that aren't credentials.
-// t.Setenv restores original values after the test.
 func clearAllEnv(t *testing.T) {
 	t.Helper()
 	for _, entry := range os.Environ() {
@@ -31,7 +28,6 @@ func clearAllEnv(t *testing.T) {
 	}
 }
 
-// newScannerWithHome creates an APIKeyScanner with HomeDir set to home and no extras.
 func newScannerWithHome(home string) *scan.APIKeyScanner {
 	s := scan.NewAPIKeyScanner()
 	s.HomeDir = home
@@ -342,11 +338,6 @@ func TestAPIKeyScanner_ExtraCredentialFiles_TildeExpanded(t *testing.T) {
 	assertResource(t, result.Findings, tokenFile)
 }
 
-// ── Name-regex tests ──────────────────────────────────────────────────────────
-
-// TestAPIKeyScanner_NameRegex_ProviderKeyword verifies that an env var with a
-// provider keyword in its name (MY_OPENAI_KEY) is flagged even though it is not
-// in HighRiskEnvKeys.
 func TestAPIKeyScanner_NameRegex_ProviderKeyword(t *testing.T) {
 	t.Setenv("MY_OPENAI_KEY", "sk-something")
 	clearHighRiskEnv(t)
@@ -357,8 +348,6 @@ func TestAPIKeyScanner_NameRegex_ProviderKeyword(t *testing.T) {
 	assertResource(t, result.Findings, "MY_OPENAI_KEY")
 }
 
-// TestAPIKeyScanner_NameRegex_GenericTerm verifies that an env var containing a
-// generic credential term (INTERNAL_API_KEY) is flagged.
 func TestAPIKeyScanner_NameRegex_GenericTerm(t *testing.T) {
 	t.Setenv("INTERNAL_API_KEY", "secret")
 	clearHighRiskEnv(t)
@@ -369,9 +358,6 @@ func TestAPIKeyScanner_NameRegex_GenericTerm(t *testing.T) {
 	assertResource(t, result.Findings, "INTERNAL_API_KEY")
 }
 
-// TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin verifies that a key already in
-// HighRiskEnvKeys (OPENAI_API_KEY) produces exactly ONE finding — scanEnvKeys() gets it
-// and scanNameRegex() skips it.
 func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) {
 	t.Setenv("OPENAI_API_KEY", "sk-test")
 	// Clear all built-in keys except OPENAI_API_KEY.
@@ -395,12 +381,6 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) {
 	}
 }
 
-// ── Value-pattern tests ───────────────────────────────────────────────────────
-
-// TestAPIKeyScanner_ValuePatterns verifies that each known provider value pattern
-// produces a finding with the correct severity and provider tag in the description.
-// Variable names are intentionally neutral (no provider keyword) so the finding
-// comes from scanValuePatterns, not scanNameRegex.
 func TestAPIKeyScanner_ValuePatterns(t *testing.T) {
 	cases := []struct {
 		name         string
@@ -447,9 +427,6 @@ func TestAPIKeyScanner_ValuePatterns(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN
-// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not
-// be treated as a Fly.io credential due to the word-boundary anchor in the pattern).
 func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) {
 	clearHighRiskEnv(t)
 	t.Setenv("FLY_API_TOKEN", "real-token")
@@ -475,8 +452,6 @@ func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords
-// added in this session are recognised.
 func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) {
 	clearHighRiskEnv(t)
 	cases := []struct {
@@ -509,8 +484,6 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the
-// right prefix but wrong length does NOT produce a finding.
 func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) {
 	value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern
 	t.Setenv("SOME_KEY", value)
@@ -526,10 +499,6 @@ func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_ValuePattern_TwilioSID verifies that a Twilio API key SID
-// (SK + 32 hex chars = 34 total) produces an UNCERTAIN finding.
-// The SK prefix is intentionally broad (any 34-char string starting with SK matches)
-// so we use SeverityUncertain rather than SeverityHigh to avoid false positives.
 func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) {
 	value := "SK" + strings.Repeat("f", 32) // total 34 chars
 	t.Setenv("CRED_SID", value)
@@ -552,9 +521,6 @@ func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) {
 	assertNoSecretValue(t, result.Findings, value)
 }
 
-// TestAPIKeyScanner_CrossPassDedup_NameRegexWins verifies that a variable whose name
-// matches a nameRegex pattern AND whose value matches a value pattern produces exactly
-// ONE finding — from the name-regex pass — not two.
 func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) {
 	// CUSTOM_STRIPE_KEY matches the STRIPE name-regex.
 	// sk_live_ + 47 chars matches the Stripe live secret value pattern.
@@ -585,10 +551,6 @@ func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) {
 	}
 }
 
-// ── Tightened-regex false-positive tests ─────────────────────────────────────
-
-// TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern verifies that the tightened \bNEON_
-// pattern does not fire on variable names that contain "neon" as part of a longer word.
 func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) {
 	clearHighRiskEnv(t)
 	// These should NOT be flagged.
@@ -613,8 +575,6 @@ func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern verifies that the tightened \bLINEAR_
-// pattern does not fire on names containing "linear" as a substring.
 func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) {
 	clearHighRiskEnv(t)
 	t.Setenv("BILINEAR_FILTER", "some-value")
@@ -631,8 +591,6 @@ func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern verifies that the tightened \bPALM_
-// pattern does not fire on names like NAPALM_MODE.
 func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) {
 	clearHighRiskEnv(t)
 	t.Setenv("NAPALM_MODE", "some-value")
@@ -652,8 +610,6 @@ func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_NameRegex_NewAIProviders verifies that newly added AI provider
-// name patterns are recognised.
 func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) {
 	clearHighRiskEnv(t)
 	cases := []struct {
@@ -683,10 +639,6 @@ func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex verifies that a key listed in
-// ExtraEnvKeys whose name also matches a nameRegexPattern produces exactly ONE finding.
-// Previously scanEnvKeys and scanNameRegex were not sharing the seenEnvNames dedup map,
-// so MY_OPENAI_KEY in extra_env_keys would fire twice.
 func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) {
 	const key = "MY_OPENAI_KEY" // matches OPENAI nameRegexPattern AND is in ExtraEnvKeys
 	t.Setenv(key, "sk-test-value")
@@ -709,11 +661,8 @@ func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) {
 	}
 }
 
-// TestAPIKeyScanner_ValuePattern_BuiltinSkipped verifies that a key in HighRiskEnvKeys
-// whose value also matches a value pattern produces exactly ONE finding (from scanEnvKeys,
-// not from scanValuePatterns which skips it).
 func TestAPIKeyScanner_ValuePattern_BuiltinSkipped(t *testing.T) {
-	value := "sk-proj-" + strings.Repeat("z", 48) // total 56 chars — matches OpenAI project pattern
+	value := "sk-proj-" + strings.Repeat("z", 48) // total 56 chars - matches OpenAI project pattern
 	t.Setenv("OPENAI_API_KEY", value)
 	// Clear all other built-in keys.
 	for k := range scan.HighRiskEnvKeys {
@@ -735,3 +684,22 @@ func TestAPIKeyScanner_ValuePattern_BuiltinSkipped(t *testing.T) {
 		t.Errorf("expected exactly 1 finding for OPENAI_API_KEY, got %d", count)
 	}
 }
+
+func TestAPIKeyScanner_NameRegex_ProviderWithoutSuffix_NotFlagged(t *testing.T) {
+	clearAllEnv(t)
+	// Provider keyword present but no credential suffix - should NOT be flagged.
+	t.Setenv("GITHUB_WORKSPACE", "/home/runner/work")
+	t.Setenv("GITHUB_ACTIONS", "true")
+	t.Setenv("OPENAI_BASE_URL", "https://api.openai.com")
+	t.Setenv("STRIPE_WEBHOOK_ENDPOINT", "https://example.com/webhook")
+
+	s := newScannerWithHome(t.TempDir())
+	result := s.Scan()
+
+	for _, f := range result.Findings {
+		switch f.Resource {
+		case "GITHUB_WORKSPACE", "GITHUB_ACTIONS", "OPENAI_BASE_URL", "STRIPE_WEBHOOK_ENDPOINT":
+			t.Errorf("%s should not be flagged (provider keyword without credential suffix)", f.Resource)
+		}
+	}
+}