Skip to content

Commit 70ca861

Browse files
authored
Merge pull request #404 from Lexus2016/evolution/issue-398-secret-detection
feat(security-guidance): secret detection slice (#398, child of #390)
2 parents 9526b75 + 1149b0d commit 70ca861

3 files changed

Lines changed: 360 additions & 0 deletions

File tree

plugins/security-guidance/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from typing import Any, Dict, List, Optional, Tuple
3939

4040
from . import patterns as _patterns
41+
from . import secrets as _secrets
4142

4243
logger = logging.getLogger(__name__)
4344

@@ -196,6 +197,7 @@ def _scan_args(tool_name: str, args: Any) -> List[Tuple[str, str]]:
196197
findings: List[Tuple[str, str]] = []
197198
for path, content in _extract_path_and_content(tool_name, args):
198199
findings.extend(_scan_content(path, content))
200+
findings.extend(_secrets.scan_secrets(path, content))
199201
return findings
200202

201203

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""Secret detection for the security-guidance plugin (Hermes addition, #398).
2+
3+
Child of #390 — first shippable slice of the security code-review plugin.
4+
NOT part of the Anthropic fork: ``patterns.py`` is byte-for-byte upstream, so
5+
this Hermes-side logic lives in its own module. Two layers:
6+
7+
1. Regex rules for well-known credential formats (AWS, GitHub, Slack, Google,
8+
Stripe, npm, PEM private keys, JWT, generic api-key assignments).
9+
2. A conservative Shannon-entropy check: a high-entropy value assigned to a
10+
secret-named key, with obvious placeholders/example values excluded. The
11+
threshold is deliberately conservative (~4.0 bits/char) to keep the
12+
false-positive rate low, so it will NOT flag low-entropy human passphrases
13+
(e.g. "correcthorsebatterystaple"); known-format keys are caught by layer 1.
14+
15+
Findings are returned as ``(ruleName, reminder)`` tuples — the same shape the
16+
regex security rules use — so they flow through the existing warn/block path in
17+
``__init__.py`` with no special handling.
18+
"""
19+
20+
from __future__ import annotations
21+
22+
import math
23+
import re
24+
from typing import Dict, List, Set, Tuple
25+
26+
# Same scan cap as the regex scanner — pattern-matching a huge blob is poor
27+
# signal-to-noise and slows the agent loop.
28+
# Same scan cap as the regex scanner in __init__.py (_MAX_SCAN_BYTES there) —
29+
# kept independent so this module stays stdlib-only and importable in isolation.
30+
# If you change one, change both.
31+
_MAX_SCAN_BYTES = 256 * 1024
32+
33+
# Obvious non-secrets — example keys, placeholders, redactions. Checked against
34+
# the matched text so AWS's documented ``AKIAIOSFODNN7EXAMPLE`` and friends, or
35+
# ``api_key = "your-key-here"``, don't generate false warnings.
36+
# Two exclusion sets:
37+
# _EXAMPLE_RE — unambiguous "this is documentation, not a real key" words.
38+
# Safe to apply even to fixed-prefix tokens (AKIA…/ghp_…), because a real
39+
# random key won't contain the literal word "example"/"dummy"/etc.
40+
# _PLACEHOLDER_RE — broader, includes structural fillers (your-, xxxx, 0000,
41+
# <...>). Applied ONLY to assignment-style/entropy values, never to a
42+
# fixed-prefix token — otherwise a real key that merely *contains* "xxxx"
43+
# or "0000" as a substring would be silently dropped (a fail-open miss in
44+
# a security tool). See scan_secrets().
45+
_EXAMPLE_RE = re.compile(
46+
r"(?i)(example|redacted|placeholder|dummy|sample|changeme|fake|"
47+
r"test[_-]?(?:key|token|secret))"
48+
)
49+
_PLACEHOLDER_RE = re.compile(
50+
r"(?i)(example|redacted|placeholder|dummy|sample|changeme|your[_-]?|"
51+
r"x{4,}|\.\.\.|<[a-z0-9_ .-]+>|fake|test[_-]?(?:key|token|secret)|0{8,})"
52+
)
53+
54+
_SECRET_REMINDER = (
55+
"⚠️ Security Warning: a hardcoded credential ({kind}) appears in "
56+
"this content. Never commit live secrets to source. Move it to an "
57+
"environment variable or a secrets manager, and rotate the credential if it "
58+
"was ever real. If this is a placeholder/example, document that inline."
59+
)
60+
61+
_ENTROPY_REMINDER = (
62+
"⚠️ Security Warning: a high-entropy value is assigned to a "
63+
"secret-named variable — this looks like a hardcoded credential. Move it to "
64+
"an environment variable or secrets manager and rotate it if real. If it is "
65+
"not a secret, rename the variable or document why it is safe."
66+
)
67+
68+
# (ruleName, human-readable kind, compiled regex). Most-specific first.
69+
_SECRET_RULES: List[Tuple[str, str, "re.Pattern[str]"]] = [
70+
("private_key_pem", "PEM private key",
71+
re.compile(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH |PGP |ENCRYPTED )?PRIVATE KEY-----")),
72+
("aws_access_key_id", "AWS access key id",
73+
re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b")),
74+
("aws_secret_access_key", "AWS secret access key",
75+
re.compile(r"(?i)aws_secret_access_key\s*[=:]\s*[\"'][A-Za-z0-9/+]{40}[\"']")),
76+
("github_token", "GitHub token",
77+
re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,}\b")),
78+
("github_pat_finegrained", "GitHub fine-grained PAT",
79+
re.compile(r"\bgithub_pat_[A-Za-z0-9_]{22,}\b")),
80+
("slack_token", "Slack token",
81+
re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
82+
("slack_webhook", "Slack webhook URL",
83+
re.compile(r"https://hooks\.slack\.com/services/T[A-Za-z0-9_/]+")),
84+
("google_api_key", "Google API key",
85+
re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b")),
86+
("stripe_secret_key", "Stripe secret key",
87+
re.compile(r"\b(?:sk|rk)_live_[0-9a-zA-Z]{24,}\b")), # live keys only; sk_test_ is low-risk by design
88+
("npm_token", "npm token",
89+
re.compile(r"\bnpm_[A-Za-z0-9]{36}\b")),
90+
("jwt_token", "JSON Web Token",
91+
re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b")),
92+
("generic_secret_assignment", "hardcoded API key / token",
93+
re.compile(
94+
r"(?i)\b(?:api[_-]?key|client[_-]?secret|access[_-]?token|auth[_-]?token|"
95+
r"secret[_-]?key)\b\s*[=:]\s*[\"'][A-Za-z0-9_\-]{16,}[\"']"
96+
)),
97+
]
98+
99+
# Entropy layer: a high-entropy value assigned to a secret-named key.
100+
_SECRET_ASSIGN_RE = re.compile(
101+
r"(?i)\b([A-Za-z0-9_]*(?:secret|token|passwd|password|api[_-]?key|"
102+
r"access[_-]?key|client[_-]?secret|private[_-]?key|credential)[A-Za-z0-9_]*)"
103+
r"\s*[=:]\s*[\"']([^\"'\s]{20,})[\"']"
104+
)
105+
_ENTROPY_THRESHOLD = 4.0 # bits/char; random base64 ~5-6, English prose ~4.0-4.2
106+
107+
108+
def shannon_entropy(s: str) -> float:
109+
"""Shannon entropy in bits/char of *s* (0.0 for empty)."""
110+
if not s:
111+
return 0.0
112+
counts: Dict[str, int] = {}
113+
for ch in s:
114+
counts[ch] = counts.get(ch, 0) + 1
115+
n = len(s)
116+
return -sum((c / n) * math.log2(c / n) for c in counts.values())
117+
118+
119+
def _is_placeholder(value: str) -> bool:
120+
return bool(_PLACEHOLDER_RE.search(value))
121+
122+
123+
def _too_big(content: str) -> bool:
124+
return len(content.encode("utf-8", errors="ignore")) > _MAX_SCAN_BYTES
125+
126+
127+
def scan_secrets(path: str, content: str) -> List[Tuple[str, str]]:
128+
"""Return ``[(ruleName, reminder), ...]`` for credentials found in *content*.
129+
130+
Each rule fires at most once. Obvious placeholders/example values are
131+
excluded to keep the false-positive rate low. *path* is accepted for
132+
symmetry with the regex scanner; secrets are scanned in any file type
133+
(config/.env files matter most).
134+
"""
135+
if not content or _too_big(content):
136+
return []
137+
hits: List[Tuple[str, str]] = []
138+
seen: Set[str] = set()
139+
for rule_name, kind, rx in _SECRET_RULES:
140+
m = rx.search(content)
141+
if not m or rule_name in seen:
142+
continue
143+
# Fixed-prefix rules are high-precision — only suppress documented
144+
# EXAMPLE-style tokens. The assignment-style rule's value can legitimately
145+
# be a structural placeholder ("your-key-here"), so it gets the broad set.
146+
excl = _PLACEHOLDER_RE if rule_name == "generic_secret_assignment" else _EXAMPLE_RE
147+
if excl.search(m.group(0)):
148+
continue
149+
seen.add(rule_name)
150+
hits.append((rule_name, _SECRET_REMINDER.format(kind=kind)))
151+
# Entropy backstop — only when no known-format secret already fired, so a
152+
# single hardcoded secret never produces two near-duplicate warnings.
153+
if not hits:
154+
for m in _SECRET_ASSIGN_RE.finditer(content):
155+
value = m.group(2)
156+
if _is_placeholder(value):
157+
continue
158+
if shannon_entropy(value) >= _ENTROPY_THRESHOLD:
159+
hits.append(("high_entropy_secret", _ENTROPY_REMINDER))
160+
break
161+
return hits
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
"""Tests for secret detection in the security-guidance plugin (#398).
2+
3+
Covers ``plugins/security-guidance/secrets.py``:
4+
* regex detection of well-known credential formats (AWS, GitHub, Slack,
5+
Google, Stripe, npm, PEM private key, JWT, generic assignment),
6+
* the conservative Shannon-entropy backstop,
7+
* false-positive sanity (benign code + placeholder/example values),
8+
* end-to-end wiring through the plugin's warn-mode hook.
9+
10+
Token-shaped fixtures are ASSEMBLED FROM PARTS at runtime so neither the
11+
repo's secret scanners (GitGuardian on the PR) nor the I/O redactor sees a
12+
contiguous credential in this file. The detector runs on the concatenated
13+
runtime value, so detection still exercises the real regexes.
14+
"""
15+
16+
import importlib.util
17+
import sys
18+
import types
19+
from pathlib import Path
20+
21+
import pytest
22+
23+
24+
def _repo_root() -> Path:
25+
return Path(__file__).resolve().parents[2]
26+
27+
28+
def _load_secrets():
29+
"""Import secrets.py in isolation (stdlib-only, no plugin glue)."""
30+
path = _repo_root() / "plugins" / "security-guidance" / "secrets.py"
31+
spec = importlib.util.spec_from_file_location(
32+
"security_guidance_secrets_under_test", path
33+
)
34+
mod = importlib.util.module_from_spec(spec)
35+
spec.loader.exec_module(mod)
36+
return mod
37+
38+
39+
def _load_plugin_init():
40+
"""Import the plugin __init__.py with patterns.py + secrets.py as siblings."""
41+
plugin_dir = _repo_root() / "plugins" / "security-guidance"
42+
if "hermes_plugins" not in sys.modules:
43+
ns = types.ModuleType("hermes_plugins")
44+
ns.__path__ = []
45+
sys.modules["hermes_plugins"] = ns
46+
spec = importlib.util.spec_from_file_location(
47+
"hermes_plugins.security_guidance",
48+
plugin_dir / "__init__.py",
49+
submodule_search_locations=[str(plugin_dir)],
50+
)
51+
mod = importlib.util.module_from_spec(spec)
52+
mod.__package__ = "hermes_plugins.security_guidance"
53+
mod.__path__ = [str(plugin_dir)]
54+
sys.modules["hermes_plugins.security_guidance"] = mod
55+
spec.loader.exec_module(mod)
56+
return mod
57+
58+
59+
# Assembled fake credentials (split so secret scanners don't match the file).
60+
_AWS_KEY = "AKIA" + "QKZ7X2MNOP3RTUV9" # AKIA + 16 upper/digits
61+
_GH_TOKEN = "ghp" + "_" + ("b" * 36) # gh?_ + 36 alnum
62+
_SLACK = "xoxb" + "-" + "123456789012" + "-" + "abcdefghijkl"
63+
_GOOGLE = "AIza" + "Sy" + ("C" * 33) # AIza + 35
64+
_STRIPE = "sk" + "_live_" + ("9" * 24)
65+
_NPM = "npm" + "_" + ("a" * 36)
66+
_PEM = "-----BEGIN " + "RSA PRIVATE KEY-----"
67+
_JWT = "eyJ" + ("hbGciOiJIUzI1NiJ9") + "." + "eyJ" + ("zdWIiOiIxMjM0NTY3ODkwIn0") + "." + ("SflKxwRJ_signature_part")
68+
_HIGH_ENTROPY = "kJ8x2Qm9Zp4Lw7Nv1Rb6Tc3Yd5Fg0Hh" # 32 mixed chars
69+
70+
71+
class TestRegexSecretDetection:
72+
def setup_method(self):
73+
self.s = _load_secrets()
74+
75+
def _names(self, content):
76+
return {name for name, _ in self.s.scan_secrets("f.py", content)}
77+
78+
def test_aws_access_key_detected(self):
79+
assert "aws_access_key_id" in self._names(f'key = "{_AWS_KEY}"\n')
80+
81+
def test_pem_private_key_detected(self):
82+
assert "private_key_pem" in self._names(_PEM + "\nMIIE...\n")
83+
84+
def test_slack_token_detected(self):
85+
assert "slack_token" in self._names(f'tok = "{_SLACK}"\n')
86+
87+
def test_github_token_detected(self):
88+
assert "github_token" in self._names(f'gh = "{_GH_TOKEN}"\n')
89+
90+
def test_google_api_key_detected(self):
91+
assert "google_api_key" in self._names(f'g = "{_GOOGLE}"\n')
92+
93+
def test_stripe_key_detected(self):
94+
assert "stripe_secret_key" in self._names(f'sk = "{_STRIPE}"\n')
95+
96+
def test_npm_token_detected(self):
97+
assert "npm_token" in self._names(f'n = "{_NPM}"\n')
98+
99+
def test_jwt_detected(self):
100+
assert "jwt_token" in self._names(f'jwt = "{_JWT}"\n')
101+
102+
def test_generic_api_key_assignment_detected(self):
103+
names = self._names('api_key = "' + ("Z" * 24) + '"\n')
104+
assert "generic_secret_assignment" in names
105+
106+
def test_prefix_key_with_filler_substring_still_detected(self):
107+
# A real fixed-prefix key that happens to contain "00000000" must NOT be
108+
# suppressed — placeholder exclusion for prefix rules is EXAMPLE-only,
109+
# so a real secret is never silently dropped (nit #1, fail-open fix).
110+
tok = "ghp" + "_" + "00000000" + ("c" * 28) # 36 chars after ghp_
111+
assert "github_token" in self._names(f'gh = "{tok}"\n')
112+
113+
def test_each_rule_fires_once(self):
114+
content = f'a = "{_AWS_KEY}"\nb = "{_AWS_KEY}"\n'
115+
findings = self.s.scan_secrets("f.py", content)
116+
assert sum(1 for n, _ in findings if n == "aws_access_key_id") == 1
117+
118+
119+
class TestEntropyBackstop:
120+
def setup_method(self):
121+
self.s = _load_secrets()
122+
123+
def test_high_entropy_secret_assignment_flagged(self):
124+
# 'db_credential' is in the entropy keyword set but is NOT a known-format
125+
# rule, so only the entropy backstop can catch this random value.
126+
names = {n for n, _ in self.s.scan_secrets("f.py", f'db_credential = "{_HIGH_ENTROPY}"\n')}
127+
assert "high_entropy_secret" in names
128+
129+
def test_low_entropy_secret_named_value_not_flagged(self):
130+
# Long but low-entropy (repetitive) value assigned to a secret key.
131+
names = {n for n, _ in self.s.scan_secrets("f.py", 'password = "aaaaaaaaaaaaaaaaaaaaaaaa"\n')}
132+
assert "high_entropy_secret" not in names
133+
134+
def test_shannon_entropy_sanity(self):
135+
assert self.s.shannon_entropy("") == 0.0
136+
assert self.s.shannon_entropy("aaaaaaaa") < 1.0
137+
assert self.s.shannon_entropy(_HIGH_ENTROPY) > 4.0
138+
139+
def test_entropy_skipped_when_known_secret_already_found(self):
140+
# AWS regex fires -> entropy backstop suppressed (no duplicate noise).
141+
names = {n for n, _ in self.s.scan_secrets("f.py", f'secret = "{_AWS_KEY}"\n')}
142+
assert "high_entropy_secret" not in names
143+
144+
145+
class TestFalsePositiveSanity:
146+
def setup_method(self):
147+
self.s = _load_secrets()
148+
149+
def test_benign_code_no_findings(self):
150+
content = "def add(a, b):\n return a + b\n\nAPI_TIMEOUT = 30\n"
151+
assert self.s.scan_secrets("f.py", content) == []
152+
153+
def test_placeholder_api_key_not_flagged(self):
154+
assert self.s.scan_secrets("f.py", 'api_key = "your-api-key-here"\n') == []
155+
156+
def test_example_value_not_flagged(self):
157+
assert self.s.scan_secrets("f.py", 'token = "EXAMPLE_TOKEN_VALUE_1234567890"\n') == []
158+
159+
def test_empty_content_no_findings(self):
160+
assert self.s.scan_secrets("f.py", "") == []
161+
162+
def test_huge_content_skipped(self):
163+
big = "x = 1\n" * 60000 # > 256 KB
164+
assert self.s.scan_secrets("f.py", big) == []
165+
166+
167+
class TestHookIntegration:
168+
def test_write_file_with_aws_key_warns(self, monkeypatch):
169+
monkeypatch.delenv("SECURITY_GUIDANCE_BLOCK", raising=False)
170+
monkeypatch.delenv("SECURITY_GUIDANCE_DISABLE", raising=False)
171+
mod = _load_plugin_init()
172+
args = {"path": "/tmp/config.py", "content": f'AWS = "{_AWS_KEY}"\n'}
173+
result = mod._on_transform_tool_result(
174+
tool_name="write_file",
175+
args=args,
176+
result='{"success": true, "bytes_written": 40}',
177+
)
178+
assert isinstance(result, str)
179+
assert "Security guidance" in result
180+
assert "credential" in result.lower()
181+
182+
def test_clean_write_no_warning(self, monkeypatch):
183+
monkeypatch.delenv("SECURITY_GUIDANCE_BLOCK", raising=False)
184+
monkeypatch.delenv("SECURITY_GUIDANCE_DISABLE", raising=False)
185+
mod = _load_plugin_init()
186+
args = {"path": "/tmp/ok.py", "content": "x = 1\n"}
187+
assert mod._on_transform_tool_result(
188+
tool_name="write_file", args=args, result='{"success": true}'
189+
) is None
190+
191+
def test_block_mode_refuses_write_with_secret(self, monkeypatch):
192+
monkeypatch.setenv("SECURITY_GUIDANCE_BLOCK", "1")
193+
monkeypatch.delenv("SECURITY_GUIDANCE_DISABLE", raising=False)
194+
mod = _load_plugin_init()
195+
args = {"path": "/tmp/config.py", "content": f'GH = "{_GH_TOKEN}"\n'}
196+
out = mod._on_pre_tool_call(tool_name="write_file", args=args)
197+
assert isinstance(out, dict) and out.get("action") == "block"

0 commit comments

Comments
 (0)