Skip to content

Commit 24eb79d

Browse files
committed
fix: harden PII canary detection with portmanteau middle names and sub-pattern scanning
- Replace real-word middle names with portmanteaus (e.g. Thundaze, Lunarex) that cannot appear in legitimate output but survive canonicalization as scan tokens - Extract CANARY_DOMAIN constant and add sub_pattern_map() to inject the email domain and each middle name as additional Aho-Corasick patterns in both the daemon scanner and the default (library/test) scanner -- catches leaks where an LLM reproduces just the domain or a distinctive middle name without the full value - Promote canary_paraphrase advisory to block when k_observed >= 10x k_threshold, leaving the advisory tier for early warning and reserving block for overwhelming n-gram accumulation (195 distinct n-grams on a full canary value in isolation) - Update TC-066-06 to accept block at high n-gram counts (reflects new behavior; exact scanner still fires first in the real pipeline)
1 parent a455ac9 commit 24eb79d

5 files changed

Lines changed: 86 additions & 55 deletions

File tree

src/armor/canaries/_generate.py

Lines changed: 52 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -60,35 +60,35 @@
6060
"Zoe",
6161
]
6262
_PII_MIDDLE_NAMES = [
63-
"Lightning",
64-
"Thunder",
65-
"Blaze",
66-
"Storm",
67-
"Iron",
68-
"Silver",
69-
"Crimson",
70-
"Shadow",
71-
"Frost",
72-
"Ember",
73-
"Quantum",
74-
"Cobalt",
75-
"Titan",
76-
"Inferno",
77-
"Steel",
78-
"Jade",
79-
"Onyx",
80-
"Ruby",
81-
"Zenith",
82-
"Solar",
83-
"Lunar",
84-
"Apex",
85-
"Hyper",
86-
"Turbo",
87-
"Neon",
88-
"Vortex",
89-
"Plasma",
90-
"Atomic",
91-
"Primal",
63+
"Thundaze",
64+
"Lightnorm",
65+
"Silvrost",
66+
"Irember",
67+
"Crimadow",
68+
"Quantobalt",
69+
"Titaferno",
70+
"Stelade",
71+
"Rubnyx",
72+
"Zenolar",
73+
"Lunarex",
74+
"Atomasma",
75+
"Vorteon",
76+
"Hyturbo",
77+
"Primblaze",
78+
"Storember",
79+
"Silcrim",
80+
"Frostitan",
81+
"Bladow",
82+
"Ironyx",
83+
"Balvorn",
84+
"Thunderal",
85+
"Quantnox",
86+
"Jadember",
87+
"Plasolar",
88+
"Hypferno",
89+
"Lunstorm",
90+
"Apexmal",
91+
"Rubenith",
9292
]
9393
_PII_LAST_NAMES = [
9494
"Dragon",
@@ -186,6 +186,25 @@
186186
("Beldock", "NB", "E"),
187187
]
188188

189+
CANARY_DOMAIN = "armor-trap.invalid"
190+
191+
192+
def sub_pattern_map() -> dict[str, str]:
193+
"""Return additional Aho-Corasick patterns that are sub-components of generated canary values.
194+
195+
These supplement the full-value canary scanner with high-signal fragments that
196+
survive mild reformatting (e.g. an LLM that omits the email local-part but
197+
reproduces the domain, or outputs a distinctive middle name without the full name).
198+
199+
Each key is a synthetic canary ID (never a real catalogue entry).
200+
"""
201+
patterns: dict[str, str] = {
202+
"canary.sub:domain": CANARY_DOMAIN,
203+
}
204+
for name in _PII_MIDDLE_NAMES:
205+
patterns[f"canary.sub:middle:{name.lower()}"] = name
206+
return patterns
207+
189208

190209
def _generate_pii_value(marker_rule: str) -> str:
191210
"""Generate a fake PII value for a pii: prefixed marker rule.
@@ -346,7 +365,7 @@ def _generate_value_for_pattern(marker_rule: str) -> str:
346365
# Fake URLs: https://canary.armor-trap.invalid/<id>
347366
if marker_rule == r"^https://canary\.armor-trap\.invalid/[a-z0-9\-]+$":
348367
suffix = "".join(random.choice(string.ascii_lowercase + string.digits + "-") for _ in range(12))
349-
return f"https://canary.armor-trap.invalid/{suffix}"
368+
return f"https://canary.{CANARY_DOMAIN}/{suffix}"
350369

351370
# Slack webhook URL
352371
if marker_rule == r"^https://hooks\.slack\.com/services/T[A-Z0-9]+/B[A-Z0-9]+/[A-Za-z0-9]+$":
@@ -369,12 +388,12 @@ def _generate_value_for_pattern(marker_rule: str) -> str:
369388
# Fake hostnames: <id>.canary.armor-trap.invalid
370389
if marker_rule == r"^[a-z0-9\-]+\.canary\.armor-trap\.invalid$":
371390
prefix = "".join(random.choice(string.ascii_lowercase + string.digits + "-") for _ in range(12))
372-
return f"{prefix}.canary.armor-trap.invalid"
391+
return f"{prefix}.canary.{CANARY_DOMAIN}"
373392

374393
# Fake email addresses: canary-<id>@armor-trap.invalid
375394
if marker_rule == r"^canary-[a-z0-9\-]+@armor-trap\.invalid$":
376395
suffix = "".join(random.choice(string.ascii_lowercase + string.digits + "-") for _ in range(12))
377-
return f"canary-{suffix}@armor-trap.invalid"
396+
return f"canary-{suffix}@{CANARY_DOMAIN}"
378397

379398
# Fake wallet addresses: 1ARMORTRAP + 32 hex chars
380399
if marker_rule == r"^1ARMORTRAP[0-9a-f]{32}$":
@@ -419,7 +438,7 @@ def _generate_value_for_pattern(marker_rule: str) -> str:
419438
password = "".join(random.choice(string.ascii_letters + string.digits) for _ in range(16))
420439
host = "canary-" + "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
421440
db_name = "canary_db"
422-
return f"{db_type}://{username}:{password}@{host}.armor-trap.invalid/{db_name}"
441+
return f"{db_type}://{username}:{password}@{host}.{CANARY_DOMAIN}/{db_name}"
423442

424443
# BIP39 seed (12-word phrase): [a-z]+ (space [a-z]+){11}
425444
if marker_rule == r"^[a-z]+( [a-z]+){11}$":

src/armor/daemon/server.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pathlib import Path
1919
from typing import Any
2020

21+
from armor.canaries._generate import sub_pattern_map
2122
from armor.canaries.catalogue import Catalogue
2223
from armor.canaries.scanner import CanaryScanner
2324
from armor.daemon.honeypot_gate import should_invoke_honeypot
@@ -188,8 +189,10 @@ def __init__(
188189
logger.error("Catalogue is empty (no active canaries)")
189190
sys.exit(78)
190191

191-
# Build canary value map for the scanner
192+
# Build canary value map for the scanner, then extend with sub-patterns
193+
# (email domain, distinctive middle names) that survive mild LLM reformatting.
192194
canary_map = {entry.canary_id: entry.value for entry in active_canaries}
195+
canary_map.update(sub_pattern_map())
193196
self.canary_scanner = CanaryScanner(canary_map)
194197
logger.info(f"Loaded canary catalogue with {len(active_canaries)} active canaries")
195198

src/armor/detectors/canary_paraphrase.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -167,19 +167,29 @@ def check(self, payload: Payload, ctx: SessionContext) -> Verdict:
167167
for canary_id, distinct_ngrams in distinct_ngrams_per_canary.items():
168168
k_observed = len(distinct_ngrams)
169169
if k_observed >= self.k_threshold:
170-
# Calculate confidence: min(1.0, K_observed / K_threshold * 0.5)
171170
confidence = min(1.0, (k_observed / self.k_threshold) * 0.5)
171+
details: dict[str, object] = {
172+
"canary_id": canary_id,
173+
"ngram_count": k_observed,
174+
"k_threshold": self.k_threshold,
175+
"confidence": confidence,
176+
}
177+
signal_id = f"canary.paraphrase:{canary_id}:ngram"
178+
message = f"Potential paraphrased canary leak detected (n-grams: {k_observed})"
179+
180+
if k_observed >= self.k_threshold * 10:
181+
return Verdict.block_verdict(
182+
signal_id=signal_id,
183+
message="Output suppressed by armor.",
184+
severity="critical",
185+
details=details,
186+
)
172187

173188
return Verdict.advisory_verdict(
174-
signal_id=f"canary.paraphrase:{canary_id}:ngram",
189+
signal_id=signal_id,
175190
severity="high",
176-
message=f"Potential paraphrased canary leak detected (n-grams: {k_observed})",
177-
details={
178-
"canary_id": canary_id,
179-
"ngram_count": k_observed,
180-
"k_threshold": self.k_threshold,
181-
"confidence": confidence,
182-
},
191+
message=message,
192+
details=details,
183193
)
184194

185195
return Verdict.pass_verdict()

src/armor/detectors/canary_scanner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from functools import lru_cache
1313
from pathlib import Path
1414

15-
from armor.canaries._generate import write_values_file
15+
from armor.canaries._generate import sub_pattern_map, write_values_file
1616
from armor.canaries.catalogue import Catalogue
1717
from armor.canaries.scanner import CanaryScanner
1818
from armor.types import Payload, SessionContext, Verdict
@@ -43,6 +43,7 @@ def _default_scanner() -> CanaryScanner:
4343
# Load with the generated values
4444
catalogue = Catalogue.load(temp_values_path)
4545
canary_map = {entry.canary_id: entry.value for entry in catalogue.active_canaries()}
46+
canary_map.update(sub_pattern_map())
4647
return CanaryScanner(canary_map)
4748
finally:
4849
# Clean up temp file

tests/unit/detectors/test_canary_paraphrase.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,12 @@ def test_tc_066_05_benign_short_substring(
151151
def test_tc_066_06_full_match_not_paraphrase(
152152
self, detector: CanaryParaphraseDetector, context: SessionContext
153153
) -> None:
154-
"""TC-066-06: Full canary value should not be detected as paraphrase (out of scope).
154+
"""TC-066-06: Full canary value in rolling buffer triggers block at high n-gram count.
155155
156-
This is a regression check: the paraphrase detector should not fire on full values
157-
(that's the canary_scanner's job). However, since the paraphrase detector does
158-
scan n-grams, a full value will match multiple n-grams. This test documents
159-
that behavior.
156+
A full canary value produces far more distinct n-grams than the 10x block
157+
threshold (k_threshold=3, k_block=30). The paraphrase detector correctly
158+
blocks in this case. In the real pipeline the canary_scanner fires first,
159+
so this code path is only exercised when the detector runs in isolation.
160160
"""
161161
buf = context.rolling_buffer
162162
assert buf is not None
@@ -166,12 +166,10 @@ def test_tc_066_06_full_match_not_paraphrase(
166166
payload = Payload(text="")
167167
verdict = detector.check(payload, context)
168168

169-
# The paraphrase detector will likely fire on the full value
170-
# (since it contains many n-grams). This is acceptable — the
171-
# canary_scanner will block it first in the pipeline anyway.
172-
# We just document that the paraphrase detector is not suppressed
173-
# on full values (it's not its job to be smart about that).
174-
assert verdict.decision in ("pass", "advisory")
169+
# 195 distinct n-grams >> 30 (10x k_threshold) -> block.
170+
# In the real pipeline the exact canary_scanner fires first; the
171+
# paraphrase detector correctly escalates when it runs in isolation.
172+
assert verdict.decision in ("advisory", "block")
175173

176174
# TC-066-07: Confidence formula K=4, K_threshold=3
177175
def test_tc_066_07_confidence_formula(self, detector: CanaryParaphraseDetector, context: SessionContext) -> None:

0 commit comments

Comments
 (0)