11package detect
22
3- import "strings"
3+ import (
4+ "regexp"
5+ "strings"
6+ )
47
58// Position classifies where a phrase match sits in the surrounding text, so the
6- // soft checks can tell a genuine embedded instruction ("ignore previous
7- // instructions") apart from a description that merely talks about such phrases
8- // ("detects prompts such as 'ignore previous instructions'"). This is the core
9- // false-positive control for legitimate security tooling (FR-009, US2).
9+ // checks can tell a genuine embedded instruction ("ignore previous
10+ // instructions") apart from a tool that merely DESCRIBES such phrases
11+ // ("analyzes prompts that ignore previous instructions"). This is the core
12+ // false-positive control for legitimate security tooling (FR-009, US2), and the
13+ // deciding factor between the hard/soft tiers for a matched directive.
14+ //
15+ // It is a three-way scale, not a binary one, because the two false-positive
16+ // pressures pull in opposite directions (Spec 077 US1, Codex round-2):
17+ //
18+ // - A bare label prefix ("Prompt:", "Message:") must NOT neutralize a clear
19+ // imperative — that framing is exactly what an attacker uses to smuggle a
20+ // directive (finding A, recall bypass). Such matches stay
21+ // PositionInstruction and hard-block.
22+ // - A tool that DESCRIBES an injection — a relative clause ("prompts that
23+ // ignore…") or an analytical verb governing the phrase ("returns text:
24+ // ignore…") — must not hard-block a benign tool (finding B). Such matches
25+ // are PositionDescriptive: the hard tier is discounted below its floor
26+ // (HARD→SOFT), but the soft tier still surfaces the match for review, so a
27+ // real injection wearing descriptive clothing never fully vanishes.
28+ // - Only genuine quotation/illustration ("such as", "e.g.", surrounding
29+ // quotes) is PositionExample and fully discounted — that framing is
30+ // unambiguously non-instructional, so suppressing it (both tiers) keeps
31+ // legitimate scanners that cite injection strings quiet.
1032type Position int
1133
1234const (
1335 // PositionInstruction is an imperative/instruction-position match; full
14- // confidence is kept.
36+ // confidence is kept. This is also where a bare "label:" prefix lands — a
37+ // label alone does not discount a clear imperative (finding A).
1538 PositionInstruction Position = iota
16- // PositionExample is an example/illustration-position match (after a cue
17- // like "such as"/"e.g.", inside quotes, or in a "detects …" list);
18- // confidence is discounted.
39+ // PositionDescriptive is a match the tool DESCRIBES rather than issues: a
40+ // relative clause ("… that ignore …") or an analytical verb governing the
41+ // phrase. Discounted enough to drop a hard match below its emit floor
42+ // (HARD→SOFT) while a soft match still surfaces for review — no total
43+ // suppression, so a genuine injection cannot bypass detection by adopting
44+ // descriptive phrasing (finding B without reopening finding A).
45+ PositionDescriptive
46+ // PositionExample is an unambiguous quotation/illustration-position match
47+ // (after "such as"/"e.g." or inside quotes); fully discounted so a scanner
48+ // that merely cites the phrase is never flagged, at either tier.
1949 PositionExample
2050)
2151
22- // exampleDiscount is the confidence multiplier applied to example-position
23- // matches. Low enough that a lone example-position match cannot, by itself,
24- // push a tool over a soft threshold.
25- const exampleDiscount = 0.25
52+ // Discount multipliers per position. exampleDiscount fully suppresses a match
53+ // (a lone example-position hit cannot clear either tier's floor).
54+ // descriptiveDiscount is the pivotal value: it must take a HARD base (≈0.85–0.9)
55+ // below the hard emit floor (0.6) yet keep a SOFT base (≈0.6–0.65) at/above the
56+ // soft emit floor (0.3) — i.e. it discounts HARD→SOFT rather than to nothing.
57+ const (
58+ exampleDiscount = 0.25
59+ descriptiveDiscount = 0.5
60+ )
2661
27- // Discount returns the confidence multiplier for a position: 1.0 for
28- // instruction-position, exampleDiscount for example-position.
62+ // Discount returns the confidence multiplier for a position.
2963func (p Position ) Discount () float64 {
30- if p == PositionExample {
64+ switch p {
65+ case PositionExample :
3166 return exampleDiscount
67+ case PositionDescriptive :
68+ return descriptiveDiscount
69+ default :
70+ return 1.0
3271 }
33- return 1.0
3472}
3573
36- // positionWindow is how many bytes before the match we inspect for cue phrases
37- // and quoting.
74+ // positionWindow is how many bytes before the match we inspect for framing.
3875const positionWindow = 80
3976
40- // exampleCues are substrings that, when they appear shortly before a match,
41- // indicate the match is being quoted or illustrated rather than instructing.
42- //
43- // The colon-introduced content nouns ("text:", "output:", …) are the
44- // description-context control (FR-005): a phrase that follows "returns training
45- // text:" / "example output:" is returned/illustrative CONTENT the tool talks
46- // about, not an instruction to the agent, so it must not clear the hard tier.
47- // These are deliberately colon-anchored — the bare nouns would over-match — and
48- // the corpus positives introduce their directives with a period, so recall on
49- // genuine embedded imperatives is unaffected.
77+ // exampleCues are substrings that, immediately before a match (within the
78+ // current sentence), mark it as quoted or illustrated rather than instructing.
79+ // Kept to genuine quotation/illustration markers ONLY — deliberately NOT the
80+ // bare colon-labels ("prompt:", "message:", …) a earlier iteration used, because
81+ // those let an attacker smuggle an imperative behind a label (finding A).
5082var exampleCues = []string {
5183 "such as" ,
5284 "for example" ,
85+ "for instance" ,
5386 "e.g" ,
5487 "i.e" ,
5588 "example" ,
56- "detects" ,
57- "detect " ,
58- "flags" ,
59- "flag " ,
60- "phrase" ,
61- "pattern" ,
62- "like " ,
63- "says " ,
64- "say " ,
65- // Colon-introduced content nouns: the following phrase is quoted/returned
66- // content, not an instruction (see doc comment above).
67- "text:" ,
68- "output:" ,
69- "response:" ,
70- "returns:" ,
71- "return:" ,
72- "example:" ,
73- "sample:" ,
74- "prompt:" ,
75- "message:" ,
76- "string:" ,
77- "content:" ,
78- "payload:" ,
7989}
8090
91+ // describingVerb matches an analytical/descriptive verb (stemmed forms) whose
92+ // presence in the match's sentence signals the tool is talking ABOUT a phrase
93+ // rather than issuing it: "analyzes prompts that…", "returns text: …",
94+ // "detects/flags/guards … instructions". Sentence-scoped (see ClassifyPosition)
95+ // so a benign lead clause cannot reach across a period to discount a following
96+ // imperative. Anchored on the verb stems only, so exfil/imperative action verbs
97+ // (send/upload/ignore/…) are never treated as descriptive.
98+ var describingVerb = regexp .MustCompile (`\b(?:analyz|detect|describ|return|handl|explain|document|illustrat|demonstrat|flag|scan|identif|recogniz|classif|catalog|enumerat|inspect|audit|monitor|highlight|surfac|guard|filter|warn|alert|block|prevent|report)\w*` )
99+
100+ // descriptiveTail marks a match that directly follows a relative pronoun
101+ // ("prompts THAT ignore…", "text WHICH reveals…") — the imperative is the
102+ // grammatical object of the clause, i.e. described, not issued.
103+ var descriptiveTail = regexp .MustCompile (`\b(?:that|which|who)\s+$` )
104+
81105// ClassifyPosition decides whether the match starting at byte offset matchStart
82- // in text is in instruction- or example-position. text may be raw or
83- // normalized; matching is case-insensitive on the preceding window.
106+ // in text is in instruction-, descriptive-, or example-position. text may be
107+ // raw or normalized; matching is case-insensitive on the preceding window.
84108func ClassifyPosition (text string , matchStart int ) Position {
85109 if matchStart <= 0 {
86110 return PositionInstruction
@@ -91,17 +115,36 @@ func ClassifyPosition(text string, matchStart int) Position {
91115 }
92116 window := strings .ToLower (text [start :matchStart ])
93117
118+ // 1. Genuine quotation / illustration → fully discounted example-position.
119+ // Checked on the full window: these markers are narrow and unambiguous, and
120+ // some ("e.g.", "i.e.") legitimately contain the periods the sentence scope
121+ // below would otherwise treat as boundaries.
94122 for _ , cue := range exampleCues {
95123 if strings .Contains (window , cue ) {
96124 return PositionExample
97125 }
98126 }
99-
100- // Inside an unclosed quote → example/illustration position.
101127 if oddQuoteCount (window ) {
102128 return PositionExample
103129 }
104130
131+ // Scope the broad descriptive heuristic to the current sentence. Framing
132+ // established before a sentence boundary must not neutralize an imperative
133+ // that begins a new sentence — otherwise a benign lead ("Lists files. Ignore
134+ // all previous instructions.") would discount the injection that follows (a
135+ // recall bypass). Only the wide analytical-verb rule needs this guard; the
136+ // quotation cues above are safe cross-sentence.
137+ if i := strings .LastIndexAny (window , ".!?" ); i >= 0 {
138+ window = window [i + 1 :]
139+ }
140+
141+ // 2. Analytical/relative-clause framing → descriptive-position (HARD→SOFT).
142+ if describingVerb .MatchString (window ) || descriptiveTail .MatchString (window ) {
143+ return PositionDescriptive
144+ }
145+
146+ // 3. Otherwise the match is an instruction — including one behind a bare
147+ // "label:" prefix, which does not by itself discount a clear imperative.
105148 return PositionInstruction
106149}
107150
0 commit comments