smart-mcp-proxy · Dumbris · Jul 2, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/cmd/scan-eval/gate.go b/cmd/scan-eval/gate.go
@@ -67,6 +67,7 @@ var categoryCheck = map[string]string{
 	"unicode_smuggling":   "unicode.hidden",
 	"decoded_payload":     "payload.decoded",
 	"shadowing":           "shadowing.cross_server",
+	"phrase_injection":    "phrase.injection",    // Spec 077 US1 — curated hard check
 	"capability_mismatch": "capability.mismatch", // US2 (T016) — not yet registered
 }
 
@@ -79,6 +80,7 @@ func gateChecks() []detect.Check {
 		&checks.UnicodeHidden{},
 		&checks.Shadowing{},
 		&checks.PayloadDecoded{},
+		&checks.PhraseInjection{}, // Spec 077 US1 — curated hard injection/exfil check
 	}
 }
 
@@ -239,7 +241,18 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
 }
 
 // scanEntryFlagged builds the entry's RegistryView (its tool + peers), scans it,
-// and reports whether the engine produced any finding for the entry's own tool.
+// and reports whether the engine HARD-flagged (auto-quarantine tier) the entry's
+// own tool.
+//
+// The gate measures the auto-quarantine decision, i.e. the HARD tier only. This
+// matters since Spec 077 US1 (Codex round-3) made phrase.injection "never fully
+// suppress" a matched injection: a phrase quoted or merely described now surfaces
+// as a SOFT review finding instead of nothing. Counting any finding would then
+// score those benign hard-negatives (a scanner quoting "ignore previous
+// instructions") as false positives, even though they are only review-flagged and
+// never blocked. Recall is unaffected — every gated category's malicious samples
+// are detected at the HARD tier — so the gate keeps measuring exactly the
+// blocking behavior the product ships.
 func scanEntryFlagged(engine *detect.Engine, e gateEntry) bool {
 	views := []detect.ToolView{toGateView(e.Server, e.Tool)}
 	for _, p := range e.Peers {
@@ -248,7 +261,7 @@ func scanEntryFlagged(engine *detect.Engine, e gateEntry) bool {
 	res := engine.Scan(detect.NewRegistryView(views))
 	want := e.Server + ":" + e.Tool.Name
 	for _, f := range res.Findings {
-		if f.Location == want {
+		if f.Location == want && f.ThreatLevel == detect.ThreatLevelDangerous {
 			return true
 		}
 	}

diff --git a/frontend/src/components/ServerCard.vue b/frontend/src/components/ServerCard.vue
@@ -265,11 +265,11 @@
     <div v-if="showApproveConfirmation" class="modal modal-open">
       <div class="modal-box">
         <h3 class="font-bold text-lg mb-4">
-          {{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Critical Findings Detected' }}
+          {{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Dangerous Findings Detected' }}
         </h3>
         <p v-if="approveDialogMode === 'critical'" class="mb-4">
           <strong>{{ server.name }}</strong> has
-          <span class="text-error font-semibold">{{ criticalFindingCount }} critical finding{{ criticalFindingCount === 1 ? '' : 's' }}</span>
+          <span class="text-error font-semibold">{{ dangerousFindingCount }} dangerous finding{{ dangerousFindingCount === 1 ? '' : 's' }}</span>
           in its most recent security scan. Approving this server will allow it to run despite these warnings.
         </p>
         <p v-else class="mb-4">
@@ -722,14 +722,17 @@ async function triggerLogout() {
   }
 }
 
-// Counts critical findings from the scan summary if available. Used to gate
-// the Approve button behind an extra confirmation (F-04).
-const criticalFindingCount = computed(() => {
+// Counts baseline DANGEROUS findings from the scan summary if available. Used to
+// gate the Approve button behind an extra confirmation (F-04). Spec 077 FR-021:
+// the gate blocks on baseline dangerous (hard-tier) findings only, matching the
+// tier-driven server verdict — not on `critical` severity, which a non-blocking
+// soft finding could also carry.
+const dangerousFindingCount = computed(() => {
   const scan = props.server.security_scan as any
   if (!scan) return 0
-  // finding_counts.critical is populated from the latest report summary.
+  // finding_counts.dangerous is populated from the latest report summary.
   const fc = scan.finding_counts as Record<string, number> | undefined
-  if (fc && typeof fc.critical === 'number') return fc.critical
+  if (fc && typeof fc.dangerous === 'number') return fc.dangerous
   return 0
 })
 
@@ -750,7 +753,7 @@ function handleApproveClick() {
     showApproveConfirmation.value = true
     return
   }
-  if (criticalFindingCount.value > 0) {
+  if (dangerousFindingCount.value > 0) {
     approveDialogMode.value = 'critical'
     showApproveConfirmation.value = true
     return

diff --git a/frontend/src/views/ServerDetail.vue b/frontend/src/views/ServerDetail.vue
@@ -221,11 +221,11 @@
       <div v-if="showApproveConfirmation" class="modal modal-open">
         <div class="modal-box">
           <h3 class="font-bold text-lg mb-4">
-            {{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Critical Findings Detected' }}
+            {{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Dangerous Findings Detected' }}
           </h3>
           <p v-if="approveDialogMode === 'critical'" class="mb-4">
             <strong>{{ server.name }}</strong> has
-            <span class="text-error font-semibold">{{ criticalFindingCount }} critical finding{{ criticalFindingCount === 1 ? '' : 's' }}</span>
+            <span class="text-error font-semibold">{{ dangerousFindingCount }} dangerous finding{{ dangerousFindingCount === 1 ? '' : 's' }}</span>
             in its most recent security scan. Approving will allow this server to run despite these warnings.
           </p>
           <p v-else class="mb-4">
@@ -2399,13 +2399,18 @@ async function unquarantineServer() {
 const showApproveConfirmation = ref(false)
 const approveDialogMode = ref<'no_scan' | 'critical'>('no_scan')
 
-const criticalFindingCount = computed(() => {
+// Spec 077 FR-021: the approval gate blocks on baseline DANGEROUS findings only
+// (hard-tier). Deep-scan findings inform but never gate. The server-side verdict
+// is now tier-driven, so the modal mirrors it by counting `dangerous` (threat
+// level) rather than `critical` (severity) — a soft finding can be "high"
+// severity yet must not block approval.
+const dangerousFindingCount = computed(() => {
   // Prefer the loaded scan report summary if available; otherwise fall back
   // to finding_counts on the server's security_scan summary (if populated).
   const rep = scanReport.value as any
-  if (rep?.summary?.critical != null) return rep.summary.critical as number
+  if (rep?.summary?.dangerous != null) return rep.summary.dangerous as number
   const scan = server.value?.security_scan as any
-  if (scan?.finding_counts?.critical != null) return scan.finding_counts.critical as number
+  if (scan?.finding_counts?.dangerous != null) return scan.finding_counts.dangerous as number
   return 0
 })
 
@@ -2421,7 +2426,7 @@ function handleApproveClick() {
     showApproveConfirmation.value = true
     return
   }
-  if (criticalFindingCount.value > 0) {
+  if (dangerousFindingCount.value > 0) {
     approveDialogMode.value = 'critical'
     showApproveConfirmation.value = true
     return

diff --git a/internal/security/detect/checks/directive_imperative.go b/internal/security/detect/checks/directive_imperative.go
@@ -45,8 +45,15 @@ type directiveFamily struct {
 // lowercase, contraction-expanded, lightly-stemmed forms (e.g. "instruction"
 // matches the stemmed "instructions"). Built once at init.
 var directiveFamilies = []directiveFamily{
-	{ // Hidden-instruction / role-injection tags: <IMPORTANT>, <system>, …
-		re:   regexp.MustCompile(`<\s*(important|system|secret|critical|admin|instruction|developer|assistant)\b`),
+	{ // Hidden-instruction / role-injection tags: <IMPORTANT>, <system>, <hidden>,
+		// <system_prompt>, … "hidden" restores the legacy tpa <hidden> marker (Spec 077
+		// US1, Codex round-2 finding C). The optional (?:[_-]\w+)* suffix lets a
+		// compound tag name match — <system_prompt> / <developer-note> — which a bare
+		// `\b` after the keyword misses because "_" is a word char (Codex round-3
+		// finding #3). It does NOT loosen to prefixes: "<systematic>" still fails (no
+		// separator), so the keyword must be a whole tag-name token or the head of an
+		// underscore/hyphen-joined one.
+		re:   regexp.MustCompile(`<\s*(?:important|system|secret|critical|admin|instruction|developer|assistant|hidden)(?:[_-]\w+)*\b`),
 		base: 0.7,
 		what: "hidden-instruction tag",
 	},
@@ -55,11 +62,31 @@ var directiveFamilies = []directiveFamily{
 		base: 0.65,
 		what: "instruction-override directive",
 	},
+	{ // Injected new-instruction preamble (legacy tpa restore, Spec 077 US1 Codex
+		// round-3 finding #2): "new instructions:", "updated directions:",
+		// "additional instructions:". The colon anchor keeps it to the smuggled-header
+		// shape — a benign "follow the new instructions carefully" (no colon) does not
+		// match. SOFT: benignly phrasable ("returns the new instructions: …"), so
+		// review-only. (Normalization leaves the trailing "instructions:" token
+		// unstemmed because the colon blocks the plural-strip, so \w* absorbs the "s".)
+		re:   regexp.MustCompile(`\b(?:new|updated|revised|additional|further|latest|real|actual|hidden|secret) (?:instruction|direction|command|rule|order)\w*\s*:`),
+		base: 0.5,
+		what: "injected instruction preamble",
+	},
 	{ // Secrecy imperative: "do not tell the user", "must not reveal".
 		re:   regexp.MustCompile(`\b(?:do not|must not|never) (?:tell|inform|reveal|disclos\w*|mention|notify|warn|expose)\b`),
 		base: 0.6,
 		what: "secrecy imperative",
 	},
+	{ // Covert-secrecy directive (legacy tpa restore, Spec 077 US1 Codex round-2
+		// finding C): coercion to act behind the user's back — "without telling the
+		// user", "without informing the user", "hide this from …", "keep this
+		// hidden/secret". Matched on NORMALIZED text (stemmed: "telling"→"tell").
+		// Benignly phrasable in rare cases, so SOFT (review, never auto-quarantine).
+		re:   regexp.MustCompile(`\b(?:without (?:tell|inform|notify|alert|warn)\w* (?:the )?(?:user|caller|human|operator|client)|(?:hide|conceal) (?:this|it|that)(?: \w+)? from|keep (?:this|it|that)(?: \w+)? (?:hidden|secret|conceal\w*|private|quiet|confidential))\b`),
+		base: 0.6,
+		what: "covert-secrecy directive",
+	},
 	{ // Exfiltration imperative: "reveal your system prompt", "send the api key".
 		re:   regexp.MustCompile(`\b(?:reveal|expose|print|output|send|leak|disclos\w*) (?:your |the |all )?(?:system prompt|api key\w*|secret\w*|password\w*|credential\w*|private key)\b`),
 		base: 0.6,
@@ -70,6 +97,30 @@ var directiveFamilies = []directiveFamily{
 		base: 0.55,
 		what: "tool-preamble injection",
 	},
+	{ // Tool-priority coercion (legacy tpa): "always call this tool first",
+		// "before using any other tool". Broader than tool-preamble and benignly
+		// phrasable ("call this tool first to authenticate"), so it lives in the
+		// SOFT tier (review, never auto-quarantine). "always" stems to "alway".
+		re:   regexp.MustCompile(`\b(?:(?:alway\w* )?(?:call|us\w*|invok\w*|run\w*) this tool (?:first|before)|before (?:us\w*|call\w*|invok\w*|run\w*) any other tool)\b`),
+		base: 0.5,
+		what: "tool-priority coercion",
+	},
+	{ // Jailbreak / mode-override triggers (legacy tpa): "developer mode",
+		// "jailbreak", "ignore your guidelines". Kept SOFT because "developer mode"
+		// has legitimate uses; a genuine guardrail-override is separately caught at
+		// the HARD tier by phrase.injection.
+		re:   regexp.MustCompile(`\b(?:developer mode|jailbreak\w*|dan mode|do anything now|unrestricted mode|ignore your guidelin\w*)\b`),
+		base: 0.5,
+		what: "jailbreak trigger",
+	},
+	{ // Data-forwarding to an external sink (legacy tpa): "upload the file to an
+		// external endpoint", "post the contents to a remote server". Requires an
+		// external/remote-flavoured target so benign "uploads the file to the
+		// storage bucket" does not match. SOFT: it informs review without blocking.
+		re:   regexp.MustCompile(`\b(?:upload|post|send|transmit|forward|exfiltrat\w*|leak|dump) (?:the |all |your )?(?:file|content|data|output|result|conversation|log|record|payload)\w* to (?:an? |the |our )?(?:external|remote|third.?party|attacker|adversar\w*|off.?site|another server|http|ftp)`),
+		base: 0.5,
+		what: "external data-forwarding directive",
+	},
 }
 
 // Inspect implements detect.Check. It emits at most one signal per tool: the