Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions cmd/scan-eval/gate.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ var categoryCheck = map[string]string{
"unicode_smuggling": "unicode.hidden",
"decoded_payload": "payload.decoded",
"shadowing": "shadowing.cross_server",
"phrase_injection": "phrase.injection", // Spec 077 US1 — curated hard check
"capability_mismatch": "capability.mismatch", // US2 (T016) — not yet registered
}

Expand All @@ -79,6 +80,7 @@ func gateChecks() []detect.Check {
&checks.UnicodeHidden{},
&checks.Shadowing{},
&checks.PayloadDecoded{},
&checks.PhraseInjection{}, // Spec 077 US1 — curated hard injection/exfil check
}
}

Expand Down Expand Up @@ -239,7 +241,18 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
}

// scanEntryFlagged builds the entry's RegistryView (its tool + peers), scans it,
// and reports whether the engine produced any finding for the entry's own tool.
// and reports whether the engine HARD-flagged (auto-quarantine tier) the entry's
// own tool.
//
// The gate measures the auto-quarantine decision, i.e. the HARD tier only. This
// matters since Spec 077 US1 (Codex round-3) made phrase.injection "never fully
// suppress" a matched injection: a phrase quoted or merely described now surfaces
// as a SOFT review finding instead of nothing. Counting any finding would then
// score those benign hard-negatives (a scanner quoting "ignore previous
// instructions") as false positives, even though they are only review-flagged and
// never blocked. Recall is unaffected — every gated category's malicious samples
// are detected at the HARD tier — so the gate keeps measuring exactly the
// blocking behavior the product ships.
func scanEntryFlagged(engine *detect.Engine, e gateEntry) bool {
views := []detect.ToolView{toGateView(e.Server, e.Tool)}
for _, p := range e.Peers {
Expand All @@ -248,7 +261,7 @@ func scanEntryFlagged(engine *detect.Engine, e gateEntry) bool {
res := engine.Scan(detect.NewRegistryView(views))
want := e.Server + ":" + e.Tool.Name
for _, f := range res.Findings {
if f.Location == want {
if f.Location == want && f.ThreatLevel == detect.ThreatLevelDangerous {
return true
}
}
Expand Down
19 changes: 11 additions & 8 deletions frontend/src/components/ServerCard.vue
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@
<div v-if="showApproveConfirmation" class="modal modal-open">
<div class="modal-box">
<h3 class="font-bold text-lg mb-4">
{{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Critical Findings Detected' }}
{{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Dangerous Findings Detected' }}
</h3>
<p v-if="approveDialogMode === 'critical'" class="mb-4">
<strong>{{ server.name }}</strong> has
<span class="text-error font-semibold">{{ criticalFindingCount }} critical finding{{ criticalFindingCount === 1 ? '' : 's' }}</span>
<span class="text-error font-semibold">{{ dangerousFindingCount }} dangerous finding{{ dangerousFindingCount === 1 ? '' : 's' }}</span>
in its most recent security scan. Approving this server will allow it to run despite these warnings.
</p>
<p v-else class="mb-4">
Expand Down Expand Up @@ -722,14 +722,17 @@ async function triggerLogout() {
}
}

// Counts critical findings from the scan summary if available. Used to gate
// the Approve button behind an extra confirmation (F-04).
const criticalFindingCount = computed(() => {
// Counts baseline DANGEROUS findings from the scan summary if available. Used to
// gate the Approve button behind an extra confirmation (F-04). Spec 077 FR-021:
// the gate blocks on baseline dangerous (hard-tier) findings only, matching the
// tier-driven server verdict — not on `critical` severity, which a non-blocking
// soft finding could also carry.
const dangerousFindingCount = computed(() => {
const scan = props.server.security_scan as any
if (!scan) return 0
// finding_counts.critical is populated from the latest report summary.
// finding_counts.dangerous is populated from the latest report summary.
const fc = scan.finding_counts as Record<string, number> | undefined
if (fc && typeof fc.critical === 'number') return fc.critical
if (fc && typeof fc.dangerous === 'number') return fc.dangerous
return 0
})

Expand All @@ -750,7 +753,7 @@ function handleApproveClick() {
showApproveConfirmation.value = true
return
}
if (criticalFindingCount.value > 0) {
if (dangerousFindingCount.value > 0) {
approveDialogMode.value = 'critical'
showApproveConfirmation.value = true
return
Expand Down
17 changes: 11 additions & 6 deletions frontend/src/views/ServerDetail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,11 @@
<div v-if="showApproveConfirmation" class="modal modal-open">
<div class="modal-box">
<h3 class="font-bold text-lg mb-4">
{{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Critical Findings Detected' }}
{{ approveDialogMode === 'no_scan' ? 'No Security Scan Run' : 'Dangerous Findings Detected' }}
</h3>
<p v-if="approveDialogMode === 'critical'" class="mb-4">
<strong>{{ server.name }}</strong> has
<span class="text-error font-semibold">{{ criticalFindingCount }} critical finding{{ criticalFindingCount === 1 ? '' : 's' }}</span>
<span class="text-error font-semibold">{{ dangerousFindingCount }} dangerous finding{{ dangerousFindingCount === 1 ? '' : 's' }}</span>
in its most recent security scan. Approving will allow this server to run despite these warnings.
</p>
<p v-else class="mb-4">
Expand Down Expand Up @@ -2399,13 +2399,18 @@ async function unquarantineServer() {
const showApproveConfirmation = ref(false)
const approveDialogMode = ref<'no_scan' | 'critical'>('no_scan')

const criticalFindingCount = computed(() => {
// Spec 077 FR-021: the approval gate blocks on baseline DANGEROUS findings only
// (hard-tier). Deep-scan findings inform but never gate. The server-side verdict
// is now tier-driven, so the modal mirrors it by counting `dangerous` (threat
// level) rather than `critical` (severity) — a soft finding can be "high"
// severity yet must not block approval.
const dangerousFindingCount = computed(() => {
// Prefer the loaded scan report summary if available; otherwise fall back
// to finding_counts on the server's security_scan summary (if populated).
const rep = scanReport.value as any
if (rep?.summary?.critical != null) return rep.summary.critical as number
if (rep?.summary?.dangerous != null) return rep.summary.dangerous as number
const scan = server.value?.security_scan as any
if (scan?.finding_counts?.critical != null) return scan.finding_counts.critical as number
if (scan?.finding_counts?.dangerous != null) return scan.finding_counts.dangerous as number
return 0
})

Expand All @@ -2421,7 +2426,7 @@ function handleApproveClick() {
showApproveConfirmation.value = true
return
}
if (criticalFindingCount.value > 0) {
if (dangerousFindingCount.value > 0) {
approveDialogMode.value = 'critical'
showApproveConfirmation.value = true
return
Expand Down
55 changes: 53 additions & 2 deletions internal/security/detect/checks/directive_imperative.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,15 @@ type directiveFamily struct {
// lowercase, contraction-expanded, lightly-stemmed forms (e.g. "instruction"
// matches the stemmed "instructions"). Built once at init.
var directiveFamilies = []directiveFamily{
{ // Hidden-instruction / role-injection tags: <IMPORTANT>, <system>, …
re: regexp.MustCompile(`<\s*(important|system|secret|critical|admin|instruction|developer|assistant)\b`),
{ // Hidden-instruction / role-injection tags: <IMPORTANT>, <system>, <hidden>,
// <system_prompt>, … "hidden" restores the legacy tpa <hidden> marker (Spec 077
// US1, Codex round-2 finding C). The optional (?:[_-]\w+)* suffix lets a
// compound tag name match — <system_prompt> / <developer-note> — which a bare
// `\b` after the keyword misses because "_" is a word char (Codex round-3
// finding #3). It does NOT loosen to prefixes: "<systematic>" still fails (no
// separator), so the keyword must be a whole tag-name token or the head of an
// underscore/hyphen-joined one.
re: regexp.MustCompile(`<\s*(?:important|system|secret|critical|admin|instruction|developer|assistant|hidden)(?:[_-]\w+)*\b`),
base: 0.7,
what: "hidden-instruction tag",
},
Expand All @@ -55,11 +62,31 @@ var directiveFamilies = []directiveFamily{
base: 0.65,
what: "instruction-override directive",
},
{ // Injected new-instruction preamble (legacy tpa restore, Spec 077 US1 Codex
// round-3 finding #2): "new instructions:", "updated directions:",
// "additional instructions:". The colon anchor keeps it to the smuggled-header
// shape — a benign "follow the new instructions carefully" (no colon) does not
// match. SOFT: benignly phrasable ("returns the new instructions: …"), so
// review-only. (Normalization leaves the trailing "instructions:" token
// unstemmed because the colon blocks the plural-strip, so \w* absorbs the "s".)
re: regexp.MustCompile(`\b(?:new|updated|revised|additional|further|latest|real|actual|hidden|secret) (?:instruction|direction|command|rule|order)\w*\s*:`),
base: 0.5,
what: "injected instruction preamble",
},
{ // Secrecy imperative: "do not tell the user", "must not reveal".
re: regexp.MustCompile(`\b(?:do not|must not|never) (?:tell|inform|reveal|disclos\w*|mention|notify|warn|expose)\b`),
base: 0.6,
what: "secrecy imperative",
},
{ // Covert-secrecy directive (legacy tpa restore, Spec 077 US1 Codex round-2
// finding C): coercion to act behind the user's back — "without telling the
// user", "without informing the user", "hide this from …", "keep this
// hidden/secret". Matched on NORMALIZED text (stemmed: "telling"→"tell").
// Benignly phrasable in rare cases, so SOFT (review, never auto-quarantine).
re: regexp.MustCompile(`\b(?:without (?:tell|inform|notify|alert|warn)\w* (?:the )?(?:user|caller|human|operator|client)|(?:hide|conceal) (?:this|it|that)(?: \w+)? from|keep (?:this|it|that)(?: \w+)? (?:hidden|secret|conceal\w*|private|quiet|confidential))\b`),
base: 0.6,
what: "covert-secrecy directive",
},
{ // Exfiltration imperative: "reveal your system prompt", "send the api key".
re: regexp.MustCompile(`\b(?:reveal|expose|print|output|send|leak|disclos\w*) (?:your |the |all )?(?:system prompt|api key\w*|secret\w*|password\w*|credential\w*|private key)\b`),
base: 0.6,
Expand All @@ -70,6 +97,30 @@ var directiveFamilies = []directiveFamily{
base: 0.55,
what: "tool-preamble injection",
},
{ // Tool-priority coercion (legacy tpa): "always call this tool first",
// "before using any other tool". Broader than tool-preamble and benignly
// phrasable ("call this tool first to authenticate"), so it lives in the
// SOFT tier (review, never auto-quarantine). "always" stems to "alway".
re: regexp.MustCompile(`\b(?:(?:alway\w* )?(?:call|us\w*|invok\w*|run\w*) this tool (?:first|before)|before (?:us\w*|call\w*|invok\w*|run\w*) any other tool)\b`),
base: 0.5,
what: "tool-priority coercion",
},
{ // Jailbreak / mode-override triggers (legacy tpa): "developer mode",
// "jailbreak", "ignore your guidelines". Kept SOFT because "developer mode"
// has legitimate uses; a genuine guardrail-override is separately caught at
// the HARD tier by phrase.injection.
re: regexp.MustCompile(`\b(?:developer mode|jailbreak\w*|dan mode|do anything now|unrestricted mode|ignore your guidelin\w*)\b`),
base: 0.5,
what: "jailbreak trigger",
},
{ // Data-forwarding to an external sink (legacy tpa): "upload the file to an
// external endpoint", "post the contents to a remote server". Requires an
// external/remote-flavoured target so benign "uploads the file to the
// storage bucket" does not match. SOFT: it informs review without blocking.
re: regexp.MustCompile(`\b(?:upload|post|send|transmit|forward|exfiltrat\w*|leak|dump) (?:the |all |your )?(?:file|content|data|output|result|conversation|log|record|payload)\w* to (?:an? |the |our )?(?:external|remote|third.?party|attacker|adversar\w*|off.?site|another server|http|ftp)`),
base: 0.5,
what: "external data-forwarding directive",
},
}

// Inspect implements detect.Check. It emits at most one signal per tool: the
Expand Down
Loading
Loading