-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathrun.ts
More file actions
110 lines (91 loc) · 4.34 KB
/
Copy pathrun.ts
File metadata and controls
110 lines (91 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env npx tsx
// bench/run.ts — ShellWard detection benchmark.
//
// Runs every detector over the labeled corpus and reports precision/recall/F1
// per category plus the actual false positives & false negatives (the
// actionable part — those are the rows to go fix).
//
// npm run bench # report metrics, always exit 0
// npm run bench -- --ci # exit 1 if any category F1 < MIN_F1 (regression gate)
import { ShellWard } from '../src/core/engine'
import { CORPUS, type Sample, type Category } from './corpus'
const MIN_F1 = 0.85 // CI regression floor
const ci = process.argv.includes('--ci')
const guard = new ShellWard({ mode: 'enforce', locale: 'en' })
/** Returns true if the detector flags the input as malicious/sensitive. */
function detect(s: Sample): boolean {
switch (s.category) {
case 'injection': return !guard.checkInjection(s.input).safe
case 'command': return !guard.checkCommand(s.input).allowed
case 'pii': return guard.scanData(s.input).hasSensitiveData
case 'tool_poisoning': return !guard.scanToolDefinition({ name: 'tool', description: s.input }).safe
}
}
interface Metrics { tp: number; fp: number; fn: number; tn: number }
const empty = (): Metrics => ({ tp: 0, fp: 0, fn: 0, tn: 0 })
const byCat = new Map<Category, Metrics>()
const falsePos: Sample[] = []
const falseNeg: Sample[] = []
// Known-limitation samples are scored separately and excluded from the gate.
const gated = CORPUS.filter(s => !s.knownLimitation)
const limitations = CORPUS.filter(s => s.knownLimitation)
for (const s of gated) {
const predicted = detect(s)
const m = byCat.get(s.category) ?? empty()
if (s.malicious && predicted) m.tp++
else if (s.malicious && !predicted) { m.fn++; falseNeg.push(s) }
else if (!s.malicious && predicted) { m.fp++; falsePos.push(s) }
else m.tn++
byCat.set(s.category, m)
}
let limitationsCaught = 0
for (const s of limitations) if (detect(s)) limitationsCaught++
function prf(m: Metrics) {
const precision = m.tp + m.fp === 0 ? 1 : m.tp / (m.tp + m.fp)
const recall = m.tp + m.fn === 0 ? 1 : m.tp / (m.tp + m.fn)
const f1 = precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall)
return { precision, recall, f1 }
}
const pct = (n: number) => (n * 100).toFixed(1).padStart(5) + '%'
console.log('\n🔬 ShellWard Detection Benchmark\n')
console.log(' category samples precision recall F1 (TP/FP/FN/TN)')
console.log(' ' + '─'.repeat(72))
const overall = empty()
let worstF1 = 1
for (const [cat, m] of byCat) {
const { precision, recall, f1 } = prf(m)
worstF1 = Math.min(worstF1, f1)
overall.tp += m.tp; overall.fp += m.fp; overall.fn += m.fn; overall.tn += m.tn
const n = m.tp + m.fp + m.fn + m.tn
console.log(` ${cat.padEnd(16)} ${String(n).padStart(5)} ${pct(precision)} ${pct(recall)} ${pct(f1)} (${m.tp}/${m.fp}/${m.fn}/${m.tn})`)
}
console.log(' ' + '─'.repeat(72))
const o = prf(overall)
const oN = overall.tp + overall.fp + overall.fn + overall.tn
console.log(` ${'OVERALL'.padEnd(16)} ${String(oN).padStart(5)} ${pct(o.precision)} ${pct(o.recall)} ${pct(o.f1)} (${overall.tp}/${overall.fp}/${overall.fn}/${overall.tn})`)
if (falseNeg.length) {
console.log(`\n ❌ False negatives (missed attacks — ${falseNeg.length}):`)
for (const s of falseNeg) console.log(` [${s.category}] ${truncate(s.input)}`)
}
if (falsePos.length) {
console.log(`\n ⚠️ False positives (benign flagged — ${falsePos.length}):`)
for (const s of falsePos) console.log(` [${s.category}] ${truncate(s.input)}${s.note ? ` — ${s.note}` : ''}`)
}
if (!falseNeg.length && !falsePos.length) {
console.log('\n ✅ No false positives or negatives on the gated corpus.')
}
if (limitations.length) {
console.log(`\n 📋 Known limitations (documented, NOT gated): ${limitationsCaught}/${limitations.length} incidentally caught`)
for (const s of limitations) {
console.log(` ${detect(s) ? '✓' : '·'} [${s.category}] ${truncate(s.input)}${s.note ? ` — ${s.note}` : ''}`)
}
}
console.log()
if (ci && worstF1 < MIN_F1) {
console.error(` ❌ CI gate: lowest category F1 ${(worstF1 * 100).toFixed(1)}% < ${(MIN_F1 * 100)}% floor`)
process.exit(1)
}
function truncate(s: string): string {
const oneLine = s.replace(/\s+/g, ' ')
return oneLine.length > 70 ? oneLine.slice(0, 70) + '…' : oneLine
}