Skip to content

Commit ade047f

Browse files
committed
test(bench): 合规扫描检测基准(精确率/召回率/F1) v0.7.4
- bench/scan-bench.ts: 31 标注样例跑真实 scanProject 管线,算 P/R/F1,CI 门禁 - 基线 P100%/R100%/F1 100%; 建基准过程验证了占位符过滤器边界 - npm run bench:scan + CI 接入; README Detection Benchmark 加合规扫描行(诚实标注自建语料) - 把"信我能检"变成"看数字"
1 parent 9e77d6d commit ade047f

5 files changed

Lines changed: 121 additions & 1 deletion

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,6 @@ jobs:
3838

3939
- name: Detection benchmark (precision/recall regression gate)
4040
run: npm run bench -- --ci
41+
42+
- name: Compliance-scan benchmark (precision/recall regression gate)
43+
run: npm run bench:scan -- --ci

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/),
66
and this project adheres to [Semantic Versioning](https://semver.org/).
77

8+
## [0.7.4] - 2026-06-20
9+
10+
### Added — 合规扫描检测基准(把"信我能检"变成"看数字")
11+
- `bench/scan-bench.ts` + `npm run bench:scan`:用 **31 个标注样例**(17 真实风险 + 14 硬负例:境内端点/占位符/文档示例/lock/无效校验位)跑**真实 scanProject 管线**,算精确率/召回率/F1,CI 门禁(低于 90% 失败)
12+
- 基线:**精确率 100% · 召回率 100% · F1 100%**
13+
- 建基准过程即暴露并验证了占位符过滤器的边界(含 `abcdef`/`123456` 的密钥会被当示例滤掉——偏向精确率的有意取舍)
14+
- README「Detection Benchmark」新增合规扫描一行(诚实标注为自建语料)
15+
816
## [0.7.3] - 2026-06-20
917

1018
### Added — 扫描透明度("秒出=干活了吗"的证据)

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,9 @@ Effectiveness is measured, not asserted. `npm run bench` runs every detector ove
390390
| Dangerous commands | 100% | 100% | 100% |
391391
| PII / secrets | 100% | 100% | 100% |
392392
| MCP tool poisoning | 100% | 100% | 100% |
393+
| **Compliance scan** (overseas / secret / PII vs hard negatives) | 100% | 100% | 100% |
394+
395+
The compliance scanner has its own gated corpus — `npm run bench:scan` runs the **real `scanProject` pipeline** over 31 labeled cases (17 real risks + 14 hard negatives: domestic endpoints, placeholder keys, doc examples, lock files, invalid checksums). Self-authored corpus, CI-gated against regression.
393396

394397
83 gated samples (attacks + hard negatives). Zero-width-interleaved and empty-quote (`r''m`) obfuscation are normalized before matching. The corpus also tracks **5 documented bypasses** (leetspeak, base64, non-zh/en languages, shell variable indirection) that regex/heuristics are not expected to catch — listed explicitly and excluded from the gate rather than hidden.
395398

bench/scan-bench.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env npx tsx
2+
// bench/scan-bench.ts — 合规扫描检测基准
3+
//
4+
// 用标注语料跑「真实 scanProject 管线」,算精确率/召回率/F1,把"信我能检"变成"看数字"。
5+
// 正例=应检出的真实风险;硬负例=不该误报的(境内端点/占位符/文档示例/lock/无效校验位)。
6+
// npx tsx bench/scan-bench.ts 打印结果
7+
// npx tsx bench/scan-bench.ts --ci 精确率或召回率低于阈值则非零退出(CI 门禁)
8+
9+
import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from 'fs'
10+
import { join } from 'path'
11+
import { tmpdir } from 'os'
12+
import { scanProject } from '../src/compliance/project-scan'
13+
import type { FindingKind } from '../src/compliance/project-scan'
14+
15+
type Expect = FindingKind | 'none'
16+
interface Case { id: string; path: string; content: string; expect: Expect; note: string }
17+
18+
const C: Case[] = [
19+
// ===== 正例:境外端点 / SDK 依赖 =====
20+
{ id: 'p01', path: 'a.ts', content: 'const u="https://api.openai.com/v1"', expect: 'overseas', note: 'OpenAI 端点(代码)' },
21+
{ id: 'p02', path: 'b.py', content: 'BASE="https://api.anthropic.com"', expect: 'overseas', note: 'Anthropic 端点' },
22+
{ id: 'p03', path: 'c.js', content: 'const g="https://generativelanguage.googleapis.com/v1"', expect: 'overseas', note: 'Gemini 端点' },
23+
{ id: 'p04', path: 'package.json', content: '{"dependencies":{"openai":"^4"}}', expect: 'overseas', note: 'openai 依赖' },
24+
{ id: 'p05', path: 'requirements.txt', content: 'flask==2.0\nanthropic>=0.20', expect: 'overseas', note: 'anthropic 依赖(py)' },
25+
{ id: 'p06', path: 'go.mod', content: 'module x\nrequire github.com/sashabaranov/go-openai v1.2.0', expect: 'overseas', note: 'go-openai 依赖' },
26+
// ===== 正例:密钥 =====
27+
{ id: 'p07', path: 'k1.js', content: 'const k="sk-RZ9mKp2QwLs7Yv3Nd8Tb1Hc4Xj6Pq"', expect: 'secret', note: 'OpenAI key' },
28+
{ id: 'p08', path: 'k2.ts', content: 'const t="ghp_Rz9MkP2qWlS7yV3nD8tB1hC4xJ6pQsTuVwYz"', expect: 'secret', note: 'GitHub token(36位)' },
29+
{ id: 'p09', path: 'k3.py', content: 'AWS="AKIARZ9MKP2QWLS7YV3N"', expect: 'secret', note: 'AWS key(真实格式)' },
30+
{ id: 'p10', path: 'k4.txt', content: '-----BEGIN RSA PRIVATE KEY-----', expect: 'secret', note: '私钥' },
31+
{ id: 'p11', path: 'k5.env', content: 'DB_PASSWORD=Sup3rS3cretPwd2026', expect: 'secret', note: '口令' },
32+
// ===== 正例:中文 PII =====
33+
{ id: 'p12', path: 'd1.txt', content: '身份证 110101199003071233', expect: 'pii', note: '身份证(校验位有效)' },
34+
{ id: 'p13', path: 'd2.ts', content: 'const phone="13912345678"', expect: 'pii', note: '手机号' },
35+
{ id: 'p14', path: 'd3.json', content: '{"card":"4111111111111111"}', expect: 'pii', note: '银行卡(Luhn)' },
36+
{ id: 'p15', path: 'd4.txt', content: 'SSN: 123-45-6789', expect: 'pii', note: 'US SSN' },
37+
38+
// ===== 硬负例:不该误报 =====
39+
{ id: 'n01', path: 'dom1.ts', content: 'const u="https://dashscope.aliyuncs.com/compatible-mode/v1"', expect: 'none', note: '境内通义端点' },
40+
{ id: 'n02', path: 'dom2.ts', content: 'const u="https://api.deepseek.com"', expect: 'none', note: '境内 DeepSeek 端点' },
41+
{ id: 'n03', path: 'ph1.ts', content: 'const k="sk-EXAMPLEEXAMPLEEXAMPLE12"', expect: 'none', note: '占位符 key(EXAMPLE)' },
42+
{ id: 'n04', path: 'ph2.ts', content: 'API_KEY=your-api-key-placeholder-here', expect: 'none', note: '占位符 your-...' },
43+
{ id: 'n05', path: 'README.md', content: '示例: key=sk-RZ9mKp2QwLs7Yv3Nd8Tb1Hc4Xj6Pq 手机 13912345678', expect: 'none', note: 'Markdown 文档示例' },
44+
{ id: 'n06', path: 'package-lock.json', content: '{"x":"sk-RZ9mKp2QwLs7Yv3Nd8Tb1Hc4Xj6Pq"}', expect: 'none', note: 'lock 文件(噪声)' },
45+
{ id: 'n07', path: 'badid.txt', content: '身份证 110101199003071234', expect: 'none', note: '身份证校验位错误' },
46+
{ id: 'n08', path: 'badphone.txt', content: '工号 12345678901', expect: 'none', note: '非手机号' },
47+
{ id: 'n09', path: 'b64.ts', content: 'const h="YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnc="', expect: 'none', note: '普通 base64(非密钥)' },
48+
{ id: 'n10', path: 'pkg2/package.json', content: '{"dependencies":{"express":"^4","lodash":"^4"}}', expect: 'none', note: '纯境内无关依赖' },
49+
// ===== 追加:更难的样例 =====
50+
{ id: 'p16', path: 'e.ts', content: 'const c="https://api.cohere.ai/v1/chat"', expect: 'overseas', note: 'Cohere 端点' },
51+
{ id: 'p17', path: 'GUIDE.md', content: '调用示例:向 https://api.openai.com/v1 发请求', expect: 'overseas', note: 'Markdown 里的境外端点(应保留检测)' },
52+
{ id: 'p18', path: 'r2.txt', content: 'dep: @langchain/openai and langchain-anthropic', expect: 'none', note: '散文提到包名(非依赖清单,不应误报)' },
53+
{ id: 'n11', path: 'sha.ts', content: 'const commit="a1b2c3d4e5f60718293a4b5c6d7e8f9012345678"', expect: 'none', note: 'git commit sha(40hex,非密钥)' },
54+
{ id: 'n12', path: 'req2/requirements.txt', content: 'dashscope==1.0\nzhipuai>=2.0', expect: 'none', note: '境内 SDK 依赖(不算出境)' },
55+
{ id: 'n13', path: 'uuid.ts', content: 'const id="550e8400-e29b-41d4-a716-446655440000"', expect: 'none', note: 'UUID(非密钥/PII)' },
56+
]
57+
58+
function run(ci: boolean) {
59+
const dir = mkdtempSync(join(tmpdir(), 'sw-bench-'))
60+
try {
61+
for (const c of C) {
62+
const full = join(dir, c.id, c.path)
63+
mkdirSync(join(full, '..'), { recursive: true })
64+
writeFileSync(full, c.content)
65+
}
66+
const scan = scanProject(dir)
67+
const byCase = (id: string) => scan.findings.filter(f => f.file.startsWith(id + '/'))
68+
69+
let tp = 0, fp = 0, fn = 0, tn = 0
70+
const fails: string[] = []
71+
console.log('\n========== ShellWard 扫描检测基准 ==========\n')
72+
for (const c of C) {
73+
const found = byCase(c.id)
74+
const hit = c.expect === 'none' ? found.length === 0 : found.some(f => f.kind === c.expect)
75+
if (c.expect === 'none') {
76+
if (found.length === 0) { tn++ } else { fp++; fails.push(`FP ${c.id} ${c.note} — 误报: ${found.map(f => f.kind).join(',')}`) }
77+
} else {
78+
if (hit) { tp++ } else { fn++; fails.push(`FN ${c.id} ${c.note} — 漏报 (期望 ${c.expect})`) }
79+
}
80+
console.log(` ${hit ? '✅' : '❌'} [${c.expect === 'none' ? '负' : '正'}] ${c.id} ${c.note}`)
81+
}
82+
83+
const precision = tp + fp === 0 ? 1 : tp / (tp + fp)
84+
const recall = tp + fn === 0 ? 1 : tp / (tp + fn)
85+
const f1 = precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall)
86+
console.log('\n--- 指标 ---')
87+
console.log(` 正例 ${tp + fn} | 负例 ${tn + fp}`)
88+
console.log(` TP=${tp} FP=${fp} FN=${fn} TN=${tn}`)
89+
console.log(` 精确率 Precision: ${(precision * 100).toFixed(1)}%`)
90+
console.log(` 召回率 Recall: ${(recall * 100).toFixed(1)}%`)
91+
console.log(` F1: ${(f1 * 100).toFixed(1)}%`)
92+
if (fails.length) { console.log('\n--- 未通过 ---'); fails.forEach(f => console.log(' ' + f)) }
93+
console.log('')
94+
95+
if (ci) {
96+
const PASS = precision >= 0.9 && recall >= 0.9
97+
console.log(PASS ? '✅ 基准达标 (P≥90% R≥90%)' : '❌ 基准未达标')
98+
if (!PASS) process.exit(1)
99+
}
100+
} finally {
101+
rmSync(dir, { recursive: true, force: true })
102+
}
103+
}
104+
105+
run(process.argv.includes('--ci'))

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "shellward",
3-
"version": "0.7.3",
3+
"version": "0.7.4",
44
"mcpName": "io.github.jnMetaCode/shellward",
55
"description": "AI agent security & MCP security middleware — prompt injection detection, AI firewall, runtime guardrails & data-loss prevention for LLM tool calls. 8-layer defense against data exfiltration & dangerous commands. Zero dependencies. SDK + OpenClaw plugin. Supports LangChain, AutoGPT, Claude Code, Cursor, OpenAI Agents, Hermes Agent.",
66
"keywords": [
@@ -65,6 +65,7 @@
6565
"test:sdk": "npx tsx test-sdk.ts",
6666
"test:mcp": "npx tsx test-mcp.ts",
6767
"bench": "npx tsx bench/run.ts",
68+
"bench:scan": "npx tsx bench/scan-bench.ts",
6869
"prepublishOnly": "npm run build"
6970
},
7071
"openclaw": {

0 commit comments

Comments
 (0)