Skip to content

Commit 061dc43

Browse files
committed
test: deepen eval suite and live reporting
Preserve benchmark metadata through eval runs and add fixture filters, run labels, and richer reports so live provider sweeps are easier to compare and expand over time. Made-with: Cursor
1 parent 0a035b2 commit 061dc43

File tree

24 files changed

+1086
-23
lines changed

24 files changed

+1086
-23
lines changed

TODO.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88
- Prefer extracting pure helpers and formatter/parsing boundaries before moving async orchestration.
99
- Keep module roots thin; if a root becomes mostly re-exports, let children carry the logic.
1010

11+
## Improvement Queue
12+
13+
- [ ] `src/commands/eval/`
14+
- Persist labeled eval runs into `QualityTrend` JSON so live provider sweeps can be trended over time.
15+
- Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
16+
- Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
17+
- Harden verification fallback for live eval runs that return unparseable verification responses.
18+
- [ ] `src/commands/feedback_eval/`
19+
- Correlate feedback calibration with eval-suite category and rule-level performance.
20+
1121
## Immediate Queue
1222

1323
- [ ] `src/core/semantic.rs`

eval/fixtures/README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,33 @@ Run:
1212
diffscope eval --fixtures eval/fixtures --output eval-report.json
1313
```
1414

15+
Filter and label a deeper suite run:
16+
17+
```bash
18+
diffscope eval \
19+
--fixtures eval/fixtures \
20+
--suite review-depth-core \
21+
--max-fixtures 3 \
22+
--label smoke \
23+
--output eval-report.json
24+
```
25+
26+
Live OpenRouter example:
27+
28+
```bash
29+
OPENROUTER_API_KEY=... \
30+
diffscope \
31+
--adapter openrouter \
32+
--base-url https://openrouter.ai/api/v1 \
33+
--model anthropic/claude-opus-4.1 \
34+
eval \
35+
--fixtures eval/fixtures \
36+
--suite review-depth-core \
37+
--max-fixtures 3 \
38+
--label openrouter-smoke
39+
```
40+
1541
Notes:
1642
- Fixtures call the configured model and API provider; they are not deterministic unit tests.
1743
- Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
44+
- Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
{
2+
"name": "review-depth-core",
3+
"author": "diffscope",
4+
"version": "1.0.0",
5+
"description": "Broader live-review benchmark pack spanning security, correctness, performance, and maintainability signals.",
6+
"languages": [
7+
"rust",
8+
"python",
9+
"typescript",
10+
"go"
11+
],
12+
"categories": [
13+
"security",
14+
"bug",
15+
"performance",
16+
"maintainability"
17+
],
18+
"thresholds": {
19+
"min_precision": 0.45,
20+
"min_recall": 0.35,
21+
"min_f1": 0.4,
22+
"max_false_positive_rate": 0.45,
23+
"min_weighted_score": 0.45
24+
},
25+
"metadata": {
26+
"purpose": "live-regression-eval",
27+
"pack": "review-depth-core"
28+
},
29+
"fixtures": [
30+
{
31+
"name": "rust-shell-command-injection",
32+
"category": "security",
33+
"language": "rust",
34+
"difficulty": "Hard",
35+
"repo_path": "../../..",
36+
"diff_content": "diff --git a/src/main.rs b/src/main.rs\nindex 2f4f9cb..8f128ab 100644\n--- a/src/main.rs\n+++ b/src/main.rs\n@@ -1226,6 +1226,14 @@ async fn suggest_pr_title(config: config::Config) -> Result<()> {\n \n Ok(())\n }\n+\n+fn run_debug_command(user_command: &str) {\n+ let _ = std::process::Command::new(\"sh\")\n+ .arg(\"-c\")\n+ .arg(user_command)\n+ .status();\n+}\n",
37+
"expected_findings": [
38+
{
39+
"description": "Shell command execution uses unsanitized user input.",
40+
"severity": "Error",
41+
"category": "Security",
42+
"file_pattern": "src/main.rs",
43+
"line_hint": 1230,
44+
"contains_any": [
45+
"shell injection",
46+
"command injection",
47+
"user-controlled command"
48+
],
49+
"tags_any": [
50+
"command-injection",
51+
"injection"
52+
]
53+
}
54+
],
55+
"negative_findings": [
56+
{
57+
"description": "Avoid style-only comments.",
58+
"contains": "style"
59+
}
60+
],
61+
"min_total": 1,
62+
"max_total": 10,
63+
"description": "Obvious shell execution bug in a Rust helper.",
64+
"source": "repo-regression"
65+
},
66+
{
67+
"name": "python-cross-file-sql-helper",
68+
"category": "security",
69+
"language": "python",
70+
"difficulty": "Expert",
71+
"repo_path": "../repo_regressions/graph_call_chain_repo",
72+
"diff_content": "diff --git a/routes.py b/routes.py\nindex 1111111..2222222 100644\n--- a/routes.py\n+++ b/routes.py\n@@ -2,4 +2,5 @@ from auth import lookup_user\n \n \n def get_profile(request, db):\n- return {\"ok\": True}\n+ user = lookup_user(request.args[\"name\"], db)\n+ return {\"user\": user}\n",
73+
"expected_findings": [
74+
{
75+
"description": "Cross-file SQL helper is called with unsanitized request input.",
76+
"category": "Security",
77+
"contains_any": [
78+
"sql injection",
79+
"unsafe sql",
80+
"query built from user input",
81+
"interpolates user-controlled"
82+
],
83+
"tags_any": [
84+
"sql-injection"
85+
]
86+
}
87+
],
88+
"negative_findings": [
89+
{
90+
"description": "Avoid style-only comments.",
91+
"contains": "style"
92+
}
93+
],
94+
"min_total": 1,
95+
"max_total": 12,
96+
"description": "Tests repo-context retrieval across a helper call chain.",
97+
"source": "repo-regression"
98+
},
99+
{
100+
"name": "rust-path-traversal-read",
101+
"category": "security",
102+
"language": "rust",
103+
"difficulty": "Hard",
104+
"diff_content": "diff --git a/src/download.rs b/src/download.rs\nnew file mode 100644\n--- /dev/null\n+++ b/src/download.rs\n@@ -0,0 +1,5 @@\n+use std::fs;\n+\n+pub fn load_user_file(name: &str) -> std::io::Result<String> {\n+ fs::read_to_string(format!(\"/srv/uploads/{}\", name))\n+}\n",
105+
"expected_findings": [
106+
{
107+
"description": "User-controlled file path is read without validation.",
108+
"severity": "Error",
109+
"category": "Security",
110+
"file_pattern": "src/download.rs",
111+
"line_hint": 4,
112+
"contains_any": [
113+
"path traversal",
114+
"directory traversal",
115+
"user-controlled path"
116+
],
117+
"tags_any": [
118+
"path-traversal"
119+
]
120+
}
121+
],
122+
"negative_findings": [
123+
{
124+
"description": "Avoid style-only comments.",
125+
"contains": "style"
126+
}
127+
],
128+
"min_total": 1,
129+
"max_total": 8,
130+
"description": "Simple path traversal regression in a file-read helper.",
131+
"source": "deep-review-suite"
132+
},
133+
{
134+
"name": "typescript-open-redirect",
135+
"category": "security",
136+
"language": "typescript",
137+
"difficulty": "Medium",
138+
"diff_content": "diff --git a/src/redirect.ts b/src/redirect.ts\nnew file mode 100644\n--- /dev/null\n+++ b/src/redirect.ts\n@@ -0,0 +1,4 @@\n+export function continueLogin(nextUrl: string) {\n+ window.location.href = nextUrl;\n+}\n+\n",
139+
"expected_findings": [
140+
{
141+
"description": "Redirect target is controlled by caller input.",
142+
"severity": "Warning",
143+
"category": "Security",
144+
"file_pattern": "src/redirect.ts",
145+
"line_hint": 2,
146+
"contains_any": [
147+
"open redirect",
148+
"unvalidated redirect",
149+
"redirect to arbitrary url"
150+
],
151+
"tags_any": [
152+
"open-redirect"
153+
]
154+
}
155+
],
156+
"negative_findings": [
157+
{
158+
"description": "Avoid style-only comments.",
159+
"contains": "style"
160+
}
161+
],
162+
"min_total": 1,
163+
"max_total": 8,
164+
"description": "Client-side redirect should validate or constrain destinations.",
165+
"source": "deep-review-suite"
166+
},
167+
{
168+
"name": "python-n-plus-one-query",
169+
"category": "performance",
170+
"language": "python",
171+
"difficulty": "Hard",
172+
"diff_content": "diff --git a/service.py b/service.py\nindex 1111111..2222222 100644\n--- a/service.py\n+++ b/service.py\n@@ -1,4 +1,8 @@\n def load_profiles(db, users):\n- return []\n+ profiles = []\n+ for user in users:\n+ profiles.append(db.query(\"SELECT * FROM profiles WHERE user_id = %s\", [user.id]))\n+ return profiles\n+\n",
173+
"expected_findings": [
174+
{
175+
"description": "Database query is executed inside a loop.",
176+
"severity": "Warning",
177+
"category": "Performance",
178+
"file_pattern": "service.py",
179+
"line_hint": 4,
180+
"contains_any": [
181+
"n+1",
182+
"query inside loop",
183+
"database query in loop"
184+
]
185+
}
186+
],
187+
"negative_findings": [
188+
{
189+
"description": "Avoid style-only comments.",
190+
"contains": "style"
191+
}
192+
],
193+
"min_total": 1,
194+
"max_total": 8,
195+
"description": "Classic N+1 query pattern in a service helper.",
196+
"source": "deep-review-suite"
197+
},
198+
{
199+
"name": "go-swallowed-error",
200+
"category": "bug",
201+
"language": "go",
202+
"difficulty": "Medium",
203+
"diff_content": "diff --git a/cache.go b/cache.go\nindex 1111111..2222222 100644\n--- a/cache.go\n+++ b/cache.go\n@@ -1,5 +1,9 @@\n func loadConfig(path string) (*Config, error) {\n- return parseConfig(path)\n+ cfg, err := parseConfig(path)\n+ if err != nil {\n+ return nil, nil\n+ }\n+ return cfg, nil\n }\n",
204+
"expected_findings": [
205+
{
206+
"description": "Parse error is swallowed and converted into a nil success.",
207+
"severity": "Warning",
208+
"category": "Bug",
209+
"file_pattern": "cache.go",
210+
"line_hint": 4,
211+
"contains_any": [
212+
"error is ignored",
213+
"swallowed error",
214+
"return nil, nil"
215+
]
216+
}
217+
],
218+
"negative_findings": [
219+
{
220+
"description": "Avoid style-only comments.",
221+
"contains": "style"
222+
}
223+
],
224+
"min_total": 1,
225+
"max_total": 8,
226+
"description": "Error handling regression in a Go helper.",
227+
"source": "deep-review-suite"
228+
},
229+
{
230+
"name": "rust-unwrap-on-user-input",
231+
"category": "bug",
232+
"language": "rust",
233+
"difficulty": "Medium",
234+
"diff_content": "diff --git a/src/parser.rs b/src/parser.rs\nindex 1111111..2222222 100644\n--- a/src/parser.rs\n+++ b/src/parser.rs\n@@ -1,3 +1,3 @@\n pub fn parse_user_id(input: &str) -> u64 {\n- input.parse().unwrap_or(0)\n+ input.parse::<u64>().unwrap()\n }\n",
235+
"expected_findings": [
236+
{
237+
"description": "Parsing user input now panics on invalid values.",
238+
"severity": "Warning",
239+
"category": "Bug",
240+
"file_pattern": "src/parser.rs",
241+
"line_hint": 2,
242+
"contains_any": [
243+
"unwrap can panic",
244+
"panic on invalid input",
245+
"untrusted input"
246+
]
247+
}
248+
],
249+
"negative_findings": [
250+
{
251+
"description": "Avoid style-only comments.",
252+
"contains": "style"
253+
}
254+
],
255+
"min_total": 1,
256+
"max_total": 8,
257+
"description": "Simple panic regression in a parser helper.",
258+
"source": "deep-review-suite"
259+
},
260+
{
261+
"name": "python-stack-trace-response-leak",
262+
"category": "security",
263+
"language": "python",
264+
"difficulty": "Medium",
265+
"diff_content": "diff --git a/handlers.py b/handlers.py\nindex 1111111..2222222 100644\n--- a/handlers.py\n+++ b/handlers.py\n@@ -1,4 +1,6 @@\n+import traceback\n+\n def fetch_profile(request):\n try:\n return do_fetch(request)\n+ except Exception:\n+ return {\"error\": traceback.format_exc()}, 500\n",
266+
"expected_findings": [
267+
{
268+
"description": "Stack traces should not be returned to clients.",
269+
"severity": "Warning",
270+
"category": "Security",
271+
"file_pattern": "handlers.py",
272+
"line_hint": 6,
273+
"contains_any": [
274+
"stack trace",
275+
"information disclosure",
276+
"debug details"
277+
],
278+
"tags_any": [
279+
"verbose-error"
280+
]
281+
}
282+
],
283+
"negative_findings": [
284+
{
285+
"description": "Avoid style-only comments.",
286+
"contains": "style"
287+
}
288+
],
289+
"min_total": 1,
290+
"max_total": 8,
291+
"description": "Error responses should not leak stack traces.",
292+
"source": "deep-review-suite"
293+
}
294+
]
295+
}

src/commands/eval.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ pub use types::EvalRunOptions;
2020

2121
#[allow(unused_imports)]
2222
use types::{
23-
EvalExpectations, EvalFixture, EvalFixtureResult, EvalPattern, EvalReport, EvalRuleMetrics,
24-
EvalRuleScoreSummary, EvalSuiteResult, LoadedEvalFixture,
23+
EvalExpectations, EvalFixture, EvalFixtureMetadata, EvalFixtureResult, EvalPattern, EvalReport,
24+
EvalRuleMetrics, EvalRuleScoreSummary, EvalRunFilters, EvalRunMetadata, EvalSuiteResult,
25+
LoadedEvalFixture,
2526
};

0 commit comments

Comments
 (0)