Skip to content

Commit 11466cc

Browse files
committed
feat: add dimension-aware eval regression tracking
Compare eval runs against baselines by suite, category, and language, and persist those richer slices into trend history so regressions are easier to spot. Made-with: Cursor
1 parent 62504f9 commit 11466cc

File tree

19 files changed

+642
-21
lines changed

19 files changed

+642
-21
lines changed

TODO.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@
1111
## Improvement Queue
1212

1313
- [ ] `src/commands/eval/`
14-
- Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
14+
- Add suite/category/language baseline comparisons so regressions are gated by dimension, not only whole-run totals.
15+
- Add model-matrix and repeat execution support so the same suite can be compared across frontier models and flake-checked.
16+
- Capture failed-run artifacts, including emitted comments, verifier warnings, and per-fixture mismatch details.
17+
- Reduce fixture brittleness with semantic/alias expectation matching instead of exact wording dependence.
18+
- Extend trend history with suite/category/language series plus verifier-health counters and model/provider labels.
1519
- Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
1620
- [ ] `src/commands/feedback_eval/`
17-
- Correlate feedback calibration with eval-suite category and rule-level performance.
21+
- Correlate feedback calibration with eval-suite category performance and rule-level precision/recall.
22+
- Surface high-confidence but frequently rejected categories/rules so review quality gaps are obvious.
1823

1924
## Immediate Queue
2025

eval/fixtures/README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,23 @@ diffscope \
4040
--trend-file eval/trends/openrouter-smoke.json
4141
```
4242

43+
Baseline-gated regression check:
44+
45+
```bash
46+
diffscope eval \
47+
--fixtures eval/fixtures \
48+
--suite review-depth-core \
49+
--baseline eval/baselines/review-depth-core.json \
50+
--max-micro-f1-drop 0.03 \
51+
--max-suite-f1-drop 0.05 \
52+
--max-category-f1-drop 0.05 \
53+
--max-language-f1-drop 0.05 \
54+
--output eval-report.json
55+
```
56+
4357
Notes:
4458
- Fixtures call the configured model and API provider; they are not deterministic unit tests.
4559
- Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
4660
- Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
47-
- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history.
61+
- Use `--baseline` together with the dimension drop flags when you want regressions to fail on shared suites, categories, or languages instead of only on the whole run.
62+
- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.

src/commands/eval.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub use types::EvalRunOptions;
2020

2121
#[allow(unused_imports)]
2222
use types::{
23-
EvalExpectations, EvalFixture, EvalFixtureMetadata, EvalFixtureResult, EvalPattern, EvalReport,
24-
EvalRuleMetrics, EvalRuleScoreSummary, EvalRunFilters, EvalRunMetadata, EvalSuiteResult,
25-
LoadedEvalFixture,
23+
EvalExpectations, EvalFixture, EvalFixtureMetadata, EvalFixtureResult,
24+
EvalNamedMetricComparison, EvalPattern, EvalReport, EvalRuleMetrics, EvalRuleScoreSummary,
25+
EvalRunFilters, EvalRunMetadata, EvalSuiteResult, EvalVerificationHealth, LoadedEvalFixture,
2626
};

src/commands/eval/command/fixtures.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ mod tests {
195195
&EvalRunOptions {
196196
baseline_report: None,
197197
max_micro_f1_drop: None,
198+
max_suite_f1_drop: None,
199+
max_category_f1_drop: None,
200+
max_language_f1_drop: None,
198201
min_micro_f1: None,
199202
min_macro_f1: None,
200203
min_rule_f1: Vec::new(),
@@ -228,6 +231,9 @@ mod tests {
228231
&EvalRunOptions {
229232
baseline_report: None,
230233
max_micro_f1_drop: None,
234+
max_suite_f1_drop: None,
235+
max_category_f1_drop: None,
236+
max_language_f1_drop: None,
231237
min_micro_f1: None,
232238
min_macro_f1: None,
233239
min_rule_f1: Vec::new(),

src/commands/eval/command/options.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
2323
baseline,
2424
threshold_options: EvalThresholdOptions {
2525
max_micro_f1_drop: options.max_micro_f1_drop,
26+
max_suite_f1_drop: options.max_suite_f1_drop,
27+
max_category_f1_drop: options.max_category_f1_drop,
28+
max_language_f1_drop: options.max_language_f1_drop,
2629
min_micro_f1: options.min_micro_f1,
2730
min_macro_f1: options.min_macro_f1,
2831
min_rule_f1: min_rule_thresholds,

src/commands/eval/metrics.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
#[path = "metrics/comparisons.rs"]
2+
mod comparisons;
13
#[path = "metrics/rules.rs"]
24
mod rules;
35
#[path = "metrics/suites.rs"]
46
mod suites;
57

8+
pub(super) use comparisons::{
9+
build_named_breakdown_comparisons, build_suite_comparisons, build_verification_health,
10+
};
611
pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
712
pub(super) use suites::{
813
build_benchmark_breakdowns, build_overall_benchmark_summary, build_suite_results,
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
use std::collections::HashMap;
2+
3+
use crate::core::eval_benchmarks::AggregateMetrics as BenchmarkAggregateMetrics;
4+
5+
use super::super::{
6+
EvalFixtureResult, EvalNamedMetricComparison, EvalReport, EvalSuiteResult,
7+
EvalVerificationHealth,
8+
};
9+
10+
pub(in super::super) fn build_suite_comparisons(
11+
current: &[EvalSuiteResult],
12+
baseline: Option<&EvalReport>,
13+
) -> Vec<EvalNamedMetricComparison> {
14+
let Some(baseline) = baseline else {
15+
return Vec::new();
16+
};
17+
18+
let baseline_by_suite = baseline
19+
.suite_results
20+
.iter()
21+
.map(|suite| (suite.suite.as_str(), &suite.aggregate))
22+
.collect::<HashMap<_, _>>();
23+
24+
let mut comparisons = current
25+
.iter()
26+
.filter_map(|suite| {
27+
let baseline_metrics = baseline_by_suite.get(suite.suite.as_str())?;
28+
Some(build_comparison(
29+
suite.suite.clone(),
30+
&suite.aggregate,
31+
baseline_metrics,
32+
))
33+
})
34+
.collect::<Vec<_>>();
35+
comparisons.sort_by(|left, right| left.name.cmp(&right.name));
36+
comparisons
37+
}
38+
39+
pub(in super::super) fn build_named_breakdown_comparisons(
40+
current: &HashMap<String, BenchmarkAggregateMetrics>,
41+
baseline: Option<&HashMap<String, BenchmarkAggregateMetrics>>,
42+
) -> Vec<EvalNamedMetricComparison> {
43+
let Some(baseline) = baseline else {
44+
return Vec::new();
45+
};
46+
47+
let mut comparisons = current
48+
.iter()
49+
.filter_map(|(name, current_metrics)| {
50+
baseline.get(name).map(|baseline_metrics| {
51+
build_comparison(name.clone(), current_metrics, baseline_metrics)
52+
})
53+
})
54+
.collect::<Vec<_>>();
55+
comparisons.sort_by(|left, right| left.name.cmp(&right.name));
56+
comparisons
57+
}
58+
59+
pub(in super::super) fn build_verification_health(
60+
results: &[EvalFixtureResult],
61+
) -> Option<EvalVerificationHealth> {
62+
let warnings_total = results
63+
.iter()
64+
.map(|result| result.warnings.len())
65+
.sum::<usize>();
66+
if warnings_total == 0 {
67+
return None;
68+
}
69+
70+
let mut health = EvalVerificationHealth {
71+
warnings_total,
72+
fixtures_with_warnings: results
73+
.iter()
74+
.filter(|result| !result.warnings.is_empty())
75+
.count(),
76+
..Default::default()
77+
};
78+
79+
for warning in results.iter().flat_map(|result| &result.warnings) {
80+
let lower = warning.to_ascii_lowercase();
81+
if lower.contains("verification fail-open kept") {
82+
health.fail_open_warning_count += 1;
83+
}
84+
if lower.contains("unparseable verifier output") {
85+
health.parse_failure_count += 1;
86+
}
87+
if lower.contains("verifier request error") {
88+
health.request_failure_count += 1;
89+
}
90+
}
91+
92+
Some(health)
93+
}
94+
95+
fn build_comparison(
96+
name: String,
97+
current: &BenchmarkAggregateMetrics,
98+
baseline: &BenchmarkAggregateMetrics,
99+
) -> EvalNamedMetricComparison {
100+
EvalNamedMetricComparison {
101+
name,
102+
current_micro_f1: current.micro_f1,
103+
baseline_micro_f1: baseline.micro_f1,
104+
micro_f1_delta: current.micro_f1 - baseline.micro_f1,
105+
current_weighted_score: current.weighted_score,
106+
baseline_weighted_score: baseline.weighted_score,
107+
weighted_score_delta: current.weighted_score - baseline.weighted_score,
108+
current_fixture_count: current.fixture_count,
109+
baseline_fixture_count: baseline.fixture_count,
110+
}
111+
}
112+
113+
#[cfg(test)]
114+
mod tests {
115+
use crate::core::eval_benchmarks::AggregateMetrics;
116+
117+
use super::*;
118+
119+
fn metrics(micro_f1: f32, weighted_score: f32, fixture_count: usize) -> AggregateMetrics {
120+
AggregateMetrics {
121+
micro_f1,
122+
weighted_score,
123+
fixture_count,
124+
..Default::default()
125+
}
126+
}
127+
128+
#[test]
129+
fn build_named_breakdown_comparisons_intersects_current_and_baseline() {
130+
let current = HashMap::from([
131+
("bug".to_string(), metrics(0.7, 0.72, 2)),
132+
("security".to_string(), metrics(0.9, 0.93, 3)),
133+
]);
134+
let baseline = HashMap::from([
135+
("security".to_string(), metrics(0.95, 0.96, 3)),
136+
("style".to_string(), metrics(0.8, 0.81, 1)),
137+
]);
138+
139+
let comparisons = build_named_breakdown_comparisons(&current, Some(&baseline));
140+
141+
assert_eq!(comparisons.len(), 1);
142+
assert_eq!(comparisons[0].name, "security");
143+
assert!((comparisons[0].micro_f1_delta + 0.05).abs() < f32::EPSILON);
144+
}
145+
146+
#[test]
147+
fn build_verification_health_counts_fail_open_signals() {
148+
let results = vec![
149+
EvalFixtureResult {
150+
fixture: "suite/a".to_string(),
151+
suite: Some("suite".to_string()),
152+
passed: true,
153+
total_comments: 1,
154+
required_matches: 1,
155+
required_total: 1,
156+
benchmark_metrics: None,
157+
suite_thresholds: None,
158+
difficulty: None,
159+
metadata: None,
160+
rule_metrics: vec![],
161+
rule_summary: None,
162+
warnings: vec![
163+
"verification fail-open kept 1 comment(s) after verifier request error: boom"
164+
.to_string(),
165+
"verification fail-open kept 1 comment(s) after unparseable verifier output"
166+
.to_string(),
167+
],
168+
failures: vec![],
169+
},
170+
EvalFixtureResult {
171+
fixture: "suite/b".to_string(),
172+
suite: Some("suite".to_string()),
173+
passed: true,
174+
total_comments: 1,
175+
required_matches: 1,
176+
required_total: 1,
177+
benchmark_metrics: None,
178+
suite_thresholds: None,
179+
difficulty: None,
180+
metadata: None,
181+
rule_metrics: vec![],
182+
rule_summary: None,
183+
warnings: vec![],
184+
failures: vec![],
185+
},
186+
];
187+
188+
let health = build_verification_health(&results).unwrap();
189+
assert_eq!(health.warnings_total, 2);
190+
assert_eq!(health.fixtures_with_warnings, 1);
191+
assert_eq!(health.fail_open_warning_count, 2);
192+
assert_eq!(health.parse_failure_count, 1);
193+
assert_eq!(health.request_failure_count, 1);
194+
}
195+
}

src/commands/eval/report/build.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use super::super::metrics::{
2-
aggregate_rule_metrics, build_benchmark_breakdowns, build_overall_benchmark_summary,
3-
build_suite_results, collect_suite_threshold_failures, summarize_rule_metrics,
2+
aggregate_rule_metrics, build_benchmark_breakdowns, build_named_breakdown_comparisons,
3+
build_overall_benchmark_summary, build_suite_comparisons, build_suite_results,
4+
build_verification_health, collect_suite_threshold_failures, summarize_rule_metrics,
45
};
56
use super::super::thresholds::{evaluate_eval_thresholds, EvalThresholdOptions};
67
use super::super::{EvalFixtureResult, EvalReport, EvalRunMetadata};
@@ -28,6 +29,16 @@ pub(in super::super) fn build_eval_report(
2829
let benchmark_summary = build_overall_benchmark_summary(&results);
2930
let suite_results = build_suite_results(&results);
3031
let breakdowns = build_benchmark_breakdowns(&results);
32+
let suite_comparisons = build_suite_comparisons(&suite_results, baseline);
33+
let category_comparisons = build_named_breakdown_comparisons(
34+
&breakdowns.by_category,
35+
baseline.map(|report| &report.benchmark_by_category),
36+
);
37+
let language_comparisons = build_named_breakdown_comparisons(
38+
&breakdowns.by_language,
39+
baseline.map(|report| &report.benchmark_by_language),
40+
);
41+
let verification_health = build_verification_health(&results);
3142

3243
let mut report = EvalReport {
3344
run,
@@ -41,6 +52,10 @@ pub(in super::super) fn build_eval_report(
4152
benchmark_by_category: breakdowns.by_category,
4253
benchmark_by_language: breakdowns.by_language,
4354
benchmark_by_difficulty: breakdowns.by_difficulty,
55+
suite_comparisons,
56+
category_comparisons,
57+
language_comparisons,
58+
verification_health,
4459
warnings,
4560
threshold_failures: Vec::new(),
4661
results,

src/commands/eval/report/output.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,59 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
187187
}
188188
}
189189

190+
if !report.suite_comparisons.is_empty() {
191+
println!("Baseline suite deltas:");
192+
for comparison in &report.suite_comparisons {
193+
println!(
194+
" - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
195+
comparison.name,
196+
comparison.micro_f1_delta * 100.0,
197+
comparison.weighted_score_delta * 100.0,
198+
comparison.baseline_micro_f1 * 100.0,
199+
comparison.current_micro_f1 * 100.0
200+
);
201+
}
202+
}
203+
204+
if !report.category_comparisons.is_empty() {
205+
println!("Baseline category deltas:");
206+
for comparison in &report.category_comparisons {
207+
println!(
208+
" - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
209+
comparison.name,
210+
comparison.micro_f1_delta * 100.0,
211+
comparison.weighted_score_delta * 100.0,
212+
comparison.baseline_micro_f1 * 100.0,
213+
comparison.current_micro_f1 * 100.0
214+
);
215+
}
216+
}
217+
218+
if !report.language_comparisons.is_empty() {
219+
println!("Baseline language deltas:");
220+
for comparison in &report.language_comparisons {
221+
println!(
222+
" - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
223+
comparison.name,
224+
comparison.micro_f1_delta * 100.0,
225+
comparison.weighted_score_delta * 100.0,
226+
comparison.baseline_micro_f1 * 100.0,
227+
comparison.current_micro_f1 * 100.0
228+
);
229+
}
230+
}
231+
232+
if let Some(verification_health) = report.verification_health.as_ref() {
233+
println!(
234+
"Verification health: warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
235+
verification_health.warnings_total,
236+
verification_health.fixtures_with_warnings,
237+
verification_health.fail_open_warning_count,
238+
verification_health.parse_failure_count,
239+
verification_health.request_failure_count
240+
);
241+
}
242+
190243
for warning in &report.warnings {
191244
println!("Warning: {}", warning);
192245
}

0 commit comments

Comments
 (0)