Skip to content

Commit 62504f9

Browse files
committed
feat: persist eval quality trend history
Add benchmark-summary and trend-file support so labeled eval runs can accumulate into reusable QualityTrend histories for live provider sweeps. Made-with: Cursor
1 parent 3ffc8e7 commit 62504f9

File tree

16 files changed

+275
-6
lines changed

16 files changed

+275
-6
lines changed

TODO.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
## Improvement Queue
1212

1313
- [ ] `src/commands/eval/`
14-
- Persist labeled eval runs into `QualityTrend` JSON so live provider sweeps can be trended over time.
1514
- Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
1615
- Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
1716
- [ ] `src/commands/feedback_eval/`

eval/fixtures/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ diffscope eval \
2020
--suite review-depth-core \
2121
--max-fixtures 3 \
2222
--label smoke \
23+
--trend-file eval/trends/review-depth-core.json \
2324
--output eval-report.json
2425
```
2526

@@ -35,10 +36,12 @@ diffscope \
3536
--fixtures eval/fixtures \
3637
--suite review-depth-core \
3738
--max-fixtures 3 \
38-
--label openrouter-smoke
39+
--label openrouter-smoke \
40+
--trend-file eval/trends/openrouter-smoke.json
3941
```
4042

4143
Notes:
4244
- Fixtures call the configured model and API provider; they are not deterministic unit tests.
4345
- Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
4446
- Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
47+
- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history.

src/commands/eval/command.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ fn build_eval_run_metadata(
6464
max_fixtures: options.max_fixtures,
6565
},
6666
verification_fail_open: config.verification_fail_open,
67+
trend_file: options
68+
.trend_file
69+
.as_ref()
70+
.map(|path| path.display().to_string()),
6771
}
6872
}
6973

src/commands/eval/command/fixtures.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ mod tests {
205205
fixture_name_filters: vec!["shell".to_string()],
206206
max_fixtures: None,
207207
label: None,
208+
trend_file: None,
208209
},
209210
);
210211

@@ -237,6 +238,7 @@ mod tests {
237238
fixture_name_filters: vec![],
238239
max_fixtures: Some(1),
239240
label: None,
241+
trend_file: None,
240242
},
241243
);
242244

src/commands/eval/command/options.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use super::super::{EvalReport, EvalRunOptions};
77
pub(super) struct PreparedEvalOptions {
88
pub(super) baseline: Option<EvalReport>,
99
pub(super) threshold_options: EvalThresholdOptions,
10+
pub(super) trend_path: Option<std::path::PathBuf>,
1011
}
1112

1213
pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedEvalOptions> {
@@ -27,5 +28,6 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
2728
min_rule_f1: min_rule_thresholds,
2829
max_rule_f1_drop: max_rule_drop_thresholds,
2930
},
31+
trend_path: options.trend_file.clone(),
3032
})
3133
}

src/commands/eval/command/report.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ use anyhow::Result;
22
use std::path::Path;
33

44
use super::super::report::{
5-
build_eval_report, evaluation_failure_message, print_eval_report, write_eval_report,
5+
build_eval_report, evaluation_failure_message, print_eval_report, update_eval_quality_trend,
6+
write_eval_report,
67
};
78
use super::super::{EvalFixtureResult, EvalRunMetadata};
89
use super::options::PreparedEvalOptions;
@@ -24,6 +25,9 @@ pub(super) async fn emit_eval_report(
2425
if let Some(path) = output_path {
2526
write_eval_report(&report, path).await?;
2627
}
28+
if let Some(path) = prepared_options.trend_path.as_deref() {
29+
update_eval_quality_trend(&report, path).await?;
30+
}
2731

2832
if let Some(message) = evaluation_failure_message(&report) {
2933
anyhow::bail!("{}", message);

src/commands/eval/metrics.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ mod suites;
55

66
pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
77
pub(super) use suites::{
8-
build_benchmark_breakdowns, build_suite_results, collect_suite_threshold_failures,
8+
build_benchmark_breakdowns, build_overall_benchmark_summary, build_suite_results,
9+
collect_suite_threshold_failures,
910
};

src/commands/eval/metrics/suites.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,29 @@ pub(in super::super) struct EvalBenchmarkBreakdowns {
1313
pub(in super::super) by_difficulty: HashMap<String, BenchmarkAggregateMetrics>,
1414
}
1515

16+
pub(in super::super) fn build_overall_benchmark_summary(
17+
results: &[EvalFixtureResult],
18+
) -> Option<BenchmarkAggregateMetrics> {
19+
let benchmark_results = collect_weighted_benchmark_results(results);
20+
if benchmark_results.is_empty() {
21+
return None;
22+
}
23+
24+
let fixture_results = benchmark_results
25+
.iter()
26+
.map(|(result, _)| *result)
27+
.collect::<Vec<_>>();
28+
let weights = benchmark_results
29+
.iter()
30+
.map(|(_, weight)| *weight)
31+
.collect::<Vec<_>>();
32+
33+
Some(BenchmarkAggregateMetrics::compute(
34+
&fixture_results,
35+
Some(&weights),
36+
))
37+
}
38+
1639
pub(in super::super) fn build_suite_results(results: &[EvalFixtureResult]) -> Vec<EvalSuiteResult> {
1740
let mut grouped: HashMap<String, Vec<&EvalFixtureResult>> = HashMap::new();
1841
for result in results {
@@ -169,6 +192,24 @@ fn difficulty_label(difficulty: &Difficulty) -> &'static str {
169192
}
170193
}
171194

195+
fn collect_weighted_benchmark_results(
196+
results: &[EvalFixtureResult],
197+
) -> Vec<(&crate::core::eval_benchmarks::FixtureResult, f32)> {
198+
results
199+
.iter()
200+
.filter_map(|result| {
201+
result.benchmark_metrics.as_ref().map(|metrics| {
202+
let weight = result
203+
.difficulty
204+
.as_ref()
205+
.map(Difficulty::weight)
206+
.unwrap_or(1.0);
207+
(metrics, weight)
208+
})
209+
})
210+
.collect()
211+
}
212+
172213
#[cfg(test)]
173214
mod tests {
174215
use super::*;
@@ -290,4 +331,47 @@ mod tests {
290331
Some(1)
291332
);
292333
}
334+
335+
#[test]
336+
fn test_build_overall_benchmark_summary_aggregates_fixture_metrics() {
337+
let results = vec![
338+
EvalFixtureResult {
339+
fixture: "suite/a".to_string(),
340+
suite: Some("suite".to_string()),
341+
passed: true,
342+
total_comments: 1,
343+
required_matches: 1,
344+
required_total: 1,
345+
benchmark_metrics: Some(FixtureResult::compute("suite/a", 1, 0, 1, 0, 0)),
346+
suite_thresholds: None,
347+
difficulty: Some(Difficulty::Easy),
348+
metadata: None,
349+
rule_metrics: vec![],
350+
rule_summary: None,
351+
warnings: vec![],
352+
failures: vec![],
353+
},
354+
EvalFixtureResult {
355+
fixture: "suite/b".to_string(),
356+
suite: Some("suite".to_string()),
357+
passed: false,
358+
total_comments: 1,
359+
required_matches: 0,
360+
required_total: 1,
361+
benchmark_metrics: Some(FixtureResult::compute("suite/b", 1, 0, 0, 0, 1)),
362+
suite_thresholds: None,
363+
difficulty: Some(Difficulty::Hard),
364+
metadata: None,
365+
rule_metrics: vec![],
366+
rule_summary: None,
367+
warnings: vec![],
368+
failures: vec!["missing".to_string()],
369+
},
370+
];
371+
372+
let summary = build_overall_benchmark_summary(&results).unwrap();
373+
374+
assert_eq!(summary.fixture_count, 2);
375+
assert!(summary.micro_f1 < 1.0);
376+
}
293377
}

src/commands/eval/report.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ mod build;
44
mod failure;
55
#[path = "report/output.rs"]
66
mod output;
7+
#[path = "report/trend.rs"]
8+
mod trend;
79

810
pub(super) use build::build_eval_report;
911
pub(super) use failure::evaluation_failure_message;
1012
pub(super) use output::{print_eval_report, write_eval_report};
13+
pub(super) use trend::update_eval_quality_trend;

src/commands/eval/report/build.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use super::super::metrics::{
2-
aggregate_rule_metrics, build_benchmark_breakdowns, build_suite_results,
3-
collect_suite_threshold_failures, summarize_rule_metrics,
2+
aggregate_rule_metrics, build_benchmark_breakdowns, build_overall_benchmark_summary,
3+
build_suite_results, collect_suite_threshold_failures, summarize_rule_metrics,
44
};
55
use super::super::thresholds::{evaluate_eval_thresholds, EvalThresholdOptions};
66
use super::super::{EvalFixtureResult, EvalReport, EvalRunMetadata};
@@ -25,6 +25,7 @@ pub(in super::super) fn build_eval_report(
2525
.collect::<Vec<_>>();
2626
let rule_metrics = aggregate_rule_metrics(&results);
2727
let rule_summary = summarize_rule_metrics(&rule_metrics);
28+
let benchmark_summary = build_overall_benchmark_summary(&results);
2829
let suite_results = build_suite_results(&results);
2930
let breakdowns = build_benchmark_breakdowns(&results);
3031

@@ -35,6 +36,7 @@ pub(in super::super) fn build_eval_report(
3536
fixtures_failed,
3637
rule_metrics,
3738
rule_summary,
39+
benchmark_summary,
3840
suite_results,
3941
benchmark_by_category: breakdowns.by_category,
4042
benchmark_by_language: breakdowns.by_language,

0 commit comments

Comments
 (0)