feat: persist eval quality trend history

haasonsaas · haasonsaas · commit 62504f9b6b66 · 2026-03-13T12:06:26.000-07:00
Add benchmark-summary and trend-file support so labeled eval runs can accumulate into reusable QualityTrend histories for live provider sweeps.

Made-with: Cursor
diff --git a/TODO.md b/TODO.md
@@ -11,7 +11,6 @@
 ## Improvement Queue
 
 - [ ] `src/commands/eval/`
-  - Persist labeled eval runs into `QualityTrend` JSON so live provider sweeps can be trended over time.
   - Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
   - Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
 - [ ] `src/commands/feedback_eval/`
diff --git a/eval/fixtures/README.md b/eval/fixtures/README.md
@@ -20,6 +20,7 @@ diffscope eval \
   --suite review-depth-core \
   --max-fixtures 3 \
   --label smoke \
+  --trend-file eval/trends/review-depth-core.json \
   --output eval-report.json
 ```
 
@@ -35,10 +36,12 @@ diffscope \
   --fixtures eval/fixtures \
   --suite review-depth-core \
   --max-fixtures 3 \
-  --label openrouter-smoke
+  --label openrouter-smoke \
+  --trend-file eval/trends/openrouter-smoke.json
 ```
 
 Notes:
 - Fixtures call the configured model and API provider; they are not deterministic unit tests.
 - Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
 - Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
+- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history.
diff --git a/src/commands/eval/command.rs b/src/commands/eval/command.rs
@@ -64,6 +64,10 @@ fn build_eval_run_metadata(
             max_fixtures: options.max_fixtures,
         },
         verification_fail_open: config.verification_fail_open,
+        trend_file: options
+            .trend_file
+            .as_ref()
+            .map(|path| path.display().to_string()),
     }
 }
 
diff --git a/src/commands/eval/command/fixtures.rs b/src/commands/eval/command/fixtures.rs
@@ -205,6 +205,7 @@ mod tests {
                 fixture_name_filters: vec!["shell".to_string()],
                 max_fixtures: None,
                 label: None,
+                trend_file: None,
             },
         );
 
@@ -237,6 +238,7 @@ mod tests {
                 fixture_name_filters: vec![],
                 max_fixtures: Some(1),
                 label: None,
+                trend_file: None,
             },
         );
 
diff --git a/src/commands/eval/command/options.rs b/src/commands/eval/command/options.rs
@@ -7,6 +7,7 @@ use super::super::{EvalReport, EvalRunOptions};
 pub(super) struct PreparedEvalOptions {
     pub(super) baseline: Option<EvalReport>,
     pub(super) threshold_options: EvalThresholdOptions,
+    pub(super) trend_path: Option<std::path::PathBuf>,
 }
 
 pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedEvalOptions> {
@@ -27,5 +28,6 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
             min_rule_f1: min_rule_thresholds,
             max_rule_f1_drop: max_rule_drop_thresholds,
         },
+        trend_path: options.trend_file.clone(),
     })
 }
diff --git a/src/commands/eval/command/report.rs b/src/commands/eval/command/report.rs
@@ -2,7 +2,8 @@ use anyhow::Result;
 use std::path::Path;
 
 use super::super::report::{
-    build_eval_report, evaluation_failure_message, print_eval_report, write_eval_report,
+    build_eval_report, evaluation_failure_message, print_eval_report, update_eval_quality_trend,
+    write_eval_report,
 };
 use super::super::{EvalFixtureResult, EvalRunMetadata};
 use super::options::PreparedEvalOptions;
@@ -24,6 +25,9 @@ pub(super) async fn emit_eval_report(
     if let Some(path) = output_path {
         write_eval_report(&report, path).await?;
     }
+    if let Some(path) = prepared_options.trend_path.as_deref() {
+        update_eval_quality_trend(&report, path).await?;
+    }
 
     if let Some(message) = evaluation_failure_message(&report) {
         anyhow::bail!("{}", message);
diff --git a/src/commands/eval/metrics.rs b/src/commands/eval/metrics.rs
@@ -5,5 +5,6 @@ mod suites;
 
 pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
 pub(super) use suites::{
-    build_benchmark_breakdowns, build_suite_results, collect_suite_threshold_failures,
+    build_benchmark_breakdowns, build_overall_benchmark_summary, build_suite_results,
+    collect_suite_threshold_failures,
 };
diff --git a/src/commands/eval/metrics/suites.rs b/src/commands/eval/metrics/suites.rs
@@ -13,6 +13,29 @@ pub(in super::super) struct EvalBenchmarkBreakdowns {
     pub(in super::super) by_difficulty: HashMap<String, BenchmarkAggregateMetrics>,
 }
 
+pub(in super::super) fn build_overall_benchmark_summary(
+    results: &[EvalFixtureResult],
+) -> Option<BenchmarkAggregateMetrics> {
+    let benchmark_results = collect_weighted_benchmark_results(results);
+    if benchmark_results.is_empty() {
+        return None;
+    }
+
+    let fixture_results = benchmark_results
+        .iter()
+        .map(|(result, _)| *result)
+        .collect::<Vec<_>>();
+    let weights = benchmark_results
+        .iter()
+        .map(|(_, weight)| *weight)
+        .collect::<Vec<_>>();
+
+    Some(BenchmarkAggregateMetrics::compute(
+        &fixture_results,
+        Some(&weights),
+    ))
+}
+
 pub(in super::super) fn build_suite_results(results: &[EvalFixtureResult]) -> Vec<EvalSuiteResult> {
     let mut grouped: HashMap<String, Vec<&EvalFixtureResult>> = HashMap::new();
     for result in results {
@@ -169,6 +192,24 @@ fn difficulty_label(difficulty: &Difficulty) -> &'static str {
     }
 }
 
+fn collect_weighted_benchmark_results(
+    results: &[EvalFixtureResult],
+) -> Vec<(&crate::core::eval_benchmarks::FixtureResult, f32)> {
+    results
+        .iter()
+        .filter_map(|result| {
+            result.benchmark_metrics.as_ref().map(|metrics| {
+                let weight = result
+                    .difficulty
+                    .as_ref()
+                    .map(Difficulty::weight)
+                    .unwrap_or(1.0);
+                (metrics, weight)
+            })
+        })
+        .collect()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -290,4 +331,47 @@ mod tests {
             Some(1)
         );
     }
+
+    #[test]
+    fn test_build_overall_benchmark_summary_aggregates_fixture_metrics() {
+        let results = vec![
+            EvalFixtureResult {
+                fixture: "suite/a".to_string(),
+                suite: Some("suite".to_string()),
+                passed: true,
+                total_comments: 1,
+                required_matches: 1,
+                required_total: 1,
+                benchmark_metrics: Some(FixtureResult::compute("suite/a", 1, 0, 1, 0, 0)),
+                suite_thresholds: None,
+                difficulty: Some(Difficulty::Easy),
+                metadata: None,
+                rule_metrics: vec![],
+                rule_summary: None,
+                warnings: vec![],
+                failures: vec![],
+            },
+            EvalFixtureResult {
+                fixture: "suite/b".to_string(),
+                suite: Some("suite".to_string()),
+                passed: false,
+                total_comments: 1,
+                required_matches: 0,
+                required_total: 1,
+                benchmark_metrics: Some(FixtureResult::compute("suite/b", 1, 0, 0, 0, 1)),
+                suite_thresholds: None,
+                difficulty: Some(Difficulty::Hard),
+                metadata: None,
+                rule_metrics: vec![],
+                rule_summary: None,
+                warnings: vec![],
+                failures: vec!["missing".to_string()],
+            },
+        ];
+
+        let summary = build_overall_benchmark_summary(&results).unwrap();
+
+        assert_eq!(summary.fixture_count, 2);
+        assert!(summary.micro_f1 < 1.0);
+    }
 }
diff --git a/src/commands/eval/report.rs b/src/commands/eval/report.rs
@@ -4,7 +4,10 @@ mod build;
 mod failure;
 #[path = "report/output.rs"]
 mod output;
+#[path = "report/trend.rs"]
+mod trend;
 
 pub(super) use build::build_eval_report;
 pub(super) use failure::evaluation_failure_message;
 pub(super) use output::{print_eval_report, write_eval_report};
+pub(super) use trend::update_eval_quality_trend;
diff --git a/src/commands/eval/report/build.rs b/src/commands/eval/report/build.rs
@@ -1,6 +1,6 @@
 use super::super::metrics::{
-    aggregate_rule_metrics, build_benchmark_breakdowns, build_suite_results,
-    collect_suite_threshold_failures, summarize_rule_metrics,
+    aggregate_rule_metrics, build_benchmark_breakdowns, build_overall_benchmark_summary,
+    build_suite_results, collect_suite_threshold_failures, summarize_rule_metrics,
 };
 use super::super::thresholds::{evaluate_eval_thresholds, EvalThresholdOptions};
 use super::super::{EvalFixtureResult, EvalReport, EvalRunMetadata};
@@ -25,6 +25,7 @@ pub(in super::super) fn build_eval_report(
         .collect::<Vec<_>>();
     let rule_metrics = aggregate_rule_metrics(&results);
     let rule_summary = summarize_rule_metrics(&rule_metrics);
+    let benchmark_summary = build_overall_benchmark_summary(&results);
     let suite_results = build_suite_results(&results);
     let breakdowns = build_benchmark_breakdowns(&results);
 
@@ -35,6 +36,7 @@ pub(in super::super) fn build_eval_report(
         fixtures_failed,
         rule_metrics,
         rule_summary,
+        benchmark_summary,
         suite_results,
         benchmark_by_category: breakdowns.by_category,
         benchmark_by_language: breakdowns.by_language,
diff --git a/src/commands/eval/report/output.rs b/src/commands/eval/report/output.rs
@@ -29,6 +29,9 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
                 "strict"
             }
         );
+        if let Some(trend_file) = report.run.trend_file.as_deref() {
+            println!("Trend file: {}", trend_file);
+        }
     }
 
     println!(
@@ -111,6 +114,15 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         }
     }
 
+    if let Some(benchmark_summary) = report.benchmark_summary.as_ref() {
+        println!(
+            "Benchmark summary: fixtures={} micro F1={:.0}% weighted={:.0}%",
+            benchmark_summary.fixture_count,
+            benchmark_summary.micro_f1 * 100.0,
+            benchmark_summary.weighted_score * 100.0
+        );
+    }
+
     for suite in &report.suite_results {
         println!(
             "Suite {}: fixtures={} micro F1={:.0}% weighted={:.0}%",
diff --git a/src/commands/eval/report/trend.rs b/src/commands/eval/report/trend.rs
diff --git a/src/commands/eval/thresholds/evaluation/run.rs b/src/commands/eval/thresholds/evaluation/run.rs
diff --git a/src/commands/eval/types/options.rs b/src/commands/eval/types/options.rs
diff --git a/src/commands/eval/types/report.rs b/src/commands/eval/types/report.rs
diff --git a/src/main.rs b/src/main.rs

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,10 @@ fn build_eval_run_metadata(`
`64`	`64`	`max_fixtures: options.max_fixtures,`
`65`	`65`	`},`
`66`	`66`	`verification_fail_open: config.verification_fail_open,`
	`67`	`+ trend_file: options`
	`68`	`+ .trend_file`
	`69`	`+ .as_ref()`
	`70`	`+ .map(\|path\| path.display().to_string()),`
`67`	`71`	`}`
`68`	`72`	`}`
`69`	`73`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ use super::super::{EvalReport, EvalRunOptions};`
`7`	`7`	`pub(super) struct PreparedEvalOptions {`
`8`	`8`	`pub(super) baseline: Option<EvalReport>,`
`9`	`9`	`pub(super) threshold_options: EvalThresholdOptions,`
	`10`	`+ pub(super) trend_path: Option<std::path::PathBuf>,`
`10`	`11`	`}`
`11`	`12`
`12`	`13`	`pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedEvalOptions> {`
`@@ -27,5 +28,6 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE`
`27`	`28`	`min_rule_f1: min_rule_thresholds,`
`28`	`29`	`max_rule_f1_drop: max_rule_drop_thresholds,`
`29`	`30`	`},`
	`31`	`+ trend_path: options.trend_file.clone(),`
`30`	`32`	`})`
`31`	`33`	`}`