evalops
diff --git a/‎TODO.md‎
Lines changed: 7 additions & 2 deletions b/‎TODO.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎eval/fixtures/README.md‎
Lines changed: 16 additions & 1 deletion b/‎eval/fixtures/README.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎src/commands/eval.rs‎
Lines changed: 3 additions & 3 deletions b/‎src/commands/eval.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/commands/eval/command/fixtures.rs‎
Lines changed: 6 additions & 0 deletions b/‎src/commands/eval/command/fixtures.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/commands/eval/command/options.rs‎
Lines changed: 3 additions & 0 deletions b/‎src/commands/eval/command/options.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/commands/eval/metrics.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/commands/eval/metrics.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/commands/eval/metrics/comparisons.rs‎
Lines changed: 195 additions & 0 deletions b/‎src/commands/eval/metrics/comparisons.rs‎
Lines changed: 195 additions & 0 deletions
diff --git a/‎src/commands/eval/report/build.rs‎
Lines changed: 17 additions & 2 deletions b/‎src/commands/eval/report/build.rs‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎src/commands/eval/report/output.rs‎
Lines changed: 53 additions & 0 deletions b/‎src/commands/eval/report/output.rs‎
Lines changed: 53 additions & 0 deletions
@@ -11,10 +11,15 @@
 ## Improvement Queue
 
 - [ ] `src/commands/eval/`
-  - Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
+  - Add suite/category/language baseline comparisons so regressions are gated by dimension, not only whole-run totals.
+  - Add model-matrix and repeat execution support so the same suite can be compared across frontier models and flake-checked.
+  - Capture failed-run artifacts, including emitted comments, verifier warnings, and per-fixture mismatch details.
+  - Reduce fixture brittleness with semantic/alias expectation matching instead of exact wording dependence.
+  - Extend trend history with suite/category/language series plus verifier-health counters and model/provider labels.
   - Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
 - [ ] `src/commands/feedback_eval/`
-  - Correlate feedback calibration with eval-suite category and rule-level performance.
+  - Correlate feedback calibration with eval-suite category performance and rule-level precision/recall.
+  - Surface high-confidence but frequently rejected categories/rules so review quality gaps are obvious.
 
 ## Immediate Queue
 
 
@@ -40,8 +40,23 @@ diffscope \
   --trend-file eval/trends/openrouter-smoke.json
 ```
 
+Baseline-gated regression check:
+
+```bash
+diffscope eval \
+  --fixtures eval/fixtures \
+  --suite review-depth-core \
+  --baseline eval/baselines/review-depth-core.json \
+  --max-micro-f1-drop 0.03 \
+  --max-suite-f1-drop 0.05 \
+  --max-category-f1-drop 0.05 \
+  --max-language-f1-drop 0.05 \
+  --output eval-report.json
+```
+
 Notes:
 - Fixtures call the configured model and API provider; they are not deterministic unit tests.
 - Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
 - Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
-- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history.
+- Use `--baseline` together with the dimension drop flags when you want regressions to fail on shared suites, categories, or languages instead of only on the whole run.
+- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.
@@ -20,7 +20,7 @@ pub use types::EvalRunOptions;
 
 #[allow(unused_imports)]
 use types::{
-    EvalExpectations, EvalFixture, EvalFixtureMetadata, EvalFixtureResult, EvalPattern, EvalReport,
-    EvalRuleMetrics, EvalRuleScoreSummary, EvalRunFilters, EvalRunMetadata, EvalSuiteResult,
-    LoadedEvalFixture,
+    EvalExpectations, EvalFixture, EvalFixtureMetadata, EvalFixtureResult,
+    EvalNamedMetricComparison, EvalPattern, EvalReport, EvalRuleMetrics, EvalRuleScoreSummary,
+    EvalRunFilters, EvalRunMetadata, EvalSuiteResult, EvalVerificationHealth, LoadedEvalFixture,
 };
@@ -195,6 +195,9 @@ mod tests {
             &EvalRunOptions {
                 baseline_report: None,
                 max_micro_f1_drop: None,
+                max_suite_f1_drop: None,
+                max_category_f1_drop: None,
+                max_language_f1_drop: None,
                 min_micro_f1: None,
                 min_macro_f1: None,
                 min_rule_f1: Vec::new(),
@@ -228,6 +231,9 @@ mod tests {
             &EvalRunOptions {
                 baseline_report: None,
                 max_micro_f1_drop: None,
+                max_suite_f1_drop: None,
+                max_category_f1_drop: None,
+                max_language_f1_drop: None,
                 min_micro_f1: None,
                 min_macro_f1: None,
                 min_rule_f1: Vec::new(),
 
@@ -23,6 +23,9 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
         baseline,
         threshold_options: EvalThresholdOptions {
             max_micro_f1_drop: options.max_micro_f1_drop,
+            max_suite_f1_drop: options.max_suite_f1_drop,
+            max_category_f1_drop: options.max_category_f1_drop,
+            max_language_f1_drop: options.max_language_f1_drop,
             min_micro_f1: options.min_micro_f1,
             min_macro_f1: options.min_macro_f1,
             min_rule_f1: min_rule_thresholds,
 
@@ -1,8 +1,13 @@
+#[path = "metrics/comparisons.rs"]
+mod comparisons;
 #[path = "metrics/rules.rs"]
 mod rules;
 #[path = "metrics/suites.rs"]
 mod suites;
 
+pub(super) use comparisons::{
+    build_named_breakdown_comparisons, build_suite_comparisons, build_verification_health,
+};
 pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
 pub(super) use suites::{
     build_benchmark_breakdowns, build_overall_benchmark_summary, build_suite_results,
 
@@ -0,0 +1,195 @@
+use std::collections::HashMap;
+
+use crate::core::eval_benchmarks::AggregateMetrics as BenchmarkAggregateMetrics;
+
+use super::super::{
+    EvalFixtureResult, EvalNamedMetricComparison, EvalReport, EvalSuiteResult,
+    EvalVerificationHealth,
+};
+
+pub(in super::super) fn build_suite_comparisons(
+    current: &[EvalSuiteResult],
+    baseline: Option<&EvalReport>,
+) -> Vec<EvalNamedMetricComparison> {
+    let Some(baseline) = baseline else {
+        return Vec::new();
+    };
+
+    let baseline_by_suite = baseline
+        .suite_results
+        .iter()
+        .map(|suite| (suite.suite.as_str(), &suite.aggregate))
+        .collect::<HashMap<_, _>>();
+
+    let mut comparisons = current
+        .iter()
+        .filter_map(|suite| {
+            let baseline_metrics = baseline_by_suite.get(suite.suite.as_str())?;
+            Some(build_comparison(
+                suite.suite.clone(),
+                &suite.aggregate,
+                baseline_metrics,
+            ))
+        })
+        .collect::<Vec<_>>();
+    comparisons.sort_by(|left, right| left.name.cmp(&right.name));
+    comparisons
+}
+
+pub(in super::super) fn build_named_breakdown_comparisons(
+    current: &HashMap<String, BenchmarkAggregateMetrics>,
+    baseline: Option<&HashMap<String, BenchmarkAggregateMetrics>>,
+) -> Vec<EvalNamedMetricComparison> {
+    let Some(baseline) = baseline else {
+        return Vec::new();
+    };
+
+    let mut comparisons = current
+        .iter()
+        .filter_map(|(name, current_metrics)| {
+            baseline.get(name).map(|baseline_metrics| {
+                build_comparison(name.clone(), current_metrics, baseline_metrics)
+            })
+        })
+        .collect::<Vec<_>>();
+    comparisons.sort_by(|left, right| left.name.cmp(&right.name));
+    comparisons
+}
+
+pub(in super::super) fn build_verification_health(
+    results: &[EvalFixtureResult],
+) -> Option<EvalVerificationHealth> {
+    let warnings_total = results
+        .iter()
+        .map(|result| result.warnings.len())
+        .sum::<usize>();
+    if warnings_total == 0 {
+        return None;
+    }
+
+    let mut health = EvalVerificationHealth {
+        warnings_total,
+        fixtures_with_warnings: results
+            .iter()
+            .filter(|result| !result.warnings.is_empty())
+            .count(),
+        ..Default::default()
+    };
+
+    for warning in results.iter().flat_map(|result| &result.warnings) {
+        let lower = warning.to_ascii_lowercase();
+        if lower.contains("verification fail-open kept") {
+            health.fail_open_warning_count += 1;
+        }
+        if lower.contains("unparseable verifier output") {
+            health.parse_failure_count += 1;
+        }
+        if lower.contains("verifier request error") {
+            health.request_failure_count += 1;
+        }
+    }
+
+    Some(health)
+}
+
+fn build_comparison(
+    name: String,
+    current: &BenchmarkAggregateMetrics,
+    baseline: &BenchmarkAggregateMetrics,
+) -> EvalNamedMetricComparison {
+    EvalNamedMetricComparison {
+        name,
+        current_micro_f1: current.micro_f1,
+        baseline_micro_f1: baseline.micro_f1,
+        micro_f1_delta: current.micro_f1 - baseline.micro_f1,
+        current_weighted_score: current.weighted_score,
+        baseline_weighted_score: baseline.weighted_score,
+        weighted_score_delta: current.weighted_score - baseline.weighted_score,
+        current_fixture_count: current.fixture_count,
+        baseline_fixture_count: baseline.fixture_count,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core::eval_benchmarks::AggregateMetrics;
+
+    use super::*;
+
+    fn metrics(micro_f1: f32, weighted_score: f32, fixture_count: usize) -> AggregateMetrics {
+        AggregateMetrics {
+            micro_f1,
+            weighted_score,
+            fixture_count,
+            ..Default::default()
+        }
+    }
+
+    #[test]
+    fn build_named_breakdown_comparisons_intersects_current_and_baseline() {
+        let current = HashMap::from([
+            ("bug".to_string(), metrics(0.7, 0.72, 2)),
+            ("security".to_string(), metrics(0.9, 0.93, 3)),
+        ]);
+        let baseline = HashMap::from([
+            ("security".to_string(), metrics(0.95, 0.96, 3)),
+            ("style".to_string(), metrics(0.8, 0.81, 1)),
+        ]);
+
+        let comparisons = build_named_breakdown_comparisons(&current, Some(&baseline));
+
+        assert_eq!(comparisons.len(), 1);
+        assert_eq!(comparisons[0].name, "security");
+        assert!((comparisons[0].micro_f1_delta + 0.05).abs() < f32::EPSILON);
+    }
+
+    #[test]
+    fn build_verification_health_counts_fail_open_signals() {
+        let results = vec![
+            EvalFixtureResult {
+                fixture: "suite/a".to_string(),
+                suite: Some("suite".to_string()),
+                passed: true,
+                total_comments: 1,
+                required_matches: 1,
+                required_total: 1,
+                benchmark_metrics: None,
+                suite_thresholds: None,
+                difficulty: None,
+                metadata: None,
+                rule_metrics: vec![],
+                rule_summary: None,
+                warnings: vec![
+                    "verification fail-open kept 1 comment(s) after verifier request error: boom"
+                        .to_string(),
+                    "verification fail-open kept 1 comment(s) after unparseable verifier output"
+                        .to_string(),
+                ],
+                failures: vec![],
+            },
+            EvalFixtureResult {
+                fixture: "suite/b".to_string(),
+                suite: Some("suite".to_string()),
+                passed: true,
+                total_comments: 1,
+                required_matches: 1,
+                required_total: 1,
+                benchmark_metrics: None,
+                suite_thresholds: None,
+                difficulty: None,
+                metadata: None,
+                rule_metrics: vec![],
+                rule_summary: None,
+                warnings: vec![],
+                failures: vec![],
+            },
+        ];
+
+        let health = build_verification_health(&results).unwrap();
+        assert_eq!(health.warnings_total, 2);
+        assert_eq!(health.fixtures_with_warnings, 1);
+        assert_eq!(health.fail_open_warning_count, 2);
+        assert_eq!(health.parse_failure_count, 1);
+        assert_eq!(health.request_failure_count, 1);
+    }
+}
@@ -1,6 +1,7 @@
 use super::super::metrics::{
-    aggregate_rule_metrics, build_benchmark_breakdowns, build_overall_benchmark_summary,
-    build_suite_results, collect_suite_threshold_failures, summarize_rule_metrics,
+    aggregate_rule_metrics, build_benchmark_breakdowns, build_named_breakdown_comparisons,
+    build_overall_benchmark_summary, build_suite_comparisons, build_suite_results,
+    build_verification_health, collect_suite_threshold_failures, summarize_rule_metrics,
 };
 use super::super::thresholds::{evaluate_eval_thresholds, EvalThresholdOptions};
 use super::super::{EvalFixtureResult, EvalReport, EvalRunMetadata};
@@ -28,6 +29,16 @@ pub(in super::super) fn build_eval_report(
     let benchmark_summary = build_overall_benchmark_summary(&results);
     let suite_results = build_suite_results(&results);
     let breakdowns = build_benchmark_breakdowns(&results);
+    let suite_comparisons = build_suite_comparisons(&suite_results, baseline);
+    let category_comparisons = build_named_breakdown_comparisons(
+        &breakdowns.by_category,
+        baseline.map(|report| &report.benchmark_by_category),
+    );
+    let language_comparisons = build_named_breakdown_comparisons(
+        &breakdowns.by_language,
+        baseline.map(|report| &report.benchmark_by_language),
+    );
+    let verification_health = build_verification_health(&results);
 
     let mut report = EvalReport {
         run,
@@ -41,6 +52,10 @@ pub(in super::super) fn build_eval_report(
         benchmark_by_category: breakdowns.by_category,
         benchmark_by_language: breakdowns.by_language,
         benchmark_by_difficulty: breakdowns.by_difficulty,
+        suite_comparisons,
+        category_comparisons,
+        language_comparisons,
+        verification_health,
         warnings,
         threshold_failures: Vec::new(),
         results,
 
@@ -187,6 +187,59 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         }
     }
 
+    if !report.suite_comparisons.is_empty() {
+        println!("Baseline suite deltas:");
+        for comparison in &report.suite_comparisons {
+            println!(
+                "  - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
+                comparison.name,
+                comparison.micro_f1_delta * 100.0,
+                comparison.weighted_score_delta * 100.0,
+                comparison.baseline_micro_f1 * 100.0,
+                comparison.current_micro_f1 * 100.0
+            );
+        }
+    }
+
+    if !report.category_comparisons.is_empty() {
+        println!("Baseline category deltas:");
+        for comparison in &report.category_comparisons {
+            println!(
+                "  - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
+                comparison.name,
+                comparison.micro_f1_delta * 100.0,
+                comparison.weighted_score_delta * 100.0,
+                comparison.baseline_micro_f1 * 100.0,
+                comparison.current_micro_f1 * 100.0
+            );
+        }
+    }
+
+    if !report.language_comparisons.is_empty() {
+        println!("Baseline language deltas:");
+        for comparison in &report.language_comparisons {
+            println!(
+                "  - {}: micro F1 {:+.0}% weighted {:+.0}% (baseline {:.0}% -> current {:.0}%)",
+                comparison.name,
+                comparison.micro_f1_delta * 100.0,
+                comparison.weighted_score_delta * 100.0,
+                comparison.baseline_micro_f1 * 100.0,
+                comparison.current_micro_f1 * 100.0
+            );
+        }
+    }
+
+    if let Some(verification_health) = report.verification_health.as_ref() {
+        println!(
+            "Verification health: warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
+            verification_health.warnings_total,
+            verification_health.fixtures_with_warnings,
+            verification_health.fail_open_warning_count,
+            verification_health.parse_failure_count,
+            verification_health.request_failure_count
+        );
+    }
+
     for warning in &report.warnings {
         println!("Warning: {}", warning);
     }