feat(eval): gate low verification health

haasonsaas · haasonsaas · commit 5224c3274769 · 2026-03-14T13:41:56.000-07:00
Track verified comment checks in eval reports and quality trends, add a configurable minimum verification-health threshold, and enforce 80% health in the eval workflow.
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -74,6 +74,7 @@ jobs:
             --baseline /tmp/eval-baseline.json \
             --max-micro-f1-drop 0.20 \
             --min-micro-f1 0.20 \
+            --min-verification-health 0.80 \
             --min-rule-f1 sec.shell.injection=0.10 \
             --min-rule-f1 reliability.unwrap_panic=0.10 \
             --max-rule-f1-drop sec.shell.injection=0.25 \
diff --git a/src/commands/eval/command/batch.rs b/src/commands/eval/command/batch.rs
@@ -319,6 +319,7 @@ mod tests {
             max_language_f1_drop: None,
             min_micro_f1: None,
             min_macro_f1: None,
+            min_verification_health: None,
             min_rule_f1: vec![],
             max_rule_f1_drop: vec![],
             matrix_models: vec![],
diff --git a/src/commands/eval/command/fixtures.rs b/src/commands/eval/command/fixtures.rs
@@ -218,6 +218,7 @@ mod tests {
                 max_language_f1_drop: None,
                 min_micro_f1: None,
                 min_macro_f1: None,
+                min_verification_health: None,
                 min_rule_f1: Vec::new(),
                 max_rule_f1_drop: Vec::new(),
                 matrix_models: Vec::new(),
@@ -260,6 +261,7 @@ mod tests {
                 max_language_f1_drop: None,
                 min_micro_f1: None,
                 min_macro_f1: None,
+                min_verification_health: None,
                 min_rule_f1: Vec::new(),
                 max_rule_f1_drop: Vec::new(),
                 matrix_models: Vec::new(),
diff --git a/src/commands/eval/command/options.rs b/src/commands/eval/command/options.rs
@@ -32,6 +32,7 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
             max_language_f1_drop: options.max_language_f1_drop,
             min_micro_f1: options.min_micro_f1,
             min_macro_f1: options.min_macro_f1,
+            min_verification_health: options.min_verification_health,
             min_rule_f1: min_rule_thresholds,
             max_rule_f1_drop: max_rule_drop_thresholds,
         },
diff --git a/src/commands/eval/metrics/comparisons.rs b/src/commands/eval/metrics/comparisons.rs
@@ -61,22 +61,60 @@ pub(in super::super) fn build_verification_health(
 ) -> Option<EvalVerificationHealth> {
     let warnings_total = results
         .iter()
-        .map(|result| result.warnings.len())
+        .map(|result| {
+            result
+                .warnings
+                .iter()
+                .filter(|warning| is_verification_warning(warning))
+                .count()
+        })
         .sum::<usize>();
-    if warnings_total == 0 {
-        return None;
-    }
 
     let mut health = EvalVerificationHealth {
         warnings_total,
         fixtures_with_warnings: results
             .iter()
-            .filter(|result| !result.warnings.is_empty())
+            .filter(|result| {
+                result
+                    .warnings
+                    .iter()
+                    .any(|warning| is_verification_warning(warning))
+            })
             .count(),
         ..Default::default()
     };
 
+    let mut observed_verification = false;
+    for result in results {
+        if let Some(report) = result.verification_report.as_ref() {
+            observed_verification = true;
+            for judge in &report.judges {
+                health.total_checks += judge.total_comments;
+                health.verified_checks += judge.passed_comments + judge.filtered_comments;
+            }
+        } else if result.total_comments > 0
+            && result
+                .warnings
+                .iter()
+                .any(|warning| is_verification_warning(warning))
+        {
+            observed_verification = true;
+            health.total_checks += result.total_comments;
+        }
+    }
+
+    if health.total_checks > 0 {
+        health.verified_pct = health.verified_checks as f32 / health.total_checks as f32;
+    }
+
+    if !observed_verification && warnings_total == 0 {
+        return None;
+    }
+
     for warning in results.iter().flat_map(|result| &result.warnings) {
+        if !is_verification_warning(warning) {
+            continue;
+        }
         let lower = warning.to_ascii_lowercase();
         if lower.contains("verification fail-open kept") {
             health.fail_open_warning_count += 1;
@@ -92,6 +130,11 @@ pub(in super::super) fn build_verification_health(
     Some(health)
 }
 
+fn is_verification_warning(warning: &str) -> bool {
+    let lower = warning.to_ascii_lowercase();
+    lower.contains("verification") || lower.contains("verifier")
+}
+
 fn build_comparison(
     name: String,
     current: &BenchmarkAggregateMetrics,
@@ -112,6 +155,7 @@ fn build_comparison(
 
 #[cfg(test)]
 mod tests {
+    use crate::commands::eval::{EvalVerificationJudgeReport, EvalVerificationReport};
     use crate::core::eval_benchmarks::AggregateMetrics;
 
     use super::*;
@@ -196,10 +240,58 @@ mod tests {
         ];
 
         let health = build_verification_health(&results).unwrap();
+        assert_eq!(health.verified_checks, 0);
+        assert_eq!(health.total_checks, 1);
+        assert_eq!(health.verified_pct, 0.0);
         assert_eq!(health.warnings_total, 2);
         assert_eq!(health.fixtures_with_warnings, 1);
         assert_eq!(health.fail_open_warning_count, 2);
         assert_eq!(health.parse_failure_count, 1);
         assert_eq!(health.request_failure_count, 1);
     }
+
+    #[test]
+    fn build_verification_health_uses_judge_reports_without_warnings() {
+        let results = vec![EvalFixtureResult {
+            fixture: "suite/a".to_string(),
+            suite: Some("suite".to_string()),
+            passed: true,
+            total_comments: 5,
+            required_matches: 1,
+            required_total: 1,
+            benchmark_metrics: None,
+            suite_thresholds: None,
+            difficulty: None,
+            metadata: None,
+            rule_metrics: vec![],
+            rule_summary: None,
+            warnings: vec![],
+            verification_report: Some(EvalVerificationReport {
+                consensus_mode: "majority".to_string(),
+                required_votes: 1,
+                judge_count: 1,
+                judges: vec![EvalVerificationJudgeReport {
+                    model: "judge".to_string(),
+                    total_comments: 5,
+                    passed_comments: 3,
+                    filtered_comments: 1,
+                    abstained_comments: 1,
+                    warnings: vec![],
+                }],
+            }),
+            agent_activity: None,
+            reproduction_summary: None,
+            artifact_path: None,
+            failures: vec![],
+            dag_traces: vec![],
+        }];
+
+        let health = build_verification_health(&results).unwrap();
+
+        assert_eq!(health.verified_checks, 4);
+        assert_eq!(health.total_checks, 5);
+        assert!((health.verified_pct - 0.8).abs() < f32::EPSILON);
+        assert_eq!(health.warnings_total, 0);
+        assert_eq!(health.fixtures_with_warnings, 0);
+    }
 }
diff --git a/src/commands/eval/report/output.rs b/src/commands/eval/report/output.rs
@@ -295,7 +295,10 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
 
     if let Some(verification_health) = report.verification_health.as_ref() {
         println!(
-            "Verification health: warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
+            "Verification health: {:.0}% ({}/{}) warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
+            verification_health.verified_pct * 100.0,
+            verification_health.verified_checks,
+            verification_health.total_checks,
             verification_health.warnings_total,
             verification_health.fixtures_with_warnings,
             verification_health.fail_open_warning_count,
diff --git a/src/commands/eval/report/trend.rs b/src/commands/eval/report/trend.rs
@@ -71,6 +71,9 @@ fn trend_entry_for_report(report: &EvalReport) -> Option<TrendEntry> {
             .map(|health| health.parse_failure_count),
         verification_request_failure_count: verification_health
             .map(|health| health.request_failure_count),
+        verification_verified_checks: verification_health.map(|health| health.verified_checks),
+        verification_total_checks: verification_health.map(|health| health.total_checks),
+        verification_verified_pct: verification_health.map(|health| health.verified_pct),
     })
 }
 
@@ -170,6 +173,9 @@ mod tests {
             category_comparisons: vec![],
             language_comparisons: vec![],
             verification_health: Some(EvalVerificationHealth {
+                verified_checks: 8,
+                total_checks: 10,
+                verified_pct: 0.8,
                 warnings_total: 2,
                 fixtures_with_warnings: 1,
                 fail_open_warning_count: 2,
@@ -233,5 +239,23 @@ mod tests {
                 .unwrap_or_default(),
             1
         );
+        assert_eq!(
+            trend.entries[0]
+                .verification_verified_checks
+                .unwrap_or_default(),
+            8
+        );
+        assert_eq!(
+            trend.entries[0]
+                .verification_total_checks
+                .unwrap_or_default(),
+            10
+        );
+        assert_eq!(
+            trend.entries[0]
+                .verification_verified_pct
+                .unwrap_or_default(),
+            0.8
+        );
     }
 }
diff --git a/src/commands/eval/thresholds.rs b/src/commands/eval/thresholds.rs
@@ -11,6 +11,7 @@ pub(super) struct EvalThresholdOptions {
     pub(super) max_language_f1_drop: Option<f32>,
     pub(super) min_micro_f1: Option<f32>,
     pub(super) min_macro_f1: Option<f32>,
+    pub(super) min_verification_health: Option<f32>,
     pub(super) min_rule_f1: Vec<EvalRuleThreshold>,
     pub(super) max_rule_f1_drop: Vec<EvalRuleThreshold>,
 }
diff --git a/src/commands/eval/thresholds/evaluation/run.rs b/src/commands/eval/thresholds/evaluation/run.rs
@@ -25,6 +25,16 @@ pub(in super::super::super) fn evaluate_eval_thresholds(
         &current_by_rule,
         options,
     );
+    if let Some(threshold) = options.min_verification_health {
+        if let Some(health) = current.verification_health.as_ref() {
+            if health.total_checks > 0 && health.verified_pct < threshold {
+                failures.push(format!(
+                    "verification health {:.3} fell below minimum {:.3} ({}/{})",
+                    health.verified_pct, threshold, health.verified_checks, health.total_checks
+                ));
+            }
+        }
+    }
     failures.extend(check_drop_thresholds(
         current,
         current_micro_f1,
@@ -79,6 +89,7 @@ mod tests {
             max_language_f1_drop: None,
             min_micro_f1: None,
             min_macro_f1: None,
+            min_verification_health: None,
             min_rule_f1: vec![],
             max_rule_f1_drop: vec![],
         };
@@ -160,6 +171,7 @@ mod tests {
             max_language_f1_drop: None,
             min_micro_f1: None,
             min_macro_f1: None,
+            min_verification_health: None,
             min_rule_f1: vec![],
             max_rule_f1_drop: vec![EvalRuleThreshold {
                 rule_id: "sec.sql.injection".to_string(),
@@ -220,6 +232,7 @@ mod tests {
             max_language_f1_drop: None,
             min_micro_f1: None,
             min_macro_f1: None,
+            min_verification_health: None,
             min_rule_f1: vec![],
             max_rule_f1_drop: vec![],
         };
@@ -230,4 +243,51 @@ mod tests {
         assert!(failures[0].contains("category 'security'"));
         assert!(failures[0].contains("exceeded max 0.100"));
     }
+
+    #[test]
+    fn test_evaluate_eval_thresholds_checks_verification_health() {
+        let current = EvalReport {
+            run: Default::default(),
+            fixtures_total: 1,
+            fixtures_passed: 1,
+            fixtures_failed: 0,
+            rule_metrics: vec![],
+            rule_summary: Some(EvalRuleScoreSummary::default()),
+            benchmark_summary: None,
+            suite_results: vec![],
+            benchmark_by_category: Default::default(),
+            benchmark_by_language: Default::default(),
+            benchmark_by_difficulty: Default::default(),
+            suite_comparisons: vec![],
+            category_comparisons: vec![],
+            language_comparisons: vec![],
+            verification_health: Some(crate::commands::eval::EvalVerificationHealth {
+                verified_checks: 7,
+                total_checks: 10,
+                verified_pct: 0.7,
+                ..Default::default()
+            }),
+            warnings: vec![],
+            threshold_failures: vec![],
+            results: vec![],
+        };
+        let options = EvalThresholdOptions {
+            max_micro_f1_drop: None,
+            max_suite_f1_drop: None,
+            max_category_f1_drop: None,
+            max_language_f1_drop: None,
+            min_micro_f1: None,
+            min_macro_f1: None,
+            min_verification_health: Some(0.8),
+            min_rule_f1: vec![],
+            max_rule_f1_drop: vec![],
+        };
+
+        let failures = evaluate_eval_thresholds(&current, None, &options);
+
+        assert_eq!(failures.len(), 1);
+        assert!(failures[0].contains("verification health 0.700"));
+        assert!(failures[0].contains("minimum 0.800"));
+        assert!(failures[0].contains("7/10"));
+    }
 }
diff --git a/src/commands/eval/types/options.rs b/src/commands/eval/types/options.rs
@@ -9,6 +9,7 @@ pub struct EvalRunOptions {
     pub max_language_f1_drop: Option<f32>,
     pub min_micro_f1: Option<f32>,
     pub min_macro_f1: Option<f32>,
+    pub min_verification_health: Option<f32>,
     pub min_rule_f1: Vec<String>,
     pub max_rule_f1_drop: Vec<String>,
     pub matrix_models: Vec<String>,
diff --git a/src/commands/eval/types/report.rs b/src/commands/eval/types/report.rs
@@ -88,6 +88,12 @@ pub struct EvalNamedMetricComparison {
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct EvalVerificationHealth {
+    #[serde(default)]
+    pub verified_checks: usize,
+    #[serde(default)]
+    pub total_checks: usize,
+    #[serde(default)]
+    pub verified_pct: f32,
     #[serde(default)]
     pub warnings_total: usize,
     #[serde(default)]
diff --git a/src/core/eval_benchmarks.rs b/src/core/eval_benchmarks.rs
@@ -454,6 +454,12 @@ pub struct TrendEntry {
     pub verification_parse_failure_count: Option<usize>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub verification_request_failure_count: Option<usize>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub verification_verified_checks: Option<usize>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub verification_total_checks: Option<usize>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub verification_verified_pct: Option<f32>,
 }
 
 impl QualityTrend {
diff --git a/src/main.rs b/src/main.rs

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ pub(super) struct EvalThresholdOptions {`
`11`	`11`	`pub(super) max_language_f1_drop: Option<f32>,`
`12`	`12`	`pub(super) min_micro_f1: Option<f32>,`
`13`	`13`	`pub(super) min_macro_f1: Option<f32>,`
	`14`	`+ pub(super) min_verification_health: Option<f32>,`
`14`	`15`	`pub(super) min_rule_f1: Vec<EvalRuleThreshold>,`
`15`	`16`	`pub(super) max_rule_f1_drop: Vec<EvalRuleThreshold>,`
`16`	`17`	`}`