Skip to content

Commit 5224c32

Browse files
committed
feat(eval): gate low verification health
Track verified comment checks in eval reports and quality trends, add a configurable minimum verification-health threshold, and enforce 80% health in the eval workflow.
1 parent 6f71492 commit 5224c32

File tree

13 files changed

+212
-6
lines changed

13 files changed

+212
-6
lines changed

.github/workflows/eval.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ jobs:
7474
--baseline /tmp/eval-baseline.json \
7575
--max-micro-f1-drop 0.20 \
7676
--min-micro-f1 0.20 \
77+
--min-verification-health 0.80 \
7778
--min-rule-f1 sec.shell.injection=0.10 \
7879
--min-rule-f1 reliability.unwrap_panic=0.10 \
7980
--max-rule-f1-drop sec.shell.injection=0.25 \

src/commands/eval/command/batch.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ mod tests {
319319
max_language_f1_drop: None,
320320
min_micro_f1: None,
321321
min_macro_f1: None,
322+
min_verification_health: None,
322323
min_rule_f1: vec![],
323324
max_rule_f1_drop: vec![],
324325
matrix_models: vec![],

src/commands/eval/command/fixtures.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ mod tests {
218218
max_language_f1_drop: None,
219219
min_micro_f1: None,
220220
min_macro_f1: None,
221+
min_verification_health: None,
221222
min_rule_f1: Vec::new(),
222223
max_rule_f1_drop: Vec::new(),
223224
matrix_models: Vec::new(),
@@ -260,6 +261,7 @@ mod tests {
260261
max_language_f1_drop: None,
261262
min_micro_f1: None,
262263
min_macro_f1: None,
264+
min_verification_health: None,
263265
min_rule_f1: Vec::new(),
264266
max_rule_f1_drop: Vec::new(),
265267
matrix_models: Vec::new(),

src/commands/eval/command/options.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
3232
max_language_f1_drop: options.max_language_f1_drop,
3333
min_micro_f1: options.min_micro_f1,
3434
min_macro_f1: options.min_macro_f1,
35+
min_verification_health: options.min_verification_health,
3536
min_rule_f1: min_rule_thresholds,
3637
max_rule_f1_drop: max_rule_drop_thresholds,
3738
},

src/commands/eval/metrics/comparisons.rs

Lines changed: 97 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,22 +61,60 @@ pub(in super::super) fn build_verification_health(
6161
) -> Option<EvalVerificationHealth> {
6262
let warnings_total = results
6363
.iter()
64-
.map(|result| result.warnings.len())
64+
.map(|result| {
65+
result
66+
.warnings
67+
.iter()
68+
.filter(|warning| is_verification_warning(warning))
69+
.count()
70+
})
6571
.sum::<usize>();
66-
if warnings_total == 0 {
67-
return None;
68-
}
6972

7073
let mut health = EvalVerificationHealth {
7174
warnings_total,
7275
fixtures_with_warnings: results
7376
.iter()
74-
.filter(|result| !result.warnings.is_empty())
77+
.filter(|result| {
78+
result
79+
.warnings
80+
.iter()
81+
.any(|warning| is_verification_warning(warning))
82+
})
7583
.count(),
7684
..Default::default()
7785
};
7886

87+
let mut observed_verification = false;
88+
for result in results {
89+
if let Some(report) = result.verification_report.as_ref() {
90+
observed_verification = true;
91+
for judge in &report.judges {
92+
health.total_checks += judge.total_comments;
93+
health.verified_checks += judge.passed_comments + judge.filtered_comments;
94+
}
95+
} else if result.total_comments > 0
96+
&& result
97+
.warnings
98+
.iter()
99+
.any(|warning| is_verification_warning(warning))
100+
{
101+
observed_verification = true;
102+
health.total_checks += result.total_comments;
103+
}
104+
}
105+
106+
if health.total_checks > 0 {
107+
health.verified_pct = health.verified_checks as f32 / health.total_checks as f32;
108+
}
109+
110+
if !observed_verification && warnings_total == 0 {
111+
return None;
112+
}
113+
79114
for warning in results.iter().flat_map(|result| &result.warnings) {
115+
if !is_verification_warning(warning) {
116+
continue;
117+
}
80118
let lower = warning.to_ascii_lowercase();
81119
if lower.contains("verification fail-open kept") {
82120
health.fail_open_warning_count += 1;
@@ -92,6 +130,11 @@ pub(in super::super) fn build_verification_health(
92130
Some(health)
93131
}
94132

133+
fn is_verification_warning(warning: &str) -> bool {
134+
let lower = warning.to_ascii_lowercase();
135+
lower.contains("verification") || lower.contains("verifier")
136+
}
137+
95138
fn build_comparison(
96139
name: String,
97140
current: &BenchmarkAggregateMetrics,
@@ -112,6 +155,7 @@ fn build_comparison(
112155

113156
#[cfg(test)]
114157
mod tests {
158+
use crate::commands::eval::{EvalVerificationJudgeReport, EvalVerificationReport};
115159
use crate::core::eval_benchmarks::AggregateMetrics;
116160

117161
use super::*;
@@ -196,10 +240,58 @@ mod tests {
196240
];
197241

198242
let health = build_verification_health(&results).unwrap();
243+
assert_eq!(health.verified_checks, 0);
244+
assert_eq!(health.total_checks, 1);
245+
assert_eq!(health.verified_pct, 0.0);
199246
assert_eq!(health.warnings_total, 2);
200247
assert_eq!(health.fixtures_with_warnings, 1);
201248
assert_eq!(health.fail_open_warning_count, 2);
202249
assert_eq!(health.parse_failure_count, 1);
203250
assert_eq!(health.request_failure_count, 1);
204251
}
252+
253+
#[test]
254+
fn build_verification_health_uses_judge_reports_without_warnings() {
255+
let results = vec![EvalFixtureResult {
256+
fixture: "suite/a".to_string(),
257+
suite: Some("suite".to_string()),
258+
passed: true,
259+
total_comments: 5,
260+
required_matches: 1,
261+
required_total: 1,
262+
benchmark_metrics: None,
263+
suite_thresholds: None,
264+
difficulty: None,
265+
metadata: None,
266+
rule_metrics: vec![],
267+
rule_summary: None,
268+
warnings: vec![],
269+
verification_report: Some(EvalVerificationReport {
270+
consensus_mode: "majority".to_string(),
271+
required_votes: 1,
272+
judge_count: 1,
273+
judges: vec![EvalVerificationJudgeReport {
274+
model: "judge".to_string(),
275+
total_comments: 5,
276+
passed_comments: 3,
277+
filtered_comments: 1,
278+
abstained_comments: 1,
279+
warnings: vec![],
280+
}],
281+
}),
282+
agent_activity: None,
283+
reproduction_summary: None,
284+
artifact_path: None,
285+
failures: vec![],
286+
dag_traces: vec![],
287+
}];
288+
289+
let health = build_verification_health(&results).unwrap();
290+
291+
assert_eq!(health.verified_checks, 4);
292+
assert_eq!(health.total_checks, 5);
293+
assert!((health.verified_pct - 0.8).abs() < f32::EPSILON);
294+
assert_eq!(health.warnings_total, 0);
295+
assert_eq!(health.fixtures_with_warnings, 0);
296+
}
205297
}

src/commands/eval/report/output.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,10 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
295295

296296
if let Some(verification_health) = report.verification_health.as_ref() {
297297
println!(
298-
"Verification health: warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
298+
"Verification health: {:.0}% ({}/{}) warnings={} fixtures={} fail-open={} parse-failures={} request-failures={}",
299+
verification_health.verified_pct * 100.0,
300+
verification_health.verified_checks,
301+
verification_health.total_checks,
299302
verification_health.warnings_total,
300303
verification_health.fixtures_with_warnings,
301304
verification_health.fail_open_warning_count,

src/commands/eval/report/trend.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ fn trend_entry_for_report(report: &EvalReport) -> Option<TrendEntry> {
7171
.map(|health| health.parse_failure_count),
7272
verification_request_failure_count: verification_health
7373
.map(|health| health.request_failure_count),
74+
verification_verified_checks: verification_health.map(|health| health.verified_checks),
75+
verification_total_checks: verification_health.map(|health| health.total_checks),
76+
verification_verified_pct: verification_health.map(|health| health.verified_pct),
7477
})
7578
}
7679

@@ -170,6 +173,9 @@ mod tests {
170173
category_comparisons: vec![],
171174
language_comparisons: vec![],
172175
verification_health: Some(EvalVerificationHealth {
176+
verified_checks: 8,
177+
total_checks: 10,
178+
verified_pct: 0.8,
173179
warnings_total: 2,
174180
fixtures_with_warnings: 1,
175181
fail_open_warning_count: 2,
@@ -233,5 +239,23 @@ mod tests {
233239
.unwrap_or_default(),
234240
1
235241
);
242+
assert_eq!(
243+
trend.entries[0]
244+
.verification_verified_checks
245+
.unwrap_or_default(),
246+
8
247+
);
248+
assert_eq!(
249+
trend.entries[0]
250+
.verification_total_checks
251+
.unwrap_or_default(),
252+
10
253+
);
254+
assert_eq!(
255+
trend.entries[0]
256+
.verification_verified_pct
257+
.unwrap_or_default(),
258+
0.8
259+
);
236260
}
237261
}

src/commands/eval/thresholds.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub(super) struct EvalThresholdOptions {
1111
pub(super) max_language_f1_drop: Option<f32>,
1212
pub(super) min_micro_f1: Option<f32>,
1313
pub(super) min_macro_f1: Option<f32>,
14+
pub(super) min_verification_health: Option<f32>,
1415
pub(super) min_rule_f1: Vec<EvalRuleThreshold>,
1516
pub(super) max_rule_f1_drop: Vec<EvalRuleThreshold>,
1617
}

src/commands/eval/thresholds/evaluation/run.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@ pub(in super::super::super) fn evaluate_eval_thresholds(
2525
&current_by_rule,
2626
options,
2727
);
28+
if let Some(threshold) = options.min_verification_health {
29+
if let Some(health) = current.verification_health.as_ref() {
30+
if health.total_checks > 0 && health.verified_pct < threshold {
31+
failures.push(format!(
32+
"verification health {:.3} fell below minimum {:.3} ({}/{})",
33+
health.verified_pct, threshold, health.verified_checks, health.total_checks
34+
));
35+
}
36+
}
37+
}
2838
failures.extend(check_drop_thresholds(
2939
current,
3040
current_micro_f1,
@@ -79,6 +89,7 @@ mod tests {
7989
max_language_f1_drop: None,
8090
min_micro_f1: None,
8191
min_macro_f1: None,
92+
min_verification_health: None,
8293
min_rule_f1: vec![],
8394
max_rule_f1_drop: vec![],
8495
};
@@ -160,6 +171,7 @@ mod tests {
160171
max_language_f1_drop: None,
161172
min_micro_f1: None,
162173
min_macro_f1: None,
174+
min_verification_health: None,
163175
min_rule_f1: vec![],
164176
max_rule_f1_drop: vec![EvalRuleThreshold {
165177
rule_id: "sec.sql.injection".to_string(),
@@ -220,6 +232,7 @@ mod tests {
220232
max_language_f1_drop: None,
221233
min_micro_f1: None,
222234
min_macro_f1: None,
235+
min_verification_health: None,
223236
min_rule_f1: vec![],
224237
max_rule_f1_drop: vec![],
225238
};
@@ -230,4 +243,51 @@ mod tests {
230243
assert!(failures[0].contains("category 'security'"));
231244
assert!(failures[0].contains("exceeded max 0.100"));
232245
}
246+
247+
#[test]
248+
fn test_evaluate_eval_thresholds_checks_verification_health() {
249+
let current = EvalReport {
250+
run: Default::default(),
251+
fixtures_total: 1,
252+
fixtures_passed: 1,
253+
fixtures_failed: 0,
254+
rule_metrics: vec![],
255+
rule_summary: Some(EvalRuleScoreSummary::default()),
256+
benchmark_summary: None,
257+
suite_results: vec![],
258+
benchmark_by_category: Default::default(),
259+
benchmark_by_language: Default::default(),
260+
benchmark_by_difficulty: Default::default(),
261+
suite_comparisons: vec![],
262+
category_comparisons: vec![],
263+
language_comparisons: vec![],
264+
verification_health: Some(crate::commands::eval::EvalVerificationHealth {
265+
verified_checks: 7,
266+
total_checks: 10,
267+
verified_pct: 0.7,
268+
..Default::default()
269+
}),
270+
warnings: vec![],
271+
threshold_failures: vec![],
272+
results: vec![],
273+
};
274+
let options = EvalThresholdOptions {
275+
max_micro_f1_drop: None,
276+
max_suite_f1_drop: None,
277+
max_category_f1_drop: None,
278+
max_language_f1_drop: None,
279+
min_micro_f1: None,
280+
min_macro_f1: None,
281+
min_verification_health: Some(0.8),
282+
min_rule_f1: vec![],
283+
max_rule_f1_drop: vec![],
284+
};
285+
286+
let failures = evaluate_eval_thresholds(&current, None, &options);
287+
288+
assert_eq!(failures.len(), 1);
289+
assert!(failures[0].contains("verification health 0.700"));
290+
assert!(failures[0].contains("minimum 0.800"));
291+
assert!(failures[0].contains("7/10"));
292+
}
233293
}

src/commands/eval/types/options.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub struct EvalRunOptions {
99
pub max_language_f1_drop: Option<f32>,
1010
pub min_micro_f1: Option<f32>,
1111
pub min_macro_f1: Option<f32>,
12+
pub min_verification_health: Option<f32>,
1213
pub min_rule_f1: Vec<String>,
1314
pub max_rule_f1_drop: Vec<String>,
1415
pub matrix_models: Vec<String>,

0 commit comments

Comments
 (0)