Skip to content

Commit 294c3c2

Browse files
committed
test(eval): harden comparison metric coverage
Add suite and verification-health regression tests so targeted mutation runs on comparisons.rs drop to a single equivalent miss.
1 parent 5224c32 commit 294c3c2

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

src/commands/eval/metrics/comparisons.rs

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,29 @@ mod tests {
160160

161161
use super::*;
162162

163+
fn empty_report() -> EvalReport {
164+
EvalReport {
165+
run: Default::default(),
166+
fixtures_total: 0,
167+
fixtures_passed: 0,
168+
fixtures_failed: 0,
169+
rule_metrics: vec![],
170+
rule_summary: None,
171+
benchmark_summary: None,
172+
suite_results: vec![],
173+
benchmark_by_category: HashMap::new(),
174+
benchmark_by_language: HashMap::new(),
175+
benchmark_by_difficulty: HashMap::new(),
176+
suite_comparisons: vec![],
177+
category_comparisons: vec![],
178+
language_comparisons: vec![],
179+
verification_health: None,
180+
warnings: vec![],
181+
threshold_failures: vec![],
182+
results: vec![],
183+
}
184+
}
185+
163186
fn metrics(micro_f1: f32, weighted_score: f32, fixture_count: usize) -> AggregateMetrics {
164187
AggregateMetrics {
165188
micro_f1,
@@ -169,6 +192,38 @@ mod tests {
169192
}
170193
}
171194

195+
#[test]
196+
fn build_suite_comparisons_intersects_current_and_baseline() {
197+
let current = vec![EvalSuiteResult {
198+
suite: "review-depth-infra".to_string(),
199+
fixture_count: 2,
200+
aggregate: metrics(0.8, 0.75, 2),
201+
thresholds_enforced: false,
202+
threshold_pass: true,
203+
threshold_failures: vec![],
204+
}];
205+
let baseline = EvalReport {
206+
suite_results: vec![EvalSuiteResult {
207+
suite: "review-depth-infra".to_string(),
208+
fixture_count: 2,
209+
aggregate: metrics(0.9, 0.85, 2),
210+
thresholds_enforced: false,
211+
threshold_pass: true,
212+
threshold_failures: vec![],
213+
}],
214+
..empty_report()
215+
};
216+
217+
let comparisons = build_suite_comparisons(&current, Some(&baseline));
218+
219+
assert_eq!(comparisons.len(), 1);
220+
assert_eq!(comparisons[0].name, "review-depth-infra");
221+
assert!((comparisons[0].micro_f1_delta + 0.1).abs() < f32::EPSILON);
222+
assert!((comparisons[0].weighted_score_delta + 0.1).abs() < f32::EPSILON);
223+
assert_eq!(comparisons[0].current_fixture_count, 2);
224+
assert_eq!(comparisons[0].baseline_fixture_count, 2);
225+
}
226+
172227
#[test]
173228
fn build_named_breakdown_comparisons_intersects_current_and_baseline() {
174229
let current = HashMap::from([
@@ -250,6 +305,108 @@ mod tests {
250305
assert_eq!(health.request_failure_count, 1);
251306
}
252307

308+
#[test]
309+
fn build_verification_health_returns_none_for_non_verification_warnings_only() {
310+
let results = vec![EvalFixtureResult {
311+
fixture: "suite/a".to_string(),
312+
suite: Some("suite".to_string()),
313+
passed: true,
314+
total_comments: 1,
315+
required_matches: 1,
316+
required_total: 1,
317+
benchmark_metrics: None,
318+
suite_thresholds: None,
319+
difficulty: None,
320+
metadata: None,
321+
rule_metrics: vec![],
322+
rule_summary: None,
323+
warnings: vec!["reproduction validator warning".to_string()],
324+
verification_report: None,
325+
agent_activity: None,
326+
reproduction_summary: None,
327+
artifact_path: None,
328+
failures: vec![],
329+
dag_traces: vec![],
330+
}];
331+
332+
assert!(build_verification_health(&results).is_none());
333+
}
334+
335+
#[test]
336+
fn build_verification_health_detects_verifier_only_warning_text() {
337+
let results = vec![EvalFixtureResult {
338+
fixture: "suite/a".to_string(),
339+
suite: Some("suite".to_string()),
340+
passed: true,
341+
total_comments: 1,
342+
required_matches: 1,
343+
required_total: 1,
344+
benchmark_metrics: None,
345+
suite_thresholds: None,
346+
difficulty: None,
347+
metadata: None,
348+
rule_metrics: vec![],
349+
rule_summary: None,
350+
warnings: vec!["verifier request error: timeout".to_string()],
351+
verification_report: None,
352+
agent_activity: None,
353+
reproduction_summary: None,
354+
artifact_path: None,
355+
failures: vec![],
356+
dag_traces: vec![],
357+
}];
358+
359+
let health = build_verification_health(&results).unwrap();
360+
361+
assert_eq!(health.warnings_total, 1);
362+
assert_eq!(health.fixtures_with_warnings, 1);
363+
assert_eq!(health.request_failure_count, 1);
364+
assert_eq!(health.total_checks, 1);
365+
}
366+
367+
#[test]
368+
fn build_verification_health_keeps_zero_percent_when_no_checks_ran() {
369+
let results = vec![EvalFixtureResult {
370+
fixture: "suite/a".to_string(),
371+
suite: Some("suite".to_string()),
372+
passed: true,
373+
total_comments: 0,
374+
required_matches: 0,
375+
required_total: 0,
376+
benchmark_metrics: None,
377+
suite_thresholds: None,
378+
difficulty: None,
379+
metadata: None,
380+
rule_metrics: vec![],
381+
rule_summary: None,
382+
warnings: vec![],
383+
verification_report: Some(EvalVerificationReport {
384+
consensus_mode: "majority".to_string(),
385+
required_votes: 1,
386+
judge_count: 1,
387+
judges: vec![EvalVerificationJudgeReport {
388+
model: "judge".to_string(),
389+
total_comments: 0,
390+
passed_comments: 0,
391+
filtered_comments: 0,
392+
abstained_comments: 0,
393+
warnings: vec![],
394+
}],
395+
}),
396+
agent_activity: None,
397+
reproduction_summary: None,
398+
artifact_path: None,
399+
failures: vec![],
400+
dag_traces: vec![],
401+
}];
402+
403+
let health = build_verification_health(&results).unwrap();
404+
405+
assert_eq!(health.total_checks, 0);
406+
assert_eq!(health.verified_checks, 0);
407+
assert_eq!(health.verified_pct, 0.0);
408+
}
409+
253410
#[test]
254411
fn build_verification_health_uses_judge_reports_without_warnings() {
255412
let results = vec![EvalFixtureResult {

0 commit comments

Comments
 (0)