Skip to content

Commit e0f78f0

Browse files
committed
refactor: split discussion, threshold, and report-build helpers
Break the next mixed-responsibility command modules into smaller units so follow-on changes stay easier to isolate and verify. Made-with: Cursor
1 parent 1cc994f commit e0f78f0

12 files changed

Lines changed: 803 additions & 745 deletions

File tree

src/commands/eval/thresholds.rs

Lines changed: 6 additions & 235 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
use anyhow::Result;
2-
use std::collections::HashMap;
3-
4-
use super::{EvalReport, EvalRuleMetrics};
1+
#[path = "thresholds/evaluation.rs"]
2+
mod evaluation;
3+
#[path = "thresholds/parsing.rs"]
4+
mod parsing;
55

66
#[derive(Debug, Clone)]
77
pub(super) struct EvalThresholdOptions {
@@ -18,234 +18,5 @@ pub(super) struct EvalRuleThreshold {
1818
pub(super) value: f32,
1919
}
2020

21-
pub(super) fn parse_rule_threshold_args(
22-
values: &[String],
23-
label: &str,
24-
) -> Result<Vec<EvalRuleThreshold>> {
25-
let mut parsed = Vec::new();
26-
for raw in values {
27-
let Some((rule_id, value)) = raw.split_once('=') else {
28-
anyhow::bail!("Invalid {} entry '{}': expected rule_id=value", label, raw);
29-
};
30-
let rule_id = rule_id.trim().to_ascii_lowercase();
31-
if rule_id.is_empty() {
32-
anyhow::bail!("Invalid {} entry '{}': empty rule id", label, raw);
33-
}
34-
let value: f32 = value
35-
.trim()
36-
.parse()
37-
.map_err(|_| anyhow::anyhow!("Invalid {} entry '{}': invalid float", label, raw))?;
38-
if !(0.0..=1.0).contains(&value) {
39-
anyhow::bail!(
40-
"Invalid {} entry '{}': value must be between 0.0 and 1.0",
41-
label,
42-
raw
43-
);
44-
}
45-
parsed.push(EvalRuleThreshold { rule_id, value });
46-
}
47-
Ok(parsed)
48-
}
49-
50-
pub(super) fn evaluate_eval_thresholds(
51-
current: &EvalReport,
52-
baseline: Option<&EvalReport>,
53-
options: &EvalThresholdOptions,
54-
) -> Vec<String> {
55-
let mut failures = Vec::new();
56-
let current_micro_f1 = current
57-
.rule_summary
58-
.map(|summary| summary.micro_f1)
59-
.unwrap_or(0.0);
60-
let current_macro_f1 = current
61-
.rule_summary
62-
.map(|summary| summary.macro_f1)
63-
.unwrap_or(0.0);
64-
65-
if let Some(threshold) = options.min_micro_f1 {
66-
let threshold = threshold.clamp(0.0, 1.0);
67-
if current_micro_f1 < threshold {
68-
failures.push(format!(
69-
"micro-F1 {:.3} is below minimum {:.3}",
70-
current_micro_f1, threshold
71-
));
72-
}
73-
}
74-
75-
if let Some(threshold) = options.min_macro_f1 {
76-
let threshold = threshold.clamp(0.0, 1.0);
77-
if current_macro_f1 < threshold {
78-
failures.push(format!(
79-
"macro-F1 {:.3} is below minimum {:.3}",
80-
current_macro_f1, threshold
81-
));
82-
}
83-
}
84-
85-
let current_by_rule = build_rule_f1_map(&current.rule_metrics);
86-
for threshold in &options.min_rule_f1 {
87-
let current = current_by_rule
88-
.get(&threshold.rule_id)
89-
.copied()
90-
.unwrap_or(0.0);
91-
if current < threshold.value {
92-
failures.push(format!(
93-
"rule '{}' F1 {:.3} is below minimum {:.3}",
94-
threshold.rule_id, current, threshold.value
95-
));
96-
}
97-
}
98-
99-
if options.max_micro_f1_drop.is_some() || !options.max_rule_f1_drop.is_empty() {
100-
let Some(baseline) = baseline else {
101-
failures.push(
102-
"baseline report is required for drop-based thresholds (--baseline)".to_string(),
103-
);
104-
return failures;
105-
};
106-
107-
let baseline_summary = baseline.rule_summary.unwrap_or_default();
108-
if let Some(max_drop) = options.max_micro_f1_drop {
109-
let max_drop = max_drop.clamp(0.0, 1.0);
110-
let drop = (baseline_summary.micro_f1 - current_micro_f1).max(0.0);
111-
if drop > max_drop {
112-
failures.push(format!(
113-
"micro-F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
114-
drop, max_drop, baseline_summary.micro_f1, current_micro_f1
115-
));
116-
}
117-
}
118-
119-
if !options.max_rule_f1_drop.is_empty() {
120-
let baseline_by_rule = build_rule_f1_map(&baseline.rule_metrics);
121-
for threshold in &options.max_rule_f1_drop {
122-
let baseline_f1 = baseline_by_rule
123-
.get(&threshold.rule_id)
124-
.copied()
125-
.unwrap_or(0.0);
126-
let current_f1 = current_by_rule
127-
.get(&threshold.rule_id)
128-
.copied()
129-
.unwrap_or(0.0);
130-
let drop = (baseline_f1 - current_f1).max(0.0);
131-
if drop > threshold.value {
132-
failures.push(format!(
133-
"rule '{}' F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
134-
threshold.rule_id, drop, threshold.value, baseline_f1, current_f1
135-
));
136-
}
137-
}
138-
}
139-
}
140-
141-
failures
142-
}
143-
144-
fn build_rule_f1_map(metrics: &[EvalRuleMetrics]) -> HashMap<String, f32> {
145-
let mut by_rule = HashMap::new();
146-
for metric in metrics {
147-
by_rule.insert(metric.rule_id.to_ascii_lowercase(), metric.f1);
148-
}
149-
by_rule
150-
}
151-
152-
#[cfg(test)]
153-
mod tests {
154-
use super::super::{EvalReport, EvalRuleMetrics, EvalRuleScoreSummary};
155-
use super::*;
156-
157-
#[test]
158-
fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks() {
159-
let report = EvalReport {
160-
fixtures_total: 1,
161-
fixtures_passed: 1,
162-
fixtures_failed: 0,
163-
rule_metrics: vec![],
164-
rule_summary: Some(EvalRuleScoreSummary {
165-
micro_precision: 1.0,
166-
micro_recall: 1.0,
167-
micro_f1: 1.0,
168-
macro_precision: 1.0,
169-
macro_recall: 1.0,
170-
macro_f1: 1.0,
171-
}),
172-
suite_results: vec![],
173-
threshold_failures: vec![],
174-
results: vec![],
175-
};
176-
let options = EvalThresholdOptions {
177-
max_micro_f1_drop: Some(0.05),
178-
min_micro_f1: None,
179-
min_macro_f1: None,
180-
min_rule_f1: vec![],
181-
max_rule_f1_drop: vec![],
182-
};
183-
184-
let failures = evaluate_eval_thresholds(&report, None, &options);
185-
186-
assert_eq!(
187-
failures,
188-
vec!["baseline report is required for drop-based thresholds (--baseline)".to_string()]
189-
);
190-
}
191-
192-
#[test]
193-
fn test_evaluate_eval_thresholds_checks_rule_specific_drop() {
194-
let current = EvalReport {
195-
fixtures_total: 1,
196-
fixtures_passed: 1,
197-
fixtures_failed: 0,
198-
rule_metrics: vec![EvalRuleMetrics {
199-
rule_id: "sec.sql.injection".to_string(),
200-
expected: 1,
201-
predicted: 1,
202-
true_positives: 0,
203-
false_positives: 1,
204-
false_negatives: 1,
205-
precision: 0.0,
206-
recall: 0.0,
207-
f1: 0.0,
208-
}],
209-
rule_summary: Some(EvalRuleScoreSummary::default()),
210-
suite_results: vec![],
211-
threshold_failures: vec![],
212-
results: vec![],
213-
};
214-
let baseline = EvalReport {
215-
fixtures_total: 1,
216-
fixtures_passed: 1,
217-
fixtures_failed: 0,
218-
rule_metrics: vec![EvalRuleMetrics {
219-
rule_id: "sec.sql.injection".to_string(),
220-
expected: 1,
221-
predicted: 1,
222-
true_positives: 1,
223-
false_positives: 0,
224-
false_negatives: 0,
225-
precision: 1.0,
226-
recall: 1.0,
227-
f1: 1.0,
228-
}],
229-
rule_summary: Some(EvalRuleScoreSummary::default()),
230-
suite_results: vec![],
231-
threshold_failures: vec![],
232-
results: vec![],
233-
};
234-
let options = EvalThresholdOptions {
235-
max_micro_f1_drop: None,
236-
min_micro_f1: None,
237-
min_macro_f1: None,
238-
min_rule_f1: vec![],
239-
max_rule_f1_drop: vec![EvalRuleThreshold {
240-
rule_id: "sec.sql.injection".to_string(),
241-
value: 0.2,
242-
}],
243-
};
244-
245-
let failures = evaluate_eval_thresholds(&current, Some(&baseline), &options);
246-
247-
assert_eq!(failures.len(), 1);
248-
assert!(failures[0].contains("sec.sql.injection"));
249-
assert!(failures[0].contains("exceeded max 0.200"));
250-
}
251-
}
21+
pub(super) use evaluation::evaluate_eval_thresholds;
22+
pub(super) use parsing::parse_rule_threshold_args;

0 commit comments

Comments
 (0)