1- use anyhow :: Result ;
2- use std :: collections :: HashMap ;
3-
4- use super :: { EvalReport , EvalRuleMetrics } ;
1+ # [ path = "thresholds/evaluation.rs" ]
2+ mod evaluation ;
3+ # [ path = "thresholds/parsing.rs" ]
4+ mod parsing ;
55
66#[ derive( Debug , Clone ) ]
77pub ( super ) struct EvalThresholdOptions {
@@ -18,234 +18,5 @@ pub(super) struct EvalRuleThreshold {
1818 pub ( super ) value : f32 ,
1919}
2020
21- pub ( super ) fn parse_rule_threshold_args (
22- values : & [ String ] ,
23- label : & str ,
24- ) -> Result < Vec < EvalRuleThreshold > > {
25- let mut parsed = Vec :: new ( ) ;
26- for raw in values {
27- let Some ( ( rule_id, value) ) = raw. split_once ( '=' ) else {
28- anyhow:: bail!( "Invalid {} entry '{}': expected rule_id=value" , label, raw) ;
29- } ;
30- let rule_id = rule_id. trim ( ) . to_ascii_lowercase ( ) ;
31- if rule_id. is_empty ( ) {
32- anyhow:: bail!( "Invalid {} entry '{}': empty rule id" , label, raw) ;
33- }
34- let value: f32 = value
35- . trim ( )
36- . parse ( )
37- . map_err ( |_| anyhow:: anyhow!( "Invalid {} entry '{}': invalid float" , label, raw) ) ?;
38- if !( 0.0 ..=1.0 ) . contains ( & value) {
39- anyhow:: bail!(
40- "Invalid {} entry '{}': value must be between 0.0 and 1.0" ,
41- label,
42- raw
43- ) ;
44- }
45- parsed. push ( EvalRuleThreshold { rule_id, value } ) ;
46- }
47- Ok ( parsed)
48- }
49-
50- pub ( super ) fn evaluate_eval_thresholds (
51- current : & EvalReport ,
52- baseline : Option < & EvalReport > ,
53- options : & EvalThresholdOptions ,
54- ) -> Vec < String > {
55- let mut failures = Vec :: new ( ) ;
56- let current_micro_f1 = current
57- . rule_summary
58- . map ( |summary| summary. micro_f1 )
59- . unwrap_or ( 0.0 ) ;
60- let current_macro_f1 = current
61- . rule_summary
62- . map ( |summary| summary. macro_f1 )
63- . unwrap_or ( 0.0 ) ;
64-
65- if let Some ( threshold) = options. min_micro_f1 {
66- let threshold = threshold. clamp ( 0.0 , 1.0 ) ;
67- if current_micro_f1 < threshold {
68- failures. push ( format ! (
69- "micro-F1 {:.3} is below minimum {:.3}" ,
70- current_micro_f1, threshold
71- ) ) ;
72- }
73- }
74-
75- if let Some ( threshold) = options. min_macro_f1 {
76- let threshold = threshold. clamp ( 0.0 , 1.0 ) ;
77- if current_macro_f1 < threshold {
78- failures. push ( format ! (
79- "macro-F1 {:.3} is below minimum {:.3}" ,
80- current_macro_f1, threshold
81- ) ) ;
82- }
83- }
84-
85- let current_by_rule = build_rule_f1_map ( & current. rule_metrics ) ;
86- for threshold in & options. min_rule_f1 {
87- let current = current_by_rule
88- . get ( & threshold. rule_id )
89- . copied ( )
90- . unwrap_or ( 0.0 ) ;
91- if current < threshold. value {
92- failures. push ( format ! (
93- "rule '{}' F1 {:.3} is below minimum {:.3}" ,
94- threshold. rule_id, current, threshold. value
95- ) ) ;
96- }
97- }
98-
99- if options. max_micro_f1_drop . is_some ( ) || !options. max_rule_f1_drop . is_empty ( ) {
100- let Some ( baseline) = baseline else {
101- failures. push (
102- "baseline report is required for drop-based thresholds (--baseline)" . to_string ( ) ,
103- ) ;
104- return failures;
105- } ;
106-
107- let baseline_summary = baseline. rule_summary . unwrap_or_default ( ) ;
108- if let Some ( max_drop) = options. max_micro_f1_drop {
109- let max_drop = max_drop. clamp ( 0.0 , 1.0 ) ;
110- let drop = ( baseline_summary. micro_f1 - current_micro_f1) . max ( 0.0 ) ;
111- if drop > max_drop {
112- failures. push ( format ! (
113- "micro-F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})" ,
114- drop, max_drop, baseline_summary. micro_f1, current_micro_f1
115- ) ) ;
116- }
117- }
118-
119- if !options. max_rule_f1_drop . is_empty ( ) {
120- let baseline_by_rule = build_rule_f1_map ( & baseline. rule_metrics ) ;
121- for threshold in & options. max_rule_f1_drop {
122- let baseline_f1 = baseline_by_rule
123- . get ( & threshold. rule_id )
124- . copied ( )
125- . unwrap_or ( 0.0 ) ;
126- let current_f1 = current_by_rule
127- . get ( & threshold. rule_id )
128- . copied ( )
129- . unwrap_or ( 0.0 ) ;
130- let drop = ( baseline_f1 - current_f1) . max ( 0.0 ) ;
131- if drop > threshold. value {
132- failures. push ( format ! (
133- "rule '{}' F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})" ,
134- threshold. rule_id, drop, threshold. value, baseline_f1, current_f1
135- ) ) ;
136- }
137- }
138- }
139- }
140-
141- failures
142- }
143-
144- fn build_rule_f1_map ( metrics : & [ EvalRuleMetrics ] ) -> HashMap < String , f32 > {
145- let mut by_rule = HashMap :: new ( ) ;
146- for metric in metrics {
147- by_rule. insert ( metric. rule_id . to_ascii_lowercase ( ) , metric. f1 ) ;
148- }
149- by_rule
150- }
151-
152- #[ cfg( test) ]
153- mod tests {
154- use super :: super :: { EvalReport , EvalRuleMetrics , EvalRuleScoreSummary } ;
155- use super :: * ;
156-
157- #[ test]
158- fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks ( ) {
159- let report = EvalReport {
160- fixtures_total : 1 ,
161- fixtures_passed : 1 ,
162- fixtures_failed : 0 ,
163- rule_metrics : vec ! [ ] ,
164- rule_summary : Some ( EvalRuleScoreSummary {
165- micro_precision : 1.0 ,
166- micro_recall : 1.0 ,
167- micro_f1 : 1.0 ,
168- macro_precision : 1.0 ,
169- macro_recall : 1.0 ,
170- macro_f1 : 1.0 ,
171- } ) ,
172- suite_results : vec ! [ ] ,
173- threshold_failures : vec ! [ ] ,
174- results : vec ! [ ] ,
175- } ;
176- let options = EvalThresholdOptions {
177- max_micro_f1_drop : Some ( 0.05 ) ,
178- min_micro_f1 : None ,
179- min_macro_f1 : None ,
180- min_rule_f1 : vec ! [ ] ,
181- max_rule_f1_drop : vec ! [ ] ,
182- } ;
183-
184- let failures = evaluate_eval_thresholds ( & report, None , & options) ;
185-
186- assert_eq ! (
187- failures,
188- vec![ "baseline report is required for drop-based thresholds (--baseline)" . to_string( ) ]
189- ) ;
190- }
191-
192- #[ test]
193- fn test_evaluate_eval_thresholds_checks_rule_specific_drop ( ) {
194- let current = EvalReport {
195- fixtures_total : 1 ,
196- fixtures_passed : 1 ,
197- fixtures_failed : 0 ,
198- rule_metrics : vec ! [ EvalRuleMetrics {
199- rule_id: "sec.sql.injection" . to_string( ) ,
200- expected: 1 ,
201- predicted: 1 ,
202- true_positives: 0 ,
203- false_positives: 1 ,
204- false_negatives: 1 ,
205- precision: 0.0 ,
206- recall: 0.0 ,
207- f1: 0.0 ,
208- } ] ,
209- rule_summary : Some ( EvalRuleScoreSummary :: default ( ) ) ,
210- suite_results : vec ! [ ] ,
211- threshold_failures : vec ! [ ] ,
212- results : vec ! [ ] ,
213- } ;
214- let baseline = EvalReport {
215- fixtures_total : 1 ,
216- fixtures_passed : 1 ,
217- fixtures_failed : 0 ,
218- rule_metrics : vec ! [ EvalRuleMetrics {
219- rule_id: "sec.sql.injection" . to_string( ) ,
220- expected: 1 ,
221- predicted: 1 ,
222- true_positives: 1 ,
223- false_positives: 0 ,
224- false_negatives: 0 ,
225- precision: 1.0 ,
226- recall: 1.0 ,
227- f1: 1.0 ,
228- } ] ,
229- rule_summary : Some ( EvalRuleScoreSummary :: default ( ) ) ,
230- suite_results : vec ! [ ] ,
231- threshold_failures : vec ! [ ] ,
232- results : vec ! [ ] ,
233- } ;
234- let options = EvalThresholdOptions {
235- max_micro_f1_drop : None ,
236- min_micro_f1 : None ,
237- min_macro_f1 : None ,
238- min_rule_f1 : vec ! [ ] ,
239- max_rule_f1_drop : vec ! [ EvalRuleThreshold {
240- rule_id: "sec.sql.injection" . to_string( ) ,
241- value: 0.2 ,
242- } ] ,
243- } ;
244-
245- let failures = evaluate_eval_thresholds ( & current, Some ( & baseline) , & options) ;
246-
247- assert_eq ! ( failures. len( ) , 1 ) ;
248- assert ! ( failures[ 0 ] . contains( "sec.sql.injection" ) ) ;
249- assert ! ( failures[ 0 ] . contains( "exceeded max 0.200" ) ) ;
250- }
251- }
21+ pub ( super ) use evaluation:: evaluate_eval_thresholds;
22+ pub ( super ) use parsing:: parse_rule_threshold_args;
0 commit comments