|
1 | | -use std::collections::HashMap; |
| 1 | +#[path = "metrics/rules.rs"] |
| 2 | +mod rules; |
| 3 | +#[path = "metrics/suites.rs"] |
| 4 | +mod suites; |
2 | 5 |
|
3 | | -use crate::core; |
4 | | -use crate::core::eval_benchmarks::{ |
5 | | - evaluate_against_thresholds, AggregateMetrics as BenchmarkAggregateMetrics, BenchmarkResult, |
6 | | - Difficulty, |
7 | | -}; |
8 | | -use crate::review::normalize_rule_id; |
9 | | - |
10 | | -use super::{ |
11 | | - EvalFixtureResult, EvalPattern, EvalRuleMetrics, EvalRuleScoreSummary, EvalSuiteResult, |
12 | | -}; |
13 | | - |
14 | | -#[derive(Debug, Default, Clone, Copy)] |
15 | | -struct RuleMetricCounts { |
16 | | - expected: usize, |
17 | | - predicted: usize, |
18 | | - true_positives: usize, |
19 | | -} |
20 | | - |
21 | | -pub(super) fn build_suite_results(results: &[EvalFixtureResult]) -> Vec<EvalSuiteResult> { |
22 | | - let mut grouped: HashMap<String, Vec<&EvalFixtureResult>> = HashMap::new(); |
23 | | - for result in results { |
24 | | - if let (Some(suite), Some(_)) = (&result.suite, &result.benchmark_metrics) { |
25 | | - grouped.entry(suite.clone()).or_default().push(result); |
26 | | - } |
27 | | - } |
28 | | - |
29 | | - let mut suites = Vec::new(); |
30 | | - for (suite_name, suite_results) in grouped { |
31 | | - let mut fixture_results = Vec::new(); |
32 | | - let mut weights = Vec::new(); |
33 | | - let mut thresholds = None; |
34 | | - |
35 | | - for result in suite_results { |
36 | | - if let Some(metrics) = result.benchmark_metrics.as_ref() { |
37 | | - fixture_results.push(metrics); |
38 | | - weights.push( |
39 | | - result |
40 | | - .difficulty |
41 | | - .as_ref() |
42 | | - .map(Difficulty::weight) |
43 | | - .unwrap_or(1.0), |
44 | | - ); |
45 | | - if thresholds.is_none() { |
46 | | - thresholds = result.suite_thresholds.clone(); |
47 | | - } |
48 | | - } |
49 | | - } |
50 | | - |
51 | | - let aggregate = BenchmarkAggregateMetrics::compute(&fixture_results, Some(&weights)); |
52 | | - let (thresholds_enforced, threshold_pass, threshold_failures) = |
53 | | - if let Some(thresholds) = thresholds.as_ref() { |
54 | | - let benchmark_result = BenchmarkResult { |
55 | | - suite_name: suite_name.clone(), |
56 | | - fixture_results: fixture_results |
57 | | - .iter() |
58 | | - .map(|result| (*result).clone()) |
59 | | - .collect(), |
60 | | - aggregate: aggregate.clone(), |
61 | | - by_category: HashMap::new(), |
62 | | - by_difficulty: HashMap::new(), |
63 | | - threshold_pass: true, |
64 | | - threshold_failures: Vec::new(), |
65 | | - timestamp: String::new(), |
66 | | - }; |
67 | | - let (passed, failures) = evaluate_against_thresholds(&benchmark_result, thresholds); |
68 | | - (true, passed, failures) |
69 | | - } else { |
70 | | - (false, true, Vec::new()) |
71 | | - }; |
72 | | - |
73 | | - suites.push(EvalSuiteResult { |
74 | | - suite: suite_name, |
75 | | - fixture_count: fixture_results.len(), |
76 | | - aggregate, |
77 | | - thresholds_enforced, |
78 | | - threshold_pass, |
79 | | - threshold_failures, |
80 | | - }); |
81 | | - } |
82 | | - |
83 | | - suites.sort_by(|left, right| left.suite.cmp(&right.suite)); |
84 | | - suites |
85 | | -} |
86 | | - |
87 | | -pub(super) fn collect_suite_threshold_failures(suites: &[EvalSuiteResult]) -> Vec<String> { |
88 | | - let mut failures = Vec::new(); |
89 | | - for suite in suites { |
90 | | - for failure in &suite.threshold_failures { |
91 | | - failures.push(format!("suite '{}' {}", suite.suite, failure)); |
92 | | - } |
93 | | - } |
94 | | - failures |
95 | | -} |
96 | | - |
97 | | -pub(super) fn compute_rule_metrics( |
98 | | - expected_patterns: &[EvalPattern], |
99 | | - comments: &[core::Comment], |
100 | | - matched_pairs: &[(usize, usize)], |
101 | | -) -> Vec<EvalRuleMetrics> { |
102 | | - let mut counts_by_rule: HashMap<String, RuleMetricCounts> = HashMap::new(); |
103 | | - |
104 | | - for pattern in expected_patterns { |
105 | | - if let Some(rule_id) = pattern.normalized_rule_id() { |
106 | | - counts_by_rule.entry(rule_id).or_default().expected += 1; |
107 | | - } |
108 | | - } |
109 | | - |
110 | | - for comment in comments { |
111 | | - if let Some(rule_id) = normalize_rule_id(comment.rule_id.as_deref()) { |
112 | | - counts_by_rule.entry(rule_id).or_default().predicted += 1; |
113 | | - } |
114 | | - } |
115 | | - |
116 | | - for (expected_idx, comment_idx) in matched_pairs { |
117 | | - let expected_rule = expected_patterns |
118 | | - .get(*expected_idx) |
119 | | - .and_then(EvalPattern::normalized_rule_id); |
120 | | - let predicted_rule = comments |
121 | | - .get(*comment_idx) |
122 | | - .and_then(|comment| normalize_rule_id(comment.rule_id.as_deref())); |
123 | | - if let (Some(expected_rule), Some(predicted_rule)) = (expected_rule, predicted_rule) { |
124 | | - if expected_rule == predicted_rule { |
125 | | - counts_by_rule |
126 | | - .entry(expected_rule) |
127 | | - .or_default() |
128 | | - .true_positives += 1; |
129 | | - } |
130 | | - } |
131 | | - } |
132 | | - |
133 | | - build_rule_metrics_from_counts(&counts_by_rule) |
134 | | -} |
135 | | - |
136 | | -pub(super) fn aggregate_rule_metrics(results: &[EvalFixtureResult]) -> Vec<EvalRuleMetrics> { |
137 | | - let mut counts_by_rule: HashMap<String, RuleMetricCounts> = HashMap::new(); |
138 | | - for result in results { |
139 | | - for metric in &result.rule_metrics { |
140 | | - let counts = counts_by_rule.entry(metric.rule_id.clone()).or_default(); |
141 | | - counts.expected = counts.expected.saturating_add(metric.expected); |
142 | | - counts.predicted = counts.predicted.saturating_add(metric.predicted); |
143 | | - counts.true_positives = counts.true_positives.saturating_add(metric.true_positives); |
144 | | - } |
145 | | - } |
146 | | - |
147 | | - build_rule_metrics_from_counts(&counts_by_rule) |
148 | | -} |
149 | | - |
150 | | -fn build_rule_metrics_from_counts( |
151 | | - counts_by_rule: &HashMap<String, RuleMetricCounts>, |
152 | | -) -> Vec<EvalRuleMetrics> { |
153 | | - let mut metrics = Vec::new(); |
154 | | - for (rule_id, counts) in counts_by_rule { |
155 | | - let false_positives = counts.predicted.saturating_sub(counts.true_positives); |
156 | | - let false_negatives = counts.expected.saturating_sub(counts.true_positives); |
157 | | - let precision = if counts.predicted > 0 { |
158 | | - counts.true_positives as f32 / counts.predicted as f32 |
159 | | - } else { |
160 | | - 0.0 |
161 | | - }; |
162 | | - let recall = if counts.expected > 0 { |
163 | | - counts.true_positives as f32 / counts.expected as f32 |
164 | | - } else { |
165 | | - 0.0 |
166 | | - }; |
167 | | - let f1 = harmonic_mean(precision, recall); |
168 | | - |
169 | | - metrics.push(EvalRuleMetrics { |
170 | | - rule_id: rule_id.clone(), |
171 | | - expected: counts.expected, |
172 | | - predicted: counts.predicted, |
173 | | - true_positives: counts.true_positives, |
174 | | - false_positives, |
175 | | - false_negatives, |
176 | | - precision, |
177 | | - recall, |
178 | | - f1, |
179 | | - }); |
180 | | - } |
181 | | - |
182 | | - metrics.sort_by(|left, right| { |
183 | | - right |
184 | | - .expected |
185 | | - .cmp(&left.expected) |
186 | | - .then_with(|| right.predicted.cmp(&left.predicted)) |
187 | | - .then_with(|| left.rule_id.cmp(&right.rule_id)) |
188 | | - }); |
189 | | - metrics |
190 | | -} |
191 | | - |
192 | | -pub(super) fn summarize_rule_metrics(metrics: &[EvalRuleMetrics]) -> Option<EvalRuleScoreSummary> { |
193 | | - if metrics.is_empty() { |
194 | | - return None; |
195 | | - } |
196 | | - |
197 | | - let mut tp_sum = 0usize; |
198 | | - let mut predicted_sum = 0usize; |
199 | | - let mut expected_sum = 0usize; |
200 | | - let mut precision_sum = 0.0f32; |
201 | | - let mut recall_sum = 0.0f32; |
202 | | - let mut f1_sum = 0.0f32; |
203 | | - |
204 | | - for metric in metrics { |
205 | | - tp_sum = tp_sum.saturating_add(metric.true_positives); |
206 | | - predicted_sum = predicted_sum.saturating_add(metric.predicted); |
207 | | - expected_sum = expected_sum.saturating_add(metric.expected); |
208 | | - precision_sum += metric.precision; |
209 | | - recall_sum += metric.recall; |
210 | | - f1_sum += metric.f1; |
211 | | - } |
212 | | - |
213 | | - let micro_precision = if predicted_sum > 0 { |
214 | | - tp_sum as f32 / predicted_sum as f32 |
215 | | - } else { |
216 | | - 0.0 |
217 | | - }; |
218 | | - let micro_recall = if expected_sum > 0 { |
219 | | - tp_sum as f32 / expected_sum as f32 |
220 | | - } else { |
221 | | - 0.0 |
222 | | - }; |
223 | | - let micro_f1 = harmonic_mean(micro_precision, micro_recall); |
224 | | - let count = metrics.len() as f32; |
225 | | - |
226 | | - Some(EvalRuleScoreSummary { |
227 | | - micro_precision, |
228 | | - micro_recall, |
229 | | - micro_f1, |
230 | | - macro_precision: precision_sum / count, |
231 | | - macro_recall: recall_sum / count, |
232 | | - macro_f1: f1_sum / count, |
233 | | - }) |
234 | | -} |
235 | | - |
236 | | -fn harmonic_mean(precision: f32, recall: f32) -> f32 { |
237 | | - if precision + recall <= f32::EPSILON { |
238 | | - 0.0 |
239 | | - } else { |
240 | | - (2.0 * precision * recall) / (precision + recall) |
241 | | - } |
242 | | -} |
243 | | - |
244 | | -#[cfg(test)] |
245 | | -mod tests { |
246 | | - use super::*; |
247 | | - use crate::core::eval_benchmarks::{BenchmarkThresholds, Difficulty, FixtureResult}; |
248 | | - |
249 | | - #[test] |
250 | | - fn test_build_suite_results_applies_pack_thresholds() { |
251 | | - let results = vec![EvalFixtureResult { |
252 | | - fixture: "community/sql-injection".to_string(), |
253 | | - suite: Some("community".to_string()), |
254 | | - passed: false, |
255 | | - total_comments: 2, |
256 | | - required_matches: 1, |
257 | | - required_total: 1, |
258 | | - benchmark_metrics: Some(FixtureResult { |
259 | | - fixture_name: "community/sql-injection".to_string(), |
260 | | - true_positives: 1, |
261 | | - false_positives: 1, |
262 | | - false_negatives: 0, |
263 | | - true_negatives: 0, |
264 | | - precision: 0.5, |
265 | | - recall: 1.0, |
266 | | - f1: 0.6666667, |
267 | | - passed: false, |
268 | | - details: vec![], |
269 | | - }), |
270 | | - suite_thresholds: Some(BenchmarkThresholds { |
271 | | - min_precision: 0.9, |
272 | | - min_recall: 0.9, |
273 | | - min_f1: 0.9, |
274 | | - max_false_positive_rate: 0.0, |
275 | | - min_weighted_score: 0.95, |
276 | | - }), |
277 | | - difficulty: Some(Difficulty::Hard), |
278 | | - rule_metrics: vec![], |
279 | | - rule_summary: None, |
280 | | - failures: vec!["missing finding".to_string()], |
281 | | - }]; |
282 | | - |
283 | | - let suites = build_suite_results(&results); |
284 | | - |
285 | | - assert_eq!(suites.len(), 1); |
286 | | - assert_eq!(suites[0].suite, "community"); |
287 | | - assert!(!suites[0].threshold_pass); |
288 | | - assert!(!suites[0].threshold_failures.is_empty()); |
289 | | - } |
290 | | -} |
| 6 | +pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics}; |
| 7 | +pub(super) use suites::{build_suite_results, collect_suite_threshold_failures}; |
0 commit comments