Skip to content

Commit 6b2221b

Browse files
committed
refactor: split eval metrics helpers
Separate suite aggregation from rule-score calculations so future eval changes stay smaller and easier to verify. Made-with: Cursor
1 parent 5c2d219 commit 6b2221b

File tree

3 files changed

+304
-289
lines changed

3 files changed

+304
-289
lines changed

src/commands/eval/metrics.rs

Lines changed: 6 additions & 289 deletions
Original file line numberDiff line numberDiff line change
@@ -1,290 +1,7 @@
1-
use std::collections::HashMap;
1+
#[path = "metrics/rules.rs"]
2+
mod rules;
3+
#[path = "metrics/suites.rs"]
4+
mod suites;
25

3-
use crate::core;
4-
use crate::core::eval_benchmarks::{
5-
evaluate_against_thresholds, AggregateMetrics as BenchmarkAggregateMetrics, BenchmarkResult,
6-
Difficulty,
7-
};
8-
use crate::review::normalize_rule_id;
9-
10-
use super::{
11-
EvalFixtureResult, EvalPattern, EvalRuleMetrics, EvalRuleScoreSummary, EvalSuiteResult,
12-
};
13-
14-
#[derive(Debug, Default, Clone, Copy)]
15-
struct RuleMetricCounts {
16-
expected: usize,
17-
predicted: usize,
18-
true_positives: usize,
19-
}
20-
21-
pub(super) fn build_suite_results(results: &[EvalFixtureResult]) -> Vec<EvalSuiteResult> {
22-
let mut grouped: HashMap<String, Vec<&EvalFixtureResult>> = HashMap::new();
23-
for result in results {
24-
if let (Some(suite), Some(_)) = (&result.suite, &result.benchmark_metrics) {
25-
grouped.entry(suite.clone()).or_default().push(result);
26-
}
27-
}
28-
29-
let mut suites = Vec::new();
30-
for (suite_name, suite_results) in grouped {
31-
let mut fixture_results = Vec::new();
32-
let mut weights = Vec::new();
33-
let mut thresholds = None;
34-
35-
for result in suite_results {
36-
if let Some(metrics) = result.benchmark_metrics.as_ref() {
37-
fixture_results.push(metrics);
38-
weights.push(
39-
result
40-
.difficulty
41-
.as_ref()
42-
.map(Difficulty::weight)
43-
.unwrap_or(1.0),
44-
);
45-
if thresholds.is_none() {
46-
thresholds = result.suite_thresholds.clone();
47-
}
48-
}
49-
}
50-
51-
let aggregate = BenchmarkAggregateMetrics::compute(&fixture_results, Some(&weights));
52-
let (thresholds_enforced, threshold_pass, threshold_failures) =
53-
if let Some(thresholds) = thresholds.as_ref() {
54-
let benchmark_result = BenchmarkResult {
55-
suite_name: suite_name.clone(),
56-
fixture_results: fixture_results
57-
.iter()
58-
.map(|result| (*result).clone())
59-
.collect(),
60-
aggregate: aggregate.clone(),
61-
by_category: HashMap::new(),
62-
by_difficulty: HashMap::new(),
63-
threshold_pass: true,
64-
threshold_failures: Vec::new(),
65-
timestamp: String::new(),
66-
};
67-
let (passed, failures) = evaluate_against_thresholds(&benchmark_result, thresholds);
68-
(true, passed, failures)
69-
} else {
70-
(false, true, Vec::new())
71-
};
72-
73-
suites.push(EvalSuiteResult {
74-
suite: suite_name,
75-
fixture_count: fixture_results.len(),
76-
aggregate,
77-
thresholds_enforced,
78-
threshold_pass,
79-
threshold_failures,
80-
});
81-
}
82-
83-
suites.sort_by(|left, right| left.suite.cmp(&right.suite));
84-
suites
85-
}
86-
87-
pub(super) fn collect_suite_threshold_failures(suites: &[EvalSuiteResult]) -> Vec<String> {
88-
let mut failures = Vec::new();
89-
for suite in suites {
90-
for failure in &suite.threshold_failures {
91-
failures.push(format!("suite '{}' {}", suite.suite, failure));
92-
}
93-
}
94-
failures
95-
}
96-
97-
pub(super) fn compute_rule_metrics(
98-
expected_patterns: &[EvalPattern],
99-
comments: &[core::Comment],
100-
matched_pairs: &[(usize, usize)],
101-
) -> Vec<EvalRuleMetrics> {
102-
let mut counts_by_rule: HashMap<String, RuleMetricCounts> = HashMap::new();
103-
104-
for pattern in expected_patterns {
105-
if let Some(rule_id) = pattern.normalized_rule_id() {
106-
counts_by_rule.entry(rule_id).or_default().expected += 1;
107-
}
108-
}
109-
110-
for comment in comments {
111-
if let Some(rule_id) = normalize_rule_id(comment.rule_id.as_deref()) {
112-
counts_by_rule.entry(rule_id).or_default().predicted += 1;
113-
}
114-
}
115-
116-
for (expected_idx, comment_idx) in matched_pairs {
117-
let expected_rule = expected_patterns
118-
.get(*expected_idx)
119-
.and_then(EvalPattern::normalized_rule_id);
120-
let predicted_rule = comments
121-
.get(*comment_idx)
122-
.and_then(|comment| normalize_rule_id(comment.rule_id.as_deref()));
123-
if let (Some(expected_rule), Some(predicted_rule)) = (expected_rule, predicted_rule) {
124-
if expected_rule == predicted_rule {
125-
counts_by_rule
126-
.entry(expected_rule)
127-
.or_default()
128-
.true_positives += 1;
129-
}
130-
}
131-
}
132-
133-
build_rule_metrics_from_counts(&counts_by_rule)
134-
}
135-
136-
pub(super) fn aggregate_rule_metrics(results: &[EvalFixtureResult]) -> Vec<EvalRuleMetrics> {
137-
let mut counts_by_rule: HashMap<String, RuleMetricCounts> = HashMap::new();
138-
for result in results {
139-
for metric in &result.rule_metrics {
140-
let counts = counts_by_rule.entry(metric.rule_id.clone()).or_default();
141-
counts.expected = counts.expected.saturating_add(metric.expected);
142-
counts.predicted = counts.predicted.saturating_add(metric.predicted);
143-
counts.true_positives = counts.true_positives.saturating_add(metric.true_positives);
144-
}
145-
}
146-
147-
build_rule_metrics_from_counts(&counts_by_rule)
148-
}
149-
150-
fn build_rule_metrics_from_counts(
151-
counts_by_rule: &HashMap<String, RuleMetricCounts>,
152-
) -> Vec<EvalRuleMetrics> {
153-
let mut metrics = Vec::new();
154-
for (rule_id, counts) in counts_by_rule {
155-
let false_positives = counts.predicted.saturating_sub(counts.true_positives);
156-
let false_negatives = counts.expected.saturating_sub(counts.true_positives);
157-
let precision = if counts.predicted > 0 {
158-
counts.true_positives as f32 / counts.predicted as f32
159-
} else {
160-
0.0
161-
};
162-
let recall = if counts.expected > 0 {
163-
counts.true_positives as f32 / counts.expected as f32
164-
} else {
165-
0.0
166-
};
167-
let f1 = harmonic_mean(precision, recall);
168-
169-
metrics.push(EvalRuleMetrics {
170-
rule_id: rule_id.clone(),
171-
expected: counts.expected,
172-
predicted: counts.predicted,
173-
true_positives: counts.true_positives,
174-
false_positives,
175-
false_negatives,
176-
precision,
177-
recall,
178-
f1,
179-
});
180-
}
181-
182-
metrics.sort_by(|left, right| {
183-
right
184-
.expected
185-
.cmp(&left.expected)
186-
.then_with(|| right.predicted.cmp(&left.predicted))
187-
.then_with(|| left.rule_id.cmp(&right.rule_id))
188-
});
189-
metrics
190-
}
191-
192-
pub(super) fn summarize_rule_metrics(metrics: &[EvalRuleMetrics]) -> Option<EvalRuleScoreSummary> {
193-
if metrics.is_empty() {
194-
return None;
195-
}
196-
197-
let mut tp_sum = 0usize;
198-
let mut predicted_sum = 0usize;
199-
let mut expected_sum = 0usize;
200-
let mut precision_sum = 0.0f32;
201-
let mut recall_sum = 0.0f32;
202-
let mut f1_sum = 0.0f32;
203-
204-
for metric in metrics {
205-
tp_sum = tp_sum.saturating_add(metric.true_positives);
206-
predicted_sum = predicted_sum.saturating_add(metric.predicted);
207-
expected_sum = expected_sum.saturating_add(metric.expected);
208-
precision_sum += metric.precision;
209-
recall_sum += metric.recall;
210-
f1_sum += metric.f1;
211-
}
212-
213-
let micro_precision = if predicted_sum > 0 {
214-
tp_sum as f32 / predicted_sum as f32
215-
} else {
216-
0.0
217-
};
218-
let micro_recall = if expected_sum > 0 {
219-
tp_sum as f32 / expected_sum as f32
220-
} else {
221-
0.0
222-
};
223-
let micro_f1 = harmonic_mean(micro_precision, micro_recall);
224-
let count = metrics.len() as f32;
225-
226-
Some(EvalRuleScoreSummary {
227-
micro_precision,
228-
micro_recall,
229-
micro_f1,
230-
macro_precision: precision_sum / count,
231-
macro_recall: recall_sum / count,
232-
macro_f1: f1_sum / count,
233-
})
234-
}
235-
236-
fn harmonic_mean(precision: f32, recall: f32) -> f32 {
237-
if precision + recall <= f32::EPSILON {
238-
0.0
239-
} else {
240-
(2.0 * precision * recall) / (precision + recall)
241-
}
242-
}
243-
244-
#[cfg(test)]
245-
mod tests {
246-
use super::*;
247-
use crate::core::eval_benchmarks::{BenchmarkThresholds, Difficulty, FixtureResult};
248-
249-
#[test]
250-
fn test_build_suite_results_applies_pack_thresholds() {
251-
let results = vec![EvalFixtureResult {
252-
fixture: "community/sql-injection".to_string(),
253-
suite: Some("community".to_string()),
254-
passed: false,
255-
total_comments: 2,
256-
required_matches: 1,
257-
required_total: 1,
258-
benchmark_metrics: Some(FixtureResult {
259-
fixture_name: "community/sql-injection".to_string(),
260-
true_positives: 1,
261-
false_positives: 1,
262-
false_negatives: 0,
263-
true_negatives: 0,
264-
precision: 0.5,
265-
recall: 1.0,
266-
f1: 0.6666667,
267-
passed: false,
268-
details: vec![],
269-
}),
270-
suite_thresholds: Some(BenchmarkThresholds {
271-
min_precision: 0.9,
272-
min_recall: 0.9,
273-
min_f1: 0.9,
274-
max_false_positive_rate: 0.0,
275-
min_weighted_score: 0.95,
276-
}),
277-
difficulty: Some(Difficulty::Hard),
278-
rule_metrics: vec![],
279-
rule_summary: None,
280-
failures: vec!["missing finding".to_string()],
281-
}];
282-
283-
let suites = build_suite_results(&results);
284-
285-
assert_eq!(suites.len(), 1);
286-
assert_eq!(suites[0].suite, "community");
287-
assert!(!suites[0].threshold_pass);
288-
assert!(!suites[0].threshold_failures.is_empty());
289-
}
290-
}
6+
pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
7+
pub(super) use suites::{build_suite_results, collect_suite_threshold_failures};

0 commit comments

Comments
 (0)