@@ -13,6 +13,29 @@ pub(in super::super) struct EvalBenchmarkBreakdowns {
1313 pub ( in super :: super ) by_difficulty : HashMap < String , BenchmarkAggregateMetrics > ,
1414}
1515
16+ pub ( in super :: super ) fn build_overall_benchmark_summary (
17+ results : & [ EvalFixtureResult ] ,
18+ ) -> Option < BenchmarkAggregateMetrics > {
19+ let benchmark_results = collect_weighted_benchmark_results ( results) ;
20+ if benchmark_results. is_empty ( ) {
21+ return None ;
22+ }
23+
24+ let fixture_results = benchmark_results
25+ . iter ( )
26+ . map ( |( result, _) | * result)
27+ . collect :: < Vec < _ > > ( ) ;
28+ let weights = benchmark_results
29+ . iter ( )
30+ . map ( |( _, weight) | * weight)
31+ . collect :: < Vec < _ > > ( ) ;
32+
33+ Some ( BenchmarkAggregateMetrics :: compute (
34+ & fixture_results,
35+ Some ( & weights) ,
36+ ) )
37+ }
38+
1639pub ( in super :: super ) fn build_suite_results ( results : & [ EvalFixtureResult ] ) -> Vec < EvalSuiteResult > {
1740 let mut grouped: HashMap < String , Vec < & EvalFixtureResult > > = HashMap :: new ( ) ;
1841 for result in results {
@@ -169,6 +192,24 @@ fn difficulty_label(difficulty: &Difficulty) -> &'static str {
169192 }
170193}
171194
195+ fn collect_weighted_benchmark_results (
196+ results : & [ EvalFixtureResult ] ,
197+ ) -> Vec < ( & crate :: core:: eval_benchmarks:: FixtureResult , f32 ) > {
198+ results
199+ . iter ( )
200+ . filter_map ( |result| {
201+ result. benchmark_metrics . as_ref ( ) . map ( |metrics| {
202+ let weight = result
203+ . difficulty
204+ . as_ref ( )
205+ . map ( Difficulty :: weight)
206+ . unwrap_or ( 1.0 ) ;
207+ ( metrics, weight)
208+ } )
209+ } )
210+ . collect ( )
211+ }
212+
172213#[ cfg( test) ]
173214mod tests {
174215 use super :: * ;
@@ -290,4 +331,47 @@ mod tests {
290331 Some ( 1 )
291332 ) ;
292333 }
334+
335+ #[ test]
336+ fn test_build_overall_benchmark_summary_aggregates_fixture_metrics ( ) {
337+ let results = vec ! [
338+ EvalFixtureResult {
339+ fixture: "suite/a" . to_string( ) ,
340+ suite: Some ( "suite" . to_string( ) ) ,
341+ passed: true ,
342+ total_comments: 1 ,
343+ required_matches: 1 ,
344+ required_total: 1 ,
345+ benchmark_metrics: Some ( FixtureResult :: compute( "suite/a" , 1 , 0 , 1 , 0 , 0 ) ) ,
346+ suite_thresholds: None ,
347+ difficulty: Some ( Difficulty :: Easy ) ,
348+ metadata: None ,
349+ rule_metrics: vec![ ] ,
350+ rule_summary: None ,
351+ warnings: vec![ ] ,
352+ failures: vec![ ] ,
353+ } ,
354+ EvalFixtureResult {
355+ fixture: "suite/b" . to_string( ) ,
356+ suite: Some ( "suite" . to_string( ) ) ,
357+ passed: false ,
358+ total_comments: 1 ,
359+ required_matches: 0 ,
360+ required_total: 1 ,
361+ benchmark_metrics: Some ( FixtureResult :: compute( "suite/b" , 1 , 0 , 0 , 0 , 1 ) ) ,
362+ suite_thresholds: None ,
363+ difficulty: Some ( Difficulty :: Hard ) ,
364+ metadata: None ,
365+ rule_metrics: vec![ ] ,
366+ rule_summary: None ,
367+ warnings: vec![ ] ,
368+ failures: vec![ "missing" . to_string( ) ] ,
369+ } ,
370+ ] ;
371+
372+ let summary = build_overall_benchmark_summary ( & results) . unwrap ( ) ;
373+
374+ assert_eq ! ( summary. fixture_count, 2 ) ;
375+ assert ! ( summary. micro_f1 < 1.0 ) ;
376+ }
293377}
0 commit comments