@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
55
66use crate :: config;
77use crate :: core;
8+ use crate :: core:: eval_benchmarks:: CommunityFixturePack ;
89use crate :: review:: { normalize_rule_id, review_diff_content_raw} ;
910
1011#[ derive( Debug , Clone , Deserialize , Default ) ]
@@ -158,17 +159,16 @@ pub async fn eval_command(
158159 output_path : Option < PathBuf > ,
159160 options : EvalRunOptions ,
160161) -> Result < ( ) > {
161- let fixture_paths = collect_fixture_paths ( & fixtures_dir) ?;
162- if fixture_paths . is_empty ( ) {
162+ let fixtures = collect_eval_fixtures ( & fixtures_dir) ?;
163+ if fixtures . is_empty ( ) {
163164 anyhow:: bail!(
164165 "No fixture files found in {} (expected .json/.yml/.yaml)" ,
165166 fixtures_dir. display( )
166167 ) ;
167168 }
168169
169170 let mut results = Vec :: new ( ) ;
170- for fixture_path in fixture_paths {
171- let fixture = load_eval_fixture ( & fixture_path) ?;
171+ for ( fixture_path, fixture) in fixtures {
172172 let result = run_eval_fixture ( & config, & fixture_path, fixture) . await ?;
173173 results. push ( result) ;
174174 }
@@ -333,21 +333,86 @@ fn collect_fixture_paths(fixtures_dir: &Path) -> Result<Vec<PathBuf>> {
333333 Ok ( paths)
334334}
335335
336- fn load_eval_fixture ( path : & Path ) -> Result < EvalFixture > {
336+ fn collect_eval_fixtures ( fixtures_dir : & Path ) -> Result < Vec < ( PathBuf , EvalFixture ) > > {
337+ let mut fixtures = Vec :: new ( ) ;
338+ for path in collect_fixture_paths ( fixtures_dir) ? {
339+ for fixture in load_eval_fixtures_from_path ( & path) ? {
340+ fixtures. push ( ( path. clone ( ) , fixture) ) ;
341+ }
342+ }
343+ Ok ( fixtures)
344+ }
345+
346+ fn load_eval_fixtures_from_path ( path : & Path ) -> Result < Vec < EvalFixture > > {
337347 let content = std:: fs:: read_to_string ( path) ?;
348+
349+ if let Ok ( pack) = load_fixture_file :: < CommunityFixturePack > ( path, & content) {
350+ return Ok ( expand_community_fixture_pack ( pack) ) ;
351+ }
352+
353+ Ok ( vec ! [ load_eval_fixture_from_content( path, & content) ?] )
354+ }
355+
356+ fn load_eval_fixture_from_content ( path : & Path , content : & str ) -> Result < EvalFixture > {
357+ load_fixture_file :: < EvalFixture > ( path, content)
358+ }
359+
360+ fn load_fixture_file < T > ( path : & Path , content : & str ) -> Result < T >
361+ where
362+ T : for < ' de > Deserialize < ' de > ,
363+ {
338364 let extension = path
339365 . extension ( )
340366 . and_then ( |value| value. to_str ( ) )
341367 . map ( |value| value. to_ascii_lowercase ( ) ) ;
342368 match extension. as_deref ( ) {
343- Some ( "json" ) => Ok ( serde_json:: from_str ( & content) ?) ,
344- _ => match serde_yaml:: from_str ( & content) {
369+ Some ( "json" ) => Ok ( serde_json:: from_str ( content) ?) ,
370+ _ => match serde_yaml:: from_str ( content) {
345371 Ok ( parsed) => Ok ( parsed) ,
346- Err ( _) => Ok ( serde_json:: from_str ( & content) ?) ,
372+ Err ( _) => Ok ( serde_json:: from_str ( content) ?) ,
347373 } ,
348374 }
349375}
350376
377+ fn expand_community_fixture_pack ( pack : CommunityFixturePack ) -> Vec < EvalFixture > {
378+ let pack_name = pack. name ;
379+ pack. fixtures
380+ . into_iter ( )
381+ . map ( |fixture| EvalFixture {
382+ name : Some ( format ! ( "{}/{}" , pack_name, fixture. name) ) ,
383+ diff : Some ( fixture. diff_content ) ,
384+ diff_file : None ,
385+ repo_path : None ,
386+ expect : EvalExpectations {
387+ must_find : fixture
388+ . expected_findings
389+ . into_iter ( )
390+ . map ( |finding| EvalPattern {
391+ file : finding. file_pattern ,
392+ line : finding. line_hint ,
393+ contains : finding. contains ,
394+ severity : finding. severity ,
395+ category : finding. category ,
396+ rule_id : finding. rule_id . clone ( ) ,
397+ require_rule_id : finding. rule_id . is_some ( ) ,
398+ } )
399+ . collect ( ) ,
400+ must_not_find : fixture
401+ . negative_findings
402+ . into_iter ( )
403+ . map ( |finding| EvalPattern {
404+ file : finding. file_pattern ,
405+ contains : finding. contains ,
406+ ..Default :: default ( )
407+ } )
408+ . collect ( ) ,
409+ min_total : None ,
410+ max_total : None ,
411+ } ,
412+ } )
413+ . collect ( )
414+ }
415+
351416fn load_eval_report ( path : & Path ) -> Result < EvalReport > {
352417 let content = std:: fs:: read_to_string ( path) ?;
353418 let report: EvalReport = serde_json:: from_str ( & content) ?;
@@ -914,6 +979,10 @@ fn summarize_for_eval(content: &str) -> String {
914979#[ cfg( test) ]
915980mod tests {
916981 use super :: * ;
982+ use crate :: core:: eval_benchmarks:: {
983+ BenchmarkFixture , CommunityFixturePack , Difficulty , ExpectedFinding , NegativeFinding ,
984+ } ;
985+ use tempfile:: tempdir;
917986
918987 #[ test]
919988 fn test_summarize_for_eval_short ( ) {
@@ -949,4 +1018,223 @@ mod tests {
9491018 let result = summarize_for_eval ( & content) ;
9501019 assert ! ( result. len( ) <= 120 ) ;
9511020 }
1021+
1022+ #[ test]
1023+ fn test_load_eval_fixtures_from_path_expands_benchmark_pack ( ) {
1024+ let dir = tempdir ( ) . unwrap ( ) ;
1025+ let pack_path = dir. path ( ) . join ( "pack.json" ) ;
1026+ let pack = CommunityFixturePack {
1027+ name : "owasp-top10" . to_string ( ) ,
1028+ author : "community" . to_string ( ) ,
1029+ version : "1.0.0" . to_string ( ) ,
1030+ description : "security regressions" . to_string ( ) ,
1031+ languages : vec ! [ "python" . to_string( ) ] ,
1032+ categories : vec ! [ "security" . to_string( ) ] ,
1033+ fixtures : vec ! [ BenchmarkFixture {
1034+ name: "sql-injection" . to_string( ) ,
1035+ category: "security" . to_string( ) ,
1036+ language: "python" . to_string( ) ,
1037+ difficulty: Difficulty :: Easy ,
1038+ diff_content: "diff --git a/app.py b/app.py" . to_string( ) ,
1039+ expected_findings: vec![ ExpectedFinding {
1040+ description: "detect sql injection" . to_string( ) ,
1041+ severity: Some ( "error" . to_string( ) ) ,
1042+ category: Some ( "security" . to_string( ) ) ,
1043+ file_pattern: Some ( "app.py" . to_string( ) ) ,
1044+ line_hint: Some ( 12 ) ,
1045+ contains: Some ( "sql injection" . to_string( ) ) ,
1046+ rule_id: Some ( "sec.sql.injection" . to_string( ) ) ,
1047+ } ] ,
1048+ negative_findings: vec![ NegativeFinding {
1049+ description: "no false positive on sanitizer" . to_string( ) ,
1050+ file_pattern: Some ( "app.py" . to_string( ) ) ,
1051+ contains: Some ( "sanitized" . to_string( ) ) ,
1052+ } ] ,
1053+ description: None ,
1054+ source: None ,
1055+ } ] ,
1056+ } ;
1057+ std:: fs:: write ( & pack_path, serde_json:: to_string ( & pack) . unwrap ( ) ) . unwrap ( ) ;
1058+
1059+ let fixtures = load_eval_fixtures_from_path ( & pack_path) . unwrap ( ) ;
1060+
1061+ assert_eq ! ( fixtures. len( ) , 1 ) ;
1062+ let fixture = & fixtures[ 0 ] ;
1063+ assert_eq ! ( fixture. name. as_deref( ) , Some ( "owasp-top10/sql-injection" ) ) ;
1064+ assert_eq ! (
1065+ fixture. diff. as_deref( ) ,
1066+ Some ( "diff --git a/app.py b/app.py" )
1067+ ) ;
1068+ assert_eq ! ( fixture. expect. must_find. len( ) , 1 ) ;
1069+ assert_eq ! ( fixture. expect. must_not_find. len( ) , 1 ) ;
1070+ assert ! ( fixture. expect. must_find[ 0 ] . require_rule_id) ;
1071+ assert_eq ! (
1072+ fixture. expect. must_find[ 0 ] . rule_id. as_deref( ) ,
1073+ Some ( "sec.sql.injection" )
1074+ ) ;
1075+ }
1076+
1077+ #[ test]
1078+ fn test_load_eval_fixtures_from_path_keeps_standard_fixture_shape ( ) {
1079+ let dir = tempdir ( ) . unwrap ( ) ;
1080+ let fixture_path = dir. path ( ) . join ( "standard.yml" ) ;
1081+ std:: fs:: write (
1082+ & fixture_path,
1083+ r#"name: standard
1084+ diff: |
1085+ diff --git a/lib.rs b/lib.rs
1086+ expect:
1087+ must_find:
1088+ - contains: injection
1089+ severity: error
1090+ "# ,
1091+ )
1092+ . unwrap ( ) ;
1093+
1094+ let fixtures = load_eval_fixtures_from_path ( & fixture_path) . unwrap ( ) ;
1095+
1096+ assert_eq ! ( fixtures. len( ) , 1 ) ;
1097+ assert_eq ! ( fixtures[ 0 ] . name. as_deref( ) , Some ( "standard" ) ) ;
1098+ assert_eq ! (
1099+ fixtures[ 0 ] . expect. must_find[ 0 ] . contains. as_deref( ) ,
1100+ Some ( "injection" )
1101+ ) ;
1102+ }
1103+
1104+ #[ test]
1105+ fn test_collect_eval_fixtures_expands_pack_entries_in_sorted_order ( ) {
1106+ let dir = tempdir ( ) . unwrap ( ) ;
1107+ let standard_path = dir. path ( ) . join ( "b-standard.yml" ) ;
1108+ std:: fs:: write (
1109+ & standard_path,
1110+ r#"name: standard
1111+ diff: |
1112+ diff --git a/lib.rs b/lib.rs
1113+ expect:
1114+ must_find:
1115+ - contains: unwrap
1116+ "# ,
1117+ )
1118+ . unwrap ( ) ;
1119+
1120+ let pack_path = dir. path ( ) . join ( "a-pack.json" ) ;
1121+ let pack = CommunityFixturePack {
1122+ name : "community" . to_string ( ) ,
1123+ author : "tester" . to_string ( ) ,
1124+ version : "1.0.0" . to_string ( ) ,
1125+ description : "regressions" . to_string ( ) ,
1126+ languages : vec ! [ "rust" . to_string( ) ] ,
1127+ categories : vec ! [ "correctness" . to_string( ) ] ,
1128+ fixtures : vec ! [ BenchmarkFixture {
1129+ name: "panic" . to_string( ) ,
1130+ category: "correctness" . to_string( ) ,
1131+ language: "rust" . to_string( ) ,
1132+ difficulty: Difficulty :: Medium ,
1133+ diff_content: "diff --git a/lib.rs b/lib.rs" . to_string( ) ,
1134+ expected_findings: vec![ ] ,
1135+ negative_findings: vec![ ] ,
1136+ description: None ,
1137+ source: None ,
1138+ } ] ,
1139+ } ;
1140+ std:: fs:: write ( & pack_path, serde_json:: to_string ( & pack) . unwrap ( ) ) . unwrap ( ) ;
1141+
1142+ let fixtures = collect_eval_fixtures ( dir. path ( ) ) . unwrap ( ) ;
1143+
1144+ assert_eq ! ( fixtures. len( ) , 2 ) ;
1145+ assert_eq ! ( fixtures[ 0 ] . 1 . name. as_deref( ) , Some ( "community/panic" ) ) ;
1146+ assert_eq ! ( fixtures[ 1 ] . 1 . name. as_deref( ) , Some ( "standard" ) ) ;
1147+ }
1148+
1149+ #[ test]
1150+ fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks ( ) {
1151+ let report = EvalReport {
1152+ fixtures_total : 1 ,
1153+ fixtures_passed : 1 ,
1154+ fixtures_failed : 0 ,
1155+ rule_metrics : vec ! [ ] ,
1156+ rule_summary : Some ( EvalRuleScoreSummary {
1157+ micro_precision : 1.0 ,
1158+ micro_recall : 1.0 ,
1159+ micro_f1 : 1.0 ,
1160+ macro_precision : 1.0 ,
1161+ macro_recall : 1.0 ,
1162+ macro_f1 : 1.0 ,
1163+ } ) ,
1164+ threshold_failures : vec ! [ ] ,
1165+ results : vec ! [ ] ,
1166+ } ;
1167+ let options = EvalThresholdOptions {
1168+ max_micro_f1_drop : Some ( 0.05 ) ,
1169+ min_micro_f1 : None ,
1170+ min_macro_f1 : None ,
1171+ min_rule_f1 : vec ! [ ] ,
1172+ max_rule_f1_drop : vec ! [ ] ,
1173+ } ;
1174+
1175+ let failures = evaluate_eval_thresholds ( & report, None , & options) ;
1176+
1177+ assert_eq ! (
1178+ failures,
1179+ vec![ "baseline report is required for drop-based thresholds (--baseline)" . to_string( ) ]
1180+ ) ;
1181+ }
1182+
1183+ #[ test]
1184+ fn test_evaluate_eval_thresholds_checks_rule_specific_drop ( ) {
1185+ let current = EvalReport {
1186+ fixtures_total : 1 ,
1187+ fixtures_passed : 1 ,
1188+ fixtures_failed : 0 ,
1189+ rule_metrics : vec ! [ EvalRuleMetrics {
1190+ rule_id: "sec.sql.injection" . to_string( ) ,
1191+ expected: 1 ,
1192+ predicted: 1 ,
1193+ true_positives: 0 ,
1194+ false_positives: 1 ,
1195+ false_negatives: 1 ,
1196+ precision: 0.0 ,
1197+ recall: 0.0 ,
1198+ f1: 0.0 ,
1199+ } ] ,
1200+ rule_summary : Some ( EvalRuleScoreSummary :: default ( ) ) ,
1201+ threshold_failures : vec ! [ ] ,
1202+ results : vec ! [ ] ,
1203+ } ;
1204+ let baseline = EvalReport {
1205+ fixtures_total : 1 ,
1206+ fixtures_passed : 1 ,
1207+ fixtures_failed : 0 ,
1208+ rule_metrics : vec ! [ EvalRuleMetrics {
1209+ rule_id: "sec.sql.injection" . to_string( ) ,
1210+ expected: 1 ,
1211+ predicted: 1 ,
1212+ true_positives: 1 ,
1213+ false_positives: 0 ,
1214+ false_negatives: 0 ,
1215+ precision: 1.0 ,
1216+ recall: 1.0 ,
1217+ f1: 1.0 ,
1218+ } ] ,
1219+ rule_summary : Some ( EvalRuleScoreSummary :: default ( ) ) ,
1220+ threshold_failures : vec ! [ ] ,
1221+ results : vec ! [ ] ,
1222+ } ;
1223+ let options = EvalThresholdOptions {
1224+ max_micro_f1_drop : None ,
1225+ min_micro_f1 : None ,
1226+ min_macro_f1 : None ,
1227+ min_rule_f1 : vec ! [ ] ,
1228+ max_rule_f1_drop : vec ! [ EvalRuleThreshold {
1229+ rule_id: "sec.sql.injection" . to_string( ) ,
1230+ value: 0.2 ,
1231+ } ] ,
1232+ } ;
1233+
1234+ let failures = evaluate_eval_thresholds ( & current, Some ( & baseline) , & options) ;
1235+
1236+ assert_eq ! ( failures. len( ) , 1 ) ;
1237+ assert ! ( failures[ 0 ] . contains( "sec.sql.injection" ) ) ;
1238+ assert ! ( failures[ 0 ] . contains( "exceeded max 0.200" ) ) ;
1239+ }
9521240}
0 commit comments