Skip to content

Commit 3318a6d

Browse files
committed
test: expand eval coverage for benchmark packs
1 parent 1b3fd30 commit 3318a6d

1 file changed

Lines changed: 296 additions & 8 deletions

File tree

src/commands/eval.rs

Lines changed: 296 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
55

66
use crate::config;
77
use crate::core;
8+
use crate::core::eval_benchmarks::CommunityFixturePack;
89
use crate::review::{normalize_rule_id, review_diff_content_raw};
910

1011
#[derive(Debug, Clone, Deserialize, Default)]
@@ -158,17 +159,16 @@ pub async fn eval_command(
158159
output_path: Option<PathBuf>,
159160
options: EvalRunOptions,
160161
) -> Result<()> {
161-
let fixture_paths = collect_fixture_paths(&fixtures_dir)?;
162-
if fixture_paths.is_empty() {
162+
let fixtures = collect_eval_fixtures(&fixtures_dir)?;
163+
if fixtures.is_empty() {
163164
anyhow::bail!(
164165
"No fixture files found in {} (expected .json/.yml/.yaml)",
165166
fixtures_dir.display()
166167
);
167168
}
168169

169170
let mut results = Vec::new();
170-
for fixture_path in fixture_paths {
171-
let fixture = load_eval_fixture(&fixture_path)?;
171+
for (fixture_path, fixture) in fixtures {
172172
let result = run_eval_fixture(&config, &fixture_path, fixture).await?;
173173
results.push(result);
174174
}
@@ -333,21 +333,86 @@ fn collect_fixture_paths(fixtures_dir: &Path) -> Result<Vec<PathBuf>> {
333333
Ok(paths)
334334
}
335335

336-
fn load_eval_fixture(path: &Path) -> Result<EvalFixture> {
336+
fn collect_eval_fixtures(fixtures_dir: &Path) -> Result<Vec<(PathBuf, EvalFixture)>> {
337+
let mut fixtures = Vec::new();
338+
for path in collect_fixture_paths(fixtures_dir)? {
339+
for fixture in load_eval_fixtures_from_path(&path)? {
340+
fixtures.push((path.clone(), fixture));
341+
}
342+
}
343+
Ok(fixtures)
344+
}
345+
346+
fn load_eval_fixtures_from_path(path: &Path) -> Result<Vec<EvalFixture>> {
337347
let content = std::fs::read_to_string(path)?;
348+
349+
if let Ok(pack) = load_fixture_file::<CommunityFixturePack>(path, &content) {
350+
return Ok(expand_community_fixture_pack(pack));
351+
}
352+
353+
Ok(vec![load_eval_fixture_from_content(path, &content)?])
354+
}
355+
356+
fn load_eval_fixture_from_content(path: &Path, content: &str) -> Result<EvalFixture> {
357+
load_fixture_file::<EvalFixture>(path, content)
358+
}
359+
360+
fn load_fixture_file<T>(path: &Path, content: &str) -> Result<T>
361+
where
362+
T: for<'de> Deserialize<'de>,
363+
{
338364
let extension = path
339365
.extension()
340366
.and_then(|value| value.to_str())
341367
.map(|value| value.to_ascii_lowercase());
342368
match extension.as_deref() {
343-
Some("json") => Ok(serde_json::from_str(&content)?),
344-
_ => match serde_yaml::from_str(&content) {
369+
Some("json") => Ok(serde_json::from_str(content)?),
370+
_ => match serde_yaml::from_str(content) {
345371
Ok(parsed) => Ok(parsed),
346-
Err(_) => Ok(serde_json::from_str(&content)?),
372+
Err(_) => Ok(serde_json::from_str(content)?),
347373
},
348374
}
349375
}
350376

377+
fn expand_community_fixture_pack(pack: CommunityFixturePack) -> Vec<EvalFixture> {
378+
let pack_name = pack.name;
379+
pack.fixtures
380+
.into_iter()
381+
.map(|fixture| EvalFixture {
382+
name: Some(format!("{}/{}", pack_name, fixture.name)),
383+
diff: Some(fixture.diff_content),
384+
diff_file: None,
385+
repo_path: None,
386+
expect: EvalExpectations {
387+
must_find: fixture
388+
.expected_findings
389+
.into_iter()
390+
.map(|finding| EvalPattern {
391+
file: finding.file_pattern,
392+
line: finding.line_hint,
393+
contains: finding.contains,
394+
severity: finding.severity,
395+
category: finding.category,
396+
rule_id: finding.rule_id.clone(),
397+
require_rule_id: finding.rule_id.is_some(),
398+
})
399+
.collect(),
400+
must_not_find: fixture
401+
.negative_findings
402+
.into_iter()
403+
.map(|finding| EvalPattern {
404+
file: finding.file_pattern,
405+
contains: finding.contains,
406+
..Default::default()
407+
})
408+
.collect(),
409+
min_total: None,
410+
max_total: None,
411+
},
412+
})
413+
.collect()
414+
}
415+
351416
fn load_eval_report(path: &Path) -> Result<EvalReport> {
352417
let content = std::fs::read_to_string(path)?;
353418
let report: EvalReport = serde_json::from_str(&content)?;
@@ -914,6 +979,10 @@ fn summarize_for_eval(content: &str) -> String {
914979
#[cfg(test)]
915980
mod tests {
916981
use super::*;
982+
use crate::core::eval_benchmarks::{
983+
BenchmarkFixture, CommunityFixturePack, Difficulty, ExpectedFinding, NegativeFinding,
984+
};
985+
use tempfile::tempdir;
917986

918987
#[test]
919988
fn test_summarize_for_eval_short() {
@@ -949,4 +1018,223 @@ mod tests {
9491018
let result = summarize_for_eval(&content);
9501019
assert!(result.len() <= 120);
9511020
}
1021+
1022+
#[test]
1023+
fn test_load_eval_fixtures_from_path_expands_benchmark_pack() {
1024+
let dir = tempdir().unwrap();
1025+
let pack_path = dir.path().join("pack.json");
1026+
let pack = CommunityFixturePack {
1027+
name: "owasp-top10".to_string(),
1028+
author: "community".to_string(),
1029+
version: "1.0.0".to_string(),
1030+
description: "security regressions".to_string(),
1031+
languages: vec!["python".to_string()],
1032+
categories: vec!["security".to_string()],
1033+
fixtures: vec![BenchmarkFixture {
1034+
name: "sql-injection".to_string(),
1035+
category: "security".to_string(),
1036+
language: "python".to_string(),
1037+
difficulty: Difficulty::Easy,
1038+
diff_content: "diff --git a/app.py b/app.py".to_string(),
1039+
expected_findings: vec![ExpectedFinding {
1040+
description: "detect sql injection".to_string(),
1041+
severity: Some("error".to_string()),
1042+
category: Some("security".to_string()),
1043+
file_pattern: Some("app.py".to_string()),
1044+
line_hint: Some(12),
1045+
contains: Some("sql injection".to_string()),
1046+
rule_id: Some("sec.sql.injection".to_string()),
1047+
}],
1048+
negative_findings: vec![NegativeFinding {
1049+
description: "no false positive on sanitizer".to_string(),
1050+
file_pattern: Some("app.py".to_string()),
1051+
contains: Some("sanitized".to_string()),
1052+
}],
1053+
description: None,
1054+
source: None,
1055+
}],
1056+
};
1057+
std::fs::write(&pack_path, serde_json::to_string(&pack).unwrap()).unwrap();
1058+
1059+
let fixtures = load_eval_fixtures_from_path(&pack_path).unwrap();
1060+
1061+
assert_eq!(fixtures.len(), 1);
1062+
let fixture = &fixtures[0];
1063+
assert_eq!(fixture.name.as_deref(), Some("owasp-top10/sql-injection"));
1064+
assert_eq!(
1065+
fixture.diff.as_deref(),
1066+
Some("diff --git a/app.py b/app.py")
1067+
);
1068+
assert_eq!(fixture.expect.must_find.len(), 1);
1069+
assert_eq!(fixture.expect.must_not_find.len(), 1);
1070+
assert!(fixture.expect.must_find[0].require_rule_id);
1071+
assert_eq!(
1072+
fixture.expect.must_find[0].rule_id.as_deref(),
1073+
Some("sec.sql.injection")
1074+
);
1075+
}
1076+
1077+
#[test]
1078+
fn test_load_eval_fixtures_from_path_keeps_standard_fixture_shape() {
1079+
let dir = tempdir().unwrap();
1080+
let fixture_path = dir.path().join("standard.yml");
1081+
std::fs::write(
1082+
&fixture_path,
1083+
r#"name: standard
1084+
diff: |
1085+
diff --git a/lib.rs b/lib.rs
1086+
expect:
1087+
must_find:
1088+
- contains: injection
1089+
severity: error
1090+
"#,
1091+
)
1092+
.unwrap();
1093+
1094+
let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
1095+
1096+
assert_eq!(fixtures.len(), 1);
1097+
assert_eq!(fixtures[0].name.as_deref(), Some("standard"));
1098+
assert_eq!(
1099+
fixtures[0].expect.must_find[0].contains.as_deref(),
1100+
Some("injection")
1101+
);
1102+
}
1103+
1104+
#[test]
1105+
fn test_collect_eval_fixtures_expands_pack_entries_in_sorted_order() {
1106+
let dir = tempdir().unwrap();
1107+
let standard_path = dir.path().join("b-standard.yml");
1108+
std::fs::write(
1109+
&standard_path,
1110+
r#"name: standard
1111+
diff: |
1112+
diff --git a/lib.rs b/lib.rs
1113+
expect:
1114+
must_find:
1115+
- contains: unwrap
1116+
"#,
1117+
)
1118+
.unwrap();
1119+
1120+
let pack_path = dir.path().join("a-pack.json");
1121+
let pack = CommunityFixturePack {
1122+
name: "community".to_string(),
1123+
author: "tester".to_string(),
1124+
version: "1.0.0".to_string(),
1125+
description: "regressions".to_string(),
1126+
languages: vec!["rust".to_string()],
1127+
categories: vec!["correctness".to_string()],
1128+
fixtures: vec![BenchmarkFixture {
1129+
name: "panic".to_string(),
1130+
category: "correctness".to_string(),
1131+
language: "rust".to_string(),
1132+
difficulty: Difficulty::Medium,
1133+
diff_content: "diff --git a/lib.rs b/lib.rs".to_string(),
1134+
expected_findings: vec![],
1135+
negative_findings: vec![],
1136+
description: None,
1137+
source: None,
1138+
}],
1139+
};
1140+
std::fs::write(&pack_path, serde_json::to_string(&pack).unwrap()).unwrap();
1141+
1142+
let fixtures = collect_eval_fixtures(dir.path()).unwrap();
1143+
1144+
assert_eq!(fixtures.len(), 2);
1145+
assert_eq!(fixtures[0].1.name.as_deref(), Some("community/panic"));
1146+
assert_eq!(fixtures[1].1.name.as_deref(), Some("standard"));
1147+
}
1148+
1149+
#[test]
1150+
fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks() {
1151+
let report = EvalReport {
1152+
fixtures_total: 1,
1153+
fixtures_passed: 1,
1154+
fixtures_failed: 0,
1155+
rule_metrics: vec![],
1156+
rule_summary: Some(EvalRuleScoreSummary {
1157+
micro_precision: 1.0,
1158+
micro_recall: 1.0,
1159+
micro_f1: 1.0,
1160+
macro_precision: 1.0,
1161+
macro_recall: 1.0,
1162+
macro_f1: 1.0,
1163+
}),
1164+
threshold_failures: vec![],
1165+
results: vec![],
1166+
};
1167+
let options = EvalThresholdOptions {
1168+
max_micro_f1_drop: Some(0.05),
1169+
min_micro_f1: None,
1170+
min_macro_f1: None,
1171+
min_rule_f1: vec![],
1172+
max_rule_f1_drop: vec![],
1173+
};
1174+
1175+
let failures = evaluate_eval_thresholds(&report, None, &options);
1176+
1177+
assert_eq!(
1178+
failures,
1179+
vec!["baseline report is required for drop-based thresholds (--baseline)".to_string()]
1180+
);
1181+
}
1182+
1183+
#[test]
1184+
fn test_evaluate_eval_thresholds_checks_rule_specific_drop() {
1185+
let current = EvalReport {
1186+
fixtures_total: 1,
1187+
fixtures_passed: 1,
1188+
fixtures_failed: 0,
1189+
rule_metrics: vec![EvalRuleMetrics {
1190+
rule_id: "sec.sql.injection".to_string(),
1191+
expected: 1,
1192+
predicted: 1,
1193+
true_positives: 0,
1194+
false_positives: 1,
1195+
false_negatives: 1,
1196+
precision: 0.0,
1197+
recall: 0.0,
1198+
f1: 0.0,
1199+
}],
1200+
rule_summary: Some(EvalRuleScoreSummary::default()),
1201+
threshold_failures: vec![],
1202+
results: vec![],
1203+
};
1204+
let baseline = EvalReport {
1205+
fixtures_total: 1,
1206+
fixtures_passed: 1,
1207+
fixtures_failed: 0,
1208+
rule_metrics: vec![EvalRuleMetrics {
1209+
rule_id: "sec.sql.injection".to_string(),
1210+
expected: 1,
1211+
predicted: 1,
1212+
true_positives: 1,
1213+
false_positives: 0,
1214+
false_negatives: 0,
1215+
precision: 1.0,
1216+
recall: 1.0,
1217+
f1: 1.0,
1218+
}],
1219+
rule_summary: Some(EvalRuleScoreSummary::default()),
1220+
threshold_failures: vec![],
1221+
results: vec![],
1222+
};
1223+
let options = EvalThresholdOptions {
1224+
max_micro_f1_drop: None,
1225+
min_micro_f1: None,
1226+
min_macro_f1: None,
1227+
min_rule_f1: vec![],
1228+
max_rule_f1_drop: vec![EvalRuleThreshold {
1229+
rule_id: "sec.sql.injection".to_string(),
1230+
value: 0.2,
1231+
}],
1232+
};
1233+
1234+
let failures = evaluate_eval_thresholds(&current, Some(&baseline), &options);
1235+
1236+
assert_eq!(failures.len(), 1);
1237+
assert!(failures[0].contains("sec.sql.injection"));
1238+
assert!(failures[0].contains("exceeded max 0.200"));
1239+
}
9521240
}

0 commit comments

Comments
 (0)