Skip to content

Commit b834685

Browse files
haasonsaasclaude
andcommitted
TDD: fix 4 more bugs — UTF-8 truncation, date comparison, weights mismatch, FPR formula
- offline.rs: truncate_to_tokens panicked on multi-byte UTF-8 chars (€ at byte boundary) - git_history.rs: last_modified used string comparison ("Wed" > "Mon") instead of parsing dates - eval_benchmarks.rs: AggregateMetrics silently dropped results when weights length != results length - eval_benchmarks.rs: FPR used FP/(FP+TP) instead of FP/(FP+TN), added total_tn tracking 330 tests, 0 warnings, 0 clippy issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1d14497 commit b834685

File tree

3 files changed

+198
-17
lines changed

3 files changed

+198
-17
lines changed

src/core/eval_benchmarks.rs

Lines changed: 84 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ pub struct AggregateMetrics {
200200
pub total_tp: usize,
201201
pub total_fp: usize,
202202
pub total_fn: usize,
203+
pub total_tn: usize,
203204
pub micro_precision: f32,
204205
pub micro_recall: f32,
205206
pub micro_f1: f32,
@@ -218,6 +219,7 @@ impl AggregateMetrics {
218219
let total_tp: usize = results.iter().map(|r| r.true_positives).sum();
219220
let total_fp: usize = results.iter().map(|r| r.false_positives).sum();
220221
let total_fn: usize = results.iter().map(|r| r.false_negatives).sum();
222+
let total_tn: usize = results.iter().map(|r| r.true_negatives).sum();
221223

222224
let micro_precision = if total_tp + total_fp > 0 {
223225
total_tp as f32 / (total_tp + total_fp) as f32
@@ -241,16 +243,22 @@ impl AggregateMetrics {
241243
let macro_f1: f32 = results.iter().map(|r| r.f1).sum::<f32>() / n;
242244

243245
let weighted_score = if let Some(ws) = weights {
244-
let total_weight: f32 = ws.iter().sum();
245-
if total_weight > 0.0 {
246-
results
247-
.iter()
248-
.zip(ws.iter())
249-
.map(|(r, w)| r.f1 * w)
250-
.sum::<f32>()
251-
/ total_weight
252-
} else {
246+
if ws.len() != results.len() {
247+
// Length mismatch: fall back to macro_f1 rather than silently
248+
// dropping results via zip truncation
253249
macro_f1
250+
} else {
251+
let total_weight: f32 = ws.iter().sum();
252+
if total_weight > 0.0 {
253+
results
254+
.iter()
255+
.zip(ws.iter())
256+
.map(|(r, w)| r.f1 * w)
257+
.sum::<f32>()
258+
/ total_weight
259+
} else {
260+
macro_f1
261+
}
254262
}
255263
} else {
256264
macro_f1
@@ -261,6 +269,7 @@ impl AggregateMetrics {
261269
total_tp,
262270
total_fp,
263271
total_fn,
272+
total_tn,
264273
micro_precision,
265274
micro_recall,
266275
micro_f1,
@@ -304,9 +313,9 @@ pub fn evaluate_against_thresholds(
304313
));
305314
}
306315

307-
let fpr = if result.aggregate.total_fp + result.aggregate.total_tp > 0 {
316+
let fpr = if result.aggregate.total_fp + result.aggregate.total_tn > 0 {
308317
result.aggregate.total_fp as f32
309-
/ (result.aggregate.total_fp + result.aggregate.total_tp) as f32
318+
/ (result.aggregate.total_fp + result.aggregate.total_tn) as f32
310319
} else {
311320
0.0
312321
};
@@ -893,6 +902,70 @@ mod tests {
893902
assert!(by_cat.is_empty());
894903
}
895904

905+
// BUG: AggregateMetrics::compute silently drops results when weights.len() < results.len()
906+
#[test]
907+
fn test_aggregate_weights_length_mismatch() {
908+
let r1 = FixtureResult::compute("a", 2, 0, 2, 0, 0); // f1 = 1.0
909+
let r2 = FixtureResult::compute("b", 2, 0, 0, 0, 2); // f1 = 0.0
910+
let r3 = FixtureResult::compute("c", 2, 0, 2, 0, 0); // f1 = 1.0
911+
912+
// Only 2 weights for 3 results — r3 (f1=1.0) is silently dropped
913+
let weights = vec![1.0, 1.0];
914+
let agg = AggregateMetrics::compute(&[&r1, &r2, &r3], Some(&weights));
915+
916+
// macro_f1 = (1.0 + 0.0 + 1.0) / 3 = 0.667
917+
// With mismatched weights, zip truncates: (1.0*1 + 0.0*1) / 2 = 0.5
918+
// The third fixture is silently ignored, deflating the score!
919+
// On mismatch, should fall back to macro_f1 rather than give wrong answer.
920+
assert!(
921+
(agg.weighted_score - agg.macro_f1).abs() < 0.05,
922+
"Mismatched weights should fall back to macro_f1, got weighted={:.3} vs macro={:.3}",
923+
agg.weighted_score, agg.macro_f1
924+
);
925+
}
926+
927+
// BUG: FPR formula uses FP/(FP+TP) which equals 1-precision, making it redundant.
928+
// Should track total_tn and use FP/(FP+TN) for real false positive rate.
929+
#[test]
930+
fn test_fpr_uses_true_negatives() {
931+
let r1 = FixtureResult::compute("a", 10, 10, 10, 0, 1);
932+
// r1: tp=10, fp=1, fn=0, tn=10
933+
let r2 = FixtureResult::compute("b", 0, 10, 0, 0, 0);
934+
// r2: tp=0, fp=0, fn=0, tn=10
935+
936+
let agg = AggregateMetrics::compute(&[&r1, &r2], None);
937+
// total_tp=10, total_fp=1, total_fn=0
938+
// If total_tn were tracked: total_tn=20
939+
940+
let result = BenchmarkResult {
941+
suite_name: "test".to_string(),
942+
fixture_results: vec![r1, r2],
943+
aggregate: agg,
944+
by_category: HashMap::new(),
945+
by_difficulty: HashMap::new(),
946+
threshold_pass: true,
947+
threshold_failures: vec![],
948+
timestamp: "2024-01-01".to_string(),
949+
};
950+
951+
// With real FPR = FP/(FP+TN) = 1/(1+20) = 0.048 → passes 0.06 threshold
952+
// With buggy FPR = FP/(FP+TP) = 1/(1+10) = 0.091 → fails 0.06 threshold
953+
let thresholds = BenchmarkThresholds {
954+
max_false_positive_rate: 0.06,
955+
min_precision: 0.0,
956+
min_recall: 0.0,
957+
min_f1: 0.0,
958+
min_weighted_score: 0.0,
959+
};
960+
961+
let (pass, failures) = evaluate_against_thresholds(&result, &thresholds);
962+
assert!(
963+
pass,
964+
"FPR should be ~0.048 (FP/(FP+TN)), not 0.091 (FP/(FP+TP)): {:?}",
965+
failures
966+
);
967+
}
968+
896969
// BUG: compare_results never populates improvements vector
897970
#[test]
898971
fn test_compare_results_detects_improvements() {

src/core/git_history.rs

Lines changed: 93 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,10 @@ impl GitHistoryAnalyzer {
119119
info.bug_fix_count += 1;
120120
}
121121

122-
if info
123-
.last_modified
124-
.as_ref()
125-
.is_none_or(|d| entry.date > *d)
126-
{
122+
if info.last_modified.as_ref().is_none_or(|d| {
123+
parse_date_for_comparison(&entry.date)
124+
> parse_date_for_comparison(d)
125+
}) {
127126
info.last_modified = Some(entry.date.clone());
128127
}
129128
}
@@ -288,6 +287,55 @@ impl GitHistoryAnalyzer {
288287
}
289288
}
290289

290+
/// Parse a date string into a comparable tuple (year, month, day, time).
291+
/// Handles both git default format ("Wed Jan 1 12:00:00 2025 +0000")
292+
/// and ISO format ("2025-01-01 12:00:00 +0000").
293+
fn parse_date_for_comparison(date: &str) -> (i32, u32, u32, String) {
294+
let parts: Vec<&str> = date.split_whitespace().collect();
295+
296+
// ISO format: "2025-01-01 ..."
297+
if parts
298+
.first()
299+
.is_some_and(|p| p.contains('-') && p.len() == 10)
300+
{
301+
// ISO dates sort correctly as strings, but parse for consistency
302+
let date_parts: Vec<&str> = parts[0].split('-').collect();
303+
if date_parts.len() == 3 {
304+
let year = date_parts[0].parse::<i32>().unwrap_or(0);
305+
let month = date_parts[1].parse::<u32>().unwrap_or(0);
306+
let day = date_parts[2].parse::<u32>().unwrap_or(0);
307+
let time = parts.get(1).unwrap_or(&"").to_string();
308+
return (year, month, day, time);
309+
}
310+
}
311+
312+
// Git default format: "Wed Jan 1 12:00:00 2025 +0000"
313+
if parts.len() >= 5 {
314+
let month = match parts[1] {
315+
"Jan" => 1,
316+
"Feb" => 2,
317+
"Mar" => 3,
318+
"Apr" => 4,
319+
"May" => 5,
320+
"Jun" => 6,
321+
"Jul" => 7,
322+
"Aug" => 8,
323+
"Sep" => 9,
324+
"Oct" => 10,
325+
"Nov" => 11,
326+
"Dec" => 12,
327+
_ => 0,
328+
};
329+
let day = parts[2].parse::<u32>().unwrap_or(0);
330+
let year = parts[4].parse::<i32>().unwrap_or(0);
331+
let time = parts[3].to_string();
332+
return (year, month, day, time);
333+
}
334+
335+
// Fallback: use the raw string (best effort)
336+
(0, 0, 0, date.to_string())
337+
}
338+
291339
/// Detect if a commit message indicates a bug fix.
292340
fn is_bug_fix_commit(message: &str) -> bool {
293341
let lower = message.to_lowercase();
@@ -640,6 +688,46 @@ Date: Tue Jan 2 12:00:00 2024 +0000
640688
);
641689
}
642690

691+
// BUG: last_modified uses string comparison which is lexicographic, not chronological.
692+
// "Wed Jan 1" > "Mon Feb 1" because 'W' > 'M', but Feb 1 is newer than Jan 1.
693+
#[test]
694+
fn test_last_modified_chronological_order() {
695+
let mut analyzer = GitHistoryAnalyzer::new();
696+
analyzer.ingest_log(vec![
697+
GitLogEntry {
698+
hash: "newer".to_string(),
699+
author: "alice".to_string(),
700+
date: "Mon Feb 3 12:00:00 2025 +0000".to_string(), // NEWER
701+
message: "newer commit".to_string(),
702+
files_changed: vec![FileChange {
703+
file_path: PathBuf::from("f.rs"),
704+
lines_added: 1,
705+
lines_removed: 0,
706+
}],
707+
},
708+
GitLogEntry {
709+
hash: "older".to_string(),
710+
author: "bob".to_string(),
711+
date: "Wed Jan 1 12:00:00 2025 +0000".to_string(), // OLDER
712+
message: "older commit".to_string(),
713+
files_changed: vec![FileChange {
714+
file_path: PathBuf::from("f.rs"),
715+
lines_added: 1,
716+
lines_removed: 0,
717+
}],
718+
},
719+
]);
720+
721+
let info = analyzer.file_info(Path::new("f.rs")).unwrap();
722+
// Feb 3 is newer than Jan 1, so last_modified should contain "Feb"
723+
// Bug: string comparison "Wed Jan..." > "Mon Feb..." because 'W' > 'M'
724+
assert!(
725+
info.last_modified.as_ref().unwrap().contains("Feb"),
726+
"last_modified should be Feb 3 (newer), got: {}",
727+
info.last_modified.as_ref().unwrap()
728+
);
729+
}
730+
643731
// BUG: commit_count accumulates but distinct_authors only counts latest batch
644732
#[test]
645733
fn test_cumulative_commit_count() {

src/core/offline.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,12 @@ fn truncate_to_tokens(text: &str, max_tokens: usize) -> String {
305305
return text.to_string();
306306
}
307307

308-
let mut truncated = text[..max_chars].to_string();
308+
// Find a valid UTF-8 char boundary at or before max_chars
309+
let mut end = max_chars;
310+
while end > 0 && !text.is_char_boundary(end) {
311+
end -= 1;
312+
}
313+
let mut truncated = text[..end].to_string();
309314
truncated.push_str("\n[Truncated for local model context window]");
310315
truncated
311316
}
@@ -648,6 +653,21 @@ mod tests {
648653
);
649654
}
650655

656+
// BUG: truncate_to_tokens slices at byte offset without char boundary check
657+
#[test]
658+
fn test_truncate_to_tokens_utf8_safety() {
659+
// '€' is 3 bytes (U+20AC). With context_window=504, budget=4,
660+
// system_budget=1, max_chars=4. "€€" is 6 bytes.
661+
// text[..4] lands inside the second '€' (bytes 3-5), panicking.
662+
let system = "€€"; // 6 bytes
663+
let user = "short";
664+
// context_window=504 → budget=4, system_budget=1, user_budget=3
665+
// system max_chars=4, which falls mid-char in "€€"
666+
let (sys, _usr) = optimize_prompt_for_local(system, user, 504);
667+
// Should not panic; result must be valid UTF-8
668+
assert!(!sys.is_empty() || sys.contains("Truncated") || sys.is_empty());
669+
}
670+
651671
// BUG: validate() doesn't check for obviously wrong base_url formats
652672
#[test]
653673
fn test_validate_rejects_invalid_url() {

0 commit comments

Comments
 (0)