Skip to content
This repository was archived by the owner on Jan 27, 2026. It is now read-only.

Commit cccf154

Browse files
authored
Fix/toploc routing w similar model (#386)
* improve toploc config matching
1 parent add3c1a commit cccf154

1 file changed

Lines changed: 98 additions & 12 deletions

File tree

  • crates/validator/src/validators/synthetic_data

crates/validator/src/validators/synthetic_data/toploc.rs

Lines changed: 98 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,12 @@ impl Toploc {
7171
pub fn matches_file_name(&self, file_name: &str) -> bool {
7272
let normalized_name = self.normalize_path(file_name);
7373
match &self.config.file_prefix_filter {
74-
Some(prefix) => normalized_name.starts_with(prefix),
74+
Some(prefix) => {
75+
normalized_name == *prefix || {
76+
normalized_name.starts_with(prefix)
77+
&& normalized_name[prefix.len()..].starts_with('/')
78+
}
79+
}
7580
None => true,
7681
}
7782
}
@@ -588,20 +593,101 @@ mod tests {
588593
assert_eq!(group_result.failing_indices, vec![1, 3, 5]);
589594
Ok(())
590595
}
591-
592596
#[tokio::test]
593597
async fn test_file_prefix_filter_matching() {
594-
let config = ToplocConfig {
595-
server_url: "http://test".to_string(),
596-
auth_token: None,
597-
file_prefix_filter: Some("Qwen3".to_string()),
598-
};
599-
let toploc = Toploc::new(config, None);
598+
let configs = vec![
599+
ToplocConfig {
600+
server_url: "http://test".to_string(),
601+
auth_token: None,
602+
file_prefix_filter: Some("Qwen/Qwen3-235B-A22B".to_string()),
603+
},
604+
ToplocConfig {
605+
server_url: "http://test".to_string(),
606+
auth_token: None,
607+
file_prefix_filter: Some("Qwen/Qwen3-32B".to_string()),
608+
},
609+
ToplocConfig {
610+
server_url: "http://test".to_string(),
611+
auth_token: None,
612+
file_prefix_filter: Some("Qwen/Qwen3-30B-A3B".to_string()),
613+
},
614+
ToplocConfig {
615+
server_url: "http://test".to_string(),
616+
auth_token: None,
617+
file_prefix_filter: Some("Qwen/Qwen3-14B".to_string()),
618+
},
619+
ToplocConfig {
620+
server_url: "http://test".to_string(),
621+
auth_token: None,
622+
file_prefix_filter: Some("deepseek-ai/DeepSeek-R1-0528".to_string()),
623+
},
624+
ToplocConfig {
625+
server_url: "http://test".to_string(),
626+
auth_token: None,
627+
file_prefix_filter: Some("deepseek-ai/DeepSeek-R1-0528-Qwen3-8B".to_string()),
628+
},
629+
];
630+
631+
let test_cases = vec![
632+
// Test Qwen 235B model
633+
("Qwen/Qwen3-235B-A22B/data.parquet", Some(0)),
634+
("Qwen/Qwen3-235B-A22B", Some(0)),
635+
("Qwen/Qwen3-235B-A22B-extra/data.parquet", None),
636+
("qwen/qwen3-235b-a22b/data.parquet", None), // Case sensitive
637+
// Test Qwen 32B model
638+
("Qwen/Qwen3-32B/data.parquet", Some(1)),
639+
("Qwen/Qwen3-32B", Some(1)),
640+
("Qwen/Qwen3-32B-extra/data.parquet", None),
641+
// Test Qwen 30B model
642+
("Qwen/Qwen3-30B-A3B/data.parquet", Some(2)),
643+
("Qwen/Qwen3-30B-A3B", Some(2)),
644+
("Qwen/Qwen3-30B-A3B-extra/data.parquet", None),
645+
// Test Qwen 14B model
646+
("Qwen/Qwen3-14B/data.parquet", Some(3)),
647+
("Qwen/Qwen3-14B", Some(3)),
648+
("Qwen/Qwen3-14B-extra/data.parquet", None),
649+
// Test DeepSeek base model
650+
("deepseek-ai/DeepSeek-R1-0528/data.parquet", Some(4)),
651+
("deepseek-ai/DeepSeek-R1-0528", Some(4)),
652+
(
653+
"deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/data.parquet",
654+
Some(5),
655+
),
656+
("deepseek-ai/deepseek-r1-0528/data.parquet", None), // Case sensitive
657+
];
658+
659+
for (test_file, expected_match) in test_cases {
660+
let mut matched = false;
661+
let mut matched_idx = None;
662+
663+
for (idx, config) in configs.iter().enumerate() {
664+
let toploc = Toploc::new(config.clone(), None);
665+
if toploc.matches_file_name(test_file) {
666+
matched = true;
667+
matched_idx = Some(idx);
668+
break;
669+
}
670+
}
600671

601-
assert!(toploc.matches_file_name("Qwen3-model-data.parquet"));
602-
assert!(toploc.matches_file_name("Qwen3"));
603-
assert!(!toploc.matches_file_name("GPT4-model-data.parquet"));
604-
assert!(!toploc.matches_file_name("qwen3-lowercase.parquet")); // Case sensitive
672+
match expected_match {
673+
Some(expected_idx) => {
674+
assert!(
675+
matched,
676+
"Expected file {} to match config {}",
677+
test_file, expected_idx
678+
);
679+
assert_eq!(
680+
matched_idx,
681+
Some(expected_idx),
682+
"File {} matched config {} but expected {}",
683+
test_file,
684+
matched_idx.unwrap(),
685+
expected_idx
686+
);
687+
}
688+
None => assert!(!matched, "File {} should not match any config", test_file),
689+
}
690+
}
605691
}
606692

607693
#[tokio::test]

0 commit comments

Comments
 (0)