Skip to content

Commit a24cf55

Browse files
fix: avoid unicode filepath suffix panic (#393)
* fix: avoid unicode filepath suffix panic * fix: prevent unicode char boundary panics in all constraint functions The previous fix (a09292e) only guarded path_ends_with_suffix with path.get(start..), but three problems remained: 1. path_ends_with_suffix: path_bytes[start - 1] reads inside a multi-byte char when start is a valid boundary but start-1 is not. Fixed by scanning backward to find the preceding ASCII byte. 2. path_contains_segment: path[..segment_len] and path[start..end] slice at non-char-boundary offsets when segment is ASCII but the path contains multi-byte UTF-8 (Korean, etc). Fixed with is_char_boundary() checks before each slice. 3. file_has_extension: same byte-offset issue for dot_pos. Fixed with is_char_boundary() check. Adds regression tests with the exact Korean filenames that caused panics (커리큘럼, 세부_커리큘럼_최종, 설치-및-기본-사용, etc). Merges upstream unicode tests (apostrophe, narrow-space mismatches). --------- Co-authored-by: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
1 parent 464f9d8 commit a24cf55

1 file changed

Lines changed: 128 additions & 4 deletions

File tree

crates/fff-core/src/constraints.rs

Lines changed: 128 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,34 @@ pub fn path_ends_with_suffix(path: &str, suffix: &str) -> bool {
5656
if path_bytes.len() < suffix_bytes.len() {
5757
return false;
5858
}
59-
let start = path_bytes.len() - suffix_bytes.len();
60-
if !path_bytes[start..].eq_ignore_ascii_case(suffix_bytes) {
59+
60+
let start = path.len() - suffix.len();
61+
62+
// Must land on a char boundary — multi-byte UTF-8 can make
63+
// `path.len() - suffix.len()` land inside a char.
64+
if !path.is_char_boundary(start) {
65+
return false;
66+
}
67+
68+
if !path[start..].eq_ignore_ascii_case(suffix) {
6169
return false;
6270
}
63-
// Exact match, or the character before is /
64-
start == 0 || path_bytes[start - 1] == b'/'
71+
72+
// Exact match, or the character before is '/'.
73+
// `start` is a char boundary but `start - 1` may be inside a multi-byte
74+
// char, so scan backward to find the preceding ASCII byte.
75+
if start == 0 {
76+
return true;
77+
}
78+
let mut i = start;
79+
while i > 0 {
80+
i -= 1;
81+
// ASCII bytes (0..128) are single-byte UTF-8 code units
82+
if path_bytes[i] < 128 {
83+
return path_bytes[i] == b'/';
84+
}
85+
}
86+
false
6587
}
6688

6789
#[inline]
@@ -72,6 +94,11 @@ pub fn file_has_extension(file_name: &str, ext: &str) -> bool {
7294
return false;
7395
}
7496
let start = name_bytes.len() - ext_bytes.len() - 1;
97+
// `.` is ASCII (single byte), so `start` must be a char boundary.
98+
// If it lands inside a multi-byte char the extension can't match.
99+
if start > 0 && !file_name.is_char_boundary(start) {
100+
return false;
101+
}
75102
name_bytes.get(start) == Some(&b'.') && name_bytes[start + 1..].eq_ignore_ascii_case(ext_bytes)
76103
}
77104

@@ -85,6 +112,7 @@ pub fn path_contains_segment(path: &str, segment: &str) -> bool {
85112
// Check segment/ at start of path
86113
if path_bytes.len() > segment_len
87114
&& path_bytes.get(segment_len) == Some(&b'/')
115+
&& path.is_char_boundary(segment_len)
88116
&& path_bytes[..segment_len].eq_ignore_ascii_case(segment_bytes)
89117
{
90118
return true;
@@ -101,6 +129,8 @@ pub fn path_contains_segment(path: &str, segment: &str) -> bool {
101129
let end = start + segment_len;
102130
if end < path_bytes.len()
103131
&& path_bytes[end] == b'/'
132+
&& path.is_char_boundary(start)
133+
&& path.is_char_boundary(end)
104134
&& path_bytes[start..end].eq_ignore_ascii_case(segment_bytes)
105135
{
106136
return true;
@@ -410,6 +440,28 @@ fn match_glob_pattern(pattern: &str, paths: &[&str]) -> AHashSet<usize> {
410440
mod tests {
411441
use super::*;
412442

443+
#[derive(Clone)]
444+
struct TestItem {
445+
relative_path: &'static str,
446+
file_name: &'static str,
447+
}
448+
449+
impl Constrainable for TestItem {
450+
fn write_file_name(&self, _arena: ArenaPtr, out: &mut String) {
451+
out.clear();
452+
out.push_str(self.file_name);
453+
}
454+
455+
fn write_relative_path(&self, _arena: ArenaPtr, out: &mut String) {
456+
out.clear();
457+
out.push_str(self.relative_path);
458+
}
459+
460+
fn git_status(&self) -> Option<git2::Status> {
461+
None
462+
}
463+
}
464+
413465
#[test]
414466
fn test_file_has_extension() {
415467
assert!(file_has_extension("file.rs", "rs"));
@@ -524,6 +576,15 @@ mod tests {
524576
assert!(path_ends_with_suffix("crates/src/main.rs", "src/main.rs"));
525577
}
526578

579+
#[test]
580+
fn test_path_ends_with_suffix_does_not_panic_on_unicode_suffix() {
581+
assert!(!path_ends_with_suffix("유니코드_파일_테스트.csv", "트.c"));
582+
assert!(path_ends_with_suffix(
583+
"data/유니코드_파일_테스트.csv",
584+
"유니코드_파일_테스트.csv"
585+
));
586+
}
587+
527588
#[test]
528589
fn test_path_ends_with_suffix_unicode_apostrophe_mismatch() {
529590
assert!(!path_ends_with_suffix(
@@ -540,6 +601,12 @@ mod tests {
540601
));
541602
}
542603

604+
#[test]
605+
fn test_path_contains_segment_does_not_panic_on_unicode_segment() {
606+
assert!(!path_contains_segment("문서/notes.txt", "문x"));
607+
assert!(path_contains_segment("프로젝트/문서/notes.txt", "문서"));
608+
}
609+
543610
#[test]
544611
fn test_path_contains_segment_unicode_no_panic() {
545612
assert!(!path_contains_segment(
@@ -552,4 +619,61 @@ mod tests {
552619
fn test_file_has_extension_unicode_no_panic() {
553620
assert!(!file_has_extension("cat\u{00e9}.rs", "s"));
554621
}
622+
623+
#[test]
624+
fn test_file_has_extension_unicode_filename() {
625+
assert!(file_has_extension("운영-가이드.md", "md"));
626+
assert!(file_has_extension("테스트.csv", "csv"));
627+
assert!(!file_has_extension("테스트.csv", "md"));
628+
}
629+
630+
#[test]
631+
fn test_apply_constraints_file_path_with_unicode_suffix() {
632+
let arena_ptr = ArenaPtr(std::ptr::null());
633+
634+
let item = TestItem {
635+
relative_path: "data/유니코드_파일_테스트.csv",
636+
file_name: "유니코드_파일_테스트.csv",
637+
};
638+
639+
let exact = [Constraint::FilePath("유니코드_파일_테스트.csv")];
640+
let mismatch = [Constraint::FilePath("트.c")];
641+
642+
let exact_items = [item.clone()];
643+
let exact_matches =
644+
apply_constraints(&exact_items, &exact, arena_ptr).expect("constraints applied");
645+
assert_eq!(exact_matches.len(), 1);
646+
647+
let mismatch_items = [item];
648+
let mismatch_matches =
649+
apply_constraints(&mismatch_items, &mismatch, arena_ptr).expect("constraints applied");
650+
assert!(mismatch_matches.is_empty());
651+
}
652+
653+
#[test]
654+
fn test_unicode_path_no_panic_real_korean_cases() {
655+
// Real Korean paths that caused panics
656+
let path1 = "Downloads/(커리큘럼) hermes agent_정승현님 - 1차 커리큘럼 (강사님 작성).csv";
657+
let path2 = "hermes-agent-lecture-materials/세부_커리큘럼_최종.csv";
658+
let path3 = "projects/fastcampus-hermes-agent-curriculum/chapters/part-02-Hermes-설치-및-기본-사용/section-02-doctor로-설치-상태-검증/research/03-fix가-자동-수정하는-것과-못하는-것.md";
659+
660+
// These must not panic regardless of segment/suffix used
661+
assert!(!path_contains_segment(path1, "작성"));
662+
assert!(!path_ends_with_suffix(path1, "작성.csv"));
663+
assert!(!path_contains_segment(path2, "최종"));
664+
assert!(!path_ends_with_suffix(path2, "최종.csv"));
665+
assert!(!path_contains_segment(path3, "수정"));
666+
assert!(!path_ends_with_suffix(path3, "것.md"));
667+
668+
// Positive cases should still work
669+
assert!(path_contains_segment(
670+
path2,
671+
"hermes-agent-lecture-materials"
672+
));
673+
assert!(path_ends_with_suffix(
674+
path1,
675+
"(커리큘럼) hermes agent_정승현님 - 1차 커리큘럼 (강사님 작성).csv"
676+
));
677+
assert!(path_ends_with_suffix(path2, "세부_커리큘럼_최종.csv"));
678+
}
555679
}

0 commit comments

Comments
 (0)