Skip to content

Commit 91cd463

Browse files
committed
fix: prevent unicode char boundary panics in all constraint functions
The previous fix (a09292e) only guarded path_ends_with_suffix with path.get(start..), but three problems remained: 1. path_ends_with_suffix: path_bytes[start - 1] reads inside a multi-byte char when start is a valid boundary but start-1 is not. Fixed by scanning backward to find the preceding ASCII byte. 2. path_contains_segment: path[..segment_len] and path[start..end] slice at non-char-boundary offsets when segment is ASCII but the path contains multi-byte UTF-8 (Korean, etc). Fixed with is_char_boundary() checks before each slice. 3. file_has_extension: same byte-offset issue for dot_pos. Fixed with is_char_boundary() check. Adds regression tests with the exact Korean filenames that caused panics (커리큘럼, 세부_커리큘럼_최종, 설치-및-기본-사용, etc). Merges upstream unicode tests (apostrophe, narrow-space mismatches).
1 parent 4586c24 commit 91cd463

1 file changed

Lines changed: 73 additions & 42 deletions

File tree

crates/fff-core/src/constraints.rs

Lines changed: 73 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,32 @@ pub fn path_ends_with_suffix(path: &str, suffix: &str) -> bool {
5858
}
5959

6060
let start = path.len() - suffix.len();
61-
let Some(path_suffix) = path.get(start..) else {
61+
62+
// Must land on a char boundary — multi-byte UTF-8 can make
63+
// `path.len() - suffix.len()` land inside a char.
64+
if !path.is_char_boundary(start) {
6265
return false;
63-
};
66+
}
6467

65-
if !path_suffix.eq_ignore_ascii_case(suffix) {
68+
if !path[start..].eq_ignore_ascii_case(suffix) {
6669
return false;
6770
}
6871

69-
// Exact match, or the character before is /
70-
start == 0 || path_bytes[start - 1] == b'/'
72+
// Exact match, or the character before is '/'.
73+
// `start` is a char boundary but `start - 1` may be inside a multi-byte
74+
// char, so scan backward to find the preceding ASCII byte.
75+
if start == 0 {
76+
return true;
77+
}
78+
let mut i = start;
79+
while i > 0 {
80+
i -= 1;
81+
// ASCII bytes (0..128) are single-byte UTF-8 code units
82+
if path_bytes[i] < 128 {
83+
return path_bytes[i] == b'/';
84+
}
85+
}
86+
false
7187
}
7288

7389
#[inline]
@@ -78,6 +94,11 @@ pub fn file_has_extension(file_name: &str, ext: &str) -> bool {
7894
return false;
7995
}
8096
let start = name_bytes.len() - ext_bytes.len() - 1;
97+
// `.` is ASCII (single byte), so `start` must be a char boundary.
98+
// If it lands inside a multi-byte char the extension can't match.
99+
if start > 0 && !file_name.is_char_boundary(start) {
100+
return false;
101+
}
81102
name_bytes.get(start) == Some(&b'.') && name_bytes[start + 1..].eq_ignore_ascii_case(ext_bytes)
82103
}
83104

@@ -91,6 +112,7 @@ pub fn path_contains_segment(path: &str, segment: &str) -> bool {
91112
// Check segment/ at start of path
92113
if path_bytes.len() > segment_len
93114
&& path_bytes.get(segment_len) == Some(&b'/')
115+
&& path.is_char_boundary(segment_len)
94116
&& path_bytes[..segment_len].eq_ignore_ascii_case(segment_bytes)
95117
{
96118
return true;
@@ -107,6 +129,8 @@ pub fn path_contains_segment(path: &str, segment: &str) -> bool {
107129
let end = start + segment_len;
108130
if end < path_bytes.len()
109131
&& path_bytes[end] == b'/'
132+
&& path.is_char_boundary(start)
133+
&& path.is_char_boundary(end)
110134
&& path_bytes[start..end].eq_ignore_ascii_case(segment_bytes)
111135
{
112136
return true;
@@ -422,12 +446,14 @@ mod tests {
422446
}
423447

424448
impl Constrainable for TestItem {
425-
fn relative_path(&self) -> &str {
426-
self.relative_path
449+
fn write_file_name(&self, _arena: ArenaPtr, out: &mut String) {
450+
out.clear();
451+
out.push_str(self.file_name);
427452
}
428453

429-
fn file_name(&self) -> &str {
430-
self.file_name
454+
fn write_relative_path(&self, _arena: ArenaPtr, out: &mut String) {
455+
out.clear();
456+
out.push_str(self.relative_path);
431457
}
432458

433459
fn git_status(&self) -> Option<git2::Status> {
@@ -550,58 +576,36 @@ mod tests {
550576
}
551577

552578
#[test]
553-
<<<<<<< HEAD
554579
fn test_path_ends_with_suffix_does_not_panic_on_unicode_suffix() {
555580
assert!(!path_ends_with_suffix("유니코드_파일_테스트.csv", "트.c"));
556581
assert!(path_ends_with_suffix(
557582
"data/유니코드_파일_테스트.csv",
558583
"유니코드_파일_테스트.csv"
559-
=======
584+
));
585+
}
586+
587+
#[test]
560588
fn test_path_ends_with_suffix_unicode_apostrophe_mismatch() {
561589
assert!(!path_ends_with_suffix(
562590
"dir/\u{2019}bar/file.txt",
563591
"'bar/file.txt"
564-
>>>>>>> origin/main
565592
));
566593
}
567594

568595
#[test]
569-
<<<<<<< HEAD
570-
fn test_path_contains_segment_does_not_panic_on_unicode_segment() {
571-
assert!(!path_contains_segment("문서/notes.txt", "문x"));
572-
assert!(path_contains_segment("프로젝트/문서/notes.txt", "문서"));
573-
}
574-
575-
#[test]
576-
fn test_apply_constraints_file_path_with_unicode_suffix() {
577-
let item = TestItem {
578-
relative_path: "data/유니코드_파일_테스트.csv",
579-
file_name: "유니코드_파일_테스트.csv",
580-
};
581-
582-
let exact = [Constraint::FilePath("유니코드_파일_테스트.csv")];
583-
let mismatch = [Constraint::FilePath("트.c")];
584-
585-
let exact_items = [item];
586-
let exact_matches = apply_constraints(&exact_items, &exact).expect("constraints applied");
587-
assert_eq!(exact_matches.len(), 1);
588-
589-
let mismatch_item = TestItem {
590-
relative_path: "data/유니코드_파일_테스트.csv",
591-
file_name: "유니코드_파일_테스트.csv",
592-
};
593-
let mismatch_items = [mismatch_item];
594-
let mismatch_matches =
595-
apply_constraints(&mismatch_items, &mismatch).expect("constraints applied");
596-
assert!(mismatch_matches.is_empty());
597-
=======
598596
fn test_path_ends_with_suffix_unicode_space_mismatch() {
599597
assert!(!path_ends_with_suffix(
600598
"dir/\u{202f}am/file.txt",
601599
" am/file.txt"
602600
));
603601
}
604602

603+
#[test]
604+
fn test_path_contains_segment_does_not_panic_on_unicode_segment() {
605+
assert!(!path_contains_segment("문서/notes.txt", "문x"));
606+
assert!(path_contains_segment("프로젝트/문서/notes.txt", "문서"));
607+
}
608+
605609
#[test]
606610
fn test_path_contains_segment_unicode_no_panic() {
607611
assert!(!path_contains_segment(
@@ -613,6 +617,33 @@ mod tests {
613617
#[test]
614618
fn test_file_has_extension_unicode_no_panic() {
615619
assert!(!file_has_extension("cat\u{00e9}.rs", "s"));
616-
>>>>>>> origin/main
620+
}
621+
622+
#[test]
623+
fn test_file_has_extension_unicode_filename() {
624+
assert!(file_has_extension("운영-가이드.md", "md"));
625+
assert!(file_has_extension("테스트.csv", "csv"));
626+
assert!(!file_has_extension("테스트.csv", "md"));
627+
}
628+
629+
#[test]
630+
fn test_unicode_path_no_panic_real_korean_cases() {
631+
// Real Korean paths that caused panics
632+
let path1 = "Downloads/(커리큘럼) hermes agent_정승현님 - 1차 커리큘럼 (강사님 작성).csv";
633+
let path2 = "hermes-agent-lecture-materials/세부_커리큘럼_최종.csv";
634+
let path3 = "projects/fastcampus-hermes-agent-curriculum/chapters/part-02-Hermes-설치-및-기본-사용/section-02-doctor로-설치-상태-검증/research/03-fix가-자동-수정하는-것과-못하는-것.md";
635+
636+
// These must not panic regardless of segment/suffix used
637+
assert!(!path_contains_segment(path1, "작성"));
638+
assert!(!path_ends_with_suffix(path1, "작성.csv"));
639+
assert!(!path_contains_segment(path2, "최종"));
640+
assert!(!path_ends_with_suffix(path2, "최종.csv"));
641+
assert!(!path_contains_segment(path3, "수정"));
642+
assert!(!path_ends_with_suffix(path3, "것.md"));
643+
644+
// Positive cases should still work
645+
assert!(path_contains_segment(path2, "hermes-agent-lecture-materials"));
646+
assert!(path_ends_with_suffix(path1, "(커리큘럼) hermes agent_정승현님 - 1차 커리큘럼 (강사님 작성).csv"));
647+
assert!(path_ends_with_suffix(path2, "세부_커리큘럼_최종.csv"));
617648
}
618649
}

0 commit comments

Comments
 (0)