Skip to content

Commit 98fd934

Browse files
committed
fix: Unicode segmentation crash
closes #369
1 parent ea1f980 commit 98fd934

2 files changed

Lines changed: 49 additions & 16 deletions

File tree

crates/fff-core/src/bigram_query.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ impl HirInfo {
164164
}
165165

166166
/// Prefilter fuzzy query. The algorithm is the following:
167-
/// we allow max_typs = min(len/3,2) every typo destroys at most 2 consecutive bigrams
167+
/// we allow max_typos = min(len/3,2) every typo destroys at most 2 consecutive bigrams
168168
/// So out of N bigrams at least N - 2 * max_typos have to present in the matching fil
169169
pub(crate) fn fuzzy_to_bigram_query(query: &str, num_probes: usize) -> BigramQuery {
170170
let lower: Vec<u8> = query.bytes().map(|b| b.to_ascii_lowercase()).collect();

crates/fff-core/src/constraints.rs

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,26 +66,29 @@ pub trait Constrainable {
6666
/// - `path_ends_with_suffix("xlibswscale/input.c", "libswscale/input.c")` → false (no boundary)
6767
#[inline]
6868
pub fn path_ends_with_suffix(path: &str, suffix: &str) -> bool {
69-
if path.len() < suffix.len() {
69+
let path_bytes = path.as_bytes();
70+
let suffix_bytes = suffix.as_bytes();
71+
if path_bytes.len() < suffix_bytes.len() {
7072
return false;
7173
}
72-
let start = path.len() - suffix.len();
73-
if !path[start..].eq_ignore_ascii_case(suffix) {
74+
let start = path_bytes.len() - suffix_bytes.len();
75+
if !path_bytes[start..].eq_ignore_ascii_case(suffix_bytes) {
7476
return false;
7577
}
7678
// Exact match, or the character before is /
77-
start == 0 || path.as_bytes()[start - 1] == b'/'
79+
start == 0 || path_bytes[start - 1] == b'/'
7880
}
7981

8082
/// Check if file extension matches (without allocation)
8183
#[inline]
8284
pub fn file_has_extension(file_name: &str, ext: &str) -> bool {
83-
if file_name.len() <= ext.len() + 1 {
85+
let name_bytes = file_name.as_bytes();
86+
let ext_bytes = ext.as_bytes();
87+
if name_bytes.len() <= ext_bytes.len() + 1 {
8488
return false;
8589
}
86-
let start = file_name.len() - ext.len() - 1;
87-
file_name.as_bytes().get(start) == Some(&b'.')
88-
&& file_name[start + 1..].eq_ignore_ascii_case(ext)
90+
let start = name_bytes.len() - ext_bytes.len() - 1;
91+
name_bytes.get(start) == Some(&b'.') && name_bytes[start + 1..].eq_ignore_ascii_case(ext_bytes)
8992
}
9093

9194
/// Check if path contains segment (without allocation)
@@ -94,28 +97,29 @@ pub fn file_has_extension(file_name: &str, ext: &str) -> bool {
9497
#[inline]
9598
pub fn path_contains_segment(path: &str, segment: &str) -> bool {
9699
let path_bytes = path.as_bytes();
97-
let segment_len = segment.len();
100+
let segment_bytes = segment.as_bytes();
101+
let segment_len = segment_bytes.len();
98102

99103
// Check segment/ at start of path
100-
if path.len() > segment_len
104+
if path_bytes.len() > segment_len
101105
&& path_bytes.get(segment_len) == Some(&b'/')
102-
&& path[..segment_len].eq_ignore_ascii_case(segment)
106+
&& path_bytes[..segment_len].eq_ignore_ascii_case(segment_bytes)
103107
{
104108
return true;
105109
}
106110

107111
// Check /segment/ anywhere using byte scanning
108-
if path.len() < segment_len + 2 {
112+
if path_bytes.len() < segment_len + 2 {
109113
return false;
110114
}
111115

112-
for i in 0..path.len().saturating_sub(segment_len + 1) {
116+
for i in 0..path_bytes.len().saturating_sub(segment_len + 1) {
113117
if path_bytes[i] == b'/' {
114118
let start = i + 1;
115119
let end = start + segment_len;
116-
if end < path.len()
120+
if end < path_bytes.len()
117121
&& path_bytes[end] == b'/'
118-
&& path[start..end].eq_ignore_ascii_case(segment)
122+
&& path_bytes[start..end].eq_ignore_ascii_case(segment_bytes)
119123
{
120124
return true;
121125
}
@@ -469,4 +473,33 @@ mod tests {
469473
assert!(path_ends_with_suffix("src/main.rs", "src/main.rs"));
470474
assert!(path_ends_with_suffix("crates/src/main.rs", "src/main.rs"));
471475
}
476+
477+
#[test]
478+
fn test_path_ends_with_suffix_unicode_apostrophe_mismatch() {
479+
assert!(!path_ends_with_suffix(
480+
"dir/\u{2019}bar/file.txt",
481+
"'bar/file.txt"
482+
));
483+
}
484+
485+
#[test]
486+
fn test_path_ends_with_suffix_unicode_space_mismatch() {
487+
assert!(!path_ends_with_suffix(
488+
"dir/\u{202f}am/file.txt",
489+
" am/file.txt"
490+
));
491+
}
492+
493+
#[test]
494+
fn test_path_contains_segment_unicode_no_panic() {
495+
assert!(!path_contains_segment(
496+
"Library/Cloud/Project\u{2019}s Folder/books.ttl",
497+
"Project's Folder"
498+
));
499+
}
500+
501+
#[test]
502+
fn test_file_has_extension_unicode_no_panic() {
503+
assert!(!file_has_extension("cat\u{00e9}.rs", "s"));
504+
}
472505
}

0 commit comments

Comments
 (0)