-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathtokenizer.rs
More file actions
101 lines (93 loc) · 3.47 KB
/
Copy pathtokenizer.rs
File metadata and controls
101 lines (93 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
//! ASCII tokenizer for FTS — splits on `[^A-Za-z0-9]+` and lowercases.
//!
//! Resolves Phase 8 plan Q3 (ASCII MVP). Unicode-aware tokenization is
//! deferred to Phase 8.1 behind a `unicode` cargo feature; the limitation
//! here is intentional. Non-ASCII bytes are treated as separators, which
//! means accented Latin (`café`), CJK, and other non-ASCII scripts won't
//! be searchable until that follow-up lands.
//!
//! No stemming and no stop-word removal (Q4 + Q5). BM25's IDF naturally
//! downweights common terms, and modern RAG pipelines rely on exact
//! lexical matches for technical retrieval.
/// Split `text` on runs of non-ASCII-alphanumeric bytes and lowercase
/// each resulting term. Empty input or input made entirely of separators
/// returns an empty `Vec`.
///
/// Tokens are `String` rather than `&str` because the posting-list owns
/// its term strings (see [`super::posting_list::PostingList`]); returning
/// owned strings keeps the call site shape consistent with how the index
/// stores them and avoids a second allocation downstream.
pub fn tokenize(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
for b in text.bytes() {
if b.is_ascii_alphanumeric() {
current.push(b.to_ascii_lowercase() as char);
} else if !current.is_empty() {
tokens.push(std::mem::take(&mut current));
}
}
if !current.is_empty() {
tokens.push(current);
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_returns_empty_vec() {
assert!(tokenize("").is_empty());
assert!(tokenize(" ").is_empty());
assert!(tokenize("!!!---???").is_empty());
}
#[test]
fn splits_on_punctuation_and_whitespace() {
assert_eq!(
tokenize("hello, world!"),
vec!["hello".to_string(), "world".to_string()]
);
assert_eq!(
tokenize("a\tb\nc d"),
vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string()
]
);
}
#[test]
fn lowercases_ascii_letters() {
assert_eq!(
tokenize("FooBar BAZ"),
vec!["foobar".to_string(), "baz".to_string()]
);
}
#[test]
fn alphanumeric_runs_stay_together() {
// "rust2026" is a single token; digits are alphanumeric.
assert_eq!(tokenize("rust2026"), vec!["rust2026".to_string()]);
// "co-op" splits on the hyphen.
assert_eq!(tokenize("co-op"), vec!["co".to_string(), "op".to_string()]);
}
#[test]
fn non_ascii_bytes_act_as_separators_without_panicking() {
// ASCII MVP per Q3 — non-ASCII bytes (é = 0xC3 0xA9 in UTF-8) are
// treated as separators. "café" -> ["caf"]. Documented limitation.
let toks = tokenize("café");
assert_eq!(toks, vec!["caf".to_string()]);
// CJK input: every byte is non-ASCII, so we get an empty result.
assert!(tokenize("日本語").is_empty());
}
#[test]
fn smoke_module_path_reaches_through_lib() {
// Confirms `sqlrite::sql::fts::tokenize` is reachable via the
// public `sql` module path from the crate root. If 8b ever moves
// the module behind a feature gate, this test will fail loudly.
assert_eq!(
crate::sql::fts::tokenize("Hello, world!"),
vec!["hello".to_string(), "world".to_string()]
);
}
}