Skip to content

Commit 8274a68

Browse files
committed
refactor(tests): move heap bpe into test utils
1 parent 50fbf9d commit 8274a68

6 files changed

Lines changed: 24 additions & 21 deletions

File tree

src/inc_impl.rs renamed to src/inc_bpe.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ mod tests {
213213

214214
use crate::{
215215
Dictionary, IncBpeToken, IncBpeTokenChainIter, IncBpeTokenizer, NormalizedDict, TokenId,
216-
Vocab, sp_impl::sentence_piece_impl,
216+
Vocab, test_utils::bpe_with_heap,
217217
};
218218

219219
fn inc_bpe_short_any_case(vocab: &[&str], rules: &[(&str, &str)], sequences: &[&str]) {
@@ -244,7 +244,7 @@ mod tests {
244244

245245
fn validate(dict: &Dictionary, seq: &[TokenId], inc_res: &[IncBpeToken]) {
246246
for i in 0..seq.len() {
247-
let expected = sentence_piece_impl::<false>(dict, &seq[0..i + 1]);
247+
let expected = bpe_with_heap::<false>(dict, &seq[0..i + 1]);
248248
let output = IncBpeTokenChainIter::new(inc_res, i).token_ids();
249249
let output = output.chain(std::iter::repeat(TokenId::MAX));
250250
assert!(expected.into_iter().rev().zip(output).all(|(i, j)| i == j));

src/lib.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
mod aho_corasick;
22
mod centroid;
33
mod dict;
4-
mod heap;
5-
mod inc_impl;
4+
mod inc_bpe;
65
mod normalize;
7-
mod sp_impl;
86
mod successor;
97
mod suf_suc;
108
mod typed_vec;
119
mod vocab;
1210

1311
pub use crate::{
1412
dict::{DictBuildError, Dictionary, Rule, RuleId, UnknownToken},
15-
inc_impl::{IncBpeToken, IncBpeTokenChainIter, IncBpeTokenization, IncBpeTokenizer},
13+
inc_bpe::{IncBpeToken, IncBpeTokenChainIter, IncBpeTokenization, IncBpeTokenizer},
1614
normalize::{NormalizedDict, NormalizedDictBuildError},
1715
successor::SkipLen,
1816
vocab::{Token, TokenId, Vocab, VocabBuildError},
1917
};
18+
19+
#[cfg(test)]
20+
mod test_utils;

src/normalize.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ impl NormalizedDict {
163163

164164
#[cfg(test)]
165165
mod tests {
166-
use crate::{Dictionary, NormalizedDict, RuleId, Vocab, sp_impl::sentence_piece_impl};
166+
use crate::{Dictionary, NormalizedDict, RuleId, Vocab, test_utils::bpe_with_heap};
167167

168168
fn build_dict<T: AsRef<[u8]>, R: IntoIterator<Item = (T, T)>>(
169169
vocab: &Vocab,
@@ -178,7 +178,7 @@ mod tests {
178178
let token_id = rule.merged;
179179
assert!(!dict.is_single(token_id));
180180
let seq = &dict[token_id];
181-
let res = sentence_piece_impl::<false>(&dict, dict.split_bytes_to_tokens(seq, 0usize));
181+
let res = bpe_with_heap::<false>(&dict, dict.split_bytes_to_tokens(seq, 0usize));
182182
assert!(dict.is_useful(token_id) ^ (res != vec![token_id]));
183183
}
184184
dict
@@ -196,7 +196,7 @@ mod tests {
196196
}
197197
};
198198
assert!(!dict.is_single(token_id));
199-
let res = sentence_piece_impl::<false>(&dict, dict.split_utf8_to_tokens(seq, 0usize));
199+
let res = bpe_with_heap::<false>(&dict, dict.split_utf8_to_tokens(seq, 0usize));
200200
assert!(dict.is_useful(token_id) ^ (res != vec![token_id]));
201201
}
202202
dict

src/sp_impl.rs renamed to src/test_utils/bpe.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
#![cfg(test)]
21
use crate::{
32
Dictionary, RuleId, TokenId,
4-
heap::AdjustableHeap,
3+
test_utils::heap::AdjustableHeap,
54
typed_vec::{TypedVec, typed_vec_index},
65
};
76

87
typed_vec_index!(pub(super) InputTextPos, u32);
98

109
type Heap = AdjustableHeap<InputTextPos, RuleId>;
1110

12-
pub fn sentence_piece_impl<const ALLOW_IMPROPER_RULES: bool>(
11+
pub fn bpe_with_heap<const ALLOW_IMPROPER_RULES: bool>(
1312
dict: &Dictionary,
1413
seq: impl Into<Vec<TokenId>>,
1514
) -> Vec<TokenId> {
@@ -98,7 +97,7 @@ pub fn sentence_piece_impl<const ALLOW_IMPROPER_RULES: bool>(
9897

9998
#[cfg(test)]
10099
mod tests {
101-
use crate::{Dictionary, TokenId, Vocab, sp_impl::sentence_piece_impl};
100+
use crate::{Dictionary, TokenId, Vocab, test_utils::bpe::bpe_with_heap};
102101

103102
fn build_dict<T: AsRef<[u8]>, R: IntoIterator<Item = (T, T)>>(
104103
vocab: &Vocab,
@@ -114,7 +113,7 @@ mod tests {
114113
) {
115114
let tokens: Vec<_> = tokens.into_iter().map(I::into).collect();
116115
let inputs = dict.split_bytes_to_tokens(seq.as_ref(), 0usize);
117-
assert_eq!(sentence_piece_impl::<true>(dict, inputs), tokens);
116+
assert_eq!(bpe_with_heap::<true>(dict, inputs), tokens);
118117
assert!(dict.is_proper_in_bytes().is_ok());
119118
check_properly_in_bytes(dict, seq, tokens);
120119
}
@@ -126,7 +125,7 @@ mod tests {
126125
) {
127126
let tokens: Vec<_> = tokens.into_iter().map(I::into).collect();
128127
let inputs = dict.split_utf8_to_tokens(seq.as_ref(), 0usize);
129-
assert_eq!(sentence_piece_impl::<true>(dict, inputs), tokens);
128+
assert_eq!(bpe_with_heap::<true>(dict, inputs), tokens);
130129
assert!(dict.is_proper_in_utf8().is_ok());
131130
check_properly_in_utf8(dict, seq, tokens);
132131
}
@@ -138,7 +137,7 @@ mod tests {
138137
) {
139138
let tokens: Vec<_> = tokens.into_iter().map(I::into).collect();
140139
let inputs = dict.split_bytes_to_tokens(seq.as_ref(), 0usize);
141-
assert_eq!(sentence_piece_impl::<false>(dict, inputs), tokens);
140+
assert_eq!(bpe_with_heap::<false>(dict, inputs), tokens);
142141
}
143142

144143
fn check_properly_in_utf8<S: AsRef<str>, I: Into<TokenId>, T: IntoIterator<Item = I>>(
@@ -148,11 +147,11 @@ mod tests {
148147
) {
149148
let tokens: Vec<_> = tokens.into_iter().map(I::into).collect();
150149
let inputs = dict.split_utf8_to_tokens(seq.as_ref(), 0usize);
151-
assert_eq!(sentence_piece_impl::<false>(dict, inputs), tokens);
150+
assert_eq!(bpe_with_heap::<false>(dict, inputs), tokens);
152151
}
153152

154153
#[test]
155-
fn test_sp_impl() {
154+
fn test_bpe_with_heap() {
156155
let vocab = Vocab::new([
157156
b"<unk>" as &[_],
158157
b"a",

src/heap.rs renamed to src/test_utils/heap.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
#![cfg(test)]
21
use crate::typed_vec::{TypedVec, TypedVecIndex, typed_vec_index};
32

43
typed_vec_index!(NodeId, u32);
54

65
const ROOT: NodeId = NodeId::ZERO.next();
76

87
#[derive(Debug)]
9-
pub(crate) struct AdjustableHeap<Pos, Key> {
8+
pub(super) struct AdjustableHeap<Pos, Key> {
109
nodes: TypedVec<NodeId, (Key, Pos)>,
1110
pos_to_node_id: TypedVec<Pos, NodeId>,
1211
}
@@ -186,7 +185,7 @@ mod tests {
186185
use rand::{Rng, SeedableRng, rngs::StdRng};
187186

188187
use crate::{
189-
heap::AdjustableHeap,
188+
test_utils::heap::AdjustableHeap,
190189
typed_vec::{TypedVec, typed_vec_index},
191190
};
192191

src/test_utils/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
mod bpe;
2+
mod heap;
3+
4+
pub use bpe::bpe_with_heap;

0 commit comments

Comments
 (0)