Skip to content

Commit 6cd72a8

Browse files
committed
Bucket to explicit vocab conversion.
Implement conversion from bucket subword vocabularies to explicitly indexed subword vocabularies.
1 parent a854134 commit 6cd72a8

2 files changed

Lines changed: 54 additions & 6 deletions

File tree

src/chunks/vocab/subword.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,33 @@ where
351351

352352
Ok(())
353353
}
354+
355+
/// Convert the hash-based vocabulary to an Explicit Vocabulary.
356+
///
357+
/// N-grams in the range `(self.min_n..self.max_n)` are extracted from the words in the
358+
/// vocabulary, each of these gets assigned an index from the `BucketIndexer` which is used to
359+
/// determine the index in the explicit subword vocab.
360+
pub fn to_explicit(&self) -> ExplicitSubwordVocab {
361+
let mut ngram_index = HashMap::new();
362+
let SubwordVocab {
363+
words,
364+
indices: _,
365+
indexer,
366+
min_n,
367+
max_n,
368+
} = &self;
369+
370+
for word in words.iter().map(Self::bracket) {
371+
for (ngram, idx) in word
372+
.subword_indices_with_ngrams(*min_n as usize, *max_n as usize, indexer)
373+
.filter_map(|(ngram, idx)| idx.map(|idx| (ngram, idx)))
374+
{
375+
ngram_index.entry(ngram.into()).or_insert(idx);
376+
}
377+
}
378+
let indexer = ExplicitIndexer::new_with_indices(ngram_index);
379+
ExplicitSubwordVocab::new(words.to_owned(), *min_n, *max_n, indexer)
380+
}
354381
}
355382

356383
impl SubwordVocab<ExplicitIndexer> {
@@ -491,9 +518,11 @@ mod tests {
491518

492519
use super::{BucketSubwordVocab, FastTextSubwordVocab, SubwordVocab};
493520
use crate::chunks::io::{ReadChunk, WriteChunk};
494-
use crate::chunks::vocab::{read_chunk_size, ExplicitSubwordVocab};
521+
use crate::chunks::vocab::{read_chunk_size, ExplicitSubwordVocab, Vocab};
495522
use crate::compat::fasttext::FastTextIndexer;
496-
use crate::subword::{BucketIndexer, ExplicitIndexer, FinalfusionHashIndexer};
523+
use crate::subword::{
524+
BucketIndexer, ExplicitIndexer, FinalfusionHashIndexer, Indexer, StrWithCharLen,
525+
};
497526

498527
fn test_fasttext_subword_vocab() -> FastTextSubwordVocab {
499528
let words = vec![
@@ -534,6 +563,24 @@ mod tests {
534563
ExplicitSubwordVocab::new(words, 2, 3, ExplicitIndexer::new_with_indices(ngrams))
535564
}
536565

566+
#[test]
567+
fn test_conversion() {
568+
let words = vec!["groß".to_owned(), "allerdings".to_owned()];
569+
let indexer = FinalfusionHashIndexer::new(21);
570+
let bucket_vocab = SubwordVocab::new(words, 3, 6, indexer);
571+
let explicit = bucket_vocab.to_explicit();
572+
let dings = StrWithCharLen::new("dings");
573+
let gro = StrWithCharLen::new("<gro");
574+
let dings_expl_idx = explicit.indexer().index_ngram(&dings);
575+
let gro_expl_idx = explicit.indexer().index_ngram(&gro);
576+
assert_eq!(dings_expl_idx, gro_expl_idx);
577+
let dings_buck_idx = bucket_vocab.indexer().index_ngram(&dings);
578+
let gro_buck_idx = bucket_vocab.indexer().index_ngram(&gro);
579+
assert_eq!(gro_buck_idx, dings_buck_idx);
580+
assert_eq!(explicit.vocab_len(), explicit.words_len() + 43);
581+
assert_eq!(explicit.indexer().upper_bound(), 43);
582+
}
583+
537584
#[test]
538585
fn fasttext_subword_vocab_write_read_roundtrip() {
539586
let check_vocab = test_fasttext_subword_vocab();

src/subword.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,11 @@ impl ExplicitIndexer {
186186
/// `subword -> index` mapping.
187187
///
188188
/// Panics when there are duplicate ngrams.
189-
pub fn new_with_indices(ngram_tuples: Vec<(String, u64)>) -> Self {
190-
let mut old_to_new_indices = HashMap::new();
191-
let mut index = HashMap::with_capacity(ngram_tuples.len());
192-
let mut ngrams = Vec::with_capacity(ngram_tuples.len());
189+
pub fn new_with_indices(ngram_tuples: impl IntoIterator<Item = (String, u64)>) -> Self {
190+
let ngram_tuples = ngram_tuples.into_iter();
191+
let mut old_to_new_indices = HashMap::with_capacity(ngram_tuples.size_hint().0);
192+
let mut index = HashMap::with_capacity(ngram_tuples.size_hint().0);
193+
let mut ngrams = Vec::with_capacity(ngram_tuples.size_hint().0);
193194
for (ngram, bucket) in ngram_tuples {
194195
let cur_idx = old_to_new_indices.len();
195196
let new_idx = *old_to_new_indices.entry(bucket).or_insert(cur_idx);

0 commit comments

Comments
 (0)