@@ -351,6 +351,33 @@ where
351351
352352 Ok ( ( ) )
353353 }
354+
355+ /// Convert the hash-based vocabulary to an Explicit Vocabulary.
356+ ///
357+ /// N-grams in the range `(self.min_n..self.max_n)` are extracted from the words in the
358+ /// vocabulary, each of these gets assigned an index from the `BucketIndexer` which is used to
359+ /// determine the index in the explicit subword vocab.
360+ pub fn to_explicit ( & self ) -> ExplicitSubwordVocab {
361+ let mut ngram_index = HashMap :: new ( ) ;
362+ let SubwordVocab {
363+ words,
364+ indices : _,
365+ indexer,
366+ min_n,
367+ max_n,
368+ } = & self ;
369+
370+ for word in words. iter ( ) . map ( Self :: bracket) {
371+ for ( ngram, idx) in word
372+ . subword_indices_with_ngrams ( * min_n as usize , * max_n as usize , indexer)
373+ . filter_map ( |( ngram, idx) | idx. map ( |idx| ( ngram, idx) ) )
374+ {
375+ ngram_index. entry ( ngram. into ( ) ) . or_insert ( idx) ;
376+ }
377+ }
378+ let indexer = ExplicitIndexer :: new_with_indices ( ngram_index) ;
379+ ExplicitSubwordVocab :: new ( words. to_owned ( ) , * min_n, * max_n, indexer)
380+ }
354381}
355382
356383impl SubwordVocab < ExplicitIndexer > {
@@ -491,9 +518,11 @@ mod tests {
491518
492519 use super :: { BucketSubwordVocab , FastTextSubwordVocab , SubwordVocab } ;
493520 use crate :: chunks:: io:: { ReadChunk , WriteChunk } ;
494- use crate :: chunks:: vocab:: { read_chunk_size, ExplicitSubwordVocab } ;
521+ use crate :: chunks:: vocab:: { read_chunk_size, ExplicitSubwordVocab , Vocab } ;
495522 use crate :: compat:: fasttext:: FastTextIndexer ;
496- use crate :: subword:: { BucketIndexer , ExplicitIndexer , FinalfusionHashIndexer } ;
523+ use crate :: subword:: {
524+ BucketIndexer , ExplicitIndexer , FinalfusionHashIndexer , Indexer , StrWithCharLen ,
525+ } ;
497526
498527 fn test_fasttext_subword_vocab ( ) -> FastTextSubwordVocab {
499528 let words = vec ! [
@@ -534,6 +563,24 @@ mod tests {
534563 ExplicitSubwordVocab :: new ( words, 2 , 3 , ExplicitIndexer :: new_with_indices ( ngrams) )
535564 }
536565
566+ #[ test]
567+ fn test_conversion ( ) {
568+ let words = vec ! [ "groß" . to_owned( ) , "allerdings" . to_owned( ) ] ;
569+ let indexer = FinalfusionHashIndexer :: new ( 21 ) ;
570+ let bucket_vocab = SubwordVocab :: new ( words, 3 , 6 , indexer) ;
571+ let explicit = bucket_vocab. to_explicit ( ) ;
572+ let dings = StrWithCharLen :: new ( "dings" ) ;
573+ let gro = StrWithCharLen :: new ( "<gro" ) ;
574+ let dings_expl_idx = explicit. indexer ( ) . index_ngram ( & dings) ;
575+ let gro_expl_idx = explicit. indexer ( ) . index_ngram ( & gro) ;
576+ assert_eq ! ( dings_expl_idx, gro_expl_idx) ;
577+ let dings_buck_idx = bucket_vocab. indexer ( ) . index_ngram ( & dings) ;
578+ let gro_buck_idx = bucket_vocab. indexer ( ) . index_ngram ( & gro) ;
579+ assert_eq ! ( gro_buck_idx, dings_buck_idx) ;
580+ assert_eq ! ( explicit. vocab_len( ) , explicit. words_len( ) + 43 ) ;
581+ assert_eq ! ( explicit. indexer( ) . upper_bound( ) , 43 ) ;
582+ }
583+
537584 #[ test]
538585 fn fasttext_subword_vocab_write_read_roundtrip ( ) {
539586 let check_vocab = test_fasttext_subword_vocab ( ) ;
0 commit comments