@@ -192,6 +192,30 @@ cnt_info = automaton.prepend_feed(state, '来')
192192assert state.is_nil()
193193```
194194
195+ ### ` GreedyTokenizer `
196+
197+ ``` python
198+ vocab = [' 歌曲' , ' 聆听歌曲' , ' 播放歌曲' , ' 歌词' , ' 查看歌词' , ' 听歌' , ' 曲折' ]
199+ trie, token_to_trie_node = build_trie_from_chars(vocab)
200+
201+ trie_node_to_token = [- 1 ] * trie.num_of_nodes()
202+ for i, j in enumerate (token_to_trie_node):
203+ trie_node_to_token[j] = i
204+
205+ sam = GeneralSAM.from_trie(trie)
206+ tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
207+
208+ def tokenize (s : str ):
209+ return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
210+
211+ assert tokenize(' 歌曲折' ) == [(0 , 2 ), (- 1 , 1 )]
212+ assert tokenize(' 听歌曲' ) == [(5 , 2 ), (- 1 , 1 )]
213+ assert tokenize(' 听歌曲折' ) == [(5 , 2 ), (6 , 2 )]
214+ assert tokenize(' 聆听歌曲折' ) == [(1 , 4 ), (- 1 , 1 )]
215+ assert tokenize(' 查看歌词歌曲' ) == [(4 , 4 ), (0 , 2 )]
216+ assert tokenize(' 一起播放歌曲并共享歌词' ) == [(- 1 , 2 ), (2 , 4 ), (- 1 , 3 ), (3 , 2 )]
217+ ```
218+
195219## License
196220
197221- © ; 2023 Chielo Newctle \< [ ChieloNewctle@gmail.com ] ( mailto:ChieloNewctle@gmail.com ) \>
0 commit comments