1- from general_sam import GeneralSAM , GreedyTokenizer , build_trie_from_chars
1+ from general_sam import (
2+ GeneralSAM ,
3+ GreedyTokenizer ,
4+ build_trie_from_bytes ,
5+ build_trie_from_chars ,
6+ )
7+
8+
9+ def test_english_chars_tokenize ():
10+ vocab = ['a' , 'ab' , 'b' , 'bc' , 'c' , 'd' , 'e' , 'f' , 'cd' , 'abcde' ]
11+ trie , token_to_trie_node = build_trie_from_chars (vocab )
12+
13+ trie_node_to_token = [- 1 ] * trie .num_of_nodes ()
14+ for i , j in enumerate (token_to_trie_node ):
15+ trie_node_to_token [j ] = i
16+
17+ sam = GeneralSAM .from_trie (trie )
18+ tokenizer = GreedyTokenizer .from_sam_and_trie (sam , trie )
19+ assert tokenizer .is_in_chars ()
20+
21+ def tokenize (s : str ):
22+ return [(trie_node_to_token [i ], j ) for i , j in tokenizer .tokenize_str (s )]
23+
24+ assert tokenize ('abcde' ) == [(9 , 5 )]
25+ assert tokenize ('abcdf' ) == [(1 , 2 ), (8 , 2 ), (7 , 1 )]
26+ assert tokenize ('abca' ) == [(1 , 2 ), (4 , 1 ), (0 , 1 )]
227
328
429def test_chinese_chars_tokenize ():
@@ -11,6 +36,7 @@ def test_chinese_chars_tokenize():
1136
1237 sam = GeneralSAM .from_trie (trie )
1338 tokenizer = GreedyTokenizer .from_sam_and_trie (sam , trie )
39+ assert tokenizer .is_in_chars ()
1440
1541 def tokenize (s : str ):
1642 return [(trie_node_to_token [i ], j ) for i , j in tokenizer .tokenize_str (s )]
@@ -21,3 +47,36 @@ def tokenize(s: str):
2147 assert tokenize ('聆听歌曲折' ) == [(1 , 4 ), (- 1 , 1 )]
2248 assert tokenize ('查看歌词歌曲' ) == [(4 , 4 ), (0 , 2 )]
2349 assert tokenize ('一起播放歌曲并共享歌词' ) == [(- 1 , 2 ), (2 , 4 ), (- 1 , 3 ), (3 , 2 )]
50+
51+
52+ def test_chinese_bytes_tokenize ():
53+ vocab = ['歌曲' , '聆听歌曲' , '播放歌曲' , '歌词' , '查看歌词' , '听歌' , '曲折' ]
54+ vocab = [i .encode () for i in vocab ]
55+ trie , token_to_trie_node = build_trie_from_bytes (vocab )
56+
57+ trie_node_to_token = [- 1 ] * trie .num_of_nodes ()
58+ for i , j in enumerate (token_to_trie_node ):
59+ trie_node_to_token [j ] = i
60+
61+ sam = GeneralSAM .from_trie (trie )
62+ tokenizer = GreedyTokenizer .from_sam_and_trie (sam , trie )
63+ assert tokenizer .is_in_bytes ()
64+
65+ def tokenize_str (s : str ):
66+ return [trie_node_to_token [i ] for i , _ in tokenizer .tokenize_str (s )]
67+
68+ def tokenize_bytes (s : str ):
69+ return [trie_node_to_token [i ] for i , _ in tokenizer .tokenize_bytes (s .encode ())]
70+
71+ def tokenize (s : str ):
72+ a = tokenize_str (s )
73+ b = tokenize_bytes (s )
74+ assert a == b
75+ return a
76+
77+ assert tokenize ('歌曲折' ) == [0 , - 1 ]
78+ assert tokenize ('听歌曲' ) == [5 , - 1 ]
79+ assert tokenize ('听歌曲折' ) == [5 , 6 ]
80+ assert tokenize ('聆听歌曲折' ) == [1 , - 1 ]
81+ assert tokenize ('查看歌词歌曲' ) == [4 , 0 ]
82+ assert tokenize ('一起播放歌曲并共享歌词' ) == [- 1 , 2 , - 1 , 3 ]
0 commit comments