55< head >
66 < meta charset ="utf-8 " />
77 < meta name ="viewport " content ="width=device-width, initial-scale=1.0 " />
8- < title > pythainlp.benchmarks.word_tokenization — PyThaiNLP 8a188f2 documentation</ title >
8+ < title > pythainlp.benchmarks.word_tokenization — PyThaiNLP f22c110 documentation</ title >
99 < link rel ="stylesheet " type ="text/css " href ="../../../_static/pygments.css?v=03e43079 " />
1010 < link rel ="stylesheet " type ="text/css " href ="../../../_static/css/theme.css?v=9edc463e " />
1111 < link rel ="stylesheet " type ="text/css " href ="../../../_static/copybutton.css?v=76b2166b " />
1414
1515 < script src ="../../../_static/jquery.js?v=5d32c60e "> </ script >
1616 < script src ="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c "> </ script >
17- < script src ="../../../_static/documentation_options.js?v=214d532b "> </ script >
17+ < script src ="../../../_static/documentation_options.js?v=472f9f16 "> </ script >
1818 < script src ="../../../_static/doctools.js?v=fd6eb6e6 "> </ script >
1919 < script src ="../../../_static/sphinx_highlight.js?v=6ffebe34 "> </ script >
2020 < script src ="../../../_static/clipboard.min.js?v=a7894cd8 "> </ script >
@@ -116,7 +116,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
116116
117117< span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> re</ span >
118118< span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> sys</ span >
119- < span class ="kn "> from</ span > < span class ="w "> </ span > < span class ="nn "> typing</ span > < span class ="w "> </ span > < span class ="kn "> import</ span > < span class ="n "> TYPE_CHECKING</ span > < span class ="p "> ,</ span > < span class ="n "> Any </ span >
119+ < span class ="kn "> from</ span > < span class ="w "> </ span > < span class ="nn "> typing</ span > < span class ="w "> </ span > < span class ="kn "> import</ span > < span class ="n "> TYPE_CHECKING</ span > < span class ="p "> ,</ span > < span class ="n "> Union </ span >
120120
121121< span class ="k "> if</ span > < span class ="n "> TYPE_CHECKING</ span > < span class ="p "> :</ span >
122122 < span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> numpy</ span > < span class ="w "> </ span > < span class ="k "> as</ span > < span class ="w "> </ span > < span class ="nn "> np</ span >
@@ -153,7 +153,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
153153 < span class ="k "> return</ span > < span class ="mi "> 2</ span > < span class ="o "> *</ span > < span class ="n "> precision</ span > < span class ="o "> *</ span > < span class ="n "> recall</ span > < span class ="o "> /</ span > < span class ="p "> (</ span > < span class ="n "> precision</ span > < span class ="o "> +</ span > < span class ="n "> recall</ span > < span class ="p "> )</ span >
154154
155155
156- < span class ="k "> def</ span > < span class ="w "> </ span > < span class ="nf "> _flatten_result</ span > < span class ="p "> (</ span > < span class ="n "> my_dict</ span > < span class ="p "> :</ span > < span class ="nb "> dict</ span > < span class ="p "> ,</ span > < span class ="n "> sep</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="o "> =</ span > < span class ="s2 "> ":"</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> dict</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> Any </ span > < span class ="p "> ]:</ span >
156+ < span class ="k "> def</ span > < span class ="w "> </ span > < span class ="nf "> _flatten_result</ span > < span class ="p "> (</ span > < span class ="n "> my_dict</ span > < span class ="p "> :</ span > < span class ="nb "> dict</ span > < span class ="p "> ,</ span > < span class ="n "> sep</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="o "> =</ span > < span class ="s2 "> ":"</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> dict</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> Union </ span > < span class ="p "> [ </ span > < span class =" nb " > int </ span > < span class =" p " > , </ span > < span class =" nb " > str </ span > < span class =" p " > ] ]:</ span >
157157< span class ="w "> </ span > < span class ="sd "> """Flatten two-dimension dictionary.</ span >
158158
159159< span class ="sd "> Use keys in the first dimension as a prefix for keys in the second dimension.</ span >
@@ -167,7 +167,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
167167< span class ="sd "> :param str sep: separator between the two keys (default: ":")</ span >
168168
169169< span class ="sd "> :return: a one-dimension dictionary with keys combined</ span >
170- < span class ="sd "> :rtype: dict[str, Any ]</ span >
170+ < span class ="sd "> :rtype: dict[str, Union[int, str] ]</ span >
171171< span class ="sd "> """</ span >
172172 < span class ="k "> return</ span > < span class ="p "> {</ span >
173173 < span class ="sa "> f</ span > < span class ="s2 "> "</ span > < span class ="si "> {</ span > < span class ="n "> k1</ span > < span class ="si "> }{</ span > < span class ="n "> sep</ span > < span class ="si "> }{</ span > < span class ="n "> k2</ span > < span class ="si "> }</ span > < span class ="s2 "> "</ span > < span class ="p "> :</ span > < span class ="n "> v</ span >
@@ -252,7 +252,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
252252
253253< div class ="viewcode-block " id ="compute_stats ">
254254< a class ="viewcode-back " href ="../../../api/benchmarks.html#pythainlp.benchmarks.word_tokenization.compute_stats "> [docs]</ a >
255- < span class ="k "> def</ span > < span class ="w "> </ span > < span class ="nf "> compute_stats</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> raw_sample</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> dict</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> Any </ span > < span class ="p "> ]:</ span >
255+ < span class ="k "> def</ span > < span class ="w "> </ span > < span class ="nf "> compute_stats</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> raw_sample</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> dict</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="nb " > dict </ span > < span class =" p " > [ </ span > < span class =" nb " > str </ span > < span class =" p " > , </ span > < span class =" n "> Union </ span > < span class ="p "> [ </ span > < span class =" nb " > int </ span > < span class =" p " > , </ span > < span class =" nb " > str </ span > < span class =" p " > ]] ]:</ span >
256256< span class ="w "> </ span > < span class ="sd "> """Compute statistics for tokenization quality</ span >
257257
258258< span class ="sd "> These statistics include:</ span >
@@ -269,7 +269,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
269269< span class ="sd "> :param str samples: samples that we want to evaluate</ span >
270270
271271< span class ="sd "> :return: metrics at character- and word-level and indicators of correctly tokenized words</ span >
272- < span class ="sd "> :rtype: dict[str, Any ]</ span >
272+ < span class ="sd "> :rtype: dict[str, dict[str, Union[int, str]] ]</ span >
273273< span class ="sd "> """</ span >
274274 < span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> numpy</ span > < span class ="w "> </ span > < span class ="k "> as</ span > < span class ="w "> </ span > < span class ="nn "> np</ span >
275275
@@ -285,11 +285,11 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
285285 < span class ="n "> c_pos_pred</ span > < span class ="o "> =</ span > < span class ="n "> c_pos_pred</ span > < span class ="p "> [</ span > < span class ="n "> c_pos_pred</ span > < span class ="o "> <</ span > < span class ="n "> ref_sample_arr</ span > < span class ="o "> .</ span > < span class ="n "> shape</ span > < span class ="p "> [</ span > < span class ="mi "> 0</ span > < span class ="p "> ]]</ span >
286286 < span class ="n "> c_neg_pred</ span > < span class ="o "> =</ span > < span class ="n "> c_neg_pred</ span > < span class ="p "> [</ span > < span class ="n "> c_neg_pred</ span > < span class ="o "> <</ span > < span class ="n "> ref_sample_arr</ span > < span class ="o "> .</ span > < span class ="n "> shape</ span > < span class ="p "> [</ span > < span class ="mi "> 0</ span > < span class ="p "> ]]</ span >
287287
288- < span class ="n "> c_tp</ span > < span class ="p "> :</ span > < span class ="n " > np </ span > < span class ="o "> . </ span > < span class ="n " > intp </ span > < span class ="o " > = </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_pos_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 1</ span > < span class ="p "> )</ span >
289- < span class ="n "> c_fp</ span > < span class ="p "> :</ span > < span class ="n " > np </ span > < span class ="o "> . </ span > < span class ="n " > intp </ span > < span class ="o " > = </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_pos_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 0</ span > < span class ="p "> )</ span >
288+ < span class ="n "> c_tp</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class ="o "> = </ span > < span class ="nb " > int </ span > < span class ="p " > ( </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_pos_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 1</ span > < span class ="p "> ) )</ span >
289+ < span class ="n "> c_fp</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class ="o "> = </ span > < span class ="nb " > int </ span > < span class ="p " > ( </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_pos_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 0</ span > < span class ="p "> ) )</ span >
290290
291- < span class ="n "> c_tn</ span > < span class ="p "> :</ span > < span class ="n " > np </ span > < span class ="o "> . </ span > < span class ="n " > intp </ span > < span class ="o " > = </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_neg_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 0</ span > < span class ="p "> )</ span >
292- < span class ="n "> c_fn</ span > < span class ="p "> :</ span > < span class ="n " > np </ span > < span class ="o "> . </ span > < span class ="n " > intp </ span > < span class ="o " > = </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_neg_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 1</ span > < span class ="p "> )</ span >
291+ < span class ="n "> c_tn</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class ="o "> = </ span > < span class ="nb " > int </ span > < span class ="p " > ( </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_neg_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 0</ span > < span class ="p "> ) )</ span >
292+ < span class ="n "> c_fn</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class ="o "> = </ span > < span class ="nb " > int </ span > < span class ="p " > ( </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> [</ span > < span class ="n "> c_neg_pred</ span > < span class ="p "> ]</ span > < span class ="o "> ==</ span > < span class ="mi "> 1</ span > < span class ="p "> ) )</ span >
293293
294294 < span class ="c1 "> # Compute word-level statistics</ span >
295295
@@ -302,7 +302,7 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
302302 < span class ="n "> word_boundaries</ span > < span class ="p "> ,</ span > < span class ="n "> ss_boundaries</ span >
303303 < span class ="p "> )</ span >
304304
305- < span class ="n "> correctly_tokenised_words</ span > < span class ="p "> :</ span > < span class ="n " > np </ span > < span class ="o "> . </ span > < span class ="n " > intp </ span > < span class ="o " > = </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> tokenization_indicators</ span > < span class ="p "> )</ span >
305+ < span class ="n "> correctly_tokenised_words</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class ="o "> = </ span > < span class ="nb " > int </ span > < span class ="p " > ( </ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> tokenization_indicators</ span > < span class ="p "> ) )</ span >
306306
307307 < span class ="n "> tokenization_indicators_str</ span > < span class ="o "> =</ span > < span class ="nb "> list</ span > < span class ="p "> (</ span > < span class ="nb "> map</ span > < span class ="p "> (</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> tokenization_indicators</ span > < span class ="p "> ))</ span >
308308
@@ -315,8 +315,8 @@ <h1>Source code for pythainlp.benchmarks.word_tokenization</h1><div class="highl
315315 < span class ="p "> },</ span >
316316 < span class ="s2 "> "word_level"</ span > < span class ="p "> :</ span > < span class ="p "> {</ span >
317317 < span class ="s2 "> "correctly_tokenised_words"</ span > < span class ="p "> :</ span > < span class ="n "> correctly_tokenised_words</ span > < span class ="p "> ,</ span >
318- < span class ="s2 "> "total_words_in_sample"</ span > < span class ="p "> :</ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> sample_arr</ span > < span class ="p "> ),</ span >
319- < span class ="s2 "> "total_words_in_ref_sample"</ span > < span class ="p "> :</ span > < span class ="n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> ),</ span >
318+ < span class ="s2 "> "total_words_in_sample"</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class =" p " > ( </ span > < span class =" n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> sample_arr</ span > < span class ="p "> ) ),</ span >
319+ < span class ="s2 "> "total_words_in_ref_sample"</ span > < span class ="p "> :</ span > < span class ="nb " > int </ span > < span class =" p " > ( </ span > < span class =" n "> np</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> (</ span > < span class ="n "> ref_sample_arr</ span > < span class ="p "> ) ),</ span >
320320 < span class ="p "> },</ span >
321321 < span class ="s2 "> "global"</ span > < span class ="p "> :</ span > < span class ="p "> {</ span >
322322 < span class ="s2 "> "tokenisation_indicators"</ span > < span class ="p "> :</ span > < span class ="s2 "> ""</ span > < span class ="o "> .</ span > < span class ="n "> join</ span > < span class ="p "> (</ span > < span class ="n "> tokenization_indicators_str</ span > < span class ="p "> )</ span >
0 commit comments