1414
1515from __future__ import annotations
1616
17+ import re
1718from typing import Optional
1819
1920from google .genai import types as genai_types
@@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float):
9293 return EvalStatus .PASSED if score >= threshold else EvalStatus .FAILED
9394
9495
95- def _unicode_tokenize ( text : str ) :
96- """Tokenizes text using Unicode-aware word boundaries .
96+ class _UnicodeTokenizer :
97+ """Tokenizer that handles Unicode text with word-boundary awareness .
9798
98- The default RougeScorer tokenizer uses r' \\ w+' which only matches ASCII
99- [a-zA-Z0-9_]. For non- Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
100- this returns zero tokens, causing ROUGE scores of 0.0 on matching responses .
99+ The default RougeScorer tokenizer splits on whitespace, which works for
100+ ASCII and Latin-script text but produces zero tokens for text in scripts
101+ without word boundaries (Chinese, Japanese, Thai, etc.) .
101102
102- This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
103- character-level tokenization for non-ASCII text.
103+ For ASCII-majority text this tokenizer uses Unicode-aware word-character
104+ matching (``\\ w+`` in re). For non-ASCII text it falls back to whitespace
105+ splitting, then character-level tokenization.
104106 """
105- import re
106- # For primarily non-ASCII text, tokenize by Unicode-aware patterns
107- ascii_chars = sum (1 for c in text if ord (c ) < 128 )
108- if ascii_chars > len (text ) * 0.5 :
109- return re .findall (r'[\\w]+' , text .lower (), re .UNICODE )
110- # For non-Latin scripts, use whitespace splitting with Unicode support
111- tokens = re .split (r'[\\s\\p{P}]+' , text , flags = re .UNICODE )
112- tokens = [t .lower () for t in tokens if t ]
113- if tokens :
114- return tokens
115- # Character-level fallback for scripts without word boundaries
116- return list (text .lower ())
107+
108+ def tokenize (self , text : str ) -> list [str ]:
109+ """Tokenizes text using Unicode-aware word boundaries."""
110+ ascii_chars = sum (1 for c in text if ord (c ) < 128 )
111+ if ascii_chars > len (text ) * 0.5 :
112+ return re .findall (r'\w+' , text .lower ())
113+ tokens = text .lower ().split ()
114+ if tokens :
115+ return tokens
116+ return list (text .lower ())
117117
118118
119119def _calculate_rouge_1_scores (candidate : str , reference : str ):
@@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
135135 A dictionary containing the ROUGE-1 precision, recall, and f-measure.
136136 """
137137 scorer = rouge_scorer .RougeScorer (
138- ["rouge1" ],
139- use_stemmer = True ,
140- tokenizer = _unicode_tokenize ,
141- )
138+ ["rouge1" ],
139+ use_stemmer = True ,
140+ tokenizer = _UnicodeTokenizer () ,
141+ )
142142
143143 # The score method returns a dictionary where keys are the ROUGE types
144144 # and values are Score objects (tuples) with precision, recall, and fmeasure.
0 commit comments