fix: use proper tokenizer class for Unicode RougeScorer

tcconnally · tcconnally · commit 5e7d5573c620 · 2026-06-17T18:39:59.000Z
- Replace function _unicode_tokenize with _UnicodeTokenizer class
  implementing the tokenize() method expected by RougeScorer
- Move import re to module level
- Fix double-escaped regex patterns (\w -&gt; \w, remove unsupported \p{P})
- Add return type annotation for tokenize() to satisfy mypy strict mode
- Fix RougeScorer constructor indentation
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import re
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
-def _unicode_tokenize(text: str):
-  """Tokenizes text using Unicode-aware word boundaries.
+class _UnicodeTokenizer:
+  """Tokenizer that handles Unicode text with word-boundary awareness.
 
-  The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
-  [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
-  this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
+  The default RougeScorer tokenizer splits on whitespace, which works for
+  ASCII and Latin-script text but produces zero tokens for text in scripts
+  without word boundaries (Chinese, Japanese, Thai, etc.).
 
-  This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
-  character-level tokenization for non-ASCII text.
+  For ASCII-majority text this tokenizer uses Unicode-aware word-character
+  matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace
+  splitting, then character-level tokenization.
   """
-  import re
-  # For primarily non-ASCII text, tokenize by Unicode-aware patterns
-  ascii_chars = sum(1 for c in text if ord(c) < 128)
-  if ascii_chars > len(text) * 0.5:
-    return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
-  # For non-Latin scripts, use whitespace splitting with Unicode support
-  tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
-  tokens = [t.lower() for t in tokens if t]
-  if tokens:
-    return tokens
-  # Character-level fallback for scripts without word boundaries
-  return list(text.lower())
+
+  def tokenize(self, text: str) -> list[str]:
+    """Tokenizes text using Unicode-aware word boundaries."""
+    ascii_chars = sum(1 for c in text if ord(c) < 128)
+    if ascii_chars > len(text) * 0.5:
+      return re.findall(r'\w+', text.lower())
+    tokens = text.lower().split()
+    if tokens:
+      return tokens
+    return list(text.lower())
 
 
 def _calculate_rouge_1_scores(candidate: str, reference: str):
@@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
   scorer = rouge_scorer.RougeScorer(
-        ["rouge1"],
-        use_stemmer=True,
-        tokenizer=_unicode_tokenize,
-    )
+      ["rouge1"],
+      use_stemmer=True,
+      tokenizer=_UnicodeTokenizer(),
+  )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.