Skip to content

Commit 5e7d557

Browse files
committed
fix: use proper tokenizer class for Unicode RougeScorer
- Replace function _unicode_tokenize with _UnicodeTokenizer class implementing the tokenize() method expected by RougeScorer - Move import re to module level - Fix double-escaped regex patterns (\w -> \w, remove unsupported \p{P}) - Add return type annotation for tokenize() to satisfy mypy strict mode - Fix RougeScorer constructor indentation
1 parent c4f04ff commit 5e7d557

1 file changed

Lines changed: 23 additions & 23 deletions

File tree

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import re
1718
from typing import Optional
1819

1920
from google.genai import types as genai_types
@@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float):
9293
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
9394

9495

95-
def _unicode_tokenize(text: str):
96-
"""Tokenizes text using Unicode-aware word boundaries.
96+
class _UnicodeTokenizer:
97+
"""Tokenizer that handles Unicode text with word-boundary awareness.
9798
98-
The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
99-
[a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
100-
this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
99+
The default RougeScorer tokenizer splits on whitespace, which works for
100+
ASCII and Latin-script text but produces zero tokens for text in scripts
101+
without word boundaries (Chinese, Japanese, Thai, etc.).
101102
102-
This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
103-
character-level tokenization for non-ASCII text.
103+
For ASCII-majority text this tokenizer uses Unicode-aware word-character
104+
matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace
105+
splitting, then character-level tokenization.
104106
"""
105-
import re
106-
# For primarily non-ASCII text, tokenize by Unicode-aware patterns
107-
ascii_chars = sum(1 for c in text if ord(c) < 128)
108-
if ascii_chars > len(text) * 0.5:
109-
return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
110-
# For non-Latin scripts, use whitespace splitting with Unicode support
111-
tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
112-
tokens = [t.lower() for t in tokens if t]
113-
if tokens:
114-
return tokens
115-
# Character-level fallback for scripts without word boundaries
116-
return list(text.lower())
107+
108+
def tokenize(self, text: str) -> list[str]:
109+
"""Tokenizes text using Unicode-aware word boundaries."""
110+
ascii_chars = sum(1 for c in text if ord(c) < 128)
111+
if ascii_chars > len(text) * 0.5:
112+
return re.findall(r'\w+', text.lower())
113+
tokens = text.lower().split()
114+
if tokens:
115+
return tokens
116+
return list(text.lower())
117117

118118

119119
def _calculate_rouge_1_scores(candidate: str, reference: str):
@@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
135135
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
136136
"""
137137
scorer = rouge_scorer.RougeScorer(
138-
["rouge1"],
139-
use_stemmer=True,
140-
tokenizer=_unicode_tokenize,
141-
)
138+
["rouge1"],
139+
use_stemmer=True,
140+
tokenizer=_UnicodeTokenizer(),
141+
)
142142

143143
# The score method returns a dictionary where keys are the ROUGE types
144144
# and values are Score objects (tuples) with precision, recall, and fmeasure.

0 commit comments

Comments
 (0)