Skip to content

Commit f538de1

Browse files
author
yoda
committed
fix: use Unicode-aware keyword extraction in InMemoryMemoryService
Replace [A-Za-z]+ with \w+ so token extraction includes Unicode word characters. Add a non-ASCII containment fallback in search_memory() for scripts without whitespace word boundaries (Japanese, Chinese). Fixes #5501
1 parent c87ee1e commit f538de1

2 files changed

Lines changed: 58 additions & 5 deletions

File tree

src/google/adk/memory/in_memory_memory_service.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def _user_key(app_name: str, user_id: str) -> str:
3838

3939

4040
def _extract_words_lower(text: str) -> set[str]:
41-
"""Extracts words from a string and converts them to lowercase."""
42-
return set([word.lower() for word in re.findall(r'[A-Za-z]+', text)])
41+
"""Extracts Unicode-aware tokens from a string in lowercase."""
42+
return set(word.lower() for word in re.findall(r'\w+', text))
4343

4444

4545
class InMemoryMemoryService(BaseMemoryService):
@@ -116,13 +116,19 @@ async def search_memory(
116116
for event in session_events:
117117
if not event.content or not event.content.parts:
118118
continue
119-
words_in_event = _extract_words_lower(
120-
' '.join([part.text for part in event.content.parts if part.text])
119+
event_text = ' '.join(
120+
[part.text for part in event.content.parts if part.text]
121121
)
122+
words_in_event = _extract_words_lower(event_text)
122123
if not words_in_event:
123124
continue
124125

125-
if any(query_word in words_in_event for query_word in words_in_query):
126+
event_text_lower = event_text.lower()
127+
if any(
128+
query_word in words_in_event
129+
or (not query_word.isascii() and query_word in event_text_lower)
130+
for query_word in words_in_query
131+
):
126132
response.memories.append(
127133
MemoryEntry(
128134
content=event.content,

tests/unittests/memory/test_in_memory_memory_service.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,50 @@ async def test_search_memory_is_scoped_by_user():
327327
assert (
328328
result_other_user.memories[0].content.parts[0].text == 'This is a secret.'
329329
)
330+
331+
332+
# --- Non-Latin language tests ---
333+
334+
335+
@pytest.mark.asyncio
336+
@pytest.mark.parametrize('event_text,query,expected_count', [
337+
# Japanese (no space delimiters — substring fallback)
338+
('私の名前は太郎です', '太郎', 1),
339+
('私の名前は太郎です', '天気', 0),
340+
# Chinese (no space delimiters — substring fallback)
341+
('我喜欢机器学习', '机器学习', 1),
342+
('我喜欢机器学习', '天气预报', 0),
343+
# Korean (space-delimited — token match)
344+
('제 이름은 민수입니다', '민수입니다', 1),
345+
# Cyrillic (space-delimited — token match)
346+
('Меня зовут Алексей', 'Алексей', 1),
347+
# Mixed: non-Latin substring + Latin token in same event
348+
('太郎 works at ABC Corp', '太郎', 1),
349+
('太郎 works at ABC Corp', 'ABC', 1),
350+
# Latin partial-word must NOT match (regression guard)
351+
('I like to code in Python.', 'thon', 0),
352+
])
353+
async def test_search_memory_non_latin(event_text, query, expected_count):
354+
"""Tests search_memory with non-Latin scripts and mixed content."""
355+
session = Session(
356+
app_name=MOCK_APP_NAME,
357+
user_id=MOCK_USER_ID,
358+
id='session-i18n',
359+
last_update_time=7000,
360+
events=[
361+
Event(
362+
id='event-i18n',
363+
invocation_id='inv-i18n',
364+
author='user',
365+
timestamp=90000,
366+
content=types.Content(parts=[types.Part(text=event_text)]),
367+
),
368+
],
369+
)
370+
memory_service = InMemoryMemoryService()
371+
await memory_service.add_session_to_memory(session)
372+
373+
result = await memory_service.search_memory(
374+
app_name=MOCK_APP_NAME, user_id=MOCK_USER_ID, query=query
375+
)
376+
assert len(result.memories) == expected_count

0 commit comments

Comments
 (0)