Skip to content

Commit fbe87e5

Browse files
yodeee9yoda
authored andcommitted
fix: use Unicode-aware keyword extraction in InMemoryMemoryService
Replace [A-Za-z]+ with \w+ so token extraction includes Unicode word characters. Add a non-ASCII containment fallback in search_memory() for scripts without whitespace word boundaries (Japanese, Chinese). Fixes #5501
1 parent 684a6e7 commit fbe87e5

2 files changed

Lines changed: 61 additions & 5 deletions

File tree

src/google/adk/memory/in_memory_memory_service.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def _user_key(app_name: str, user_id: str) -> str:
3838

3939

4040
def _extract_words_lower(text: str) -> set[str]:
41-
"""Extracts words from a string and converts them to lowercase."""
42-
return set([word.lower() for word in re.findall(r'[A-Za-z]+', text)])
41+
"""Extracts Unicode-aware tokens from a string in lowercase."""
42+
return set(word.lower() for word in re.findall(r'\w+', text))
4343

4444

4545
class InMemoryMemoryService(BaseMemoryService):
@@ -116,13 +116,19 @@ async def search_memory(
116116
for event in session_events:
117117
if not event.content or not event.content.parts:
118118
continue
119-
words_in_event = _extract_words_lower(
120-
' '.join([part.text for part in event.content.parts if part.text])
119+
event_text = ' '.join(
120+
[part.text for part in event.content.parts if part.text]
121121
)
122+
words_in_event = _extract_words_lower(event_text)
122123
if not words_in_event:
123124
continue
124125

125-
if any(query_word in words_in_event for query_word in words_in_query):
126+
event_text_lower = event_text.lower()
127+
if any(
128+
query_word in words_in_event
129+
or (not query_word.isascii() and query_word in event_text_lower)
130+
for query_word in words_in_query
131+
):
126132
response.memories.append(
127133
MemoryEntry(
128134
content=event.content,

tests/unittests/memory/test_in_memory_memory_service.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,53 @@ async def test_search_memory_is_scoped_by_user():
327327
assert (
328328
result_other_user.memories[0].content.parts[0].text == 'This is a secret.'
329329
)
330+
331+
332+
# --- Non-Latin language tests ---
333+
334+
335+
@pytest.mark.asyncio
336+
@pytest.mark.parametrize(
337+
'event_text,query,expected_count',
338+
[
339+
# Japanese (no space delimiters — substring fallback)
340+
('私の名前は太郎です', '太郎', 1),
341+
('私の名前は太郎です', '天気', 0),
342+
# Chinese (no space delimiters — substring fallback)
343+
('我喜欢机器学习', '机器学习', 1),
344+
('我喜欢机器学习', '天气预报', 0),
345+
# Korean (space-delimited — token match)
346+
('제 이름은 민수입니다', '민수입니다', 1),
347+
# Cyrillic (space-delimited — token match)
348+
('Меня зовут Алексей', 'Алексей', 1),
349+
# Mixed: non-Latin substring + Latin token in same event
350+
('太郎 works at ABC Corp', '太郎', 1),
351+
('太郎 works at ABC Corp', 'ABC', 1),
352+
# Latin partial-word must NOT match (regression guard)
353+
('I like to code in Python.', 'thon', 0),
354+
],
355+
)
356+
async def test_search_memory_non_latin(event_text, query, expected_count):
357+
"""Tests search_memory with non-Latin scripts and mixed content."""
358+
session = Session(
359+
app_name=MOCK_APP_NAME,
360+
user_id=MOCK_USER_ID,
361+
id='session-i18n',
362+
last_update_time=7000,
363+
events=[
364+
Event(
365+
id='event-i18n',
366+
invocation_id='inv-i18n',
367+
author='user',
368+
timestamp=90000,
369+
content=types.Content(parts=[types.Part(text=event_text)]),
370+
),
371+
],
372+
)
373+
memory_service = InMemoryMemoryService()
374+
await memory_service.add_session_to_memory(session)
375+
376+
result = await memory_service.search_memory(
377+
app_name=MOCK_APP_NAME, user_id=MOCK_USER_ID, query=query
378+
)
379+
assert len(result.memories) == expected_count

0 commit comments

Comments
 (0)