fix: use Unicode-aware keyword extraction in InMemoryMemoryService

yodeee9 · r-yoda-nri · commit fbe87e5ebd4f · 2026-04-28T09:53:14.000+09:00
Replace [A-Za-z]+ with \w+ so token extraction includes Unicode word characters. Add a non-ASCII containment fallback in search_memory() for scripts without whitespace word boundaries (Japanese, Chinese). Fixes #5501
diff --git a/src/google/adk/memory/in_memory_memory_service.py b/src/google/adk/memory/in_memory_memory_service.py
@@ -38,8 +38,8 @@ def _user_key(app_name: str, user_id: str) -> str:
 
 
 def _extract_words_lower(text: str) -> set[str]:
-  """Extracts words from a string and converts them to lowercase."""
-  return set([word.lower() for word in re.findall(r'[A-Za-z]+', text)])
+  """Extracts Unicode-aware tokens from a string in lowercase."""
+  return set(word.lower() for word in re.findall(r'\w+', text))
 
 
 class InMemoryMemoryService(BaseMemoryService):
@@ -116,13 +116,19 @@ async def search_memory(
       for event in session_events:
         if not event.content or not event.content.parts:
           continue
-        words_in_event = _extract_words_lower(
-            ' '.join([part.text for part in event.content.parts if part.text])
+        event_text = ' '.join(
+            [part.text for part in event.content.parts if part.text]
         )
+        words_in_event = _extract_words_lower(event_text)
         if not words_in_event:
           continue
 
-        if any(query_word in words_in_event for query_word in words_in_query):
+        event_text_lower = event_text.lower()
+        if any(
+            query_word in words_in_event
+            or (not query_word.isascii() and query_word in event_text_lower)
+            for query_word in words_in_query
+        ):
           response.memories.append(
               MemoryEntry(
                   content=event.content,
diff --git a/tests/unittests/memory/test_in_memory_memory_service.py b/tests/unittests/memory/test_in_memory_memory_service.py
@@ -327,3 +327,53 @@ async def test_search_memory_is_scoped_by_user():
   assert (
       result_other_user.memories[0].content.parts[0].text == 'This is a secret.'
   )
+
+
+# --- Non-Latin language tests ---
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    'event_text,query,expected_count',
+    [
+        # Japanese (no space delimiters — substring fallback)
+        ('私の名前は太郎です', '太郎', 1),
+        ('私の名前は太郎です', '天気', 0),
+        # Chinese (no space delimiters — substring fallback)
+        ('我喜欢机器学习', '机器学习', 1),
+        ('我喜欢机器学习', '天气预报', 0),
+        # Korean (space-delimited — token match)
+        ('제 이름은 민수입니다', '민수입니다', 1),
+        # Cyrillic (space-delimited — token match)
+        ('Меня зовут Алексей', 'Алексей', 1),
+        # Mixed: non-Latin substring + Latin token in same event
+        ('太郎 works at ABC Corp', '太郎', 1),
+        ('太郎 works at ABC Corp', 'ABC', 1),
+        # Latin partial-word must NOT match (regression guard)
+        ('I like to code in Python.', 'thon', 0),
+    ],
+)
+async def test_search_memory_non_latin(event_text, query, expected_count):
+  """Tests search_memory with non-Latin scripts and mixed content."""
+  session = Session(
+      app_name=MOCK_APP_NAME,
+      user_id=MOCK_USER_ID,
+      id='session-i18n',
+      last_update_time=7000,
+      events=[
+          Event(
+              id='event-i18n',
+              invocation_id='inv-i18n',
+              author='user',
+              timestamp=90000,
+              content=types.Content(parts=[types.Part(text=event_text)]),
+          ),
+      ],
+  )
+  memory_service = InMemoryMemoryService()
+  await memory_service.add_session_to_memory(session)
+
+  result = await memory_service.search_memory(
+      app_name=MOCK_APP_NAME, user_id=MOCK_USER_ID, query=query
+  )
+  assert len(result.memories) == expected_count