google
diff --git a/‎src/google/adk/memory/vertex_ai_memory_bank_service.py‎
Lines changed: 2 additions & 11 deletions b/‎src/google/adk/memory/vertex_ai_memory_bank_service.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎src/google/adk/models/gemini_context_cache_manager.py‎
Lines changed: 64 additions & 8 deletions b/‎src/google/adk/models/gemini_context_cache_manager.py‎
Lines changed: 64 additions & 8 deletions
diff --git a/‎src/google/adk/models/interactions_utils.py‎
Lines changed: 122 additions & 6 deletions b/‎src/google/adk/models/interactions_utils.py‎
Lines changed: 122 additions & 6 deletions
@@ -26,7 +26,6 @@
 from google.genai import types
 from typing_extensions import override
 
-from ..utils._google_client_headers import get_tracking_headers
 from ..utils.vertex_ai_utils import get_express_mode_api_key
 from .base_memory_service import BaseMemoryService
 from .base_memory_service import SearchMemoryResponse
@@ -617,17 +616,9 @@ def _get_api_client(self) -> vertexai.AsyncClient:
     """
     import vertexai
 
-    http_options = types.HttpOptions(headers=get_tracking_headers())
     if self._express_mode_api_key:
-      return vertexai.Client(
-          http_options=http_options,
-          api_key=self._express_mode_api_key,
-      ).aio
-    return vertexai.Client(
-        project=self._project,
-        location=self._location,
-        http_options=http_options,
-    ).aio
+      return vertexai.Client(api_key=self._express_mode_api_key).aio
+    return vertexai.Client(project=self._project, location=self._location).aio
 
 
 def _log_ingest_task_error(task: asyncio.Task) -> None:
 
@@ -326,11 +326,21 @@ async def _create_new_cache_with_contents(
       )
       return None
 
-    # Check client-side to avoid unnecessary API round-trips.
-    if llm_request.cacheable_contents_token_count < _GEMINI_MIN_CACHE_TOKENS:
+    # `cacheable_contents_token_count` is the token count of the whole previous
+    # prompt (system instruction + tools + every content). The cache, however,
+    # only stores the prefix `contents[:cache_contents_count]` plus the system
+    # instruction and tools (see `_create_gemini_cache`). On a long conversation
+    # the full-prompt count can clear Gemini's minimum while the cached prefix
+    # is far smaller, which makes `caches.create` fail with 400
+    # INVALID_ARGUMENT.
+    # Gate on the estimated prefix size so we never send a sub-minimum payload.
+    cacheable_prefix_tokens = self._estimate_cacheable_prefix_tokens(
+        llm_request, cache_contents_count
+    )
+    if cacheable_prefix_tokens < _GEMINI_MIN_CACHE_TOKENS:
       logger.info(
-          "Request below Gemini minimum cache size (%d < %d tokens)",
-          llm_request.cacheable_contents_token_count,
+          "Cacheable prefix below Gemini minimum cache size (%d < %d tokens)",
+          cacheable_prefix_tokens,
           _GEMINI_MIN_CACHE_TOKENS,
       )
       return None
@@ -342,13 +352,20 @@ async def _create_new_cache_with_contents(
       logger.warning("Failed to create cache: %s", e)
       return None
 
-  def _estimate_request_tokens(self, llm_request: LlmRequest) -> int:
-    """Estimate token count for the request.
+  def _estimate_request_tokens(
+      self,
+      llm_request: LlmRequest,
+      cache_contents_count: Optional[int] = None,
+  ) -> int:
+    """Estimate token count for the request (or its cacheable prefix).
 
     This is a rough estimation based on content text length.
 
     Args:
         llm_request: Request to estimate tokens for
+        cache_contents_count: When provided, only the first
+            ``cache_contents_count`` contents are counted (the prefix that gets
+            cached); the system instruction and tools are always included.
 
     Returns:
         Estimated token count
@@ -366,15 +383,54 @@ def _estimate_request_tokens(self, llm_request: LlmRequest) -> int:
           tool_str = json.dumps(tool.model_dump())
           total_chars += len(tool_str)
 
-    # Contents
-    for content in llm_request.contents:
+    # Contents (optionally limited to the cacheable prefix)
+    contents = llm_request.contents
+    if cache_contents_count is not None:
+      contents = contents[:cache_contents_count]
+    for content in contents:
       for part in content.parts:
         if part.text:
           total_chars += len(part.text)
 
     # Rough estimate: 4 characters per token
     return total_chars // 4
 
+  def _estimate_cacheable_prefix_tokens(
+      self, llm_request: LlmRequest, cache_contents_count: int
+  ) -> int:
+    """Estimate the token count of the prefix that will actually be cached.
+
+    The only accurate token count available is
+    ``cacheable_contents_token_count``, which covers the entire previous prompt.
+    Since the cache stores just the prefix ``contents[:cache_contents_count]``
+    (plus system instruction and tools), we scale that accurate count by the
+    prefix's estimated share of the request. When the prefix already spans the
+    whole request the scale factor is 1 and the accurate count is returned
+    unchanged.
+
+    Args:
+        llm_request: Request to estimate the cacheable prefix tokens for
+        cache_contents_count: Number of leading contents that get cached
+
+    Returns:
+        Estimated token count of the cacheable prefix
+    """
+    full_tokens = llm_request.cacheable_contents_token_count
+    if not full_tokens:
+      return 0
+
+    full_estimate = self._estimate_request_tokens(llm_request)
+    if full_estimate <= 0:
+      # No text to estimate from (e.g. non-text parts); fall back to the
+      # accurate full count rather than incorrectly skipping the cache.
+      return full_tokens
+
+    prefix_estimate = self._estimate_request_tokens(
+        llm_request, cache_contents_count
+    )
+    ratio = min(1.0, prefix_estimate / full_estimate)
+    return int(full_tokens * ratio)
+
   async def _create_gemini_cache(
       self, llm_request: LlmRequest, cache_contents_count: int
   ) -> CacheMetadata:
 
@@ -706,13 +706,23 @@ def convert_interaction_to_llm_response(
 
 @dataclasses.dataclass
 class _StreamState:
-  """Accumulates streamed parts across SSE events.
+  """Accumulates streamed parts and grounding data across SSE events.
 
   ``parts`` collects ``types.Part``s in arrival order to assemble the final
-  ``Content``.
+  ``Content``. The grounding fields accumulate google_search / citation data
+  that maps to ``grounding_metadata`` (a top-level ``LlmResponse`` field, not a
+  part) so it can be reattached to the final, persisted event.
   """
 
   parts: list[types.Part] = dataclasses.field(default_factory=list)
+  web_search_queries: list[str] = dataclasses.field(default_factory=list)
+  grounding_chunks: list[types.GroundingChunk] = dataclasses.field(
+      default_factory=list
+  )
+  grounding_supports: list[types.GroundingSupport] = dataclasses.field(
+      default_factory=list
+  )
+  search_entry_point: types.SearchEntryPoint | None = None
 
 
 def _partial_part_response(
@@ -727,6 +737,18 @@ def _partial_part_response(
   )
 
 
+def _partial_grounding_response(
+    grounding_metadata: types.GroundingMetadata, interaction_id: str | None
+) -> LlmResponse:
+  """Build a partial streaming LlmResponse carrying incremental grounding."""
+  return LlmResponse(
+      grounding_metadata=grounding_metadata,
+      partial=True,
+      turn_complete=False,
+      interaction_id=interaction_id,
+  )
+
+
 def _handle_text(
     delta: StepDeltaData, state: _StreamState, interaction_id: str | None
 ) -> LlmResponse | None:
@@ -862,6 +884,69 @@ def _handle_code_execution_result(
   return _partial_part_response(part, interaction_id)
 
 
+def _handle_google_search_call(
+    delta: StepDeltaData, state: _StreamState, interaction_id: str | None
+) -> LlmResponse | None:
+  queries = delta.arguments.queries if delta.arguments else None
+  if not queries:
+    return None
+  state.web_search_queries.extend(queries)
+  grounding_metadata = types.GroundingMetadata(web_search_queries=list(queries))
+  return _partial_grounding_response(grounding_metadata, interaction_id)
+
+
+def _handle_google_search_result(
+    delta: StepDeltaData, state: _StreamState, interaction_id: str | None
+) -> LlmResponse | None:
+  rendered = None
+  for search_result in delta.result or []:
+    if search_result.search_suggestions:
+      rendered = search_result.search_suggestions
+      break
+  if not rendered:
+    return None
+  entry_point = types.SearchEntryPoint(rendered_content=rendered)
+  state.search_entry_point = entry_point
+  grounding_metadata = types.GroundingMetadata(search_entry_point=entry_point)
+  return _partial_grounding_response(grounding_metadata, interaction_id)
+
+
+def _handle_text_annotation(
+    delta: StepDeltaData, state: _StreamState, interaction_id: str | None
+) -> LlmResponse | None:
+  new_chunks: list[types.GroundingChunk] = []
+  new_supports: list[types.GroundingSupport] = []
+  for annotation in delta.annotations or []:
+    if getattr(annotation, 'type', None) != 'url_citation':
+      continue
+    chunk_index = len(state.grounding_chunks) + len(new_chunks)
+    new_chunks.append(
+        types.GroundingChunk(
+            web=types.GroundingChunkWeb(
+                uri=annotation.url, title=annotation.title
+            )
+        )
+    )
+    new_supports.append(
+        types.GroundingSupport(
+            segment=types.Segment(
+                start_index=annotation.start_index,
+                end_index=annotation.end_index,
+            ),
+            grounding_chunk_indices=[chunk_index],
+        )
+    )
+  if not new_chunks:
+    return None
+  state.grounding_chunks.extend(new_chunks)
+  state.grounding_supports.extend(new_supports)
+  grounding_metadata = types.GroundingMetadata(
+      grounding_chunks=new_chunks,
+      grounding_supports=new_supports,
+  )
+  return _partial_grounding_response(grounding_metadata, interaction_id)
+
+
 def _handle_function_result(
     delta: StepDeltaData, state: _StreamState, interaction_id: str | None
 ) -> LlmResponse | None:
@@ -875,6 +960,24 @@ def _handle_function_result(
   return _partial_part_response(part, interaction_id)
 
 
+def _build_grounding_metadata(
+    state: _StreamState,
+) -> types.GroundingMetadata | None:
+  if not (
+      state.web_search_queries
+      or state.grounding_chunks
+      or state.grounding_supports
+      or state.search_entry_point
+  ):
+    return None
+  return types.GroundingMetadata(
+      web_search_queries=state.web_search_queries or None,
+      grounding_chunks=state.grounding_chunks or None,
+      grounding_supports=state.grounding_supports or None,
+      search_entry_point=state.search_entry_point,
+  )
+
+
 def convert_interaction_event_to_llm_response(
     event: InteractionSSEEvent,
     state: _StreamState,
@@ -931,6 +1034,12 @@ def convert_interaction_event_to_llm_response(
       return _handle_code_execution_call(delta, state, interaction_id)
     elif delta_type == 'code_execution_result':
       return _handle_code_execution_result(delta, state, interaction_id)
+    elif delta_type == 'google_search_call':
+      return _handle_google_search_call(delta, state, interaction_id)
+    elif delta_type == 'google_search_result':
+      return _handle_google_search_result(delta, state, interaction_id)
+    elif delta_type == 'text_annotation_delta':
+      return _handle_text_annotation(delta, state, interaction_id)
     elif delta_type == 'function_result':
       return _handle_function_result(delta, state, interaction_id)
     else:
@@ -968,16 +1077,23 @@ def convert_interaction_event_to_llm_response(
     return None
 
   elif isinstance(event, InteractionCompletedEvent):
-    # Final aggregated response
-    if state.parts:
+    grounding_metadata = _build_grounding_metadata(state)
+    if state.parts or grounding_metadata is not None:
+      content = (
+          types.Content(role='model', parts=state.parts)
+          if state.parts
+          else None
+      )
       return LlmResponse(
-          content=types.Content(role='model', parts=state.parts),
+          content=content,
+          grounding_metadata=grounding_metadata,
+          usage_metadata=_usage_metadata_from_interaction(event.interaction),
           partial=False,
           turn_complete=True,
           finish_reason=types.FinishReason.STOP,
           interaction_id=interaction_id,
       )
-    # If no streaming parts were collected, convert the final interaction directly
+    # No streaming parts or grounding collected: convert the final interaction.
     return convert_interaction_to_llm_response(event.interaction)
 
   elif isinstance(event, Interaction):