fix(workflow): Resolve raw Content output crash on rehydration

wyf7107 · copybara-github · commit 428e7899d722 · 2026-06-08T15:59:02.000-07:00
Port of GitHub PR: #5909 Centralizes text extraction and schema validation for rehydrated output via a shared helper `extract_text_from_content`. If a stored output fails validation against the node's schema due to schema drift, gracefully fallback to parsing unvalidated JSON to avoid blocking resumption, rather than crashing. Co-authored-by: Yifan Wang <wanyif@google.com> PiperOrigin-RevId: 927408084
diff --git a/src/google/adk/utils/content_utils.py b/src/google/adk/utils/content_utils.py
@@ -36,3 +36,10 @@ def filter_audio_parts(content: types.Content) -> types.Content | None:
   if not filtered_parts:
     return None
   return types.Content(role=content.role, parts=filtered_parts)
+
+
+def extract_text_from_content(content: types.Content | None) -> str:
+  """Extracts text from a Content object, filtering out thoughts."""
+  if not content or not content.parts:
+    return ''
+  return ''.join(p.text for p in content.parts if p.text and not p.thought)
diff --git a/src/google/adk/workflow/_base_node.py b/src/google/adk/workflow/_base_node.py
@@ -28,6 +28,7 @@
 from pydantic import ValidationError
 
 from ..utils._schema_utils import SchemaType
+from ..utils.content_utils import extract_text_from_content
 from ._retry_config import RetryConfig
 
 if TYPE_CHECKING:
@@ -143,7 +144,7 @@ def _validate_input_data(self, data: Any) -> Any:
     """Validates data against input_schema if set."""
     if self.input_schema and isinstance(data, types.Content):
       # Extract text from Content (e.g. user input from START node).
-      text = ''.join(part.text for part in data.parts if part.text)
+      text = extract_text_from_content(data)
       if self.input_schema is str:
         return text
       # If schema is defined, try to parse the text as JSON.
@@ -168,7 +169,7 @@ def _validate_output_data(self, data: Any) -> Any:
     except ValidationError as e:
       # 2. If failed, try to parse JSON ONLY if it's Content
       if isinstance(data, types.Content):
-        text = ''.join(part.text for part in data.parts if part.text)
+        text = extract_text_from_content(data)
         if self.output_schema is str:
           return text
         if text.strip():
diff --git a/src/google/adk/workflow/utils/_rehydration_utils.py b/src/google/adk/workflow/utils/_rehydration_utils.py
@@ -20,15 +20,23 @@
 from dataclasses import dataclass
 from dataclasses import field
 import json
+import logging
 from typing import Any
+from typing import TYPE_CHECKING
 
+from google.genai import types
 from pydantic import TypeAdapter
 from pydantic import ValidationError
 
 from ...events._node_path_builder import _NodePathBuilder
 from ...events.event import Event
 from ._workflow_hitl_utils import REQUEST_INPUT_FUNCTION_CALL_NAME
 
+if TYPE_CHECKING:
+  from .._base_node import BaseNode
+
+logger = logging.getLogger('google_adk.' + __name__)
+
 _RESULT_KEY = 'result'
 
 
@@ -96,6 +104,49 @@ def _extract_schema_from_event(event: Event, interrupt_id: str) -> Any | None:
   return None
 
 
+def _process_rehydrated_output(node: BaseNode, output: Any) -> Any:
+  """Process rehydrated output from event.content using the node's output schema.
+
+  Protects type consistency between fresh runs and rehydrated runs by
+  properly respecting output schemas, handling model reasoning thought
+  blocks, and ensuring raw strings are returned when no output schema is
+  configured.
+  """
+  if not isinstance(output, types.Content):
+    return output
+
+  from google.adk.utils.content_utils import extract_text_from_content
+
+  text = extract_text_from_content(output).strip()
+
+  if not text:
+    return None
+
+  if node.output_schema:
+    if node.output_schema is str:
+      return text
+    try:
+      validated = TypeAdapter(node.output_schema).validate_json(text)
+      return node._to_serializable(validated)
+    except ValidationError as e:
+      # Fallback to unvalidated JSON parsing on validation failure
+      # to prevent blocking resumption on schema drift.
+      try:
+        parsed = json.loads(text)
+        logger.warning(
+            'Validation failed for rehydrated output against schema: %s. '
+            'Falling back to unvalidated JSON output to allow resumption.',
+            e,
+        )
+        return parsed
+      except ValueError:
+        raise ValueError(
+            f'Validation failed for rehydrated output against schema: {e}'
+        ) from e
+  else:
+    return text
+
+
 def _validate_resume_response(response_data: Any, schema: Any) -> Any:
   """Validates and coerces resume response data against a schema.
 
diff --git a/src/google/adk/workflow/utils/_replay_interceptor.py b/src/google/adk/workflow/utils/_replay_interceptor.py
@@ -27,6 +27,7 @@
 from .._node_state import NodeState
 from .._node_status import NodeStatus
 from ._rehydration_utils import _ChildScanState
+from ._rehydration_utils import _process_rehydrated_output
 
 if TYPE_CHECKING:
   from .._dynamic_node_scheduler import DynamicNodeRun
@@ -112,7 +113,7 @@ def check_interception(
   ):
     # Case 3: Cross-turn successfully completed in a prior turn (fast-forward).
     # Bypass execution completely and return the cached output and route.
-    output = recovered.output
+    output = _process_rehydrated_output(node, recovered.output)
     route = recovered.route
 
   elif recovered.interrupt_ids:
diff --git a/tests/unittests/workflow/utils/test_rehydration_utils.py b/tests/unittests/workflow/utils/test_rehydration_utils.py
@@ -17,7 +17,9 @@
 from google.adk.events.event import Event
 from google.adk.events.event import NodeInfo
 from google.adk.events.request_input import RequestInput
+from google.adk.workflow._base_node import BaseNode
 from google.adk.workflow.utils._rehydration_utils import _ChildScanState
+from google.adk.workflow.utils._rehydration_utils import _process_rehydrated_output
 from google.adk.workflow.utils._rehydration_utils import _reconstruct_node_states
 from google.adk.workflow.utils._rehydration_utils import _unwrap_response
 from google.adk.workflow.utils._rehydration_utils import _validate_resume_response
@@ -103,6 +105,84 @@ def test_roundtrip_wrap_unwrap_dict(self):
     assert _unwrap_response(_wrap_response(d)) == d
 
 
+# --- _process_rehydrated_output ---
+
+
+class TestProcessRehydratedOutput:
+
+  def test_extracts_plain_text_without_schema(self):
+    node = BaseNode(name="dummy")
+    content = types.Content(parts=[types.Part(text="hello world")])
+    assert _process_rehydrated_output(node, content) == "hello world"
+
+  def test_returns_plain_text_even_if_json_when_no_schema(self):
+    node = BaseNode(name="dummy")
+    content = types.Content(parts=[types.Part(text='{"foo": "bar"}')])
+    assert _process_rehydrated_output(node, content) == '{"foo": "bar"}'
+
+  def test_parses_json_text_with_output_schema(self):
+    class MySchema(BaseModel):
+      foo: str
+
+    node = BaseNode(name="dummy", output_schema=MySchema)
+    content = types.Content(parts=[types.Part(text='{"foo": "bar"}')])
+    assert _process_rehydrated_output(node, content) == {"foo": "bar"}
+
+  def test_joins_multiple_parts(self):
+    node = BaseNode(name="dummy")
+    content = types.Content(
+        parts=[types.Part(text="hello "), types.Part(text="world")]
+    )
+    assert _process_rehydrated_output(node, content) == "hello world"
+
+  def test_filters_thought_parts(self):
+    class MySchema(BaseModel):
+      answer: int
+
+    node = BaseNode(name="dummy", output_schema=MySchema)
+    content = types.Content(
+        parts=[
+            types.Part(text="thinking...", thought=True),
+            types.Part(text='{"answer": 42}'),
+        ]
+    )
+    assert _process_rehydrated_output(node, content) == {"answer": 42}
+
+  def test_returns_none_for_empty_text(self):
+    node = BaseNode(name="dummy")
+    content = types.Content(parts=[types.Part(text="  ")])
+    assert _process_rehydrated_output(node, content) is None
+
+  def test_gracefully_falls_back_on_schema_mismatch(self, caplog):
+    class MySchema(BaseModel):
+      foo: str
+      bar: int  # Required field that is missing in the stored output
+
+    node = BaseNode(name="dummy", output_schema=MySchema)
+    content = types.Content(parts=[types.Part(text='{"foo": "only"}')])
+
+    # Should NOT raise ValueError, but fallback to unvalidated parsed dict
+    res = _process_rehydrated_output(node, content)
+    assert res == {"foo": "only"}
+    assert (
+        "Validation failed for rehydrated output against schema" in caplog.text
+    )
+
+  def test_raises_value_error_if_not_valid_json_on_schema_mismatch(self):
+    class MySchema(BaseModel):
+      foo: str
+
+    node = BaseNode(name="dummy", output_schema=MySchema)
+    content = types.Content(parts=[types.Part(text="invalid json")])
+
+    # Should raise ValueError because it's not valid JSON
+    with pytest.raises(
+        ValueError,
+        match="Validation failed for rehydrated output against schema",
+    ):
+      _process_rehydrated_output(node, content)
+
+
 # --- _validate_resume_response ---