fix(responses-durable): normalize agent_reference before persisting durable task input

RaviPidaparthi · Copilot · RaviPidaparthi · commit 06b6aa0fd77a · 2026-06-18T20:44:35.000Z
On hosted, the platform injects `agent_reference` as an AgentReference model
(a Mapping but not json.dumps-serializable). It leaked through
_split_runtime_refs into the persisted durable-task input, so
create_and_start -&gt; _resolve_input_storage raised
`TypeError: Object of type AgentReference is not JSON serializable` and the
durable background start silently fell back to a non-durable
asyncio.create_task — meaning NO durable task was created and crash recovery
never happened on hosted.

_split_runtime_refs now normalizes a model-typed agent_reference to a plain
dict (consumers all accept AgentReference | dict and read it as a mapping; the
dict also survives cross-process recovery). Absent agent_reference stays the
{} sentinel.

This was invisible to the conformance suite because local/conformance requests
carry no agent_reference (-&gt; {} sentinel -&gt; serializable). Adds
TestSplitRuntimeRefsSerializable asserting the persisted durable input is
JSON-serializable when agent_reference is a model.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/agentserver/azure-ai-agentserver-responses/azure/ai/agentserver/responses/hosting/_durable_orchestrator.py b/sdk/agentserver/azure-ai-agentserver-responses/azure/ai/agentserver/responses/hosting/_durable_orchestrator.py
@@ -120,7 +120,9 @@ def _build_server_error_payload(
 )
 
 
-def _split_runtime_refs(ctx_params: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+def _split_runtime_refs(
+    ctx_params: dict[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any]]:
     """Split ``ctx_params`` into refs (memory-only) and persisted params.
 
     :param ctx_params: The orchestrator's combined params dict.
@@ -137,6 +139,29 @@ def _split_runtime_refs(ctx_params: dict[str, Any]) -> tuple[dict[str, Any], dic
             refs[k] = v
         else:
             persisted[k] = v
+    # The hosted gateway injects ``agent_reference`` as an ``AgentReference``
+    # model. That model is a Mapping but is NOT ``json.dumps``-serializable, so
+    # if it leaks into the persisted durable-task input the underlying
+    # ``create_and_start`` -> ``_resolve_input_storage`` size check raises
+    # ``TypeError`` and the whole durable start silently falls back to a
+    # non-durable ``asyncio.create_task`` (no crash recovery). Normalize it to a
+    # plain dict here: the durable input must be JSON-serializable AND survive
+    # cross-process recovery, and every consumer accepts ``AgentReference | dict``
+    # (and reads it as a mapping). Absent agent_reference is the ``{}`` sentinel,
+    # which is already serializable.
+    agent_reference = persisted.get("agent_reference")
+    if agent_reference is not None and not isinstance(agent_reference, dict):
+        if hasattr(agent_reference, "as_dict"):
+            persisted["agent_reference"] = agent_reference.as_dict()
+        else:
+            try:
+                persisted["agent_reference"] = dict(agent_reference)
+            except (TypeError, ValueError):
+                persisted["agent_reference"] = {
+                    "type": getattr(agent_reference, "type", "agent_reference"),
+                    "name": getattr(agent_reference, "name", None),
+                    "version": getattr(agent_reference, "version", None),
+                }
     return refs, persisted
 
 
@@ -160,7 +185,9 @@ def _reconstruct_parsed_from_params(params: dict[str, Any]) -> Any:
             "missing. Ensure the orchestrator stamps it at fresh-entry."
         )
     # Late import to avoid circular dependency on hosting/_request_parsing.
-    from ..models._generated import CreateResponse  # pylint: disable=import-outside-toplevel
+    from ..models._generated import (
+        CreateResponse,
+    )  # pylint: disable=import-outside-toplevel
 
     if isinstance(payload, dict):
         return CreateResponse(payload)
@@ -196,8 +223,14 @@ def _reconstruct_from_params(
     :rtype: tuple[ResponseExecution, ResponseContext]
     """
     # Late imports to avoid module-level circular dependencies.
-    from .._response_context import IsolationContext, ResponseContext  # pylint: disable=import-outside-toplevel
-    from ..models.runtime import ResponseExecution, ResponseModeFlags  # pylint: disable=import-outside-toplevel
+    from .._response_context import (
+        IsolationContext,
+        ResponseContext,
+    )  # pylint: disable=import-outside-toplevel
+    from ..models.runtime import (
+        ResponseExecution,
+        ResponseModeFlags,
+    )  # pylint: disable=import-outside-toplevel
 
     parsed = _reconstruct_parsed_from_params(params)
 
@@ -226,7 +259,9 @@ def _reconstruct_from_params(
         input_items=record.input_items,
         previous_response_id=record.previous_response_id,
         conversation_id=record.conversation_id,
-        history_limit=int(params.get("history_limit", runtime_options.default_fetch_history_count)),
+        history_limit=int(
+            params.get("history_limit", runtime_options.default_fetch_history_count)
+        ),
         # Client headers / query params are not preserved across recovery
         # — they were specific to the original HTTP request and are not
         # meaningful for the recovered handler.
@@ -509,7 +544,9 @@ def _ref(key: str) -> Any:
         # next-lifetime recovery can dispatch correctly without needing to
         # reconstruct the routing decisions from input params.
         if _RESP_DISPOSITION not in responses_ns:
-            responses_ns[_RESP_DISPOSITION] = params.get("disposition", DISPOSITION_REINVOKE)
+            responses_ns[_RESP_DISPOSITION] = params.get(
+                "disposition", DISPOSITION_REINVOKE
+            )
             # Force-flush so the disposition is durable BEFORE the body
             # could be killed — without an explicit flush the recovered
             # task would default to ``re-invoke`` and skip the mark-failed
@@ -581,8 +618,12 @@ def _ref(key: str) -> Any:
                 runtime_state=self._runtime_state,
                 runtime_options=self._options,
             )
-            assert record is not None, "_reconstruct_from_params guarantees non-None record"
-            assert self._runtime_state is not None, "runtime_state always wired at orchestrator init"
+            assert (
+                record is not None
+            ), "_reconstruct_from_params guarantees non-None record"
+            assert (
+                self._runtime_state is not None
+            ), "runtime_state always wired at orchestrator init"
             await self._runtime_state.add(record)
 
         # After the reconstruction block, context and record are both
@@ -646,7 +687,8 @@ def _ref(key: str) -> Any:
                     return
                 except Exception:  # pylint: disable=broad-exception-caught
                     logger.debug(
-                        "persisted_response pre-fetch failed for %s " "(recovery, transient — not dropping)",
+                        "persisted_response pre-fetch failed for %s "
+                        "(recovery, transient — not dropping)",
                         context.response_id,
                         exc_info=True,
                     )
@@ -772,7 +814,11 @@ async def _bridge() -> None:
             # mid-handler with grace exhausted) silently loses the
             # response because the one-shot ephemeral record is deleted
             # on cancel.
-            if ctx.shutdown.is_set() and record is not None and record.status in {"queued", "in_progress"}:
+            if (
+                ctx.shutdown.is_set()
+                and record is not None
+                and record.status in {"queued", "in_progress"}
+            ):
                 logger.info(
                     "Response %s handler returned during shutdown without "
                     "terminal; calling ctx.exit_for_recovery() so task stays "
@@ -950,11 +996,16 @@ async def _persist_crash_failed(
         # happened after terminal persistence, and overwriting would corrupt
         # the result.
         try:
-            existing = await self._provider.get_response(response_id, isolation=isolation)
+            existing = await self._provider.get_response(
+                response_id, isolation=isolation
+            )
             existing_status = getattr(existing, "status", None) or (
                 existing.get("status") if isinstance(existing, dict) else None
             )
-            if isinstance(existing_status, str) and existing_status in _TERMINAL_STATUSES:
+            if (
+                isinstance(existing_status, str)
+                and existing_status in _TERMINAL_STATUSES
+            ):
                 logger.info(
                     "_persist_crash_failed: response %s already terminal "
                     "(status=%s) — skipping overwrite (race avoidance)",
@@ -977,7 +1028,9 @@ async def _persist_crash_failed(
         )
 
         try:
-            await self._provider.update_response(ResponseObject(failed_response), isolation=isolation)
+            await self._provider.update_response(
+                ResponseObject(failed_response), isolation=isolation
+            )
         except KeyError:
             # Response was never persisted at response.created — try
             # create instead so the failed terminal still lands.
diff --git a/sdk/agentserver/azure-ai-agentserver-responses/tests/unit/test_durable_orchestrator.py b/sdk/agentserver/azure-ai-agentserver-responses/tests/unit/test_durable_orchestrator.py
@@ -13,6 +13,7 @@
 from azure.ai.agentserver.responses.hosting._durable_orchestrator import (
     DurableResponseOrchestrator,
     _is_recovered_entry,
+    _split_runtime_refs,
 )
 
 
@@ -162,7 +163,9 @@ async def test_calls_run_background_non_stream(self) -> None:
         ctx.entry_mode = "fresh"
         ctx.retry_attempt = 0
         ctx.is_steered_turn = False  # Spec 016 FR-020: was_steered renamed
-        ctx.pending_input_count = 0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        ctx.pending_input_count = (
+            0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        )
         ctx.metadata = _FakeTaskMetadata()
         ctx._cancellation_signal = asyncio.Event()
         ctx.shutdown = asyncio.Event()
@@ -195,7 +198,9 @@ async def test_calls_run_background_non_stream(self) -> None:
         assert kwargs["model"] == "gpt-4o"
 
     @pytest.mark.asyncio
-    async def test_recovery_and_steering_fields_flattened_on_response_context(self) -> None:
+    async def test_recovery_and_steering_fields_flattened_on_response_context(
+        self,
+    ) -> None:
         """(Spec 024 Phase 5 — Proposal #10/#13) Recovery + steering
         classifiers land directly on ``ResponseContext`` flat fields.
         The pre-Phase-5 ``DurabilityContext`` indirection is deleted —
@@ -210,7 +215,10 @@ async def test_recovery_and_steering_fields_flattened_on_response_context(self)
             options=MagicMock(steerable_conversations=False),
         )
 
-        from azure.ai.agentserver.responses._response_context import IsolationContext, ResponseContext
+        from azure.ai.agentserver.responses._response_context import (
+            IsolationContext,
+            ResponseContext,
+        )
         from azure.ai.agentserver.responses.models.runtime import ResponseModeFlags
 
         real_context = ResponseContext(
@@ -250,9 +258,13 @@ async def test_recovery_and_steering_fields_flattened_on_response_context(self)
         assert real_context.pending_input_count == 2
         assert not hasattr(real_context, "durability")
         # The metadata facade was swapped in to back the task metadata.
-        from azure.ai.agentserver.responses._durability_context import _DeveloperMetadataFacade
+        from azure.ai.agentserver.responses._durability_context import (
+            _DeveloperMetadataFacade,
+        )
 
-        assert isinstance(real_context.conversation_chain_metadata, _DeveloperMetadataFacade)
+        assert isinstance(
+            real_context.conversation_chain_metadata, _DeveloperMetadataFacade
+        )
 
     @pytest.mark.asyncio
     async def test_steerable_returns_none_for_implicit_suspend(self) -> None:
@@ -270,7 +282,9 @@ async def test_steerable_returns_none_for_implicit_suspend(self) -> None:
         ctx.entry_mode = "fresh"
         ctx.retry_attempt = 0
         ctx.is_steered_turn = False  # Spec 016 FR-020: was_steered renamed
-        ctx.pending_input_count = 0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        ctx.pending_input_count = (
+            0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        )
         ctx.metadata = _FakeTaskMetadata()
         ctx._cancellation_signal = asyncio.Event()
         ctx.shutdown = asyncio.Event()
@@ -310,7 +324,9 @@ async def test_non_steerable_returns_none_too(self) -> None:
         ctx.entry_mode = "fresh"
         ctx.retry_attempt = 0
         ctx.is_steered_turn = False  # Spec 016 FR-020: was_steered renamed
-        ctx.pending_input_count = 0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        ctx.pending_input_count = (
+            0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        )
         ctx.metadata = _FakeTaskMetadata()
         ctx._cancellation_signal = asyncio.Event()
         ctx.shutdown = asyncio.Event()
@@ -350,7 +366,9 @@ async def test_cancel_bridge_propagates(self) -> None:
         ctx.entry_mode = "fresh"
         ctx.retry_attempt = 0
         ctx.is_steered_turn = False  # Spec 016 FR-020: was_steered renamed
-        ctx.pending_input_count = 0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        ctx.pending_input_count = (
+            0  # Spec 016 FR-019: pending_inputs Sequence renamed to live int count
+        )
         ctx.metadata = _FakeTaskMetadata()
         ctx._cancellation_signal = asyncio.Event()
         ctx.shutdown = asyncio.Event()
@@ -441,8 +459,12 @@ def test_pick_primitive_matrix(
         )
 
         # Both primitives must exist (precondition for the matrix).
-        assert hasattr(orch, "_one_shot_task_fn"), f"{case_id}: orchestrator must register a one-shot primitive."
-        assert hasattr(orch, "_multi_turn_task_fn"), f"{case_id}: orchestrator must register a multi-turn primitive."
+        assert hasattr(
+            orch, "_one_shot_task_fn"
+        ), f"{case_id}: orchestrator must register a one-shot primitive."
+        assert hasattr(
+            orch, "_multi_turn_task_fn"
+        ), f"{case_id}: orchestrator must register a multi-turn primitive."
 
         ctx_params = {
             "response_id": "resp_test",
@@ -472,22 +494,31 @@ def test_orchestrator_registers_both_primitives_on_construction(self) -> None:
         deployment that mis-imports the core wheel fails fast at
         server startup instead of per-request.
         """
-        opts = MagicMock(steerable_conversations=False, max_pending=10, default_fetch_history_count=100)
+        opts = MagicMock(
+            steerable_conversations=False,
+            max_pending=10,
+            default_fetch_history_count=100,
+        )
         orch = DurableResponseOrchestrator(
             create_fn=AsyncMock(),
             provider=MagicMock(),
             options=opts,
         )
 
         # Both registrations are present.
-        assert hasattr(orch, "_one_shot_task_fn"), "Construction must register the one-shot primitive."
-        assert hasattr(orch, "_multi_turn_task_fn"), "Construction must register the multi-turn primitive."
+        assert hasattr(
+            orch, "_one_shot_task_fn"
+        ), "Construction must register the one-shot primitive."
+        assert hasattr(
+            orch, "_multi_turn_task_fn"
+        ), "Construction must register the multi-turn primitive."
 
         # Names are distinct and well-formed.
         one_shot_name = orch._one_shot_task_fn._opts.name
         multi_turn_name = orch._multi_turn_task_fn._opts.name
         assert one_shot_name != multi_turn_name, (
-            f"Primitives must have distinct registration names " f"(both got {one_shot_name!r})."
+            f"Primitives must have distinct registration names "
+            f"(both got {one_shot_name!r})."
         )
         assert (
             "one_shot" in one_shot_name or "oneshot" in one_shot_name
@@ -499,13 +530,18 @@ def test_orchestrator_registers_both_primitives_on_construction(self) -> None:
         # The multi-turn primitive's steerable flag MUST match the
         # deployment's steerable_conversations option (per SOT §6.6).
         assert orch._multi_turn_task_fn._opts.steerable is False, (
-            "Multi-turn primitive's steerable flag must match " "options.steerable_conversations."
+            "Multi-turn primitive's steerable flag must match "
+            "options.steerable_conversations."
         )
 
     def test_orchestrator_multi_turn_steerable_flag_propagated(self) -> None:
         """With ``steerable_conversations=True``, the multi-turn primitive
         is registered with ``steerable=True``."""
-        opts = MagicMock(steerable_conversations=True, max_pending=10, default_fetch_history_count=100)
+        opts = MagicMock(
+            steerable_conversations=True,
+            max_pending=10,
+            default_fetch_history_count=100,
+        )
         orch = DurableResponseOrchestrator(
             create_fn=AsyncMock(),
             provider=MagicMock(),
@@ -514,3 +550,66 @@ def test_orchestrator_multi_turn_steerable_flag_propagated(self) -> None:
         assert (
             orch._multi_turn_task_fn._opts.steerable is True
         ), "Steerable flag must propagate from options to multi-turn primitive."
+
+
+class TestSplitRuntimeRefsSerializable:
+    """The persisted durable-task input MUST be JSON-serializable.
+
+    Regression for the hosted bug where the gateway-injected
+    ``agent_reference`` (an ``AgentReference`` model — a Mapping but not
+    ``json.dumps``-serializable) leaked into the persisted params, making
+    ``create_and_start`` raise ``TypeError`` and silently degrade the durable
+    background run to a non-durable ``asyncio.create_task`` (no crash recovery).
+    """
+
+    def test_persisted_params_json_serializable_with_agent_reference_model(
+        self,
+    ) -> None:
+        import json
+
+        from azure.ai.agentserver.responses.models import AgentReference
+
+        ctx_params = {
+            "response_id": "caresp_abc",
+            "agent_name": "durable-responses-agent-demo",
+            "session_id": "sess_1",
+            "agent_reference": AgentReference(
+                name="durable-responses-agent-demo", version="29"
+            ),
+            # a runtime-only object ref that must be stripped, never persisted
+            "_record_ref": object(),
+        }
+
+        refs, persisted = _split_runtime_refs(ctx_params)
+
+        # refs hold the non-serializable object reference; not persisted
+        assert "_record_ref" in refs
+        assert "_record_ref" not in persisted
+
+        # agent_reference survives in the persisted input (needed across
+        # cross-process recovery) but normalized to a plain dict
+        assert isinstance(persisted["agent_reference"], dict)
+        assert (
+            persisted["agent_reference"].get("name") == "durable-responses-agent-demo"
+        )
+        assert persisted["agent_reference"].get("version") == "29"
+
+        # the whole persisted input must JSON-serialize (this is what the
+        # core durable-task size check does and what previously raised)
+        json.dumps(persisted)  # must not raise
+
+    def test_empty_agent_reference_sentinel_passthrough(self) -> None:
+        import json
+
+        # absent agent_reference is the ``{}`` sentinel — already serializable
+        _, persisted = _split_runtime_refs({"response_id": "r", "agent_reference": {}})
+        assert persisted["agent_reference"] == {}
+        json.dumps(persisted)
+
+    def test_dict_agent_reference_unchanged(self) -> None:
+        import json
+
+        ar = {"type": "agent_reference", "name": "x", "version": "1"}
+        _, persisted = _split_runtime_refs({"response_id": "r", "agent_reference": ar})
+        assert persisted["agent_reference"] == ar
+        json.dumps(persisted)