fix(ext-workflow): retry transient gRPC errors in wait_for_orchestration_*

javier-aliaga · javier-aliaga · commit e58655d23b96 · 2026-06-01T16:15:55.000+02:00
wait_for_orchestration_start and wait_for_orchestration_completion call
the workflow runtime through the local Dapr sidecar. Immediately after a
sidecar restart (placement re-dissemination not yet applied, actor
registration still propagating, etc.), the sidecar can return
FAILED_PRECONDITION or UNAVAILABLE for an instance whose persistent
state is intact. The previous implementation surfaced these as a hard
error to the caller, so a client polling a long-running workflow would
fail permanently even though the workflow itself was recoverable.

Wrap both wait methods in a single _call_with_transient_retry helper:

  - Retry FAILED_PRECONDITION and UNAVAILABLE with exponential backoff
    (0.5s, doubling, capped at 5s).
  - Respect the caller's timeout. timeout in (0, None) means unbounded.
    The first call passes the user's timeout verbatim so behavior on a
    healthy runtime is unchanged. On retry, the per-call gRPC deadline
    is the remaining budget against a monotonic deadline anchored to the
    start of the loop.
  - DEADLINE_EXCEEDED and budget exhaustion both surface as the public
    TimeoutError (preserved through a private _TransientTimeout
    sentinel).
  - Non-transient RpcErrors propagate immediately, unchanged.

Behavior on a healthy runtime is unchanged: the first call succeeds and
no retry loop runs.

Signed-off-by: Javier Aliaga &lt;javier@diagrid.io&gt;
diff --git a/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/client.py b/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/client.py
@@ -10,12 +10,18 @@
 # limitations under the License.
 
 import logging
+import time
 import uuid
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from typing import Any, Optional, Sequence, TypeVar, Union
 
+
+class _TransientTimeout(Exception):
+    """Internal sentinel: the retry loop exhausted the user-provided timeout
+    budget. Callers convert this to a public ``TimeoutError``."""
+
 import dapr.ext.workflow._durabletask.internal.helpers as helpers
 import dapr.ext.workflow._durabletask.internal.orchestrator_service_pb2_grpc as stubs
 import dapr.ext.workflow._durabletask.internal.protos as pb
@@ -220,29 +226,28 @@ def wait_for_orchestration_start(
         self, instance_id: str, *, fetch_payloads: bool = False, timeout: int = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
-        try:
-            grpc_timeout = None if timeout == 0 else timeout
-            self._logger.info(
-                f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to start."
-            )
+        self._logger.info(
+            f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to start."
+        )
+
+        def _call(grpc_timeout):
             res: pb.GetInstanceResponse = self._stub.WaitForInstanceStart(req, timeout=grpc_timeout)
             return new_orchestration_state(req.instanceId, res)
-        except grpc.RpcError as rpc_error:
-            if rpc_error.code() == grpc.StatusCode.DEADLINE_EXCEEDED:  # type: ignore
-                # Replace gRPC error with the built-in TimeoutError
-                raise TimeoutError('Timed-out waiting for the orchestration to start')
-            else:
-                raise
+
+        try:
+            return self._call_with_transient_retry(instance_id, timeout, _call)
+        except _TransientTimeout:
+            raise TimeoutError('Timed-out waiting for the orchestration to start')
 
     def wait_for_orchestration_completion(
         self, instance_id: str, *, fetch_payloads: bool = True, timeout: int = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
-        try:
-            grpc_timeout = None if timeout == 0 else timeout
-            self._logger.info(
-                f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to complete."
-            )
+        self._logger.info(
+            f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to complete."
+        )
+
+        def _call(grpc_timeout):
             res: pb.GetInstanceResponse = self._stub.WaitForInstanceCompletion(
                 req, timeout=grpc_timeout
             )
@@ -262,14 +267,60 @@ def wait_for_orchestration_completion(
                 self._logger.info(f"Instance '{instance_id}' was terminated.")
             elif state.runtime_status == OrchestrationStatus.COMPLETED:
                 self._logger.info(f"Instance '{instance_id}' completed.")
-
             return state
-        except grpc.RpcError as rpc_error:
-            if rpc_error.code() == grpc.StatusCode.DEADLINE_EXCEEDED:  # type: ignore
-                # Replace gRPC error with the built-in TimeoutError
-                raise TimeoutError('Timed-out waiting for the orchestration to complete')
-            else:
-                raise
+
+        try:
+            return self._call_with_transient_retry(instance_id, timeout, _call)
+        except _TransientTimeout:
+            raise TimeoutError('Timed-out waiting for the orchestration to complete')
+
+    # Transient gRPC codes that indicate the workflow runtime is temporarily
+    # unable to locate the workflow actor — typically immediately after a Dapr
+    # sidecar restart (e.g. recovery from chaos). The placement service has the
+    # actor registration, but local daprd hasn't received the dissemination yet.
+    # Without retry, every poll fails permanently with FAILED_PRECONDITION even
+    # though the workflow runtime state is intact.
+    _TRANSIENT_RPC_CODES = (
+        grpc.StatusCode.FAILED_PRECONDITION,
+        grpc.StatusCode.UNAVAILABLE,
+    )
+
+    def _call_with_transient_retry(self, instance_id, timeout, call_fn):
+        """Run a gRPC wait call, retrying transient errors until the user
+        timeout deadline. Re-raises non-transient errors immediately.
+        timeout in (0, None) means unbounded; we still retry transients with
+        backoff.
+
+        The first call passes ``timeout`` verbatim to ``call_fn`` so callers
+        observe identical behavior to a non-retrying client when no transient
+        occurs (preserves prior public behavior). On a retry, the per-call
+        gRPC deadline is the remaining budget against a monotonic deadline
+        anchored to the start of the loop.
+        """
+        unbounded = timeout in (0, None)
+        deadline = None if unbounded else time.monotonic() + timeout
+        grpc_timeout = None if unbounded else timeout
+        backoff = 0.5
+        while True:
+            try:
+                return call_fn(grpc_timeout)
+            except grpc.RpcError as rpc_error:
+                code = rpc_error.code()  # type: ignore
+                if code == grpc.StatusCode.DEADLINE_EXCEEDED:
+                    raise _TransientTimeout()
+                if code not in self._TRANSIENT_RPC_CODES:
+                    raise
+                if deadline is not None and time.monotonic() >= deadline:
+                    raise _TransientTimeout()
+                self._logger.warning(
+                    f"Transient gRPC error {code.name} waiting on instance '{instance_id}'; "
+                    f"retrying in {backoff:.1f}s"
+                )
+                time.sleep(min(backoff, 5.0))
+                backoff = min(backoff * 2, 5.0)
+                grpc_timeout = (
+                    None if deadline is None else max(0.1, deadline - time.monotonic())
+                )
 
     def raise_orchestration_event(
         self, instance_id: str, event_name: str, *, data: Optional[Any] = None