fix(ext-workflow): bound transient retries and address review feedback

javier-aliaga · javier-aliaga · commit f63311d281e2 · 2026-06-01T17:43:58.000+02:00
Cap continuous transient-error retries in unbounded mode (timeout=0/None)
at 30s via _MAX_TRANSIENT_RETRY_SECONDS, then re-raise the original
RpcError. This preserves the pre-retry contract: timeout=0 still waits
indefinitely for a healthy workflow and never raises TimeoutError, but a
permanently-unavailable sidecar now surfaces the original error instead
of retrying forever.

Also address review feedback:
  - Type wait_for_orchestration_* timeout as Optional[int] (None is a
    supported, tested input meaning unbounded).
  - Fix sync "up to Nones" log message to treat None as indefinite,
    matching the async client.
  - Correct the retry-helper docstring: the first call passes grpc_timeout
    (None when unbounded), not the timeout value verbatim.

Add a test covering unbounded-mode transient exhaustion surfacing as the
original RpcError (not TimeoutError, not a hang).

Signed-off-by: Javier Aliaga &lt;javier@diagrid.io&gt;
diff --git a/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/aio/client.py b/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/aio/client.py
@@ -123,7 +123,7 @@ async def get_orchestration_state(
         return new_orchestration_state(req.instanceId, res)
 
     async def wait_for_orchestration_start(
-        self, instance_id: str, *, fetch_payloads: bool = False, timeout: int = 0
+        self, instance_id: str, *, fetch_payloads: bool = False, timeout: Optional[int] = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
         self._logger.info(
@@ -142,7 +142,7 @@ async def _call(grpc_timeout):
             raise TimeoutError('Timed-out waiting for the orchestration to start')
 
     async def wait_for_orchestration_completion(
-        self, instance_id: str, *, fetch_payloads: bool = True, timeout: int = 0
+        self, instance_id: str, *, fetch_payloads: bool = True, timeout: Optional[int] = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
         self._logger.info(
@@ -187,17 +187,26 @@ async def _call(grpc_timeout):
         grpc.StatusCode.UNAVAILABLE,
     )
 
+    # See TaskHubGrpcClient._MAX_TRANSIENT_RETRY_SECONDS — same grace window for
+    # unbounded (timeout=0) callers so a down sidecar surfaces the original
+    # error instead of retrying forever.
+    _MAX_TRANSIENT_RETRY_SECONDS = 30.0
+
     async def _call_with_transient_retry(self, instance_id, timeout, call_fn):
         """Async mirror of TaskHubGrpcClient._call_with_transient_retry.
         Retries FAILED_PRECONDITION/UNAVAILABLE with capped exponential
         backoff while clamping sleep and per-call gRPC timeout to the
-        remaining budget. The first call passes ``timeout`` verbatim so
-        callers observe identical behavior on a healthy runtime.
+        remaining budget. The first call uses the caller's timeout unchanged
+        (``None`` when unbounded) so callers observe identical behavior on a
+        healthy runtime. In unbounded
+        mode, continuous transient retries are capped at
+        ``_MAX_TRANSIENT_RETRY_SECONDS`` before the original error propagates.
         """
         unbounded = timeout in (0, None)
         deadline = None if unbounded else time.monotonic() + timeout
         grpc_timeout = None if unbounded else timeout
         backoff = 0.5
+        transient_deadline = None  # unbounded mode only; anchored on first transient
         while True:
             try:
                 return await call_fn(grpc_timeout)
@@ -208,16 +217,26 @@ async def _call_with_transient_retry(self, instance_id, timeout, call_fn):
                 if code not in self._TRANSIENT_RPC_CODES:
                     raise
 
+                now = time.monotonic()
+
+                if unbounded:
+                    if transient_deadline is None:
+                        transient_deadline = now + self._MAX_TRANSIENT_RETRY_SECONDS
+                    elif now >= transient_deadline:
+                        raise
+
                 if deadline is None:
                     remaining = None
                 else:
-                    remaining = deadline - time.monotonic()
+                    remaining = deadline - now
                     if remaining <= 0:
                         raise _TransientTimeout()
 
                 sleep_for = min(backoff, 5.0)
                 if remaining is not None:
                     sleep_for = min(sleep_for, remaining)
+                if transient_deadline is not None:
+                    sleep_for = min(sleep_for, transient_deadline - now)
                 self._logger.warning(
                     f"Transient gRPC error {code.name} waiting on instance '{instance_id}'; "
                     f'retrying in {sleep_for:.2f}s'
diff --git a/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/client.py b/ext/dapr-ext-workflow/dapr/ext/workflow/_durabletask/client.py
@@ -224,11 +224,11 @@ def get_orchestration_state(
         return new_orchestration_state(req.instanceId, res)
 
     def wait_for_orchestration_start(
-        self, instance_id: str, *, fetch_payloads: bool = False, timeout: int = 0
+        self, instance_id: str, *, fetch_payloads: bool = False, timeout: Optional[int] = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
         self._logger.info(
-            f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to start."
+            f"Waiting {'indefinitely' if timeout in (0, None) else f'up to {timeout}s'} for instance '{instance_id}' to start."
         )
 
         def _call(grpc_timeout):
@@ -241,11 +241,11 @@ def _call(grpc_timeout):
             raise TimeoutError('Timed-out waiting for the orchestration to start')
 
     def wait_for_orchestration_completion(
-        self, instance_id: str, *, fetch_payloads: bool = True, timeout: int = 0
+        self, instance_id: str, *, fetch_payloads: bool = True, timeout: Optional[int] = 0
     ) -> Optional[WorkflowState]:
         req = pb.GetInstanceRequest(instanceId=instance_id, getInputsAndOutputs=fetch_payloads)
         self._logger.info(
-            f"Waiting {'indefinitely' if timeout == 0 else f'up to {timeout}s'} for instance '{instance_id}' to complete."
+            f"Waiting {'indefinitely' if timeout in (0, None) else f'up to {timeout}s'} for instance '{instance_id}' to complete."
         )
 
         def _call(grpc_timeout):
@@ -286,15 +286,25 @@ def _call(grpc_timeout):
         grpc.StatusCode.UNAVAILABLE,
     )
 
+    # When the caller sets no timeout (timeout=0), bound how long we keep
+    # retrying *consecutive* transient errors so a permanently-unavailable
+    # sidecar surfaces the original error instead of hanging forever. This
+    # window comfortably covers placement re-dissemination after a restart;
+    # a slow-but-healthy workflow never enters this path (it just blocks in
+    # the long-poll), so its indefinite wait is preserved.
+    _MAX_TRANSIENT_RETRY_SECONDS = 30.0
+
     def _call_with_transient_retry(self, instance_id, timeout, call_fn):
         """Run a gRPC wait call, retrying transient errors until the user
         timeout deadline. Re-raises non-transient errors immediately.
-        timeout in (0, None) means unbounded; we still retry transients with
-        backoff.
-
-        The first call passes ``timeout`` verbatim to ``call_fn`` so callers
-        observe identical behavior to a non-retrying client when no transient
-        occurs (preserves prior public behavior). On a retry, both the sleep
+        timeout in (0, None) means unbounded; transients are still retried,
+        but only for up to ``_MAX_TRANSIENT_RETRY_SECONDS`` of continuous
+        failures, after which the original transient error propagates.
+
+        The first call passes the caller's ``grpc_timeout`` (``None`` when
+        unbounded) to ``call_fn`` so callers observe identical behavior to a
+        non-retrying client when no transient occurs (preserves prior public
+        behavior). On a retry, both the sleep
         and the per-call gRPC deadline are clamped to the remaining budget so
         the helper never sleeps past ``timeout`` or starts a gRPC call with
         no time left.
@@ -303,6 +313,7 @@ def _call_with_transient_retry(self, instance_id, timeout, call_fn):
         deadline = None if unbounded else time.monotonic() + timeout
         grpc_timeout = None if unbounded else timeout
         backoff = 0.5
+        transient_deadline = None  # unbounded mode only; anchored on first transient
         while True:
             try:
                 return call_fn(grpc_timeout)
@@ -313,18 +324,31 @@ def _call_with_transient_retry(self, instance_id, timeout, call_fn):
                 if code not in self._TRANSIENT_RPC_CODES:
                     raise
 
+                now = time.monotonic()
+
+                # In unbounded mode the user budget can't end the loop, so cap
+                # continuous transient retries and re-raise the original error
+                # (matching pre-retry behavior) once the grace window elapses.
+                if unbounded:
+                    if transient_deadline is None:
+                        transient_deadline = now + self._MAX_TRANSIENT_RETRY_SECONDS
+                    elif now >= transient_deadline:
+                        raise
+
                 # Compute remaining budget once and reuse so the sleep and the
                 # next per-call grpc_timeout agree on "how much time is left".
                 if deadline is None:
                     remaining = None
                 else:
-                    remaining = deadline - time.monotonic()
+                    remaining = deadline - now
                     if remaining <= 0:
                         raise _TransientTimeout()
 
                 sleep_for = min(backoff, 5.0)
                 if remaining is not None:
                     sleep_for = min(sleep_for, remaining)
+                if transient_deadline is not None:
+                    sleep_for = min(sleep_for, transient_deadline - now)
                 self._logger.warning(
                     f"Transient gRPC error {code.name} waiting on instance '{instance_id}'; "
                     f'retrying in {sleep_for:.2f}s'
diff --git a/ext/dapr-ext-workflow/tests/durabletask/test_orchestration_wait.py b/ext/dapr-ext-workflow/tests/durabletask/test_orchestration_wait.py
@@ -156,3 +156,33 @@ def test_wait_for_orchestration_start_non_transient_propagates(monkeypatch):
     with pytest.raises(grpc.RpcError):
         c.wait_for_orchestration_start(instance_id, timeout=10)
     assert c._stub.WaitForInstanceStart.call_count == 1
+
+
+def test_wait_for_orchestration_start_unbounded_transient_gives_up_with_rpc_error(monkeypatch):
+    """With timeout=0 (unbounded), persistent transient errors are retried only
+    for the grace window, then the original RpcError propagates — NOT a hang and
+    NOT a TimeoutError, preserving the pre-retry contract that timeout=0 surfaces
+    the gRPC error rather than TimeoutError."""
+    instance_id = 'test-instance'
+
+    # Advance well past _MAX_TRANSIENT_RETRY_SECONDS on each transient so the
+    # grace window is exhausted within a couple of retries.
+    fake_time = [0.0]
+
+    def fake_monotonic():
+        fake_time[0] += 20.0  # 20, 40, 60, ... — anchors at 20, deadline 50
+        return fake_time[0]
+
+    monkeypatch.setattr('dapr.ext.workflow._durabletask.client.time.monotonic', fake_monotonic)
+    monkeypatch.setattr('dapr.ext.workflow._durabletask.client.time.sleep', lambda s: None)
+
+    c = TaskHubGrpcClient()
+    c._stub = Mock()
+    c._stub.WaitForInstanceStart.side_effect = _make_rpc_error(grpc.StatusCode.UNAVAILABLE)
+
+    with pytest.raises(grpc.RpcError) as exc_info:
+        c.wait_for_orchestration_start(instance_id, timeout=0)
+    assert not isinstance(exc_info.value, TimeoutError)
+    # Retried at least once before giving up (proves it didn't fail-fast like the
+    # non-transient path, and didn't loop forever).
+    assert c._stub.WaitForInstanceStart.call_count >= 2