1010# limitations under the License.
1111
1212import logging
13+ import time
1314import uuid
1415from dataclasses import dataclass
1516from datetime import datetime
2627from dapr .ext .workflow ._durabletask import task
2728from dapr .ext .workflow ._durabletask .internal .grpc_interceptor import DefaultClientInterceptorImpl
2829
30+
31+ class _TransientTimeout (Exception ):
32+ """Internal sentinel: the retry loop exhausted the user-provided timeout
33+ budget. Callers convert this to a public ``TimeoutError``."""
34+
35+
2936TInput = TypeVar ('TInput' )
3037TOutput = TypeVar ('TOutput' )
3138
@@ -218,32 +225,31 @@ def get_orchestration_state(
218225 return new_orchestration_state (req .instanceId , res )
219226
220227 def wait_for_orchestration_start (
221- self , instance_id : str , * , fetch_payloads : bool = False , timeout : int = 0
228+ self , instance_id : str , * , fetch_payloads : bool = False , timeout : Optional [ int ] = 0
222229 ) -> Optional [WorkflowState ]:
223230 req = pb .GetInstanceRequest (instanceId = instance_id , getInputsAndOutputs = fetch_payloads )
224- try :
225- grpc_timeout = None if timeout == 0 else timeout
226- self . _logger . info (
227- f"Waiting { 'indefinitely' if timeout == 0 else f'up to { timeout } s' } for instance ' { instance_id } ' to start."
228- )
231+ self . _logger . info (
232+ f"Waiting { 'indefinitely' if timeout in ( 0 , None ) else f'up to { timeout } s' } for instance ' { instance_id } ' to start."
233+ )
234+
235+ def _call ( grpc_timeout ):
229236 res : pb .GetInstanceResponse = self ._stub .WaitForInstanceStart (req , timeout = grpc_timeout )
230237 return new_orchestration_state (req .instanceId , res )
231- except grpc .RpcError as rpc_error :
232- if rpc_error .code () == grpc .StatusCode .DEADLINE_EXCEEDED : # type: ignore
233- # Replace gRPC error with the built-in TimeoutError
234- raise TimeoutError ('Timed-out waiting for the orchestration to start' )
235- else :
236- raise
238+
239+ try :
240+ return self ._call_with_transient_retry (instance_id , timeout , _call )
241+ except _TransientTimeout :
242+ raise TimeoutError ('Timed-out waiting for the orchestration to start' )
237243
238244 def wait_for_orchestration_completion (
239- self , instance_id : str , * , fetch_payloads : bool = True , timeout : int = 0
245+ self , instance_id : str , * , fetch_payloads : bool = True , timeout : Optional [ int ] = 0
240246 ) -> Optional [WorkflowState ]:
241247 req = pb .GetInstanceRequest (instanceId = instance_id , getInputsAndOutputs = fetch_payloads )
242- try :
243- grpc_timeout = None if timeout == 0 else timeout
244- self . _logger . info (
245- f"Waiting { 'indefinitely' if timeout == 0 else f'up to { timeout } s' } for instance ' { instance_id } ' to complete."
246- )
248+ self . _logger . info (
249+ f"Waiting { 'indefinitely' if timeout in ( 0 , None ) else f'up to { timeout } s' } for instance ' { instance_id } ' to complete."
250+ )
251+
252+ def _call ( grpc_timeout ):
247253 res : pb .GetInstanceResponse = self ._stub .WaitForInstanceCompletion (
248254 req , timeout = grpc_timeout
249255 )
@@ -263,14 +269,100 @@ def wait_for_orchestration_completion(
263269 self ._logger .info (f"Instance '{ instance_id } ' was terminated." )
264270 elif state .runtime_status == OrchestrationStatus .COMPLETED :
265271 self ._logger .info (f"Instance '{ instance_id } ' completed." )
266-
267272 return state
268- except grpc .RpcError as rpc_error :
269- if rpc_error .code () == grpc .StatusCode .DEADLINE_EXCEEDED : # type: ignore
270- # Replace gRPC error with the built-in TimeoutError
271- raise TimeoutError ('Timed-out waiting for the orchestration to complete' )
272- else :
273- raise
273+
274+ try :
275+ return self ._call_with_transient_retry (instance_id , timeout , _call )
276+ except _TransientTimeout :
277+ raise TimeoutError ('Timed-out waiting for the orchestration to complete' )
278+
279+ # Transient gRPC codes that indicate the workflow runtime is temporarily
280+ # unable to locate the workflow actor — typically immediately after a Dapr
281+ # sidecar restart (e.g. recovery from chaos). The placement service has the
282+ # actor registration, but local daprd hasn't received the dissemination yet.
283+ # Without retry, every poll fails permanently with FAILED_PRECONDITION even
284+ # though the workflow runtime state is intact.
285+ _TRANSIENT_RPC_CODES = (
286+ grpc .StatusCode .FAILED_PRECONDITION ,
287+ grpc .StatusCode .UNAVAILABLE ,
288+ )
289+
290+ # When the caller sets no timeout (timeout=0), bound how long we keep
291+ # retrying *consecutive* transient errors so a permanently-unavailable
292+ # sidecar surfaces the original error instead of hanging forever. This
293+ # window comfortably covers placement re-dissemination after a restart;
294+ # a slow-but-healthy workflow never enters this path (it just blocks in
295+ # the long-poll), so its indefinite wait is preserved.
296+ _MAX_TRANSIENT_RETRY_SECONDS = 30.0
297+
298+ def _call_with_transient_retry (self , instance_id , timeout , call_fn ):
299+ """Run a gRPC wait call, retrying transient errors until the user
300+ timeout deadline. Re-raises non-transient errors immediately.
301+ timeout in (0, None) means unbounded; transients are still retried,
302+ but only for up to ``_MAX_TRANSIENT_RETRY_SECONDS`` of continuous
303+ failures, after which the original transient error propagates.
304+
305+ The first call passes the caller's ``grpc_timeout`` (``None`` when
306+ unbounded) to ``call_fn`` so callers observe identical behavior to a
307+ non-retrying client when no transient occurs (preserves prior public
308+ behavior). On a retry, both the sleep
309+ and the per-call gRPC deadline are clamped to the remaining budget so
310+ the helper never sleeps past ``timeout`` or starts a gRPC call with
311+ no time left.
312+ """
313+ unbounded = timeout in (0 , None )
314+ deadline = None if unbounded else time .monotonic () + timeout
315+ grpc_timeout = None if unbounded else timeout
316+ backoff = 0.5
317+ transient_deadline = None # unbounded mode only; anchored on first transient
318+ while True :
319+ try :
320+ return call_fn (grpc_timeout )
321+ except grpc .RpcError as rpc_error :
322+ code = rpc_error .code () # type: ignore
323+ if code == grpc .StatusCode .DEADLINE_EXCEEDED :
324+ raise _TransientTimeout ()
325+ if code not in self ._TRANSIENT_RPC_CODES :
326+ raise
327+
328+ now = time .monotonic ()
329+
330+ # In unbounded mode the user budget can't end the loop, so cap
331+ # continuous transient retries and re-raise the original error
332+ # (matching pre-retry behavior) once the grace window elapses.
333+ if unbounded :
334+ if transient_deadline is None :
335+ transient_deadline = now + self ._MAX_TRANSIENT_RETRY_SECONDS
336+ elif now >= transient_deadline :
337+ raise
338+
339+ # Compute remaining budget once and reuse so the sleep and the
340+ # next per-call grpc_timeout agree on "how much time is left".
341+ if deadline is None :
342+ remaining = None
343+ else :
344+ remaining = deadline - now
345+ if remaining <= 0 :
346+ raise _TransientTimeout ()
347+
348+ sleep_for = min (backoff , 5.0 )
349+ if remaining is not None :
350+ sleep_for = min (sleep_for , remaining )
351+ if transient_deadline is not None :
352+ sleep_for = min (sleep_for , transient_deadline - now )
353+ self ._logger .warning (
354+ f"Transient gRPC error { code .name } waiting on instance '{ instance_id } '; "
355+ f'retrying in { sleep_for :.2f} s'
356+ )
357+ time .sleep (sleep_for )
358+ backoff = min (backoff * 2 , 5.0 )
359+
360+ if deadline is None :
361+ grpc_timeout = None
362+ else :
363+ grpc_timeout = deadline - time .monotonic ()
364+ if grpc_timeout <= 0 :
365+ raise _TransientTimeout ()
274366
275367 def raise_orchestration_event (
276368 self , instance_id : str , event_name : str , * , data : Optional [Any ] = None
0 commit comments