Skip to content

Commit d943d16

Browse files
committed
Retry
stack-info: PR: #2363, branch: GarrettBeatty/stack/3 Retry stack-info: PR: #2363, branch: GarrettBeatty/stack/3
1 parent 9e5113d commit d943d16

19 files changed

Lines changed: 1165 additions & 86 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
namespace Amazon.Lambda.DurableExecution;
2+
3+
/// <summary>
4+
/// Determines whether a failed step should be retried and with what delay.
5+
/// </summary>
6+
public interface IRetryStrategy
7+
{
8+
/// <summary>
9+
/// Evaluates whether the given exception warrants a retry.
10+
/// </summary>
11+
/// <param name="exception">The exception that caused the step to fail.</param>
12+
/// <param name="attemptNumber">The 1-based attempt number that just failed.</param>
13+
/// <returns>A decision indicating whether to retry and the delay before the next attempt.</returns>
14+
RetryDecision ShouldRetry(Exception exception, int attemptNumber);
15+
}
16+
17+
/// <summary>
18+
/// The outcome of a retry evaluation.
19+
/// </summary>
20+
public readonly struct RetryDecision
21+
{
22+
/// <summary>Whether the step should be retried.</summary>
23+
public bool ShouldRetry { get; }
24+
25+
/// <summary>The delay before the next retry attempt.</summary>
26+
public TimeSpan Delay { get; }
27+
28+
private RetryDecision(bool shouldRetry, TimeSpan delay)
29+
{
30+
ShouldRetry = shouldRetry;
31+
Delay = delay;
32+
}
33+
34+
/// <summary>Indicates the step should not be retried.</summary>
35+
public static RetryDecision DoNotRetry() => new(false, TimeSpan.Zero);
36+
37+
/// <summary>Indicates the step should be retried after the specified delay.</summary>
38+
public static RetryDecision RetryAfter(TimeSpan delay) => new(true, delay);
39+
}
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
using System.Text.RegularExpressions;
2+
3+
namespace Amazon.Lambda.DurableExecution;
4+
5+
/// <summary>
6+
/// Jitter strategy for exponential backoff to prevent thundering-herd scenarios.
7+
/// </summary>
8+
public enum JitterStrategy
9+
{
10+
/// <summary>No randomization — delay is exactly the calculated backoff value.</summary>
11+
None,
12+
/// <summary>Random delay between 0 and the calculated backoff value (recommended).</summary>
13+
Full,
14+
/// <summary>Random delay between 50% and 100% of the calculated backoff value.</summary>
15+
Half
16+
}
17+
18+
/// <summary>
19+
/// Controls whether a step re-executes if the Lambda is re-invoked mid-attempt.
20+
/// </summary>
21+
public enum StepSemantics
22+
{
23+
/// <summary>
24+
/// Default. The step may re-execute if the Lambda is re-invoked during execution.
25+
/// Use for idempotent operations.
26+
/// </summary>
27+
AtLeastOncePerRetry,
28+
29+
/// <summary>
30+
/// The step executes at most once per retry attempt. A START checkpoint is written
31+
/// before execution; on replay with an existing START, the SDK skips re-execution
32+
/// and proceeds to the retry handler.
33+
/// </summary>
34+
AtMostOncePerRetry
35+
}
36+
37+
/// <summary>
38+
/// Factory methods for common retry strategies.
39+
/// </summary>
40+
public static class RetryStrategy
41+
{
42+
/// <summary>6 attempts, 2x backoff, 5s initial delay, 60s max, Full jitter.</summary>
43+
public static IRetryStrategy Default { get; } = Exponential(
44+
maxAttempts: 6,
45+
initialDelay: TimeSpan.FromSeconds(5),
46+
maxDelay: TimeSpan.FromSeconds(60),
47+
backoffRate: 2.0,
48+
jitter: JitterStrategy.Full);
49+
50+
/// <summary>3 attempts, 2x backoff, 1s initial delay, 5s max, Half jitter.</summary>
51+
public static IRetryStrategy Transient { get; } = Exponential(
52+
maxAttempts: 3,
53+
initialDelay: TimeSpan.FromSeconds(1),
54+
maxDelay: TimeSpan.FromSeconds(5),
55+
backoffRate: 2.0,
56+
jitter: JitterStrategy.Half);
57+
58+
/// <summary>No retry — 1 attempt only.</summary>
59+
public static IRetryStrategy None { get; } = Exponential(maxAttempts: 1);
60+
61+
/// <summary>
62+
/// Creates an exponential backoff retry strategy.
63+
/// </summary>
64+
public static IRetryStrategy Exponential(
65+
int maxAttempts = 3,
66+
TimeSpan? initialDelay = null,
67+
TimeSpan? maxDelay = null,
68+
double backoffRate = 2.0,
69+
JitterStrategy jitter = JitterStrategy.Full,
70+
Type[]? retryableExceptions = null,
71+
string[]? retryableMessagePatterns = null)
72+
{
73+
return new ExponentialRetryStrategy(
74+
maxAttempts,
75+
initialDelay ?? TimeSpan.FromSeconds(5),
76+
maxDelay ?? TimeSpan.FromSeconds(300),
77+
backoffRate,
78+
jitter,
79+
retryableExceptions,
80+
retryableMessagePatterns);
81+
}
82+
83+
/// <summary>
84+
/// Creates a retry strategy from a delegate.
85+
/// </summary>
86+
public static IRetryStrategy FromDelegate(Func<Exception, int, RetryDecision> strategy)
87+
=> new DelegateRetryStrategy(strategy);
88+
}
89+
90+
internal sealed class ExponentialRetryStrategy : IRetryStrategy
91+
{
92+
private readonly int _maxAttempts;
93+
private readonly TimeSpan _initialDelay;
94+
private readonly TimeSpan _maxDelay;
95+
private readonly double _backoffRate;
96+
private readonly JitterStrategy _jitter;
97+
private readonly Type[]? _retryableExceptions;
98+
private readonly Regex[]? _retryableMessagePatterns;
99+
100+
[ThreadStatic]
101+
private static Random? t_random;
102+
private static Random Random => t_random ??= new Random();
103+
104+
public ExponentialRetryStrategy(
105+
int maxAttempts,
106+
TimeSpan initialDelay,
107+
TimeSpan maxDelay,
108+
double backoffRate,
109+
JitterStrategy jitter,
110+
Type[]? retryableExceptions,
111+
string[]? retryableMessagePatterns)
112+
{
113+
_maxAttempts = maxAttempts;
114+
_initialDelay = initialDelay;
115+
_maxDelay = maxDelay;
116+
_backoffRate = backoffRate;
117+
_jitter = jitter;
118+
_retryableExceptions = retryableExceptions;
119+
_retryableMessagePatterns = retryableMessagePatterns?
120+
.Select(p => new Regex(p, RegexOptions.Compiled))
121+
.ToArray();
122+
}
123+
124+
public RetryDecision ShouldRetry(Exception exception, int attemptNumber)
125+
{
126+
if (attemptNumber >= _maxAttempts)
127+
return RetryDecision.DoNotRetry();
128+
129+
if (!IsRetryable(exception))
130+
return RetryDecision.DoNotRetry();
131+
132+
var delay = CalculateDelay(attemptNumber);
133+
return RetryDecision.RetryAfter(delay);
134+
}
135+
136+
private bool IsRetryable(Exception exception)
137+
{
138+
if (_retryableExceptions == null && _retryableMessagePatterns == null)
139+
return true;
140+
141+
if (_retryableExceptions != null)
142+
{
143+
var exType = exception.GetType();
144+
if (_retryableExceptions.Any(t => t.IsAssignableFrom(exType)))
145+
return true;
146+
}
147+
148+
if (_retryableMessagePatterns != null)
149+
{
150+
var message = exception.Message;
151+
if (_retryableMessagePatterns.Any(p => p.IsMatch(message)))
152+
return true;
153+
}
154+
155+
return false;
156+
}
157+
158+
internal TimeSpan CalculateDelay(int attemptNumber)
159+
{
160+
var baseDelay = _initialDelay.TotalSeconds * Math.Pow(_backoffRate, attemptNumber - 1);
161+
var cappedDelay = Math.Min(baseDelay, _maxDelay.TotalSeconds);
162+
163+
var finalDelay = _jitter switch
164+
{
165+
JitterStrategy.Full => Random.NextDouble() * cappedDelay,
166+
JitterStrategy.Half => cappedDelay * (0.5 + 0.5 * Random.NextDouble()),
167+
_ => cappedDelay
168+
};
169+
170+
return TimeSpan.FromSeconds(Math.Max(1, Math.Ceiling(finalDelay)));
171+
}
172+
}
173+
174+
internal sealed class DelegateRetryStrategy : IRetryStrategy
175+
{
176+
private readonly Func<Exception, int, RetryDecision> _strategy;
177+
178+
public DelegateRetryStrategy(Func<Exception, int, RetryDecision> strategy)
179+
{
180+
_strategy = strategy;
181+
}
182+
183+
public RetryDecision ShouldRetry(Exception exception, int attemptNumber)
184+
=> _strategy(exception, attemptNumber);
185+
}

Libraries/src/Amazon.Lambda.DurableExecution/Config/StepConfig.cs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,14 @@ namespace Amazon.Lambda.DurableExecution;
55
/// </summary>
66
public sealed class StepConfig
77
{
8-
// TODO: Retry support is deferred to a follow-up PR. When added, this is
9-
// where RetryStrategy and Semantics (AtLeastOncePerRetry / AtMostOncePerRetry)
10-
// will live. The follow-up needs to use service-mediated retries (checkpoint
11-
// a RETRY operation + suspend the Lambda) rather than an in-process Task.Delay
12-
// loop, to avoid billing Lambda compute time during retry backoff.
8+
/// <summary>
9+
/// Retry strategy for failed steps. When null (default), failures are not retried.
10+
/// </summary>
11+
public IRetryStrategy? RetryStrategy { get; set; }
12+
13+
/// <summary>
14+
/// Controls whether a step may re-execute if the Lambda is re-invoked mid-attempt.
15+
/// Default is <see cref="StepSemantics.AtLeastOncePerRetry"/>.
16+
/// </summary>
17+
public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry;
1318
}

Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ namespace Amazon.Lambda.DurableExecution.Internal;
1111
/// call awaits the flush of its containing batch (sync semantics).
1212
/// </summary>
1313
/// <remarks>
14-
/// TODO: when Map / Parallel / ChildContext / WaitForCondition land — or when
15-
/// AtLeastOncePerRetry step START gets a non-blocking variant — they will need
16-
/// a fire-and-forget overload like
17-
/// <c>Task EnqueueAsync(SdkOperationUpdate update, bool sync)</c> where
18-
/// <c>sync=false</c> returns as soon as the item is queued. Java's
19-
/// <c>sendOperationUpdate</c> vs <c>sendOperationUpdateAsync</c> is the model.
20-
/// Today every call site is sync, so the API stays minimal.
14+
/// Fire-and-forget semantics are achieved by simply not awaiting the returned
15+
/// Task — matching Java/Python/JS SDKs which use the same one-method pattern.
16+
/// Errors still surface deterministically via <c>_terminalError</c>: the next
17+
/// sync <see cref="EnqueueAsync"/> or <see cref="DrainAsync"/> rethrows.
18+
/// Callers using fire-and-forget should observe the discarded Task's exception
19+
/// (see <c>StepOperation.FireAndForget</c>) so it doesn't trip the runtime's
20+
/// <c>UnobservedTaskException</c> event.
2121
/// </remarks>
2222
internal sealed class CheckpointBatcher : IAsyncDisposable
2323
{

0 commit comments

Comments
 (0)