Skip to content

Commit 0efad1e

Browse files
authored
refactor: reconcile retry defaults and backoff across the two retry steps (#97)
PR: #97
1 parent 7b0138b commit 0efad1e

8 files changed

Lines changed: 453 additions & 128 deletions

File tree

docs/pipelines.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,21 @@ and carries:
389389
| `maxDelay` | `8s` | Cap on the scaled delay |
390390
| `maxAttempts` | `3` | Total attempts including the first send; `1` disables retries |
391391
| `jitter` | `0.2` | Symmetric jitter fraction in `[0.0, 1.0]` |
392-
| `retryableStatuses` | `{429, 500, 502, 503, 504}` | Status codes that trigger a retry on an `HttpException` |
392+
| `retryableStatuses` | `{408, 429, 500, 502, 503, 504}` | Status codes that trigger a retry on an `HttpException` |
393393
| `retryableMethods` | `{GET, HEAD, OPTIONS, PUT, DELETE}` | Methods retryable by RFC 9110; others need a replayable body |
394394
| `scheduler` | `null` | Optional caller scheduler; `null` uses a daemon scheduler |
395395

396-
`408` (Request Timeout) is intentionally excluded from the default `retryableStatuses` — a
397-
server-side 408 usually means the client was slow to send and is unlikely to improve on retry.
398-
Callers that disagree can opt in via the builder.
396+
These are the SDK's canonical retry defaults: the stage-based `DefaultRetryStep` (and its
397+
`HttpRetryOptions`) share the same base delay (`200ms`), max delay (`8s`), multiplier (`2.0`),
398+
jitter (`0.2`), retryable-status policy, and total send budget (3 attempts). Both stacks compute
399+
their exponential schedule through the one `BackoffCalculator`, so the two cannot drift apart;
400+
the only intentional difference is that the stage-based step has no `totalTimeout` deadline.
401+
`HttpRetryOptions` counts *retries* (`maxRetries`, default `2`) while `RetrySettings` counts
402+
*total attempts* (`maxAttempts`, default `3`) — both default to the same 3 sends.
403+
404+
`408` (Request Timeout) **is** retryable by default, matching
405+
`RetryUtils.isRetryable`/`HttpException.retryable` and the stage-based step. Callers wanting a
406+
stricter posture can pass a tighter `retryableStatuses` set to the builder.
399407

400408
---
401409

sdk-core/api/sdk-core.api

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,6 @@ public class org/dexpace/sdk/core/http/pipeline/steps/DefaultRedirectStep : org/
793793
public class org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep : org/dexpace/sdk/core/http/pipeline/steps/RetryStep {
794794
public static final field Companion Lorg/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep$Companion;
795795
public static final field DEFAULT_MAX_RETRIES I
796-
public static final field MAX_SHIFT_TRY_COUNT I
797796
public fun <init> ()V
798797
public fun <init> (Lorg/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions;)V
799798
public fun <init> (Lorg/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions;Lorg/dexpace/sdk/core/util/Clock;)V
@@ -2163,7 +2162,10 @@ public final class org/dexpace/sdk/core/pipeline/step/retry/RetryAfterParser {
21632162

21642163
public final class org/dexpace/sdk/core/pipeline/step/retry/RetrySettings {
21652164
public static final field Companion Lorg/dexpace/sdk/core/pipeline/step/retry/RetrySettings$Companion;
2165+
public static final field DEFAULT_DELAY_MULTIPLIER D
21662166
public static final field DEFAULT_INITIAL_DELAY Ljava/time/Duration;
2167+
public static final field DEFAULT_JITTER D
2168+
public static final field DEFAULT_MAX_ATTEMPTS I
21672169
public static final field DEFAULT_MAX_DELAY Ljava/time/Duration;
21682170
public static final field DEFAULT_RETRYABLE_METHODS Ljava/util/Set;
21692171
public static final field DEFAULT_RETRYABLE_STATUSES Ljava/util/Set;

sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep.kt

Lines changed: 53 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ import org.dexpace.sdk.core.http.request.Method
1313
import org.dexpace.sdk.core.http.request.Request
1414
import org.dexpace.sdk.core.http.response.Response
1515
import org.dexpace.sdk.core.instrumentation.ClientLogger
16+
import org.dexpace.sdk.core.pipeline.step.retry.BackoffCalculator
1617
import org.dexpace.sdk.core.pipeline.step.retry.RetryAfterParser
18+
import org.dexpace.sdk.core.pipeline.step.retry.RetrySettings
1719
import org.dexpace.sdk.core.util.Clock
1820
import java.io.IOException
1921
import java.io.InterruptedIOException
2022
import java.time.Duration
21-
import java.util.concurrent.ThreadLocalRandom
2223

2324
/**
2425
* Default [RetryStep]. Drives an iterative retry loop with classified failure detection,
@@ -40,9 +41,8 @@ import java.util.concurrent.ThreadLocalRandom
4041
* 4. Sleeps via [Clock.sleep]; an interrupt during sleep throws [InterruptedIOException]
4142
* with the original [InterruptedException] and any accumulated prior failures attached
4243
* as suppressed — retries are NOT resumed after an interrupt.
43-
* 5. Caps `tryCount` at [MAX_SHIFT_TRY_COUNT] before computing `1L shl tryCount` so the
44-
* left-shift can never overflow (the resulting delay is clamped to [HttpRetryOptions.maxDelay]
45-
* anyway, so the cap is invisible to callers).
44+
* 5. Computes the exponential delay through the shared [BackoffCalculator], which saturates
45+
* rather than overflows on extreme attempt counts and clamps to [HttpRetryOptions.maxDelay].
4646
*
4747
* ## Body replayability
4848
*
@@ -69,8 +69,12 @@ import java.util.concurrent.ThreadLocalRandom
6969
* parsed from the response (response path only). A negative or unparseable value
7070
* falls through; a value of zero produces an immediate retry.
7171
* 3. [HttpRetryOptions.fixedDelay] — if set, every retry waits exactly this duration.
72-
* 4. Exponential backoff: `baseDelay * (1L shl tryCount)` clamped to `maxDelay`, with
73-
* ±5% random jitter via [ThreadLocalRandom].
72+
* 4. Exponential backoff computed by the shared [BackoffCalculator]:
73+
* `baseDelay * 2.0^tryCount` clamped to `maxDelay`, with symmetric ±10% jitter
74+
* ([RetrySettings.DEFAULT_JITTER]). This is the same calculator the recovery-aware
75+
* `pipeline.step.retry.RetryStep` uses, so both stacks share one backoff formula and one
76+
* set of defaults. The deadline-shrinking that the calculator also offers is disabled here
77+
* (this stage-based step carries no total-timeout budget).
7478
*
7579
* ## Failure handling
7680
*
@@ -146,6 +150,33 @@ public open class DefaultRetryStep
146150
*/
147151
private val options: HttpRetryOptions = clampOptions(options)
148152

153+
/**
154+
* The [options]' exponential parameters expressed as a [RetrySettings] view so the shared
155+
* [BackoffCalculator] can compute this stack's schedule. Built once per step instance:
156+
* - `initialDelay` / `maxDelay` come from the options.
157+
* - `delayMultiplier` (2.0) and `jitter` (0.2) are the canonical shared constants — the
158+
* options object does not expose its own multiplier/jitter, so the SDK defaults apply.
159+
* If [HttpRetryOptions] ever gains configurable multiplier/jitter knobs, this view must
160+
* read them from the options instead of the constants, or the new knobs are silently
161+
* ignored on this stack.
162+
* - `totalTimeout = ZERO` disables the deadline cap: the stage-based step has no budget.
163+
* The `fixedDelay` path never consults this view; it short-circuits in [backoffOrFixed].
164+
*
165+
* Building this view also validates the delay magnitudes eagerly: [RetrySettings.builder]
166+
* rejects a negative `baseDelay`/`maxDelay` and one larger than the calculator's
167+
* ~292-year nanosecond ceiling. [HttpRetryOptions] performs no such range check, so a
168+
* pathological delay surfaces as an [IllegalArgumentException] here, at step construction,
169+
* rather than later at delay-computation time.
170+
*/
171+
private val backoffSettings: RetrySettings =
172+
RetrySettings.builder()
173+
.initialDelay(this.options.baseDelay)
174+
.maxDelay(this.options.maxDelay)
175+
.delayMultiplier(RetrySettings.DEFAULT_DELAY_MULTIPLIER)
176+
.jitter(RetrySettings.DEFAULT_JITTER)
177+
.totalTimeout(Duration.ZERO)
178+
.build()
179+
149180
/**
150181
* Sends [request] through the downstream pipeline with automatic retry on retryable failures.
151182
*
@@ -437,51 +468,16 @@ public open class DefaultRetryStep
437468
}
438469

439470
/**
440-
* Returns [HttpRetryOptions.fixedDelay] if set, otherwise the exponential-backoff
441-
* delay for [tryCount]. The shift count is capped at [MAX_SHIFT_TRY_COUNT] so the
442-
* `1L shl tryCount` term never overflows; the result is always clamped to [HttpRetryOptions.maxDelay]
443-
* anyway, so the cap is invisible in practice.
444-
*/
445-
private fun backoffOrFixed(tryCount: Int): Duration = options.fixedDelay ?: exponentialBackoff(tryCount)
446-
447-
/**
448-
* `baseDelay * (1L shl tryCount)` clamped to `maxDelay`, plus a ±5% jitter sampled
449-
* from [ThreadLocalRandom]. Pure function of [tryCount] and the configured options.
450-
*/
451-
private fun exponentialBackoff(tryCount: Int): Duration {
452-
val baseNanos = options.baseDelay.toNanos()
453-
if (baseNanos == 0L) return Duration.ZERO
454-
val maxNanos = options.maxDelay.toNanos()
455-
val safeShift = tryCount.coerceAtMost(MAX_SHIFT_TRY_COUNT)
456-
// 1L shl 30 ~= 1e9 — multiplying by 800ms (8e8 ns) overflows. Cap on the long
457-
// multiply itself: if `baseNanos * (1L shl safeShift)` would overflow, clamp.
458-
val multiplier = 1L shl safeShift
459-
val scaled =
460-
if (baseNanos > 0 && multiplier > Long.MAX_VALUE / baseNanos) {
461-
Long.MAX_VALUE
462-
} else {
463-
baseNanos * multiplier
464-
}
465-
val clamped = scaled.coerceAtMost(maxNanos)
466-
val jittered = applyJitter(clamped)
467-
// Guarantee a non-negative result — jitter could push us under zero if the caller
468-
// configured pathological options (e.g. baseDelay equal to negative epsilon).
469-
return Duration.ofNanos(jittered.coerceAtLeast(0L))
470-
}
471-
472-
/**
473-
* Applies a ±5% jitter to [nanos]. Sample is drawn from [ThreadLocalRandom] which is
474-
* per-thread, so there is no cross-thread contention on the retry hot path.
471+
* Returns [HttpRetryOptions.fixedDelay] if set, otherwise the exponential-backoff delay
472+
* for [tryCount]. The backoff is computed by the shared [BackoffCalculator] from
473+
* [backoffSettings] so this stack and the recovery-aware `RetryStep` share one formula.
474+
*
475+
* [tryCount] is 0-indexed here (`0` = the delay before the first retry), whereas
476+
* [BackoffCalculator.computeDelay] is 1-indexed (`1` = first retry); the `+ 1` bridges
477+
* the two so both produce `baseDelay`, `2·baseDelay`, `4·baseDelay`, … capped at `maxDelay`.
475478
*/
476-
private fun applyJitter(nanos: Long): Long {
477-
if (nanos == 0L) return 0L
478-
// 5% of nanos, used as the magnitude bound on the random sample.
479-
val jitterMagnitude = nanos / JITTER_DIVISOR
480-
if (jitterMagnitude == 0L) return nanos
481-
// ThreadLocalRandom.nextLong(origin, bound) is inclusive of origin, exclusive of bound.
482-
val offset = ThreadLocalRandom.current().nextLong(-jitterMagnitude, jitterMagnitude + 1L)
483-
return nanos + offset
484-
}
479+
private fun backoffOrFixed(tryCount: Int): Duration =
480+
options.fixedDelay ?: BackoffCalculator.computeDelay(tryCount + 1, backoffSettings)
485481

486482
// --------------- Retry-After parsing ---------------
487483

@@ -581,22 +577,14 @@ public open class DefaultRetryStep
581577

582578
public companion object {
583579
/**
584-
* Default [HttpRetryOptions.maxRetries] applied when the caller passes a negative
585-
* value. Matches Azure Core's `RetryOptions` default.
580+
* Default retry count applied when the caller passes a negative
581+
* [HttpRetryOptions.maxRetries], and the value baked into the no-arg
582+
* [HttpRetryOptions] default. `2` retries on top of the initial send is the SDK's
583+
* canonical budget — `initial + DEFAULT_MAX_RETRIES == 3`, matching
584+
* [RetrySettings.DEFAULT_MAX_ATTEMPTS] so both retry stacks default to the same
585+
* number of total sends.
586586
*/
587-
public const val DEFAULT_MAX_RETRIES: Int = 3
588-
589-
/**
590-
* Upper bound on `tryCount` used for the `1L shl tryCount` term in
591-
* [DefaultRetryStep.exponentialBackoff]. `1L shl 30` ~= 1.07e9 — the scaled delay is
592-
* always clamped to [HttpRetryOptions.maxDelay] long before this bound is hit, so the
593-
* cap is a paranoid guard against integer overflow rather than a behavior knob.
594-
*/
595-
public const val MAX_SHIFT_TRY_COUNT: Int = 30
596-
597-
// Jitter is ±5% of the computed delay; expressed as the divisor (nanos / 20) to
598-
// avoid an extra multiplication on the hot path. See [applyJitter].
599-
private const val JITTER_DIVISOR = 20L
587+
public const val DEFAULT_MAX_RETRIES: Int = 2
600588

601589
// Nanoseconds in one millisecond — used to convert monotonic-clock deltas to ms
602590
// for retry log events.

sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions.kt

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
package org.dexpace.sdk.core.http.pipeline.steps
99

1010
import org.dexpace.sdk.core.http.common.HttpHeaderName
11+
import org.dexpace.sdk.core.pipeline.step.retry.RetrySettings
1112
import org.dexpace.sdk.core.util.RetryUtils
1213
import java.time.Duration
1314
import java.util.Collections
@@ -40,12 +41,15 @@ public fun interface HttpRetryDelayProvider {
4041
}
4142

4243
/**
43-
* Configuration for [DefaultRetryStep]. Defaults mirror Azure Core's retry policy:
44-
* - [maxRetries] = 3
45-
* - [baseDelay] = 800ms (exponentially scaled per attempt)
46-
* - [maxDelay] = 8s (cap on the scaled delay)
44+
* Configuration for [DefaultRetryStep]. The numeric defaults are the SDK's canonical retry
45+
* defaults, shared with the recovery-aware [RetrySettings] so both retry stacks compute the
46+
* same backoff schedule (via [org.dexpace.sdk.core.pipeline.step.retry.BackoffCalculator]):
47+
* - [maxRetries] = 2 (initial send + 2 retries = 3 total attempts, matching
48+
* [RetrySettings.DEFAULT_MAX_ATTEMPTS]).
49+
* - [baseDelay] = 200ms (= [RetrySettings.DEFAULT_INITIAL_DELAY]; exponentially scaled per attempt).
50+
* - [maxDelay] = 8s (= [RetrySettings.DEFAULT_MAX_DELAY]; cap on the scaled delay).
4751
* - [fixedDelay] = null (exponential backoff is used; when non-null it overrides the
48-
* backoff entirely and every retry waits exactly [fixedDelay])
52+
* backoff entirely and every retry waits exactly [fixedDelay]).
4953
* - [retryAfterHeaders] = `Retry-After`, `retry-after-ms`, `x-ms-retry-after-ms` —
5054
* parsed in declared order; the first present wins. Drop the Microsoft-specific
5155
* variants by passing a tighter list for stricter posture.
@@ -54,15 +58,20 @@ public fun interface HttpRetryDelayProvider {
5458
* anywhere in the cause chain, per [RetryUtils.isRetryable].
5559
* - [delayFromCondition] = null delay (falls through to `Retry-After` parsing, then backoff).
5660
*
61+
* The exponential schedule, multiplier (2.0), and symmetric jitter (0.2) are sourced from the
62+
* shared [RetrySettings] constants and applied by `BackoffCalculator`, so this stack and the
63+
* recovery-aware stack cannot drift apart. The one intentional difference is the deadline: this
64+
* stage-based step has no total-timeout budget, so it never shrinks a delay against one.
65+
*
5766
* The companion [HttpRetryOptions.fixed] factory builds an options instance whose delay
5867
* never grows — useful for test injection or high-throughput retry against flaky endpoints.
5968
*/
6069
public class HttpRetryOptions
6170
@JvmOverloads
6271
constructor(
63-
public val maxRetries: Int = 3,
64-
public val baseDelay: Duration = Duration.ofMillis(DEFAULT_BASE_DELAY_MS),
65-
public val maxDelay: Duration = Duration.ofSeconds(DEFAULT_MAX_DELAY_SECONDS),
72+
public val maxRetries: Int = DEFAULT_MAX_RETRIES,
73+
public val baseDelay: Duration = RetrySettings.DEFAULT_INITIAL_DELAY,
74+
public val maxDelay: Duration = RetrySettings.DEFAULT_MAX_DELAY,
6675
public val fixedDelay: Duration? = null,
6776
public val retryAfterHeaders: List<HttpHeaderName> = DEFAULT_RETRY_AFTER_HEADERS,
6877
public val shouldRetryCondition: HttpRetryConditionPredicate =
@@ -72,10 +81,9 @@ public class HttpRetryOptions
7281
public val delayFromCondition: HttpRetryDelayProvider = HttpRetryDelayProvider { null },
7382
) {
7483
public companion object {
75-
// Default exponential-backoff parameters tuned to favour fast first-retry while
76-
// bounding cumulative latency. Aligned with Azure Core's RetryOptions defaults.
77-
private const val DEFAULT_BASE_DELAY_MS = 800L
78-
private const val DEFAULT_MAX_DELAY_SECONDS = 8L
84+
// The default retry count is the canonical SDK budget, kept in one place on
85+
// DefaultRetryStep (initial send + DEFAULT_MAX_RETRIES == RetrySettings.DEFAULT_MAX_ATTEMPTS).
86+
private const val DEFAULT_MAX_RETRIES = DefaultRetryStep.DEFAULT_MAX_RETRIES
7987

8088
/**
8189
* The three `Retry-After` header forms parsed by [DefaultRetryStep]. Order matters —

0 commit comments

Comments
 (0)