Skip to content

Commit 29fe15d

Browse files
authored
bump the number of retries for r2 cache uploads (#1252)
1 parent 975833d commit 29fe15d

3 files changed

Lines changed: 63 additions & 62 deletions

File tree

.changeset/poor-peaches-jump.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@opennextjs/cloudflare": patch
3+
---
4+
5+
bump the number of retries for r2 cache uploads
6+
7+
Increase the retry count from 5 to 15 with a capped exponential backoff to improve resilience against transient R2 write failures during cache population.

packages/cloudflare/src/cli/commands/populate-cache.spec.ts

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ import { unstable_startWorker } from "wrangler";
1010
import { defineCloudflareConfig } from "../../api/config.js";
1111
import r2IncrementalCache from "../../api/overrides/incremental-cache/r2-incremental-cache.js";
1212
import { ensureR2Bucket } from "../utils/ensure-r2-bucket.js";
13-
import { getCacheAssets, populateCache, PopulateCacheOptions } from "./populate-cache.js";
13+
import {
14+
getCacheAssets,
15+
MAX_REQUEST_RETRIES,
16+
MAX_RETRY_DELAY_MS,
17+
populateCache,
18+
PopulateCacheOptions,
19+
} from "./populate-cache.js";
1420
import { WorkerEnvVar } from "./utils/helpers.js";
1521

1622
describe("getCacheAssets", () => {
@@ -75,6 +81,19 @@ describe("getCacheAssets", () => {
7581
});
7682
});
7783

84+
// Mock `node:timers/promises` so that `setTimeout` delegates to `globalThis.setTimeout`,
85+
// which is correctly intercepted by `vi.useFakeTimers()`. Without this, the destructured
86+
// import in the source code holds a reference to the original native `setTimeout`, making
87+
// `vi.advanceTimersByTimeAsync()` ineffective and causing tests to wait real wall-clock time.
88+
vi.mock("node:timers/promises", async (importOriginal) => {
89+
const original = await importOriginal<typeof import("node:timers/promises")>();
90+
return {
91+
...original,
92+
setTimeout: (ms: number, value?: unknown) =>
93+
new Promise((resolve) => globalThis.setTimeout(() => resolve(value), ms)),
94+
};
95+
});
96+
7897
vi.mock("./utils/run-wrangler.js", () => ({
7998
runWrangler: vi.fn(() => ({ success: true, stdout: "", stderr: "" })),
8099
}));
@@ -432,26 +451,16 @@ describe("populateCache", () => {
432451
envVars
433452
);
434453

435-
await vi.waitFor(() => {
436-
expect(fetchMock).toHaveBeenCalledTimes(1);
437-
});
438-
439-
await vi.advanceTimersByTimeAsync(249);
440-
expect(fetchMock).toHaveBeenCalledTimes(1);
441-
442-
await vi.advanceTimersByTimeAsync(1);
443-
444-
await vi.waitFor(() => {
445-
expect(fetchMock).toHaveBeenCalledTimes(2);
446-
});
447-
448-
await vi.advanceTimersByTimeAsync(500 + 1000 + 2000);
449-
450-
await expect(result).rejects.toThrow(
451-
/Failed to populate the local R2 cache: Failed to write "incremental-cache\/buildID\/[A-Za-z0-9]+.cache" to R2 after 5 attempts/
454+
const resultExpectation = expect(result).rejects.toThrow(
455+
new RegExp(
456+
`Failed to populate the local R2 cache: Failed to write "incremental-cache\\/buildID\\/[A-Za-z0-9]+\\.cache" to R2 after ${MAX_REQUEST_RETRIES} attempts`
457+
)
452458
);
453459

454-
expect(fetchMock).toHaveBeenCalledTimes(5);
460+
await vi.advanceTimersByTimeAsync(MAX_REQUEST_RETRIES * MAX_RETRY_DELAY_MS);
461+
await resultExpectation;
462+
463+
expect(fetchMock).toHaveBeenCalledTimes(MAX_REQUEST_RETRIES);
455464
expect(mockWorkerDispose).toHaveBeenCalled();
456465
});
457466
});

packages/cloudflare/src/cli/commands/populate-cache.ts

Lines changed: 28 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@ import {
5656
withWranglerPassthroughArgs,
5757
} from "./utils/utils.js";
5858

59+
// Maximum number of attempts to send the request
60+
export const MAX_REQUEST_RETRIES = 15;
61+
// Base delay for retries
62+
export const BASE_RETRY_DELAY_MS = 250;
63+
// Maximum delay for retries, used to calculate the backoff factor
64+
export const MAX_RETRY_DELAY_MS = 10_000;
65+
// Backoff factor for retries, calculated to ensure that the delay grows exponentially up to the maximum delay
66+
export const BACKOFF_FACTOR = (MAX_RETRY_DELAY_MS / BASE_RETRY_DELAY_MS) ** (1 / (MAX_REQUEST_RETRIES - 1));
67+
5968
/**
6069
* Implementation of the `opennextjs-cloudflare populateCache` command.
6170
*
@@ -324,14 +333,11 @@ async function populateR2IncrementalCache(
324333
/**
325334
* Sends cache entries to the R2 worker, one entry per request.
326335
*
327-
* Up to `concurrency` requests are in-flight at any given time.
328-
* Retry logic for transient R2 write failures is handled by the worker.
329-
*
330336
* @param options
331337
* @param options.workerUrl - The URL of the local R2 worker's `/populate` endpoint.
332338
* @param options.assets - The cache assets to write, as collected by {@link getCacheAssets}.
333339
* @param options.prefix - Optional prefix prepended to each R2 key.
334-
* @param options.concurrency - Maximum number of concurrent in-flight requests.
340+
* @param options.maxConcurrency - Maximum number of concurrent in-flight requests.
335341
* @returns Resolves when all entries have been written successfully.
336342
* @throws {Error} If any entry fails after all retries or encounters a non-retryable error.
337343
*/
@@ -343,41 +349,26 @@ async function sendEntriesToR2Worker(options: {
343349
}): Promise<void> {
344350
const { workerUrl, assets, prefix, maxConcurrency } = options;
345351

346-
// Build the list of entries to send (key + filename).
347-
// File contents are read lazily in sendEntryToR2Worker to avoid
348-
// loading all cache values into memory at once.
349-
const entries = assets.map(({ fullPath, key, buildId, isFetch }) => ({
350-
key: computeCacheKey(key, {
351-
prefix,
352-
buildId,
353-
cacheType: isFetch ? "fetch" : "cache",
354-
}),
355-
filename: fullPath,
356-
}));
357-
358-
// Use a concurrency-limited loop with a progress bar.
359-
// `pending` tracks in-flight promises so we can cap concurrency.
360352
const pending = new Set<Promise<void>>();
361353

362-
let concurrency = 1;
354+
for (const asset of tqdm(assets)) {
355+
const { fullPath, key, buildId, isFetch } = asset;
363356

364-
for (const entry of tqdm(entries)) {
365357
const task = sendEntryToR2Worker({
366358
workerUrl,
367-
key: entry.key,
368-
filename: entry.filename,
359+
key: computeCacheKey(key, {
360+
prefix,
361+
buildId,
362+
cacheType: isFetch ? "fetch" : "cache",
363+
}),
364+
filename: fullPath,
369365
}).finally(() => pending.delete(task));
370366

371367
pending.add(task);
372368

373369
// If we've reached the concurrency limit, wait for one to finish.
374-
if (pending.size >= concurrency) {
370+
if (pending.size >= maxConcurrency) {
375371
await Promise.race(pending);
376-
// Increase concurrency gradually to avoid overwhelming the worker
377-
// with too many requests at once.
378-
if (concurrency < maxConcurrency) {
379-
concurrency++;
380-
}
381372
}
382373
}
383374

@@ -404,10 +395,7 @@ async function sendEntryToR2Worker(options: {
404395
}): Promise<void> {
405396
const { workerUrl, key, filename } = options;
406397

407-
const CLIENT_RETRY_ATTEMPTS = 5;
408-
const CLIENT_RETRY_BASE_DELAY_MS = 250;
409-
410-
for (let attempt = 0; attempt < CLIENT_RETRY_ATTEMPTS; attempt++) {
398+
for (let attempt = 0; attempt < MAX_REQUEST_RETRIES; attempt++) {
411399
try {
412400
let response: Response;
413401

@@ -426,9 +414,7 @@ async function sendEntryToR2Worker(options: {
426414
} catch (e) {
427415
throw new RetryableWorkerError(
428416
`Failed to send request to R2 worker: ${e instanceof Error ? e.message : String(e)}`,
429-
{
430-
cause: e,
431-
}
417+
{ cause: e }
432418
);
433419
}
434420

@@ -438,6 +424,7 @@ async function sendEntryToR2Worker(options: {
438424
try {
439425
result = JSON.parse(body) as R2Response;
440426
} catch (e) {
427+
// https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-1xxx-errors/error-1102
441428
if (body.includes("Worker exceeded resource limits")) {
442429
throw new RetryableWorkerError("Worker exceeded resource limits", { cause: e });
443430
}
@@ -454,25 +441,23 @@ async function sendEntryToR2Worker(options: {
454441
});
455442
}
456443

457-
if (!result.success && response.status >= 500) {
458-
throw new RetryableWorkerError(result.error);
459-
}
460-
461444
if (!result.success) {
462-
throw new Error(`Failed to write "${key}" to R2: ${result.error}`);
445+
throw response.status >= 500
446+
? new RetryableWorkerError(result.error)
447+
: new Error(`Failed to write "${key}" to R2: ${result.error}`);
463448
}
464449

465450
return;
466451
} catch (e) {
467-
if (e instanceof RetryableWorkerError && attempt < CLIENT_RETRY_ATTEMPTS - 1) {
452+
if (e instanceof RetryableWorkerError && attempt < MAX_REQUEST_RETRIES - 1) {
468453
logger.error(
469454
`Attempt ${attempt + 1} to write "${key}" failed with a retryable error: ${e.message}. Retrying...`
470455
);
471-
await setTimeout(CLIENT_RETRY_BASE_DELAY_MS * Math.pow(2, attempt));
456+
await setTimeout(BASE_RETRY_DELAY_MS * Math.pow(BACKOFF_FACTOR, attempt));
472457
continue;
473458
}
474459

475-
throw new Error(`Failed to write "${key}" to R2 after ${CLIENT_RETRY_ATTEMPTS} attempts`, {
460+
throw new Error(`Failed to write "${key}" to R2 after ${MAX_REQUEST_RETRIES} attempts`, {
476461
cause: e,
477462
});
478463
}

0 commit comments

Comments
 (0)