Skip to content

Commit 08ca397

Browse files
authored
Add retries for persistence (tldraw#7050)
Add a lengthy retry system with sentry error reporting to see how many retries we max out at. ### Change type - [x] `other` ### API changes internal changes only <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Introduce persistence retry with thresholds, analytics logging, and client notifications; add custom server events and update retry util signature. > > - **Sync Worker**: > - **Persistence Retries**: Wrap `persistToDatabase` in `retry(...)` with up to `PERSIST_RETRIES_MAX` attempts; log `persist_success` with attempt count. > - **Client Notifications**: Broadcast `TLCustomServerEvent` (`persistence_bad` at threshold, `persistence_good` on recovery) via `broadcastPersistenceEvent`. > - **Analytics**: Extend `logEvent` to handle `persist_success`; improved error logging on DB update. > - **Constants**: Add `PERSIST_RETRIES_NOTIFY_THRESHOLD` and `PERSIST_RETRIES_MAX`. > - **Client**: > - Track custom server events by adding `onCustomMessageReceived` in `useSync` and calling `trackEvent(message.type)`. > - **Shared/Utils**: > - Add `TLCustomServerEvent` type in `@tldraw/dotcom-shared`. > - Change `retry` signature to pass `{ attempt, remaining, total }`; update API report accordingly. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 6980329. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent baaac2b commit 08ca397

6 files changed

Lines changed: 89 additions & 40 deletions

File tree

apps/dotcom/client/src/tla/components/TlaEditor/TlaEditor.tsx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { getLicenseKey } from '@tldraw/dotcom-shared'
1+
import { TLCustomServerEvent, getLicenseKey } from '@tldraw/dotcom-shared'
22
import { FairyEntity } from '@tldraw/fairy-shared'
33
import { useSync } from '@tldraw/sync'
44
import { Suspense, lazy, useCallback, useEffect, useMemo, useRef, useState } from 'react'
@@ -30,7 +30,7 @@ import {
3030
import { ThemeUpdater } from '../../../components/ThemeUpdater/ThemeUpdater'
3131
import { useOpenUrlAndTrack } from '../../../hooks/useOpenUrlAndTrack'
3232
import { useRoomLoadTracking } from '../../../hooks/useRoomLoadTracking'
33-
import { useHandleUiEvents } from '../../../utils/analytics'
33+
import { trackEvent, useHandleUiEvents } from '../../../utils/analytics'
3434
import { assetUrls } from '../../../utils/assetUrls'
3535
import { MULTIPLAYER_SERVER } from '../../../utils/config'
3636
import { createAssetFromUrl } from '../../../utils/createAssetFromUrl'
@@ -209,6 +209,9 @@ function TlaEditorInner({ fileSlug, deepLinks }: TlaEditorProps) {
209209
}, [fileSlug, hasUser, getUserToken]),
210210
assets,
211211
userInfo: app?.tlUser.userPreferences,
212+
onCustomMessageReceived: useCallback((message: TLCustomServerEvent) => {
213+
trackEvent(message.type)
214+
}, []),
212215
getUserPresence: useCallback(
213216
(store: TLStore, userInfo: TLPresenceUserInfo): TLPresenceStateInfo | null => {
214217
const defaultPresence = getDefaultUserPresence(store, userInfo)

apps/dotcom/sync-worker/src/TLDrawDurableObject.ts

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/// <reference no-default-lib="true"/>
2-
/// <reference types="@cloudflare/workers-types" />
32

3+
import { R2Bucket } from '@cloudflare/workers-types'
44
import { SupabaseClient } from '@supabase/supabase-js'
55
import {
66
APP_ASSET_UPLOAD_ENDPOINT,
@@ -14,6 +14,7 @@ import {
1414
ROOM_PREFIX,
1515
ROOM_SIZE_LIMIT_MB,
1616
SNAPSHOT_PREFIX,
17+
TLCustomServerEvent,
1718
TlaFile,
1819
type RoomOpenMode,
1920
} from '@tldraw/dotcom-shared'
@@ -528,6 +529,10 @@ export class TLDrawDurableObject extends DurableObject {
528529

529530
logEvent(event: TLServerEvent) {
530531
switch (event.type) {
532+
case 'persist_success': {
533+
this.writeEvent(event.type, { doubles: [event.attempts] })
534+
break
535+
}
531536
case 'room': {
532537
// we would add user/connection ids here if we could
533538
this.writeEvent(event.name, { blobs: [event.roomId] })
@@ -801,43 +806,71 @@ export class TLDrawDurableObject extends DurableObject {
801806
})
802807
}
803808

809+
broadcastPersistenceEvent(event: TLCustomServerEvent) {
810+
this._room?.then((r) => {
811+
for (const session of r.getSessions()) {
812+
r.sendCustomMessage(session.sessionId, event)
813+
}
814+
})
815+
}
816+
804817
// Save the room to r2
805818
async persistToDatabase() {
806-
try {
807-
await this.executionQueue.push(async () => {
808-
// check whether the worker was woken up to persist after having gone to sleep
809-
if (!this._room) return
810-
const slug = this.documentInfo.slug
811-
const room = await this.getRoom()
812-
const clock = room.getCurrentDocumentClock()
813-
if (this._lastPersistedClock === clock) return
814-
if (this._isRestoring) return
815-
816-
const snapshot = room.getCurrentSnapshot()
817-
this.maybeAssociateFileAssets()
818-
819-
const key = getR2KeyForRoom({ slug: slug, isApp: this.documentInfo.isApp })
820-
await this._uploadSnapshotToR2(room, snapshot, key)
821-
822-
this._lastPersistedClock = clock
823-
824-
// Update the updatedAt timestamp in the database
825-
if (this.documentInfo.isApp) {
826-
// don't await on this because otherwise
827-
// if this logic is invoked during another db transaction
828-
// (e.g. when publishing a file)
829-
// that transaction will deadlock
830-
this.db
831-
.updateTable('file')
832-
.set({ updatedAt: new Date().getTime() })
833-
.where('id', '=', this.documentInfo.slug)
834-
.execute()
835-
.catch((e) => this.reportError(e))
836-
}
819+
await this.executionQueue
820+
.push(async () => {
821+
await retry(
822+
async ({ attempt }) => {
823+
if (attempt === PERSIST_RETRIES_NOTIFY_THRESHOLD) {
824+
this.broadcastPersistenceEvent({ type: 'persistence_bad' })
825+
}
826+
// check whether the worker was woken up to persist after having gone to sleep
827+
if (!this._room) return
828+
const slug = this.documentInfo.slug
829+
const room = await this.getRoom()
830+
const clock = room.getCurrentDocumentClock()
831+
if (this._lastPersistedClock === clock) return
832+
if (this._isRestoring) return
833+
834+
const snapshot = room.getCurrentSnapshot()
835+
this.maybeAssociateFileAssets()
836+
837+
const key = getR2KeyForRoom({ slug: slug, isApp: this.documentInfo.isApp })
838+
await this._uploadSnapshotToR2(room, snapshot, key)
839+
840+
this.logEvent({ type: 'persist_success', attempts: attempt })
841+
this._lastPersistedClock = clock
842+
if (attempt >= PERSIST_RETRIES_NOTIFY_THRESHOLD) {
843+
this.broadcastPersistenceEvent({ type: 'persistence_good' })
844+
}
845+
846+
// Update the updatedAt timestamp in the database
847+
if (this.documentInfo.isApp) {
848+
// don't await on this because otherwise
849+
// if this logic is invoked during another db transaction
850+
// (e.g. when publishing a file)
851+
// that transaction will deadlock
852+
this.db
853+
.updateTable('file')
854+
.set({ updatedAt: new Date().getTime() })
855+
.where('id', '=', this.documentInfo.slug)
856+
.execute()
857+
.catch((e) => {
858+
this.logEvent({
859+
type: 'room',
860+
roomId: this.documentInfo.slug,
861+
name: 'failed_persist_to_db',
862+
})
863+
this.reportError(e)
864+
})
865+
}
866+
},
867+
{ attempts: PERSIST_RETRIES_MAX, waitDuration: 2000 }
868+
)
869+
})
870+
.catch((e) => {
871+
this.logEvent({ type: 'room', roomId: this.documentInfo.slug, name: 'fail_persist' })
872+
this.reportError(e)
837873
})
838-
} catch (e) {
839-
this.reportError(e)
840-
}
841874
}
842875

843876
private async _uploadSnapshotToR2(
@@ -1143,3 +1176,6 @@ async function listAllObjectKeys(bucket: R2Bucket, prefix: string): Promise<stri
11431176

11441177
return keys
11451178
}
1179+
1180+
const PERSIST_RETRIES_NOTIFY_THRESHOLD = 10
1181+
const PERSIST_RETRIES_MAX = 100

apps/dotcom/sync-worker/src/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@ export type TLServerEvent =
124124
messageType: string
125125
messageLength: number
126126
}
127+
| {
128+
type: 'persist_success'
129+
attempts: number
130+
}
127131

128132
export type TLPostgresReplicatorRebootSource =
129133
| 'constructor'

packages/dotcom-shared/src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,5 @@ export interface SubmitFeedbackRequestBody {
224224
}
225225

226226
export const MAX_PROBLEM_DESCRIPTION_LENGTH = 2000
227+
228+
export type TLCustomServerEvent = { type: 'persistence_good' } | { type: 'persistence_bad' }

packages/utils/api-report.api.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,11 @@ export const Result: {
421421
};
422422

423423
// @internal
424-
export function retry<T>(fn: () => Promise<T>, { attempts, waitDuration, abortSignal, matchError, }?: {
424+
export function retry<T>(fn: (args: {
425+
attempt: number;
426+
remaining: number;
427+
total: number;
428+
}) => Promise<T>, { attempts, waitDuration, abortSignal, matchError, }?: {
425429
abortSignal?: AbortSignal;
426430
attempts?: number;
427431
matchError?(error: unknown): boolean;

packages/utils/src/lib/retry.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ import { sleep } from './control'
4848
* @internal
4949
*/
5050
export async function retry<T>(
51-
fn: () => Promise<T>,
51+
fn: (args: { attempt: number; remaining: number; total: number }) => Promise<T>,
5252
{
5353
attempts = 3,
5454
waitDuration = 1000,
@@ -65,7 +65,7 @@ export async function retry<T>(
6565
for (let i = 0; i < attempts; i++) {
6666
if (abortSignal?.aborted) throw new Error('aborted')
6767
try {
68-
return await fn()
68+
return await fn({ attempt: i, remaining: attempts - i, total: attempts })
6969
} catch (e) {
7070
if (matchError && !matchError(e)) throw e
7171
error = e

0 commit comments

Comments
 (0)