Skip to content

Commit d426ffe

Browse files
fix(zero): add replicator health check and statement timeout (tldraw#8437)
After the 16+ hour Zero replication stall with no automated detection, this adds a health check endpoint and a safety net for stuck queries. **Health check** (`/health-check/zero-replicator`): queries `pg_stat_replication` for the `zero-replicator` application and returns 500 if it's disconnected, stalled (`write_lsn IS NULL`), or lagging (`write_lag > 1 minute`). Uses the existing Kysely pool, same pattern as `/health-check/db`. Configure Updown.io to hit this endpoint every 60s with the `HEALTH_CHECK_BEARER_TOKEN`. **Statement timeout**: changes `statement_timeout=0` (infinite) to `statement_timeout=1800000` (30 min) on Zero's connection strings. Prevents stuck queries from blocking forever while still allowing initial sync to complete. ### Change type - [x] `improvement` ### Test plan 1. Deploy to staging 2. Hit `/health-check/zero-replicator` with bearer token — should return 200 3. Stop the zero-replicator process — endpoint should return 500 after Updown confirmation 4. Verify Zero still boots and completes initial sync with the 30-min statement timeout ### Code changes | Section | LOC change | | -------------- | ---------- | | Apps | +25 / -0 | | Config/tooling | +1 / -1 |
1 parent 309af39 commit d426ffe

2 files changed

Lines changed: 33 additions & 2 deletions

File tree

apps/dotcom/sync-worker/src/healthCheckRoutes.ts

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { createRouter, notFound } from '@tldraw/worker-shared'
2+
import { sql } from 'kysely'
23
import { createPostgresConnectionPool } from './postgres'
34
import { isDebugLogging, type Environment } from './types'
45
import { getStatsDurableObjct } from './utils/durableObjects'
@@ -48,8 +49,9 @@ export const healthCheckRoutes = createRouter<Environment>()
4849
}
4950
})
5051
.get('/health-check/db', async (_, env) => {
52+
const db = createPostgresConnectionPool(env, '/health-check/db')
5153
try {
52-
await createPostgresConnectionPool(env, '/health-check/db')
54+
await db
5355
.selectFrom('user')
5456
.select('name')
5557
.where('email', '=', 'mitja@tldraw.com')
@@ -58,6 +60,35 @@ export const healthCheckRoutes = createRouter<Environment>()
5860
return new Response('ok', { status: 200 })
5961
} catch (_e) {
6062
return new Response('Could not reach the database', { status: 500 })
63+
} finally {
64+
await db.destroy()
65+
}
66+
})
67+
.get('/health-check/zero-replicator', async (_, env) => {
68+
const db = createPostgresConnectionPool(env, '/health-check/zero-replicator')
69+
try {
70+
const result = await sql<{ status: string }>`
71+
SELECT
72+
CASE
73+
WHEN write_lsn IS NULL THEN 'STALLED'
74+
WHEN write_lag > interval '1 minute' THEN 'LAGGING'
75+
ELSE 'HEALTHY'
76+
END AS status
77+
FROM pg_stat_replication
78+
WHERE application_name = 'zero-replicator'
79+
`.execute(db)
80+
if (result.rows.length === 0) {
81+
return new Response('zero-replicator not connected', { status: 500 })
82+
}
83+
const status = result.rows[0].status
84+
if (status !== 'HEALTHY') {
85+
return new Response(`zero-replicator: ${status}`, { status: 500 })
86+
}
87+
return new Response('ok', { status: 200 })
88+
} catch (_e) {
89+
return new Response('Could not check zero-replicator status', { status: 500 })
90+
} finally {
91+
await db.destroy()
6192
}
6293
})
6394
.all('*', notFound)

internal/scripts/deploy-dotcom.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,7 @@ async function vercelCli(command: string, args: string[], opts?: ExecOpts) {
669669

670670
function withStatementTimeout(connString: string): string {
671671
const separator = connString.includes('?') ? '&' : '?'
672-
return `${connString}${separator}statement_timeout=0`
672+
return `${connString}${separator}statement_timeout=1800000`
673673
}
674674

675675
function updateFlyioToml(appName: string): void {

0 commit comments

Comments
 (0)