Skip to content

Commit def1ea5

Browse files
committed
fix(db): prevent connection pool exhaustion causing 500 errors
## Problem After ~3 days of uptime, rest-api containers became unhealthy due to database connection timeouts, causing 'Network error while fetching document' (500) errors for users. ## Root Cause - Default pool size was 20 connections per container - 5 backend containers × 20 = 100 potential DB connections - DigitalOcean Managed Postgres limit: 25-50 connections - Once limit hit → new connections timeout → health checks fail (503) - Docker marks containers unhealthy → Traefik stops routing ## Solution 1. Reduced default DB_POOL_SIZE from 20 → 5 per container 2. Reduced idle timeout from 60s → 30s to release connections faster 3. Added explicit DB_POOL_SIZE env vars to docker-compose.prod.yml New connection math: - 2× rest-api × 5 = 10 - 2× hocuspocus-server × 5 = 10 - 1× hocuspocus-worker × 8 = 8 - Total: 28 connections (fits managed DB limits) ## Files Changed - docker-compose.prod.yml: Added DB_POOL_SIZE and DB_IDLE_TIMEOUT - packages/hocuspocus.server/src/lib/prisma.ts: Safer defaults Fixes: Database connection exhaustion after extended uptime
1 parent 9b58ef7 commit def1ea5

2 files changed

Lines changed: 96 additions & 58 deletions

File tree

docker-compose.prod.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ services:
130130
DATABASE_URL: ${DATABASE_URL}
131131
REDIS_HOST: docsplus-redis
132132
REDIS_PORT: 6379
133+
# Limit DB pool to prevent connection exhaustion (5 containers × 5 = 25 max)
134+
DB_POOL_SIZE: ${DB_POOL_SIZE:-5}
135+
DB_IDLE_TIMEOUT: 30000
133136
SUPABASE_URL: ${SUPABASE_URL}
134137
SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY}
135138
SUPABASE_SERVICE_ROLE_KEY: ${SUPABASE_SERVICE_ROLE_KEY}
@@ -210,6 +213,9 @@ services:
210213
DATABASE_URL: ${DATABASE_URL}
211214
REDIS_HOST: docsplus-redis
212215
REDIS_PORT: 6379
216+
# Limit DB pool to prevent connection exhaustion
217+
DB_POOL_SIZE: ${DB_POOL_SIZE:-5}
218+
DB_IDLE_TIMEOUT: 30000
213219
SUPABASE_URL: ${SUPABASE_URL}
214220
SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY}
215221
SUPABASE_SERVICE_ROLE_KEY: ${SUPABASE_SERVICE_ROLE_KEY}
@@ -295,6 +301,9 @@ services:
295301
DATABASE_URL: ${DATABASE_URL}
296302
REDIS_HOST: docsplus-redis
297303
REDIS_PORT: 6379
304+
# Worker needs slightly higher pool for background jobs
305+
DB_POOL_SIZE: ${DB_POOL_SIZE:-8}
306+
DB_IDLE_TIMEOUT: 30000
298307
SUPABASE_URL: ${SUPABASE_URL}
299308
SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY}
300309
SUPABASE_SERVICE_ROLE_KEY: ${SUPABASE_SERVICE_ROLE_KEY}

packages/hocuspocus.server/src/lib/prisma.ts

Lines changed: 87 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,17 @@ const getDatabaseUrl = (): string => {
3939
}
4040

4141
// Connection pool configuration
42+
// IMPORTANT: Keep pool size LOW to prevent connection exhaustion with multiple containers
43+
// With 5 containers at 5 connections each = 25 total (fits most managed DB limits)
4244
const poolConfig = {
4345
connectionString: getDatabaseUrl(),
44-
max: parseInt(process.env.DB_POOL_SIZE || process.env.DB_CONNECTION_LIMIT || '20', 10),
45-
// Increased idle timeout to prevent connections from closing too quickly
46-
// Health checks and low-traffic scenarios shouldn't cause connection churn
47-
idleTimeoutMillis: parseInt(process.env.DB_IDLE_TIMEOUT || (process.env.NODE_ENV === 'development' ? '300000' : '60000'), 10), // 5min dev, 1min prod
46+
max: parseInt(process.env.DB_POOL_SIZE || process.env.DB_CONNECTION_LIMIT || '5', 10),
47+
// Reduced idle timeout to release connections faster (30s prod, 5min dev)
48+
// This prevents connection accumulation over time
49+
idleTimeoutMillis: parseInt(
50+
process.env.DB_IDLE_TIMEOUT || (process.env.NODE_ENV === 'development' ? '300000' : '30000'),
51+
10
52+
),
4853
connectionTimeoutMillis: parseInt(process.env.DB_CONNECT_TIMEOUT || '10000', 10),
4954
// Enable keep-alive to prevent connection drops
5055
keepAlive: true,
@@ -56,13 +61,11 @@ const poolConfig = {
5661
// SSL config:
5762
// - Production: Enable SSL for managed DBs (DigitalOcean, etc.) - rejectUnauthorized: false for self-signed certs
5863
// - Development: undefined = no SSL override, uses connectionString defaults (local Docker Postgres typically has no SSL)
59-
ssl: process.env.NODE_ENV === 'production' ? { rejectUnauthorized: false } : undefined,
64+
ssl: process.env.NODE_ENV === 'production' ? { rejectUnauthorized: false } : undefined
6065
}
6166

6267
// Create PostgreSQL connection pool
63-
const pool =
64-
globalForPrisma.pool ??
65-
new Pool(poolConfig)
68+
const pool = globalForPrisma.pool ?? new Pool(poolConfig)
6669

6770
// Pool event handlers for monitoring
6871
pool.on('connect', () => {
@@ -82,13 +85,16 @@ pool.on('acquire', () => {
8285

8386
// Production: warn if pool is getting exhausted (80%+ utilization)
8487
if (process.env.NODE_ENV === 'production' && stats.total >= poolConfig.max * 0.8) {
85-
dbLogger.warn({
86-
total: stats.total,
87-
idle: stats.idle,
88-
waiting: stats.waiting,
89-
max: poolConfig.max,
90-
utilization: `${Math.round((stats.total / poolConfig.max) * 100)}%`
91-
}, 'Database pool utilization high - consider increasing DB_POOL_SIZE')
88+
dbLogger.warn(
89+
{
90+
total: stats.total,
91+
idle: stats.idle,
92+
waiting: stats.waiting,
93+
max: poolConfig.max,
94+
utilization: `${Math.round((stats.total / poolConfig.max) * 100)}%`
95+
},
96+
'Database pool utilization high - consider increasing DB_POOL_SIZE'
97+
)
9298
}
9399
})
94100

@@ -99,53 +105,71 @@ pool.on('error', (err: any) => {
99105
// 57P01: terminating connection due to administrator command
100106
// This is normal during database restarts/maintenance - don't log as error
101107
if (errorCode === '57P01') {
102-
dbLogger.debug({
103-
code: errorCode,
104-
message: 'Database connection terminated (likely during maintenance/restart)'
105-
}, 'Connection terminated by database - will be automatically replaced')
108+
dbLogger.debug(
109+
{
110+
code: errorCode,
111+
message: 'Database connection terminated (likely during maintenance/restart)'
112+
},
113+
'Connection terminated by database - will be automatically replaced'
114+
)
106115
return
107116
}
108117

109118
// 57P02: terminating connection due to administrator command (alternative)
110119
if (errorCode === '57P02') {
111-
dbLogger.debug({
112-
code: errorCode,
113-
message: 'Database connection terminated'
114-
}, 'Connection terminated - will be automatically replaced')
120+
dbLogger.debug(
121+
{
122+
code: errorCode,
123+
message: 'Database connection terminated'
124+
},
125+
'Connection terminated - will be automatically replaced'
126+
)
115127
return
116128
}
117129

118130
// 57P03: cannot connect now (database startup/shutdown)
119131
if (errorCode === '57P03') {
120-
dbLogger.warn({
121-
code: errorCode,
122-
message: 'Database temporarily unavailable - connection will retry'
123-
}, 'Database connection unavailable')
132+
dbLogger.warn(
133+
{
134+
code: errorCode,
135+
message: 'Database temporarily unavailable - connection will retry'
136+
},
137+
'Database connection unavailable'
138+
)
124139
return
125140
}
126141

127142
// Connection errors that are recoverable
128143
const recoverableErrors = ['ECONNREFUSED', 'ETIMEDOUT', 'ENOTFOUND']
129-
if (recoverableErrors.some(code => err.code === code || err.message?.includes(code))) {
130-
dbLogger.warn({
131-
code: err.code,
132-
message: err.message
133-
}, 'Database connection error - will retry automatically')
144+
if (recoverableErrors.some((code) => err.code === code || err.message?.includes(code))) {
145+
dbLogger.warn(
146+
{
147+
code: err.code,
148+
message: err.message
149+
},
150+
'Database connection error - will retry automatically'
151+
)
134152
return
135153
}
136154

137155
// Log other errors as actual errors
138-
dbLogger.error({
139-
err,
140-
code: errorCode,
141-
message: err.message
142-
}, 'Unexpected database pool error')
156+
dbLogger.error(
157+
{
158+
err,
159+
code: errorCode,
160+
message: err.message
161+
},
162+
'Unexpected database pool error'
163+
)
143164
})
144165

145166
pool.on('remove', () => {
146167
// Only log in development - connection removal is normal (idle timeout, errors, etc.)
147168
if (process.env.NODE_ENV === 'development') {
148-
dbLogger.debug({ poolSize: pool.totalCount, idle: pool.idleCount }, 'Connection removed from pool')
169+
dbLogger.debug(
170+
{ poolSize: pool.totalCount, idle: pool.idleCount },
171+
'Connection removed from pool'
172+
)
149173
}
150174
})
151175

@@ -157,15 +181,14 @@ export const prisma =
157181
globalForPrisma.prisma ??
158182
new PrismaClient({
159183
adapter,
160-
log: process.env.NODE_ENV === 'development'
161-
? [
162-
{ level: 'query', emit: 'event' },
163-
{ level: 'error', emit: 'event' },
164-
{ level: 'warn', emit: 'event' }
165-
]
166-
: [
167-
{ level: 'error', emit: 'event' }
168-
]
184+
log:
185+
process.env.NODE_ENV === 'development'
186+
? [
187+
{ level: 'query', emit: 'event' },
188+
{ level: 'error', emit: 'event' },
189+
{ level: 'warn', emit: 'event' }
190+
]
191+
: [{ level: 'error', emit: 'event' }]
169192
})
170193

171194
// Log Prisma events
@@ -190,11 +213,14 @@ if (process.env.NODE_ENV !== 'production') {
190213
}
191214

192215
// Log pool configuration on startup
193-
dbLogger.info({
194-
maxConnections: poolConfig.max,
195-
idleTimeout: poolConfig.idleTimeoutMillis,
196-
connectionTimeout: poolConfig.connectionTimeoutMillis
197-
}, 'Database connection pool initialized')
216+
dbLogger.info(
217+
{
218+
maxConnections: poolConfig.max,
219+
idleTimeout: poolConfig.idleTimeoutMillis,
220+
connectionTimeout: poolConfig.connectionTimeoutMillis
221+
},
222+
'Database connection pool initialized'
223+
)
198224

199225
/**
200226
* Get current pool statistics
@@ -220,12 +246,15 @@ export const checkDatabaseHealth = async (): Promise<boolean> => {
220246
// Log pool stats in production if pool is getting exhausted
221247
const stats = getPoolStats()
222248
if (process.env.NODE_ENV === 'production' && stats.waiting > 0) {
223-
dbLogger.warn({
224-
waiting: stats.waiting,
225-
total: stats.total,
226-
idle: stats.idle,
227-
max: poolConfig.max
228-
}, 'Database pool has waiting connections - potential bottleneck')
249+
dbLogger.warn(
250+
{
251+
waiting: stats.waiting,
252+
total: stats.total,
253+
idle: stats.idle,
254+
max: poolConfig.max
255+
},
256+
'Database pool has waiting connections - potential bottleneck'
257+
)
229258
}
230259

231260
return true

0 commit comments

Comments
 (0)