Skip to content

Commit eccb1e5

Browse files
authored
feat(observability): add UX readiness health check endpoint (#639)
1 parent 683b27d commit eccb1e5

12 files changed

Lines changed: 109 additions & 27 deletions

File tree

apps/lfx-one/otel.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ if (!otlpEndpoint) {
105105
new HttpInstrumentation({
106106
ignoreIncomingRequestHook: (req) => {
107107
const url = req.url || '';
108-
return url === '/health' || url === '/api/health' || url.startsWith('/.well-known');
108+
return url === '/livez' || url === '/readyz' || url.startsWith('/.well-known');
109109
},
110110
applyCustomAttributesOnSpan: (span, request, response) => {
111111
const req = 'req' in response ? response.req : undefined;

apps/lfx-one/src/server/middleware/auth.middleware.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ const TOKEN_EXPIRY_BUFFER_SECONDS = 300;
1717
* Ordered by specificity - more specific patterns should come first
1818
*/
1919
const DEFAULT_ROUTE_CONFIG: RouteAuthConfig[] = [
20-
// Health check - completely public
21-
{ pattern: '/health', type: 'api', auth: 'public' },
20+
// Liveness and readiness probes - completely public
21+
{ pattern: '/livez', type: 'api', auth: 'public' },
22+
{ pattern: '/readyz', type: 'api', auth: 'public' },
2223

2324
// Public API routes - optional authentication with token benefits
2425
{ pattern: '/public/api', type: 'api', auth: 'optional', tokenRequired: false },

apps/lfx-one/src/server/server.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,30 @@ app.use(
7373
app.use(express.json({ limit: '15mb' }));
7474
app.use(express.urlencoded({ extended: true, limit: '15mb' }));
7575

76+
// Liveness and readiness endpoints registered before the static handler,
77+
// logger, auth, and rate-limit middleware so:
78+
// - probes are served directly with no filesystem lookup (no I/O overhead
79+
// on frequent Kubernetes probe traffic)
80+
// - probe traffic is not request-logged
81+
// - endpoints are always reachable unauthenticated
82+
// auth.middleware.ts lists /livez and /readyz as public.
83+
app.get('/livez', (_req: Request, res: Response) => {
84+
res.send('OK');
85+
});
86+
87+
// Readiness endpoint for Kubernetes (LFXV2-1640).
88+
// Signals that this pod can accept HTTP traffic: Express is listening and the
89+
// Angular SSR engine loaded successfully (constructed at module load above —
90+
// a load failure crashes the process before reaching this point).
91+
// Intentionally does NOT probe NATS / Snowflake / microservice-proxy: those
92+
// clients are lazy-initialized and report not-connected at startup even
93+
// though many SSR pages render fine without them. Per-feature dependency
94+
// failures are handled at the route level, not by pulling the whole pod out
95+
// of the Service endpoints list.
96+
app.get('/readyz', (_req: Request, res: Response) => {
97+
res.status(200).json({ status: 'ready' });
98+
});
99+
76100
app.get(
77101
'**',
78102
express.static(browserDistFolder, {
@@ -81,11 +105,6 @@ app.get(
81105
})
82106
);
83107

84-
// Health endpoint before logger middleware so health checks aren't logged.
85-
app.get('/health', (_req: Request, res: Response) => {
86-
res.send('OK');
87-
});
88-
89108
const httpLogger = pinoHttp({
90109
logger: serverLogger,
91110
serializers: {

charts/lfx-v2-ui/templates/deployment.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,15 @@ spec:
5353
{{- toYaml $config.valueFrom | nindent 14 }}
5454
{{- end }}
5555
{{- end }}
56+
{{- with .Values.startupProbe }}
57+
startupProbe:
58+
{{- toYaml . | nindent 12 }}
59+
{{- end }}
60+
{{- with .Values.livenessProbe }}
61+
livenessProbe:
62+
{{- toYaml . | nindent 12 }}
63+
{{- end }}
64+
{{- with .Values.readinessProbe }}
65+
readinessProbe:
66+
{{- toYaml . | nindent 12 }}
67+
{{- end }}

charts/lfx-v2-ui/values.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,53 @@ resources:
152152
cpu: 500m
153153
memory: 512Mi
154154

155+
# Startup probe — gates liveness and readiness during the initial startup window.
156+
# Once it succeeds, Kubernetes hands off to livenessProbe and readinessProbe.
157+
# Budget: 10s initial delay + (10s × 30 failures) = 310s (~5 min) maximum,
158+
# giving headroom over the observed ~4 min cold-start time.
159+
# Set to null to disable: startupProbe: null
160+
startupProbe:
161+
httpGet:
162+
path: /livez
163+
port: http
164+
initialDelaySeconds: 10
165+
periodSeconds: 10
166+
timeoutSeconds: 3
167+
failureThreshold: 30
168+
169+
# Liveness probe — restarts the pod if it becomes unresponsive at runtime.
170+
# startupProbe handles the startup window, so initialDelaySeconds is 0 here.
171+
# Targets GET /livez (plain text "OK"), registered before logging, auth, and
172+
# rate-limit middleware so it is always reachable if the process is alive.
173+
# Set to null to disable: livenessProbe: null
174+
livenessProbe:
175+
httpGet:
176+
path: /livez
177+
port: http
178+
initialDelaySeconds: 0
179+
periodSeconds: 15
180+
timeoutSeconds: 5
181+
failureThreshold: 3
182+
183+
# Readiness probe — controls when the pod receives traffic from the Service.
184+
# startupProbe handles the startup window, so initialDelaySeconds is 0 here.
185+
# Targets GET /readyz (JSON {"status":"ready"}) which is registered before
186+
# pino-http so probes don't pollute request logs. The endpoint confirms that
187+
# Express is listening and the Angular SSR engine loaded; it intentionally
188+
# does not probe lazy-initialized clients (NATS, Snowflake) because those
189+
# report not-connected at startup even when most SSR pages render fine without
190+
# them. See apps/lfx-one/src/server/server.ts for implementation details.
191+
# Set to null to disable: readinessProbe: null
192+
readinessProbe:
193+
httpGet:
194+
path: /readyz
195+
port: http
196+
initialDelaySeconds: 0
197+
periodSeconds: 10
198+
timeoutSeconds: 3
199+
failureThreshold: 3
200+
successThreshold: 1
201+
155202
# Environment variables for the application
156203
# Uses map/object format for deep merging support
157204
environment:

docs/architecture/backend/ai-service.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,8 @@ serverLogger.error('Failed to generate meeting agenda', { error });
321321
### Health Monitoring
322322

323323
```typescript
324-
// Health check integration
325-
app.get('/api/health', (req, res) => {
324+
// Readiness check integration
325+
app.get('/readyz', (req, res) => {
326326
const healthStatus = {
327327
ai_service: {
328328
configured: !!process.env['AI_PROXY_URL'] && !!process.env['AI_API_KEY'],

docs/architecture/backend/logging-monitoring.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -672,22 +672,25 @@ logger.success(req, 'user_login', startTime, {
672672

673673
## 📊 Health Check Filtering
674674

675-
Health check endpoints are excluded from automatic HTTP logging:
675+
Liveness and readiness endpoints are excluded from automatic HTTP logging by being registered before the `pino-http` middleware:
676676

677677
```typescript
678-
// Health check endpoint (added before logger middleware)
679-
app.get('/health', (_req: Request, res: Response) => {
678+
// Liveness and readiness endpoints (registered before logger middleware)
679+
app.get('/livez', (_req: Request, res: Response) => {
680680
res.send('OK');
681681
});
682+
app.get('/readyz', (_req: Request, res: Response) => {
683+
res.status(200).json({ status: 'ready' });
684+
});
682685

683-
// HTTP logger middleware (added after health endpoint)
686+
// HTTP logger middleware (added after probe endpoints)
684687
app.use(httpLogger);
685688
```
686689

687690
URLs excluded from logging:
688691

689-
- `/health`
690-
- `/api/health`
692+
- `/livez`
693+
- `/readyz`
691694
- `/.well-known/*`
692695

693696
## 🎯 Best Practices

docs/architecture/backend/nats-integration.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,8 @@ logger.info(undefined, 'nats_connect', 'Connecting to NATS server on demand', {
245245
### Health Check Integration
246246

247247
```typescript
248-
// Health endpoint includes NATS status
249-
app.get('/api/health', (req, res) => {
248+
// Readiness endpoint includes NATS status
249+
app.get('/readyz', (req, res) => {
250250
const healthStatus = {
251251
nats: {
252252
connected: natsService.isConnected(),

docs/architecture/backend/observability.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Tracing activates only when `OTEL_EXPORTER_OTLP_ENDPOINT` is set — leaving it
4040

4141
## What gets instrumented
4242

43-
- **HTTP** (`HttpInstrumentation`) — incoming and outgoing HTTP requests, with `/health`, `/api/health`, and `/.well-known/*` excluded so health checks don't flood spans.
43+
- **HTTP** (`HttpInstrumentation`) — incoming and outgoing HTTP requests, with `/livez`, `/readyz`, and `/.well-known/*` excluded so health checks don't flood spans.
4444
- **Express** (`ExpressInstrumentation`) — per-middleware spans so you can see which middleware runs for a given request.
4545
- **Undici** (`UndiciInstrumentation`) — spans for `fetch` / `undici` calls (used by `api-client.service.ts` for microservice calls). Content-type headers are captured.
4646
- **Propagators** — W3C Trace Context + W3C Baggage (`traceparent`, `tracestate`, `baggage` headers) so spans correlate across the NATS boundary and into downstream microservices.

docs/architecture/backend/snowflake-integration.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -774,8 +774,8 @@ Track these metrics for operational visibility:
774774
### Health Check Integration
775775

776776
```typescript
777-
// Add to server health endpoint
778-
app.get('/health', (req, res) => {
777+
// Add to server liveness endpoint
778+
app.get('/livez', (req, res) => {
779779
const snowflakeStats = snowflakeService.getPoolStats();
780780
const lockStats = snowflakeService.getLockStats();
781781

0 commit comments

Comments
 (0)