From 498aab2c5ca478f2b60abd6eab4e56ab5e83d8a1 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Wed, 3 Jun 2026 16:25:51 -0400 Subject: [PATCH 1/2] =?UTF-8?q?Instrument=20the=20realm-server=20search=20?= =?UTF-8?q?path=20with=20a=20request=E2=86=92response=20timeline=20(CS-113?= =?UTF-8?q?63)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During a from-scratch index, prerendered cards block for tens of seconds on in-render `_search` round-trips while the SQL behind them runs in milliseconds. We could see the client wait (host-side `boxel_index.diagnostics`) but had no server-side view of where those seconds went. This adds that view: - A client-minted `x-boxel-request-id` correlation id, stamped on the prerendered host's `_federated-search` fetch (prerender-gated; live SPA traffic is untouched). Join key only — no new client measurement. - A `realm:search-timing` line keyed by that id with the request→response stage breakdown: parse, resolveRealms, sql, loadLinks, populate (relationship/query-field assembly), per-instance cache read/write + hit/miss, stringify, and result count. Emitted by `handle-search` for the full handler picture, and standalone by `searchRealms` on the host-test path. Gated on a correlation id, so normal traffic allocates nothing. - Request duration + the correlation id on the `realm:requests` log lines (the outermost request→response bound, incl. body read + send). - A `realm:health` sampler: event-loop lag (mean/p99/max) alongside the in-flight `_search` count and heap, logged only during saturation windows — the signal that confirms or refutes single-thread saturation directly. The timing collector is threaded through the shared `SearchOpts` and is a no-op without a correlation id; `RealmIndexQueryEngine` only ever runs in the realm-server, so the host stays unaffected. A host integration test exercises the whole path end-to-end: the in-realm browser mints the id, issues a real `store.search`, and the test asserts the id lands in the server's `realm:search-timing` line (and that live, non-prerender traffic stamps nothing and emits nothing). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../host/app/lib/prerender-fetch-headers.ts | 31 +++ packages/host/app/services/store.ts | 2 + .../tests/helpers/realm-server-mock/routes.ts | 10 + .../search-correlation-id-test.gts | 195 ++++++++++++++++++ .../realm-server/handlers/handle-search.ts | 74 ++++++- packages/realm-server/health-sampler.ts | 65 ++++++ packages/realm-server/main.ts | 7 + packages/realm-server/middleware/index.ts | 56 ++++- packages/realm-server/search-inflight.ts | 21 ++ packages/runtime-common/index.ts | 1 + packages/runtime-common/prerender-headers.ts | 24 +++ .../realm-index-query-engine.ts | 95 ++++++--- packages/runtime-common/realm.ts | 1 + packages/runtime-common/request-timings.ts | 64 ++++++ packages/runtime-common/search-utils.ts | 67 +++++- 15 files changed, 668 insertions(+), 45 deletions(-) create mode 100644 packages/host/tests/integration/search-correlation-id-test.gts create mode 100644 packages/realm-server/health-sampler.ts create mode 100644 packages/realm-server/search-inflight.ts create mode 100644 packages/runtime-common/request-timings.ts diff --git a/packages/host/app/lib/prerender-fetch-headers.ts b/packages/host/app/lib/prerender-fetch-headers.ts index a17823bcf68..a4b810acc31 100644 --- a/packages/host/app/lib/prerender-fetch-headers.ts +++ b/packages/host/app/lib/prerender-fetch-headers.ts @@ -2,6 +2,7 @@ import { DURING_PRERENDER_HEADER, X_BOXEL_CONSUMING_REALM_HEADER, X_BOXEL_JOB_ID_HEADER, + X_BOXEL_REQUEST_ID_HEADER, } from '@cardstack/runtime-common'; // Set by the prerender server's `evaluateOnNewDocument` before the @@ -46,3 +47,33 @@ export function jobIdHeader(): Record { let j = (globalThis as unknown as { __boxelJobId?: string }).__boxelJobId; return j ? { [X_BOXEL_JOB_ID_HEADER]: j } : {}; } + +// Per-search correlation id. Minted fresh for each `_federated-search` +// fetch the SPA issues while rendering inside a prerender tab, and stamped +// as `x-boxel-request-id`. The realm-server reads it back out and keys its +// `realm:search-timing` line on it, so a search the prerender observes as +// slow (surfaced in its `queryLoadsInFlight` diagnostics) can be joined to +// the realm-server's stage-by-stage view of the same request. Gated on the +// prerender context — exactly like the job-id / consuming-realm headers — +// so live SPA traffic is unaffected and emits no server-side timing line. +export function requestIdHeader(): Record { + let flag = (globalThis as unknown as { __boxelRenderContext?: boolean }) + .__boxelRenderContext; + if (flag !== true) { + return {}; + } + return { [X_BOXEL_REQUEST_ID_HEADER]: newCorrelationId() }; +} + +function newCorrelationId(): string { + let c = (globalThis as { crypto?: { randomUUID?: () => string } }).crypto; + if (c?.randomUUID) { + return c.randomUUID(); + } + // Fallback for the rare environment without `crypto.randomUUID` — the id + // only needs to disambiguate concurrent searches in a log line, not be + // cryptographically strong. + return `r-${Date.now().toString(36)}-${Math.floor( + Math.random() * 1e9, + ).toString(36)}`; +} diff --git a/packages/host/app/services/store.ts b/packages/host/app/services/store.ts index cabb1df446b..1b105f79960 100644 --- a/packages/host/app/services/store.ts +++ b/packages/host/app/services/store.ts @@ -89,6 +89,7 @@ import { consumingRealmHeader, duringPrerenderHeaders, jobIdHeader, + requestIdHeader, } from '../lib/prerender-fetch-headers'; import { searchCacheKey } from '../lib/search-cache-key'; import { searchInFlightKey } from '../lib/search-in-flight-key'; @@ -1153,6 +1154,7 @@ export default class StoreService extends Service implements StoreInterface { ...consumingRealmHeader(), ...jobIdHeader(), ...jobPriorityHeader(), + ...requestIdHeader(), }, body: JSON.stringify({ ...query, realms }), }, diff --git a/packages/host/tests/helpers/realm-server-mock/routes.ts b/packages/host/tests/helpers/realm-server-mock/routes.ts index 83005da512c..266e36318ed 100644 --- a/packages/host/tests/helpers/realm-server-mock/routes.ts +++ b/packages/host/tests/helpers/realm-server-mock/routes.ts @@ -7,9 +7,11 @@ import { parseSearchQueryFromPayload, parseSearchRequestPayload, SearchRequestError, + sanitizeRequestId, searchPrerenderedRealms, searchRealms, SupportedMimeType, + X_BOXEL_REQUEST_ID_HEADER, type RealmInfo, type Query, } from '@cardstack/runtime-common'; @@ -118,9 +120,17 @@ function registerSearchRoutes() { throw e; } + // Mirror the realm-server's `handle-search`: read the client's + // correlation id off the request and thread it into searchRealms, so + // the real `realm:search-timing` line is emitted (and observable by + // host integration tests) keyed by the id the client minted. + let requestId = sanitizeRequestId( + req.headers.get(X_BOXEL_REQUEST_ID_HEADER), + ); let combined = await searchRealms( realmList.map((realmURL) => getSearchableRealmForURL(realmURL)), cardsQuery, + requestId ? { requestId } : undefined, ); return new Response(JSON.stringify(combined), { diff --git a/packages/host/tests/integration/search-correlation-id-test.gts b/packages/host/tests/integration/search-correlation-id-test.gts new file mode 100644 index 00000000000..e215722cdc6 --- /dev/null +++ b/packages/host/tests/integration/search-correlation-id-test.gts @@ -0,0 +1,195 @@ +import type { RenderingTestContext } from '@ember/test-helpers'; +import { settled } from '@ember/test-helpers'; + +import { getService } from '@universal-ember/test-support'; +import { module, test } from 'qunit'; + +import { + baseRealm, + rri, + setSearchTimingSinkForTests, + X_BOXEL_REQUEST_ID_HEADER, +} from '@cardstack/runtime-common'; +import type { Loader } from '@cardstack/runtime-common/loader'; +import type { Query } from '@cardstack/runtime-common/query'; + +import type NetworkService from '@cardstack/host/services/network'; +import type StoreService from '@cardstack/host/services/store'; + +import { + testRealmURL, + setupCardLogs, + setupLocalIndexing, + setupIntegrationTestRealm, +} from '../helpers'; +import { setupMockMatrix } from '../helpers/mock-matrix'; +import { setupRenderingTest } from '../helpers/setup'; + +// End-to-end coverage for the search correlation id: the in-realm browser +// (the prerendered host SPA) mints `x-boxel-request-id` on its +// `_federated-search` fetch, and the realm-server's search path emits a +// `realm:search-timing` line keyed by that same id. This proves the id +// threads all the way from the client that originated it through to the +// server log a triage would join against. +// +// The host test exercises the *real* code on both ends: the SPA's +// `requestIdHeader()` stamps the header, and the realm-server-mock hands it +// to the real `searchRealms`, which emits the line. Only the prerender +// context flag is simulated (the host normally sets it inside a prerender +// tab). + +const personModule = ` + import { contains, field, CardDef } from 'https://cardstack.com/base/card-api'; + import StringField from 'https://cardstack.com/base/string'; + + export class Person extends CardDef { + static displayName = 'Person'; + @field name = contains(StringField); + } +`; + +let loader: Loader; + +module('Integration | search correlation id', function (hooks) { + setupRenderingTest(hooks); + setupLocalIndexing(hooks); + let mockMatrixUtils = setupMockMatrix(hooks); + + hooks.beforeEach(function (this: RenderingTestContext) { + loader = getService('loader-service').loader; + }); + + setupCardLogs( + hooks, + async () => await loader.import(`${baseRealm.url}card-api`), + ); + + hooks.beforeEach(async function () { + await setupIntegrationTestRealm({ + mockMatrixUtils, + contents: { + 'person.gts': personModule, + 'person-1.json': { + data: { + attributes: { name: 'Alice' }, + meta: { adoptsFrom: { module: './person', name: 'Person' } }, + }, + }, + 'person-2.json': { + data: { + attributes: { name: 'Bob' }, + meta: { adoptsFrom: { module: './person', name: 'Person' } }, + }, + }, + }, + }); + }); + + // Restore globals + sink between tests so a failure can't leak into the + // next test or the rest of the suite. + hooks.afterEach(function () { + delete (globalThis as Record).__boxelRenderContext; + setSearchTimingSinkForTests(undefined); + }); + + const personQuery: Query = { + filter: { type: { module: rri(`${testRealmURL}person`), name: 'Person' } }, + }; + + test('a client-issued search threads its correlation id into the server timing log', async function (assert) { + let store = getService('store') as StoreService; + let network = getService('network') as NetworkService; + + // Capture the realm-server's `realm:search-timing` emissions. + let timingLines: string[] = []; + setSearchTimingSinkForTests((line) => timingLines.push(line)); + + // Capture the correlation id the client actually puts on the wire. + let sentRequestIds: string[] = []; + let spy = async (request: Request) => { + if (new URL(request.url).pathname.endsWith('/_federated-search')) { + let id = request.headers.get(X_BOXEL_REQUEST_ID_HEADER); + if (id) { + sentRequestIds.push(id); + } + } + // Return null to fall through to the realm-server-mock route. + return null; + }; + network.virtualNetwork.mount(spy, { prepend: true }); + + // Simulate the prerender context, which is what gates the host's + // correlation-id stamping (mirrors a card rendering inside a prerender + // tab issuing a query-backed search). + (globalThis as Record).__boxelRenderContext = true; + + let results = await store.search(personQuery, [testRealmURL]); + await settled(); + + assert.strictEqual(results.length, 2, 'the search returned both people'); + + assert.strictEqual( + sentRequestIds.length, + 1, + 'the client stamped exactly one correlation id on its _federated-search fetch', + ); + let sentId = sentRequestIds[0]; + assert.ok( + /^[A-Za-z0-9._:-]{8,}$/.test(sentId), + `client-minted correlation id looks well-formed (${sentId})`, + ); + + let matching = timingLines.filter((line) => line.includes(`req=${sentId}`)); + assert.strictEqual( + matching.length, + 1, + `the server emitted exactly one realm:search-timing line keyed by the client's id (lines: ${JSON.stringify( + timingLines, + )})`, + ); + assert.ok( + /\bsql=\d+\b/.test(matching[0]), + `the timing line carries the sql stage (${matching[0]})`, + ); + assert.ok( + /\bloadLinks=\d+\b/.test(matching[0]), + `the timing line carries the loadLinks stage (${matching[0]})`, + ); + }); + + test('a non-prerender search stamps no id and emits no timing line', async function (assert) { + let store = getService('store') as StoreService; + let network = getService('network') as NetworkService; + + let timingLines: string[] = []; + setSearchTimingSinkForTests((line) => timingLines.push(line)); + + let sawHeader = false; + let spy = async (request: Request) => { + if ( + new URL(request.url).pathname.endsWith('/_federated-search') && + request.headers.get(X_BOXEL_REQUEST_ID_HEADER) + ) { + sawHeader = true; + } + return null; + }; + network.virtualNetwork.mount(spy, { prepend: true }); + + // No __boxelRenderContext: live SPA traffic must not stamp the header + // (so it pays nothing and the server emits no timing line). + let results = await store.search(personQuery, [testRealmURL]); + await settled(); + + assert.strictEqual(results.length, 2, 'the search still returns results'); + assert.false( + sawHeader, + 'live (non-prerender) traffic sends no x-boxel-request-id header', + ); + assert.strictEqual( + timingLines.length, + 0, + 'no realm:search-timing line is emitted without a correlation id', + ); + }); +}); diff --git a/packages/realm-server/handlers/handle-search.ts b/packages/realm-server/handlers/handle-search.ts index 1ec88e752f2..8a5a7579c63 100644 --- a/packages/realm-server/handlers/handle-search.ts +++ b/packages/realm-server/handlers/handle-search.ts @@ -10,6 +10,10 @@ import { sanitizeConsumingRealmHeader, SearchRequestError, searchRealms, + sanitizeRequestId, + X_BOXEL_REQUEST_ID_HEADER, + RequestTimings, + emitSearchTiming, } from '@cardstack/runtime-common'; import { fetchRequestFromContext, @@ -36,16 +40,29 @@ export default function handleSearch(opts: { }): (ctxt: Koa.Context) => Promise { let { reconciler, searchCache } = opts; return async function (ctxt: Koa.Context) { + // Hoof-to-snout server-side timing for one search: from the moment the + // handler is entered through to the response being assembled. Stamped + // only when the client supplied a correlation id (prerender traffic); + // live / external callers allocate nothing and emit no line. The + // outermost request→response bound (incl. body read + send) is the + // `realm:requests` middleware's `dur=`, keyed by the same id. + let handlerStart = Date.now(); + let requestId = sanitizeRequestId(ctxt.get(X_BOXEL_REQUEST_ID_HEADER)); + let timings = requestId !== null ? new RequestTimings() : undefined; + let { realmList } = getMultiRealmAuthorization(ctxt); let cardsQuery; let request = await fetchRequestFromContext(ctxt); try { let payload = getSearchRequestPayload(ctxt); - cardsQuery = + let parseQuery = async () => payload !== undefined ? parseSearchQueryFromPayload(payload) : await parseSearchQueryFromRequest(request); + cardsQuery = timings + ? await timings.time('parse', parseQuery) + : await parseQuery(); } catch (e) { if (e instanceof SearchRequestError) { if (e.code === 'invalid-query') { @@ -106,6 +123,21 @@ export default function handleSearch(opts: { if (prerenderJobId) searchOpts.jobIdentity = prerenderJobId; let normalizedSearchOpts = Object.keys(searchOpts).length > 0 ? searchOpts : undefined; + // `requestId` / `timings` are deliberately kept OUT of `searchOpts`: + // that object is the job-scoped search cache's key material (see + // `computeETag` / `getOrPopulate` below), and per-request values would + // make every key unique and defeat the cache. They only need to reach + // `searchRealms` (which stamps the SQL + loadLinks stages onto the same + // collector this handler emits), so they ride on the run-time opts and + // never touch the cache key. + let runSearchOpts = + requestId !== null + ? { + ...(normalizedSearchOpts ?? {}), + requestId, + ...(timings ? { timings } : {}), + } + : normalizedSearchOpts; // `consumingRealm` is read unconditionally — even when the // job-scoped search cache is disabled, `resolveRealmsForFederatedRequest` // uses it to scope CS-11259's self-mount fast-path. The cache gate @@ -116,15 +148,34 @@ export default function handleSearch(opts: { // Lazy-mount inside runSearch so cache hits (304 / cached body) // skip the lazy-mount work entirely. let runSearch = async () => { - let realmInstances = await resolveRealmsForFederatedRequest( - reconciler, - realmList, - { consumingRealm }, - ); - return JSON.stringify( - await searchRealms(realmInstances, cardsQuery, normalizedSearchOpts), - null, - 2, + let resolveRealms = () => + resolveRealmsForFederatedRequest(reconciler, realmList, { + consumingRealm, + }); + let realmInstances = timings + ? await timings.time('resolveRealms', resolveRealms) + : await resolveRealms(); + // `searchRealms` stamps `sql` / `loadLinks` / `populate` / cache stages + // onto `runSearchOpts.timings`; because the handler passed a collector + // it won't emit its own line — this handler emits the complete one. + let doc = await searchRealms(realmInstances, cardsQuery, runSearchOpts); + let stringify = async () => JSON.stringify(doc, null, 2); + return timings ? await timings.time('stringify', stringify) : stringify(); + }; + + // Emit the complete request→response stage breakdown. Called on every + // terminal path that produced a response (skipped only on the parse- + // error early returns above, which never reach here). No-op without a + // correlation id. + let emitTimeline = () => { + if (!timings || requestId === null) { + return; + } + emitSearchTiming( + `req=${requestId}` + + (prerenderJobId ? ` job=${prerenderJobId}` : '') + + ` handler=${Date.now() - handlerStart}ms ` + + timings.toLogFragment(), ); }; @@ -181,6 +232,7 @@ export default function handleSearch(opts: { if (cached !== undefined) { ctxt.status = 304; ctxt.set('ETag', expectedEtag); + emitTimeline(); return; } } @@ -200,6 +252,7 @@ export default function handleSearch(opts: { }, }), ); + emitTimeline(); return; } @@ -210,5 +263,6 @@ export default function handleSearch(opts: { headers: { 'content-type': SupportedMimeType.CardJson }, }), ); + emitTimeline(); }; } diff --git a/packages/realm-server/health-sampler.ts b/packages/realm-server/health-sampler.ts new file mode 100644 index 00000000000..ca9deb1ccc5 --- /dev/null +++ b/packages/realm-server/health-sampler.ts @@ -0,0 +1,65 @@ +import { monitorEventLoopDelay, type IntervalHistogram } from 'node:perf_hooks'; +import { logger } from '@cardstack/runtime-common'; +import { getSearchInFlight } from './search-inflight'; + +// Periodically samples the realm-server process's event-loop health and +// in-flight search count, logging a `realm:health` line whenever there's a +// saturation signal worth capturing. +// +// Why: during a from-scratch index, prerendered cards block in +// `waiting-stability` on `_search` round-trips that the realm-server is slow +// to answer — yet the SQL behind them runs in milliseconds. The missing +// piece is whether the single-threaded realm-server's event loop is starved +// (by the synchronous, CPU-bound post-SQL JSON serialization across many +// concurrent searches) so requests sit unserviced. Event-loop lag rising in +// lockstep with `inFlightSearch` is the fingerprint of exactly that. +// +// `monitorEventLoopDelay` measures the delay between when a timer was +// scheduled and when it actually fired — i.e. how long synchronous work kept +// the loop from turning. Values are nanoseconds. +export interface HealthSamplerOptions { + // How often to sample + maybe log. Defaults to 5s. + intervalMs?: number; + // Only log when peak loop lag in the window exceeds this (or a search is + // in flight). Keeps the line quiet on an idle/healthy server. Defaults to + // 200ms. + lagThresholdMs?: number; +} + +export function startHealthSampler( + opts: HealthSamplerOptions = {}, +): () => void { + let intervalMs = opts.intervalMs ?? 5000; + let lagThresholdMs = opts.lagThresholdMs ?? 200; + // Created here (not at module load) to avoid racing the circular import + // that installs the logger factory; startup calls this well after boot. + let log = logger('realm:health'); + let histogram: IntervalHistogram = monitorEventLoopDelay({ resolution: 20 }); + histogram.enable(); + + let timer = setInterval(() => { + let toMs = (ns: number) => (Number.isFinite(ns) ? ns / 1e6 : 0); + let maxLagMs = toMs(histogram.max); + let meanLagMs = toMs(histogram.mean); + let p99LagMs = toMs(histogram.percentile(99)); + histogram.reset(); + let inFlightSearch = getSearchInFlight(); + // Stay silent when the loop is healthy and nothing is in flight — only + // the saturation windows are interesting. + if (maxLagMs < lagThresholdMs && inFlightSearch === 0) { + return; + } + let heapMB = Math.round(process.memoryUsage().heapUsed / (1024 * 1024)); + log.info( + `eventLoopLagMs(mean/p99/max)=${meanLagMs.toFixed(0)}/${p99LagMs.toFixed(0)}/${maxLagMs.toFixed(0)} ` + + `inFlightSearch=${inFlightSearch} heapMB=${heapMB}`, + ); + }, intervalMs); + // Don't keep the process alive solely for sampling. + timer.unref?.(); + + return () => { + clearInterval(timer); + histogram.disable(); + }; +} diff --git a/packages/realm-server/main.ts b/packages/realm-server/main.ts index abab0bca538..8d8d327d634 100644 --- a/packages/realm-server/main.ts +++ b/packages/realm-server/main.ts @@ -43,6 +43,7 @@ import { ModuleCacheCoordinator } from './lib/module-cache-coordination'; import { JobsFinishedListener } from './lib/jobs-finished-listener'; import { JobScopedSearchCache } from './job-scoped-search-cache'; import { JobScopedInstanceCache } from './job-scoped-instance-cache'; +import { startHealthSampler } from './health-sampler'; import { resolveFullIndexOnStartup } from './lib/full-index-on-startup'; import { PUBLISHED_DIRECTORY_NAME } from '@cardstack/runtime-common'; @@ -391,6 +392,11 @@ const smokeTestHostApp = async () => { // JobsFinishedListener below) and the age-based janitor backstop. let instanceCache = new JobScopedInstanceCache(dbAdapter); instanceCache.startJanitor(); + // Periodic event-loop-lag + in-flight-search sampler. Emits a + // `realm:health` line only during saturation windows, so a stalled + // `_search` can be checked against whether the process's event loop was + // starved at the time. + let stopHealthSampler = startHealthSampler(); let reconciler: RealmRegistryReconciler | undefined; let fileChangesListener: RealmFileChangesListener | undefined; let indexUpdatedListener: RealmIndexUpdatedListener | undefined; @@ -642,6 +648,7 @@ const smokeTestHostApp = async () => { } searchCache.stopJanitor(); instanceCache.stopJanitor(); + stopHealthSampler(); httpServer.close(() => { (async () => { await Promise.all([ diff --git a/packages/realm-server/middleware/index.ts b/packages/realm-server/middleware/index.ts index 49a01db2304..f850aa803cb 100644 --- a/packages/realm-server/middleware/index.ts +++ b/packages/realm-server/middleware/index.ts @@ -4,6 +4,8 @@ import type { ResponseWithNodeStream } from '@cardstack/runtime-common'; import { logger as getLogger, webStreamToText, + sanitizeRequestId, + X_BOXEL_REQUEST_ID_HEADER, } from '@cardstack/runtime-common'; import type Koa from 'koa'; import mime from 'mime-types'; @@ -18,6 +20,16 @@ import { PRERENDER_JOB_ID_HEADER, sanitizePrerenderJobId, } from '../prerender/prerender-constants'; +import { + incrementSearchInFlight, + decrementSearchInFlight, +} from '../search-inflight'; + +// Matches the realm-server's search endpoints (`/_search`, +// `/_search-prerendered`, `/_federated-search`, +// `/_federated-search-prerendered`) so the request middleware can track how +// many searches are in flight for the health sampler. +const SEARCH_PATH_PATTERN = /(^|\/)_(federated-)?search(-prerendered)?$/; const REQUEST_BODY_STATE = 'requestBody'; @@ -156,20 +168,58 @@ export function httpLogging(ctxt: Koa.Context, next: Koa.Next) { // realm-server lines for an indexing job alongside worker lines. let jobId = sanitizePrerenderJobId(ctxt.get(PRERENDER_JOB_ID_HEADER)); let jobTag = jobId ? ` [job: ${jobId}]` : ''; + // Correlation id minted by the client; echoed onto both request log + // lines (and into the response header) so a client-observed slow search + // joins to the realm-server's view of the same request. The matching + // `realm:search-timing` line (emitted by `searchRealms`) is keyed by the + // same value. + let requestId = sanitizeRequestId(ctxt.get(X_BOXEL_REQUEST_ID_HEADER)); + let reqTag = requestId ? ` req=${requestId}` : ''; + if (requestId) { + ctxt.set(X_BOXEL_REQUEST_ID_HEADER, requestId); + } + let startedAt = Date.now(); + + // Track in-flight search load for the health sampler across the request's + // full lifecycle (queue → parse → SQL → serialize → send), which is the + // window during which a saturated event loop would leave it unserviced. + let isSearch = SEARCH_PATH_PATTERN.test(ctxt.path); + let releasedInFlight = false; + let releaseInFlight = () => { + if (releasedInFlight) { + return; + } + releasedInFlight = true; + decrementSearchInFlight(); + }; + if (isSearch) { + incrementSearchInFlight(); + } logger.info( `<-- ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }${jobTag}`, + }${jobTag}${reqTag}`, ); - ctxt.res.on('finish', () => { + let onSettled = () => { + if (isSearch) { + releaseInFlight(); + } logger.info( `--> ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }: ${ctxt.status}${jobTag}`, + }: ${ctxt.status}${jobTag}${reqTag} dur=${Date.now() - startedAt}ms`, ); logger.debug(JSON.stringify(ctxt.req.headers)); + }; + // `finish` fires on a fully-sent response; `close` covers a connection + // torn down before that (so the in-flight count can't leak on aborts). + ctxt.res.on('finish', onSettled); + ctxt.res.on('close', () => { + if (isSearch) { + releaseInFlight(); + } }); return next(); } diff --git a/packages/realm-server/search-inflight.ts b/packages/realm-server/search-inflight.ts new file mode 100644 index 00000000000..79549e5d277 --- /dev/null +++ b/packages/realm-server/search-inflight.ts @@ -0,0 +1,21 @@ +// Count of `_federated-search` requests the realm-server is currently +// handling. Incremented at the search handler's entry and decremented when +// it settles. Read by the health sampler so a spike in concurrent searches +// can be correlated with event-loop lag — the signature of the realm-server +// process being saturated while prerenders wait on in-render `_search` +// round-trips. A plain module-level counter (the realm-server is a single +// process) kept separate from the sampler so the search handler doesn't pull +// in `perf_hooks`. +let inFlight = 0; + +export function incrementSearchInFlight(): void { + inFlight++; +} + +export function decrementSearchInFlight(): void { + inFlight = inFlight > 0 ? inFlight - 1 : 0; +} + +export function getSearchInFlight(): number { + return inFlight; +} diff --git a/packages/runtime-common/index.ts b/packages/runtime-common/index.ts index 23b27c8ce4d..d18b15ae3d2 100644 --- a/packages/runtime-common/index.ts +++ b/packages/runtime-common/index.ts @@ -705,6 +705,7 @@ export * from './resource-types'; export * from './prerender-headers'; export * from './query'; export * from './search-utils'; +export * from './request-timings'; export * from './prerendered-html-format'; export * from './query-field-utils'; export * from './relationship-utils'; diff --git a/packages/runtime-common/prerender-headers.ts b/packages/runtime-common/prerender-headers.ts index e1e8b7dc19d..b72eae1b409 100644 --- a/packages/runtime-common/prerender-headers.ts +++ b/packages/runtime-common/prerender-headers.ts @@ -93,3 +93,27 @@ export function sanitizeJobPriorityHeader( if (!Number.isSafeInteger(n) || n < 0 || n > JOB_PRIORITY_MAX) return null; return n; } + +// Per-search correlation id minted by the host SPA on each +// `_federated-search` fetch it issues while rendering inside a prerender +// tab. The realm-server's search path reads it back out and emits a +// `realm:search-timing` line keyed by it, so a client-observed slow +// search (visible in the prerender's `queryLoadsInFlight` diagnostics) +// can be joined to where the realm-server actually spent the time. Lives +// here so the host SPA can import the header name without depending on +// the realm-server package. +export const X_BOXEL_REQUEST_ID_HEADER = 'x-boxel-request-id'; + +// Sanitize the inbound request-id header. It's echoed into log lines, so +// admit only a bounded run of URL-safe id characters (covers a UUID and +// then some) and reject anything with whitespace or control characters +// that could forge a log line. +const REQUEST_ID_PATTERN = /^[A-Za-z0-9._:-]{1,128}$/; +export function sanitizeRequestId( + raw: string | null | undefined, +): string | null { + if (typeof raw !== 'string') return null; + let trimmed = raw.trim(); + if (!trimmed) return null; + return REQUEST_ID_PATTERN.test(trimmed) ? trimmed : null; +} diff --git a/packages/runtime-common/realm-index-query-engine.ts b/packages/runtime-common/realm-index-query-engine.ts index 336683c9ec1..66db0ae7297 100644 --- a/packages/runtime-common/realm-index-query-engine.ts +++ b/packages/runtime-common/realm-index-query-engine.ts @@ -35,6 +35,7 @@ import type { Realm } from './realm'; import type { VirtualNetwork } from './virtual-network'; import { FILE_META_RESERVED_KEYS } from './realm'; import { RealmPaths } from './paths'; +import type { RequestTimings } from './request-timings'; import type { RealmResourceIdentifier, RealmIdentifier, @@ -168,6 +169,13 @@ type Options = { // for live / external callers, which therefore never read or write the // cache. jobIdentity?: string; + // Per-request wall-clock collector, threaded from `searchRealms` when a + // request carries a correlation id. The post-SQL stages here — the SQL + // query, the `loadLinks` relationship assembly, and the per-instance + // cache reads/writes — stamp their elapsed time on it so the handler can + // attribute the request's server-side time across stages. Absent (and so + // a no-op) for everything except instrumented `_federated-search` calls. + timings?: RequestTimings; } & QueryOptions; type SearchResult = SearchResultDoc | SearchResultError; @@ -376,11 +384,15 @@ export class RealmIndexQueryEngine { meta, }; } else { - let { cards, meta } = await this.#indexQueryEngine.searchCards( - new URL(this.#realm.url), - query, - opts, - ); + let runCardSql = () => + this.#indexQueryEngine.searchCards( + new URL(this.#realm.url), + query, + opts, + ); + let { cards, meta } = opts?.timings + ? await opts.timings.time('sql', runCardSql) + : await runCardSql(); let cardResources = cards.map((resource) => ({ ...resource, ...{ links: { self: resource.id } }, @@ -408,14 +420,18 @@ export class RealmIndexQueryEngine { // Process all root resources together so a single batched DB query // resolves their first-level links (1+1 instead of N+M sequential // round-trips). See CS-11038. - let included = await this.loadLinks( - { - realmURL: this.realmURL, - rootResources: doc.data, - omit, - }, - linkOpts, - ); + let runLoadLinks = () => + this.loadLinks( + { + realmURL: this.realmURL, + rootResources: doc.data, + omit, + }, + linkOpts, + ); + let included = opts?.timings + ? await opts.timings.time('loadLinks', runLoadLinks) + : await runLoadLinks(); if (included.length > 0) { doc.included = included; } @@ -1373,14 +1389,18 @@ export class RealmIndexQueryEngine { : opts?.linkFields ? { ...opts, linkFields: undefined } : opts; + let timings = popOpts?.timings; let cacheKey = this.#instanceCacheKey(resource, popOpts); if (cacheKey) { - let cached = await this.#readCachedRelationships( - cacheKey.jobId, - cacheKey.url, - ); + let ck = cacheKey; + let cached = timings + ? await timings.time('cacheRead', () => + this.#readCachedRelationships(ck.jobId, ck.url), + ) + : await this.#readCachedRelationships(ck.jobId, ck.url); if (cached !== undefined) { recordInstanceCacheEvent(cacheKey.jobId, true); + timings?.incr('cacheHit'); // Hit: reuse this instance's assembled query-field // relationships from an earlier occurrence in the same job, // skipping the definition lookup + field-tree walk. The @@ -1396,28 +1416,43 @@ export class RealmIndexQueryEngine { return; } recordInstanceCacheEvent(cacheKey.jobId, false); + timings?.incr('cacheMiss'); } let storedDefs = ( resource.meta as { queryFieldDefs?: Record; } )?.queryFieldDefs; - if (popOpts?.cacheOnlyDefinitions && storedDefs) { - await this.populateQueryFieldsFromMeta( - resource, - realmURL, - storedDefs, - popOpts, - ); + // The relationship/query-field assembly — the definition lookup + + // field-tree walk — is the post-SQL "wire-format prep" the timeline + // attributes under `populate`. + let runPopulate = () => + popOpts?.cacheOnlyDefinitions && storedDefs + ? this.populateQueryFieldsFromMeta( + resource, + realmURL, + storedDefs, + popOpts, + ) + : this.populateQueryFields(resource, realmURL, popOpts); + if (timings) { + await timings.time('populate', runPopulate); } else { - await this.populateQueryFields(resource, realmURL, popOpts); + await runPopulate(); } if (cacheKey) { - await this.#writeCachedRelationships( - cacheKey.jobId, - cacheKey.url, - (resource as { relationships?: unknown }).relationships, - ); + let ck = cacheKey; + let writeCache = () => + this.#writeCachedRelationships( + ck.jobId, + ck.url, + (resource as { relationships?: unknown }).relationships, + ); + if (timings) { + await timings.time('cacheWrite', writeCache); + } else { + await writeCache(); + } } }), ); diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index c1c5331e1b5..4ed0168c18b 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -5175,6 +5175,7 @@ export class Realm { // `!== undefined` so an explicit priority 0 (system-initiated) survives. ...(opts?.priority !== undefined ? { priority: opts.priority } : {}), ...(opts?.jobIdentity ? { jobIdentity: opts.jobIdentity } : {}), + ...(opts?.timings ? { timings: opts.timings } : {}), }); } diff --git a/packages/runtime-common/request-timings.ts b/packages/runtime-common/request-timings.ts new file mode 100644 index 00000000000..431d9a4e80d --- /dev/null +++ b/packages/runtime-common/request-timings.ts @@ -0,0 +1,64 @@ +// A request-scoped wall-clock collector for the realm-server search path. +// +// One instance is created per instrumented `_search` request in +// `handle-search` and threaded into search opts. Each stage of the +// server-side pipeline that runs *after* the request is received — +// the SQL query, the post-SQL `loadLinks` relationship assembly (the +// per-instance cache reads/writes), and the JSON wire-format +// serialization — stamps its elapsed time here, so the handler can emit +// a single line attributing the request's wall-clock across stages. +// +// It is deliberately plain (no Node-only APIs) so it can live in +// runtime-common alongside the search pipeline it instruments. In +// practice it is only ever instantiated by the realm-server: the host's +// store fetches search results over HTTP and never runs +// `RealmIndexQueryEngine`, so in the browser the threaded collector is +// always `undefined` and every `opts?.timings?.…` call is a no-op. +export class RequestTimings { + #stages = new Map(); + #counters = new Map(); + + // Time an async stage and accumulate its elapsed ms under `stage`. + // Repeated stages (e.g. one `loadLinks` per realm in a federated + // search) sum, so the line reports total time in each stage. + async time(stage: string, fn: () => Promise): Promise { + let start = Date.now(); + try { + return await fn(); + } finally { + this.add(stage, Date.now() - start); + } + } + + add(stage: string, ms: number): void { + this.#stages.set(stage, (this.#stages.get(stage) ?? 0) + ms); + } + + // Integer tallies that aren't durations — result counts, per-instance + // cache hit/miss counts. + incr(counter: string, n = 1): void { + this.#counters.set(counter, (this.#counters.get(counter) ?? 0) + n); + } + + stages(): Record { + let out: Record = {}; + for (let [k, v] of this.#stages) { + out[k] = Math.round(v); + } + return out; + } + + counters(): Record { + return Object.fromEntries(this.#counters); + } + + // Compact fragment for a single log line, e.g. + // `sql=41 loadLinks=120 stringify=210 | results=312 cacheHit=18 cacheMiss=3` + toLogFragment(): string { + let stages = [...this.#stages] + .map(([k, v]) => `${k}=${Math.round(v)}`) + .join(' '); + let counters = [...this.#counters].map(([k, v]) => `${k}=${v}`).join(' '); + return counters ? `${stages} | ${counters}` : stages; + } +} diff --git a/packages/runtime-common/search-utils.ts b/packages/runtime-common/search-utils.ts index b7e17b6d764..30181055c94 100644 --- a/packages/runtime-common/search-utils.ts +++ b/packages/runtime-common/search-utils.ts @@ -1,6 +1,8 @@ import type { RealmResourceIdentifier } from './card-reference-resolver'; +import { logger } from './log'; import { ensureTrailingSlash } from './paths'; import { assertQuery, InvalidQueryError, type Query } from './query'; +import { RequestTimings } from './request-timings'; import { isValidPrerenderedHtmlFormat, PRERENDERED_HTML_FORMATS, @@ -326,6 +328,18 @@ export type SearchOpts = { omitIncluded?: boolean; priority?: number; jobIdentity?: string; + // Correlation id minted by the client (the prerendered host stamps + // `x-boxel-request-id` on its `_federated-search` fetch) and read back + // out by the request handler into opts. When present, `searchRealms` + // instruments the server-side search pipeline and emits one + // `realm:search-timing` line keyed by this id, so a client-observed + // slow search can be joined to where the realm-server spent the time. + requestId?: string; + // Per-request wall-clock collector. `searchRealms` creates it when a + // `requestId` is present and threads it down through `Realm.search` → + // `searchCards` → `loadLinks` so each post-SQL stage stamps its + // elapsed time. Callers never supply this directly. + timings?: RequestTimings; }; type SearchableRealm = { @@ -336,11 +350,47 @@ type SearchableRealm = { url?: string; }; +// Indirection so a host integration test can deterministically capture +// the emitted timing line: loglevel rebinds a logger's methods on every +// `setLevel`, so a test that monkeypatches a direct logger handle would +// race the next `logger('realm:search-timing')` call. A settable sink +// sidesteps that. Defaults to the `realm:search-timing` logger. +let searchTimingSink: ((line: string) => void) | undefined; +let searchTimingLog: ReturnType | undefined; +export function setSearchTimingSinkForTests( + sink: ((line: string) => void) | undefined, +): void { + searchTimingSink = sink; +} +export function emitSearchTiming(line: string): void { + if (searchTimingSink) { + searchTimingSink(line); + return; + } + // Lazy: a module-load `logger()` call races the circular import that + // installs the logger factory. First emission happens well after boot. + (searchTimingLog ??= logger('realm:search-timing')).info(line); +} + export async function searchRealms( realms: Array, query: Query, opts?: SearchOpts, ): Promise { + // Instrument only when the caller threaded a correlation id. The + // prerendered host stamps one; live SPA / API traffic does not — so + // normal traffic allocates no collector and emits no line. + // + // Two callers: the realm-server's `handle-search` threads a collector it + // owns (so it can emit one complete request→response line itself — + // `opts.timings` is set, `ownsTimings` is false, we don't emit), and the + // host-test realm-server mock calls us with just a `requestId` (we create + // the collector and emit the line ourselves, which the host test observes). + let ownsTimings = Boolean(opts?.requestId) && !opts?.timings; + let timings = + opts?.timings ?? (ownsTimings ? new RequestTimings() : undefined); + let perRealmOpts = ownsTimings && opts ? { ...opts, timings } : opts; + let startedAt = ownsTimings ? Date.now() : 0; let realmEntries = realms .filter((realm): realm is SearchableRealm => Boolean(realm)) .map((realm) => ({ @@ -348,7 +398,7 @@ export async function searchRealms( label: realm.url ? String(realm.url) : undefined, })); let searchPromises = realmEntries.map(({ realm }) => - Promise.resolve().then(() => realm.search(query, opts)), + Promise.resolve().then(() => realm.search(query, perRealmOpts)), ); let results = await Promise.allSettled(searchPromises); let queryLabel = '[unserializable query]'; @@ -369,7 +419,20 @@ export async function searchRealms( let docs = results.flatMap((result) => result.status === 'fulfilled' ? [result.value] : [], ); - return combineSearchResults(docs); + let combined = combineSearchResults(docs); + if (timings) { + timings.incr('results', combined.data?.length ?? 0); + } + if (ownsTimings && timings) { + emitSearchTiming( + `req=${opts!.requestId}` + + (opts!.jobIdentity ? ` job=${opts!.jobIdentity}` : '') + + ` realms=${realmEntries.length}` + + ` total=${Date.now() - startedAt}ms ` + + timings.toLogFragment(), + ); + } + return combined; } type PrerenderedSearchableRealm = { From 706ed8b2bde5d7e7330f84c4d05c5b7f80f60e74 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Wed, 3 Jun 2026 16:38:36 -0400 Subject: [PATCH 2/2] Whitelist the correlation header for CORS, rename it, and fix coalesced-search timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CORS: add the correlation header to the realm-server's `Access-Control-Allow-Headers` allowlist. Without it the cross-origin preflight blocks the prerender's `_federated-search` once the host stamps the header. - Rename `x-boxel-request-id` → `x-boxel-logging-correlation-id` (and the internal field/vars to `loggingCorrelationId`, the log token to `corr=`): the old name read too much like the unrelated `x-boxel-client-request-id`, which is realm-event/write correlation, a different concern. - searchCards in-flight coalescing: exclude the per-request `timings` collector from `searchInFlightKey` so it can't perturb the dedup key, and when a follower awaits an already-running identical search, record that wait as `coalescedWait` on the follower's own collector — otherwise its timing line would show no sql/loadLinks and look misleadingly instant under concurrent search load. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../host/app/lib/prerender-fetch-headers.ts | 8 +++---- packages/host/app/services/store.ts | 4 ++-- .../tests/helpers/realm-server-mock/routes.ts | 10 ++++----- .../search-correlation-id-test.gts | 16 +++++++------- .../realm-server/handlers/handle-search.ts | 21 +++++++++++-------- packages/realm-server/middleware/index.ts | 18 +++++++++------- packages/realm-server/server.ts | 2 +- packages/runtime-common/prerender-headers.ts | 5 +++-- .../realm-index-query-engine.ts | 21 +++++++++++++++++-- packages/runtime-common/search-utils.ts | 12 +++++------ 10 files changed, 71 insertions(+), 46 deletions(-) diff --git a/packages/host/app/lib/prerender-fetch-headers.ts b/packages/host/app/lib/prerender-fetch-headers.ts index a4b810acc31..f6e7e9061a3 100644 --- a/packages/host/app/lib/prerender-fetch-headers.ts +++ b/packages/host/app/lib/prerender-fetch-headers.ts @@ -2,7 +2,7 @@ import { DURING_PRERENDER_HEADER, X_BOXEL_CONSUMING_REALM_HEADER, X_BOXEL_JOB_ID_HEADER, - X_BOXEL_REQUEST_ID_HEADER, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, } from '@cardstack/runtime-common'; // Set by the prerender server's `evaluateOnNewDocument` before the @@ -50,19 +50,19 @@ export function jobIdHeader(): Record { // Per-search correlation id. Minted fresh for each `_federated-search` // fetch the SPA issues while rendering inside a prerender tab, and stamped -// as `x-boxel-request-id`. The realm-server reads it back out and keys its +// as `x-boxel-logging-correlation-id`. The realm-server reads it back out and keys its // `realm:search-timing` line on it, so a search the prerender observes as // slow (surfaced in its `queryLoadsInFlight` diagnostics) can be joined to // the realm-server's stage-by-stage view of the same request. Gated on the // prerender context — exactly like the job-id / consuming-realm headers — // so live SPA traffic is unaffected and emits no server-side timing line. -export function requestIdHeader(): Record { +export function loggingCorrelationIdHeader(): Record { let flag = (globalThis as unknown as { __boxelRenderContext?: boolean }) .__boxelRenderContext; if (flag !== true) { return {}; } - return { [X_BOXEL_REQUEST_ID_HEADER]: newCorrelationId() }; + return { [X_BOXEL_LOGGING_CORRELATION_ID_HEADER]: newCorrelationId() }; } function newCorrelationId(): string { diff --git a/packages/host/app/services/store.ts b/packages/host/app/services/store.ts index 1b105f79960..5aa0fbad1e3 100644 --- a/packages/host/app/services/store.ts +++ b/packages/host/app/services/store.ts @@ -89,7 +89,7 @@ import { consumingRealmHeader, duringPrerenderHeaders, jobIdHeader, - requestIdHeader, + loggingCorrelationIdHeader, } from '../lib/prerender-fetch-headers'; import { searchCacheKey } from '../lib/search-cache-key'; import { searchInFlightKey } from '../lib/search-in-flight-key'; @@ -1154,7 +1154,7 @@ export default class StoreService extends Service implements StoreInterface { ...consumingRealmHeader(), ...jobIdHeader(), ...jobPriorityHeader(), - ...requestIdHeader(), + ...loggingCorrelationIdHeader(), }, body: JSON.stringify({ ...query, realms }), }, diff --git a/packages/host/tests/helpers/realm-server-mock/routes.ts b/packages/host/tests/helpers/realm-server-mock/routes.ts index 266e36318ed..0d6445c60d4 100644 --- a/packages/host/tests/helpers/realm-server-mock/routes.ts +++ b/packages/host/tests/helpers/realm-server-mock/routes.ts @@ -7,11 +7,11 @@ import { parseSearchQueryFromPayload, parseSearchRequestPayload, SearchRequestError, - sanitizeRequestId, + sanitizeLoggingCorrelationId, searchPrerenderedRealms, searchRealms, SupportedMimeType, - X_BOXEL_REQUEST_ID_HEADER, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, type RealmInfo, type Query, } from '@cardstack/runtime-common'; @@ -124,13 +124,13 @@ function registerSearchRoutes() { // correlation id off the request and thread it into searchRealms, so // the real `realm:search-timing` line is emitted (and observable by // host integration tests) keyed by the id the client minted. - let requestId = sanitizeRequestId( - req.headers.get(X_BOXEL_REQUEST_ID_HEADER), + let loggingCorrelationId = sanitizeLoggingCorrelationId( + req.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), ); let combined = await searchRealms( realmList.map((realmURL) => getSearchableRealmForURL(realmURL)), cardsQuery, - requestId ? { requestId } : undefined, + loggingCorrelationId ? { loggingCorrelationId } : undefined, ); return new Response(JSON.stringify(combined), { diff --git a/packages/host/tests/integration/search-correlation-id-test.gts b/packages/host/tests/integration/search-correlation-id-test.gts index e215722cdc6..ec378783ae6 100644 --- a/packages/host/tests/integration/search-correlation-id-test.gts +++ b/packages/host/tests/integration/search-correlation-id-test.gts @@ -8,7 +8,7 @@ import { baseRealm, rri, setSearchTimingSinkForTests, - X_BOXEL_REQUEST_ID_HEADER, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, } from '@cardstack/runtime-common'; import type { Loader } from '@cardstack/runtime-common/loader'; import type { Query } from '@cardstack/runtime-common/query'; @@ -26,14 +26,14 @@ import { setupMockMatrix } from '../helpers/mock-matrix'; import { setupRenderingTest } from '../helpers/setup'; // End-to-end coverage for the search correlation id: the in-realm browser -// (the prerendered host SPA) mints `x-boxel-request-id` on its +// (the prerendered host SPA) mints `x-boxel-logging-correlation-id` on its // `_federated-search` fetch, and the realm-server's search path emits a // `realm:search-timing` line keyed by that same id. This proves the id // threads all the way from the client that originated it through to the // server log a triage would join against. // // The host test exercises the *real* code on both ends: the SPA's -// `requestIdHeader()` stamps the header, and the realm-server-mock hands it +// `loggingCorrelationIdHeader()` stamps the header, and the realm-server-mock hands it // to the real `searchRealms`, which emits the line. Only the prerender // context flag is simulated (the host normally sets it inside a prerender // tab). @@ -108,7 +108,7 @@ module('Integration | search correlation id', function (hooks) { let sentRequestIds: string[] = []; let spy = async (request: Request) => { if (new URL(request.url).pathname.endsWith('/_federated-search')) { - let id = request.headers.get(X_BOXEL_REQUEST_ID_HEADER); + let id = request.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER); if (id) { sentRequestIds.push(id); } @@ -139,7 +139,9 @@ module('Integration | search correlation id', function (hooks) { `client-minted correlation id looks well-formed (${sentId})`, ); - let matching = timingLines.filter((line) => line.includes(`req=${sentId}`)); + let matching = timingLines.filter((line) => + line.includes(`corr=${sentId}`), + ); assert.strictEqual( matching.length, 1, @@ -168,7 +170,7 @@ module('Integration | search correlation id', function (hooks) { let spy = async (request: Request) => { if ( new URL(request.url).pathname.endsWith('/_federated-search') && - request.headers.get(X_BOXEL_REQUEST_ID_HEADER) + request.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER) ) { sawHeader = true; } @@ -184,7 +186,7 @@ module('Integration | search correlation id', function (hooks) { assert.strictEqual(results.length, 2, 'the search still returns results'); assert.false( sawHeader, - 'live (non-prerender) traffic sends no x-boxel-request-id header', + 'live (non-prerender) traffic sends no x-boxel-logging-correlation-id header', ); assert.strictEqual( timingLines.length, diff --git a/packages/realm-server/handlers/handle-search.ts b/packages/realm-server/handlers/handle-search.ts index 8a5a7579c63..300f28b6ed6 100644 --- a/packages/realm-server/handlers/handle-search.ts +++ b/packages/realm-server/handlers/handle-search.ts @@ -10,8 +10,8 @@ import { sanitizeConsumingRealmHeader, SearchRequestError, searchRealms, - sanitizeRequestId, - X_BOXEL_REQUEST_ID_HEADER, + sanitizeLoggingCorrelationId, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, RequestTimings, emitSearchTiming, } from '@cardstack/runtime-common'; @@ -47,8 +47,11 @@ export default function handleSearch(opts: { // outermost request→response bound (incl. body read + send) is the // `realm:requests` middleware's `dur=`, keyed by the same id. let handlerStart = Date.now(); - let requestId = sanitizeRequestId(ctxt.get(X_BOXEL_REQUEST_ID_HEADER)); - let timings = requestId !== null ? new RequestTimings() : undefined; + let loggingCorrelationId = sanitizeLoggingCorrelationId( + ctxt.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), + ); + let timings = + loggingCorrelationId !== null ? new RequestTimings() : undefined; let { realmList } = getMultiRealmAuthorization(ctxt); @@ -123,7 +126,7 @@ export default function handleSearch(opts: { if (prerenderJobId) searchOpts.jobIdentity = prerenderJobId; let normalizedSearchOpts = Object.keys(searchOpts).length > 0 ? searchOpts : undefined; - // `requestId` / `timings` are deliberately kept OUT of `searchOpts`: + // `loggingCorrelationId` / `timings` are deliberately kept OUT of `searchOpts`: // that object is the job-scoped search cache's key material (see // `computeETag` / `getOrPopulate` below), and per-request values would // make every key unique and defeat the cache. They only need to reach @@ -131,10 +134,10 @@ export default function handleSearch(opts: { // collector this handler emits), so they ride on the run-time opts and // never touch the cache key. let runSearchOpts = - requestId !== null + loggingCorrelationId !== null ? { ...(normalizedSearchOpts ?? {}), - requestId, + loggingCorrelationId, ...(timings ? { timings } : {}), } : normalizedSearchOpts; @@ -168,11 +171,11 @@ export default function handleSearch(opts: { // error early returns above, which never reach here). No-op without a // correlation id. let emitTimeline = () => { - if (!timings || requestId === null) { + if (!timings || loggingCorrelationId === null) { return; } emitSearchTiming( - `req=${requestId}` + + `corr=${loggingCorrelationId}` + (prerenderJobId ? ` job=${prerenderJobId}` : '') + ` handler=${Date.now() - handlerStart}ms ` + timings.toLogFragment(), diff --git a/packages/realm-server/middleware/index.ts b/packages/realm-server/middleware/index.ts index f850aa803cb..afa72c36eaa 100644 --- a/packages/realm-server/middleware/index.ts +++ b/packages/realm-server/middleware/index.ts @@ -4,8 +4,8 @@ import type { ResponseWithNodeStream } from '@cardstack/runtime-common'; import { logger as getLogger, webStreamToText, - sanitizeRequestId, - X_BOXEL_REQUEST_ID_HEADER, + sanitizeLoggingCorrelationId, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, } from '@cardstack/runtime-common'; import type Koa from 'koa'; import mime from 'mime-types'; @@ -173,10 +173,12 @@ export function httpLogging(ctxt: Koa.Context, next: Koa.Next) { // joins to the realm-server's view of the same request. The matching // `realm:search-timing` line (emitted by `searchRealms`) is keyed by the // same value. - let requestId = sanitizeRequestId(ctxt.get(X_BOXEL_REQUEST_ID_HEADER)); - let reqTag = requestId ? ` req=${requestId}` : ''; - if (requestId) { - ctxt.set(X_BOXEL_REQUEST_ID_HEADER, requestId); + let loggingCorrelationId = sanitizeLoggingCorrelationId( + ctxt.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), + ); + let corrTag = loggingCorrelationId ? ` corr=${loggingCorrelationId}` : ''; + if (loggingCorrelationId) { + ctxt.set(X_BOXEL_LOGGING_CORRELATION_ID_HEADER, loggingCorrelationId); } let startedAt = Date.now(); @@ -199,7 +201,7 @@ export function httpLogging(ctxt: Koa.Context, next: Koa.Next) { logger.info( `<-- ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }${jobTag}${reqTag}`, + }${jobTag}${corrTag}`, ); let onSettled = () => { @@ -209,7 +211,7 @@ export function httpLogging(ctxt: Koa.Context, next: Koa.Next) { logger.info( `--> ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }: ${ctxt.status}${jobTag}${reqTag} dur=${Date.now() - startedAt}ms`, + }: ${ctxt.status}${jobTag}${corrTag} dur=${Date.now() - startedAt}ms`, ); logger.debug(JSON.stringify(ctxt.req.headers)); }; diff --git a/packages/realm-server/server.ts b/packages/realm-server/server.ts index 43f60196ae3..c9fc2aace00 100644 --- a/packages/realm-server/server.ts +++ b/packages/realm-server/server.ts @@ -728,7 +728,7 @@ export class RealmServer { cors({ origin: '*', allowHeaders: - 'Authorization, Content-Type, If-Match, If-None-Match, X-Requested-With, X-Boxel-Client-Request-Id, X-Boxel-Assume-User, X-HTTP-Method-Override, X-Boxel-Disable-Module-Cache, X-Filename, X-Boxel-During-Prerender, X-Boxel-Consuming-Realm, X-Boxel-Job-Id, X-Boxel-Job-Priority, X-Grafana-Device-Id, X-Grafana-Action', + 'Authorization, Content-Type, If-Match, If-None-Match, X-Requested-With, X-Boxel-Client-Request-Id, X-Boxel-Assume-User, X-HTTP-Method-Override, X-Boxel-Disable-Module-Cache, X-Filename, X-Boxel-During-Prerender, X-Boxel-Consuming-Realm, X-Boxel-Job-Id, X-Boxel-Job-Priority, X-Boxel-Logging-Correlation-Id, X-Grafana-Device-Id, X-Grafana-Action', // Without an explicit expose list, @koa/cors only emits the // CORS-safelisted response headers (cache-control, content-*, // expires, last-modified, pragma). ETag is not on that list, diff --git a/packages/runtime-common/prerender-headers.ts b/packages/runtime-common/prerender-headers.ts index b72eae1b409..dacedb8427f 100644 --- a/packages/runtime-common/prerender-headers.ts +++ b/packages/runtime-common/prerender-headers.ts @@ -102,14 +102,15 @@ export function sanitizeJobPriorityHeader( // can be joined to where the realm-server actually spent the time. Lives // here so the host SPA can import the header name without depending on // the realm-server package. -export const X_BOXEL_REQUEST_ID_HEADER = 'x-boxel-request-id'; +export const X_BOXEL_LOGGING_CORRELATION_ID_HEADER = + 'x-boxel-logging-correlation-id'; // Sanitize the inbound request-id header. It's echoed into log lines, so // admit only a bounded run of URL-safe id characters (covers a UUID and // then some) and reject anything with whitespace or control characters // that could forge a log line. const REQUEST_ID_PATTERN = /^[A-Za-z0-9._:-]{1,128}$/; -export function sanitizeRequestId( +export function sanitizeLoggingCorrelationId( raw: string | null | undefined, ): string | null { if (typeof raw !== 'string') return null; diff --git a/packages/runtime-common/realm-index-query-engine.ts b/packages/runtime-common/realm-index-query-engine.ts index 66db0ae7297..3354c924e16 100644 --- a/packages/runtime-common/realm-index-query-engine.ts +++ b/packages/runtime-common/realm-index-query-engine.ts @@ -227,10 +227,19 @@ export function searchInFlightKey( // values inside query/opts — e.g. a `matches: 'a|b'` string — can never // collide with the delimiter and cause unrelated searches to coalesce. try { + // `timings` is a per-request diagnostic collector, not part of the + // result-shaping opts. Exclude it from the key so it can't perturb the + // in-flight coalescing — two otherwise-identical searches must still + // dedupe even though each carries its own collector. + let keyOpts = opts; + if (opts && 'timings' in opts) { + let { timings: _omitTimings, ...rest } = opts; + keyOpts = rest; + } return JSON.stringify([ realmURL, normalizeQueryForSignature(query), - opts ? sortKeysDeep(opts) : null, + keyOpts ? sortKeysDeep(keyOpts) : null, ]); } catch { return undefined; @@ -341,7 +350,15 @@ export class RealmIndexQueryEngine { if (key !== undefined) { let existing = this.#inFlightSearch.get(key); if (existing) { - return await existing; + // A concurrent identical search is already running; this follower + // awaits its result instead of re-running the work. Record that wait + // as `coalescedWait` on the follower's own collector so its + // `realm:search-timing` line reflects the time spent — otherwise the + // follower would show no `sql`/`loadLinks` and look misleadingly + // instant exactly under the concurrent search load we're diagnosing. + return opts?.timings + ? await opts.timings.time('coalescedWait', () => existing) + : await existing; } let pending = this.searchCardsUncoalesced(query, opts).finally(() => { // Identity-check before deletion: a concurrent invalidation path diff --git a/packages/runtime-common/search-utils.ts b/packages/runtime-common/search-utils.ts index 30181055c94..515d15f3a02 100644 --- a/packages/runtime-common/search-utils.ts +++ b/packages/runtime-common/search-utils.ts @@ -329,14 +329,14 @@ export type SearchOpts = { priority?: number; jobIdentity?: string; // Correlation id minted by the client (the prerendered host stamps - // `x-boxel-request-id` on its `_federated-search` fetch) and read back + // `x-boxel-logging-correlation-id` on its `_federated-search` fetch) and read back // out by the request handler into opts. When present, `searchRealms` // instruments the server-side search pipeline and emits one // `realm:search-timing` line keyed by this id, so a client-observed // slow search can be joined to where the realm-server spent the time. - requestId?: string; + loggingCorrelationId?: string; // Per-request wall-clock collector. `searchRealms` creates it when a - // `requestId` is present and threads it down through `Realm.search` → + // `loggingCorrelationId` is present and threads it down through `Realm.search` → // `searchCards` → `loadLinks` so each post-SQL stage stamps its // elapsed time. Callers never supply this directly. timings?: RequestTimings; @@ -384,9 +384,9 @@ export async function searchRealms( // Two callers: the realm-server's `handle-search` threads a collector it // owns (so it can emit one complete request→response line itself — // `opts.timings` is set, `ownsTimings` is false, we don't emit), and the - // host-test realm-server mock calls us with just a `requestId` (we create + // host-test realm-server mock calls us with just a `loggingCorrelationId` (we create // the collector and emit the line ourselves, which the host test observes). - let ownsTimings = Boolean(opts?.requestId) && !opts?.timings; + let ownsTimings = Boolean(opts?.loggingCorrelationId) && !opts?.timings; let timings = opts?.timings ?? (ownsTimings ? new RequestTimings() : undefined); let perRealmOpts = ownsTimings && opts ? { ...opts, timings } : opts; @@ -425,7 +425,7 @@ export async function searchRealms( } if (ownsTimings && timings) { emitSearchTiming( - `req=${opts!.requestId}` + + `corr=${opts!.loggingCorrelationId}` + (opts!.jobIdentity ? ` job=${opts!.jobIdentity}` : '') + ` realms=${realmEntries.length}` + ` total=${Date.now() - startedAt}ms ` +