diff --git a/packages/host/app/lib/prerender-fetch-headers.ts b/packages/host/app/lib/prerender-fetch-headers.ts index a17823bcf68..f6e7e9061a3 100644 --- a/packages/host/app/lib/prerender-fetch-headers.ts +++ b/packages/host/app/lib/prerender-fetch-headers.ts @@ -2,6 +2,7 @@ import { DURING_PRERENDER_HEADER, X_BOXEL_CONSUMING_REALM_HEADER, X_BOXEL_JOB_ID_HEADER, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, } from '@cardstack/runtime-common'; // Set by the prerender server's `evaluateOnNewDocument` before the @@ -46,3 +47,33 @@ export function jobIdHeader(): Record { let j = (globalThis as unknown as { __boxelJobId?: string }).__boxelJobId; return j ? { [X_BOXEL_JOB_ID_HEADER]: j } : {}; } + +// Per-search correlation id. Minted fresh for each `_federated-search` +// fetch the SPA issues while rendering inside a prerender tab, and stamped +// as `x-boxel-logging-correlation-id`. The realm-server reads it back out and keys its +// `realm:search-timing` line on it, so a search the prerender observes as +// slow (surfaced in its `queryLoadsInFlight` diagnostics) can be joined to +// the realm-server's stage-by-stage view of the same request. Gated on the +// prerender context — exactly like the job-id / consuming-realm headers — +// so live SPA traffic is unaffected and emits no server-side timing line. +export function loggingCorrelationIdHeader(): Record { + let flag = (globalThis as unknown as { __boxelRenderContext?: boolean }) + .__boxelRenderContext; + if (flag !== true) { + return {}; + } + return { [X_BOXEL_LOGGING_CORRELATION_ID_HEADER]: newCorrelationId() }; +} + +function newCorrelationId(): string { + let c = (globalThis as { crypto?: { randomUUID?: () => string } }).crypto; + if (c?.randomUUID) { + return c.randomUUID(); + } + // Fallback for the rare environment without `crypto.randomUUID` — the id + // only needs to disambiguate concurrent searches in a log line, not be + // cryptographically strong. + return `r-${Date.now().toString(36)}-${Math.floor( + Math.random() * 1e9, + ).toString(36)}`; +} diff --git a/packages/host/app/services/store.ts b/packages/host/app/services/store.ts index cabb1df446b..5aa0fbad1e3 100644 --- a/packages/host/app/services/store.ts +++ b/packages/host/app/services/store.ts @@ -89,6 +89,7 @@ import { consumingRealmHeader, duringPrerenderHeaders, jobIdHeader, + loggingCorrelationIdHeader, } from '../lib/prerender-fetch-headers'; import { searchCacheKey } from '../lib/search-cache-key'; import { searchInFlightKey } from '../lib/search-in-flight-key'; @@ -1153,6 +1154,7 @@ export default class StoreService extends Service implements StoreInterface { ...consumingRealmHeader(), ...jobIdHeader(), ...jobPriorityHeader(), + ...loggingCorrelationIdHeader(), }, body: JSON.stringify({ ...query, realms }), }, diff --git a/packages/host/tests/helpers/realm-server-mock/routes.ts b/packages/host/tests/helpers/realm-server-mock/routes.ts index 83005da512c..0d6445c60d4 100644 --- a/packages/host/tests/helpers/realm-server-mock/routes.ts +++ b/packages/host/tests/helpers/realm-server-mock/routes.ts @@ -7,9 +7,11 @@ import { parseSearchQueryFromPayload, parseSearchRequestPayload, SearchRequestError, + sanitizeLoggingCorrelationId, searchPrerenderedRealms, searchRealms, SupportedMimeType, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, type RealmInfo, type Query, } from '@cardstack/runtime-common'; @@ -118,9 +120,17 @@ function registerSearchRoutes() { throw e; } + // Mirror the realm-server's `handle-search`: read the client's + // correlation id off the request and thread it into searchRealms, so + // the real `realm:search-timing` line is emitted (and observable by + // host integration tests) keyed by the id the client minted. + let loggingCorrelationId = sanitizeLoggingCorrelationId( + req.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), + ); let combined = await searchRealms( realmList.map((realmURL) => getSearchableRealmForURL(realmURL)), cardsQuery, + loggingCorrelationId ? { loggingCorrelationId } : undefined, ); return new Response(JSON.stringify(combined), { diff --git a/packages/host/tests/integration/search-correlation-id-test.gts b/packages/host/tests/integration/search-correlation-id-test.gts new file mode 100644 index 00000000000..ec378783ae6 --- /dev/null +++ b/packages/host/tests/integration/search-correlation-id-test.gts @@ -0,0 +1,197 @@ +import type { RenderingTestContext } from '@ember/test-helpers'; +import { settled } from '@ember/test-helpers'; + +import { getService } from '@universal-ember/test-support'; +import { module, test } from 'qunit'; + +import { + baseRealm, + rri, + setSearchTimingSinkForTests, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, +} from '@cardstack/runtime-common'; +import type { Loader } from '@cardstack/runtime-common/loader'; +import type { Query } from '@cardstack/runtime-common/query'; + +import type NetworkService from '@cardstack/host/services/network'; +import type StoreService from '@cardstack/host/services/store'; + +import { + testRealmURL, + setupCardLogs, + setupLocalIndexing, + setupIntegrationTestRealm, +} from '../helpers'; +import { setupMockMatrix } from '../helpers/mock-matrix'; +import { setupRenderingTest } from '../helpers/setup'; + +// End-to-end coverage for the search correlation id: the in-realm browser +// (the prerendered host SPA) mints `x-boxel-logging-correlation-id` on its +// `_federated-search` fetch, and the realm-server's search path emits a +// `realm:search-timing` line keyed by that same id. This proves the id +// threads all the way from the client that originated it through to the +// server log a triage would join against. +// +// The host test exercises the *real* code on both ends: the SPA's +// `loggingCorrelationIdHeader()` stamps the header, and the realm-server-mock hands it +// to the real `searchRealms`, which emits the line. Only the prerender +// context flag is simulated (the host normally sets it inside a prerender +// tab). + +const personModule = ` + import { contains, field, CardDef } from 'https://cardstack.com/base/card-api'; + import StringField from 'https://cardstack.com/base/string'; + + export class Person extends CardDef { + static displayName = 'Person'; + @field name = contains(StringField); + } +`; + +let loader: Loader; + +module('Integration | search correlation id', function (hooks) { + setupRenderingTest(hooks); + setupLocalIndexing(hooks); + let mockMatrixUtils = setupMockMatrix(hooks); + + hooks.beforeEach(function (this: RenderingTestContext) { + loader = getService('loader-service').loader; + }); + + setupCardLogs( + hooks, + async () => await loader.import(`${baseRealm.url}card-api`), + ); + + hooks.beforeEach(async function () { + await setupIntegrationTestRealm({ + mockMatrixUtils, + contents: { + 'person.gts': personModule, + 'person-1.json': { + data: { + attributes: { name: 'Alice' }, + meta: { adoptsFrom: { module: './person', name: 'Person' } }, + }, + }, + 'person-2.json': { + data: { + attributes: { name: 'Bob' }, + meta: { adoptsFrom: { module: './person', name: 'Person' } }, + }, + }, + }, + }); + }); + + // Restore globals + sink between tests so a failure can't leak into the + // next test or the rest of the suite. + hooks.afterEach(function () { + delete (globalThis as Record).__boxelRenderContext; + setSearchTimingSinkForTests(undefined); + }); + + const personQuery: Query = { + filter: { type: { module: rri(`${testRealmURL}person`), name: 'Person' } }, + }; + + test('a client-issued search threads its correlation id into the server timing log', async function (assert) { + let store = getService('store') as StoreService; + let network = getService('network') as NetworkService; + + // Capture the realm-server's `realm:search-timing` emissions. + let timingLines: string[] = []; + setSearchTimingSinkForTests((line) => timingLines.push(line)); + + // Capture the correlation id the client actually puts on the wire. + let sentRequestIds: string[] = []; + let spy = async (request: Request) => { + if (new URL(request.url).pathname.endsWith('/_federated-search')) { + let id = request.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER); + if (id) { + sentRequestIds.push(id); + } + } + // Return null to fall through to the realm-server-mock route. + return null; + }; + network.virtualNetwork.mount(spy, { prepend: true }); + + // Simulate the prerender context, which is what gates the host's + // correlation-id stamping (mirrors a card rendering inside a prerender + // tab issuing a query-backed search). + (globalThis as Record).__boxelRenderContext = true; + + let results = await store.search(personQuery, [testRealmURL]); + await settled(); + + assert.strictEqual(results.length, 2, 'the search returned both people'); + + assert.strictEqual( + sentRequestIds.length, + 1, + 'the client stamped exactly one correlation id on its _federated-search fetch', + ); + let sentId = sentRequestIds[0]; + assert.ok( + /^[A-Za-z0-9._:-]{8,}$/.test(sentId), + `client-minted correlation id looks well-formed (${sentId})`, + ); + + let matching = timingLines.filter((line) => + line.includes(`corr=${sentId}`), + ); + assert.strictEqual( + matching.length, + 1, + `the server emitted exactly one realm:search-timing line keyed by the client's id (lines: ${JSON.stringify( + timingLines, + )})`, + ); + assert.ok( + /\bsql=\d+\b/.test(matching[0]), + `the timing line carries the sql stage (${matching[0]})`, + ); + assert.ok( + /\bloadLinks=\d+\b/.test(matching[0]), + `the timing line carries the loadLinks stage (${matching[0]})`, + ); + }); + + test('a non-prerender search stamps no id and emits no timing line', async function (assert) { + let store = getService('store') as StoreService; + let network = getService('network') as NetworkService; + + let timingLines: string[] = []; + setSearchTimingSinkForTests((line) => timingLines.push(line)); + + let sawHeader = false; + let spy = async (request: Request) => { + if ( + new URL(request.url).pathname.endsWith('/_federated-search') && + request.headers.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER) + ) { + sawHeader = true; + } + return null; + }; + network.virtualNetwork.mount(spy, { prepend: true }); + + // No __boxelRenderContext: live SPA traffic must not stamp the header + // (so it pays nothing and the server emits no timing line). + let results = await store.search(personQuery, [testRealmURL]); + await settled(); + + assert.strictEqual(results.length, 2, 'the search still returns results'); + assert.false( + sawHeader, + 'live (non-prerender) traffic sends no x-boxel-logging-correlation-id header', + ); + assert.strictEqual( + timingLines.length, + 0, + 'no realm:search-timing line is emitted without a correlation id', + ); + }); +}); diff --git a/packages/realm-server/handlers/handle-search.ts b/packages/realm-server/handlers/handle-search.ts index 1ec88e752f2..300f28b6ed6 100644 --- a/packages/realm-server/handlers/handle-search.ts +++ b/packages/realm-server/handlers/handle-search.ts @@ -10,6 +10,10 @@ import { sanitizeConsumingRealmHeader, SearchRequestError, searchRealms, + sanitizeLoggingCorrelationId, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, + RequestTimings, + emitSearchTiming, } from '@cardstack/runtime-common'; import { fetchRequestFromContext, @@ -36,16 +40,32 @@ export default function handleSearch(opts: { }): (ctxt: Koa.Context) => Promise { let { reconciler, searchCache } = opts; return async function (ctxt: Koa.Context) { + // Hoof-to-snout server-side timing for one search: from the moment the + // handler is entered through to the response being assembled. Stamped + // only when the client supplied a correlation id (prerender traffic); + // live / external callers allocate nothing and emit no line. The + // outermost request→response bound (incl. body read + send) is the + // `realm:requests` middleware's `dur=`, keyed by the same id. + let handlerStart = Date.now(); + let loggingCorrelationId = sanitizeLoggingCorrelationId( + ctxt.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), + ); + let timings = + loggingCorrelationId !== null ? new RequestTimings() : undefined; + let { realmList } = getMultiRealmAuthorization(ctxt); let cardsQuery; let request = await fetchRequestFromContext(ctxt); try { let payload = getSearchRequestPayload(ctxt); - cardsQuery = + let parseQuery = async () => payload !== undefined ? parseSearchQueryFromPayload(payload) : await parseSearchQueryFromRequest(request); + cardsQuery = timings + ? await timings.time('parse', parseQuery) + : await parseQuery(); } catch (e) { if (e instanceof SearchRequestError) { if (e.code === 'invalid-query') { @@ -106,6 +126,21 @@ export default function handleSearch(opts: { if (prerenderJobId) searchOpts.jobIdentity = prerenderJobId; let normalizedSearchOpts = Object.keys(searchOpts).length > 0 ? searchOpts : undefined; + // `loggingCorrelationId` / `timings` are deliberately kept OUT of `searchOpts`: + // that object is the job-scoped search cache's key material (see + // `computeETag` / `getOrPopulate` below), and per-request values would + // make every key unique and defeat the cache. They only need to reach + // `searchRealms` (which stamps the SQL + loadLinks stages onto the same + // collector this handler emits), so they ride on the run-time opts and + // never touch the cache key. + let runSearchOpts = + loggingCorrelationId !== null + ? { + ...(normalizedSearchOpts ?? {}), + loggingCorrelationId, + ...(timings ? { timings } : {}), + } + : normalizedSearchOpts; // `consumingRealm` is read unconditionally — even when the // job-scoped search cache is disabled, `resolveRealmsForFederatedRequest` // uses it to scope CS-11259's self-mount fast-path. The cache gate @@ -116,15 +151,34 @@ export default function handleSearch(opts: { // Lazy-mount inside runSearch so cache hits (304 / cached body) // skip the lazy-mount work entirely. let runSearch = async () => { - let realmInstances = await resolveRealmsForFederatedRequest( - reconciler, - realmList, - { consumingRealm }, - ); - return JSON.stringify( - await searchRealms(realmInstances, cardsQuery, normalizedSearchOpts), - null, - 2, + let resolveRealms = () => + resolveRealmsForFederatedRequest(reconciler, realmList, { + consumingRealm, + }); + let realmInstances = timings + ? await timings.time('resolveRealms', resolveRealms) + : await resolveRealms(); + // `searchRealms` stamps `sql` / `loadLinks` / `populate` / cache stages + // onto `runSearchOpts.timings`; because the handler passed a collector + // it won't emit its own line — this handler emits the complete one. + let doc = await searchRealms(realmInstances, cardsQuery, runSearchOpts); + let stringify = async () => JSON.stringify(doc, null, 2); + return timings ? await timings.time('stringify', stringify) : stringify(); + }; + + // Emit the complete request→response stage breakdown. Called on every + // terminal path that produced a response (skipped only on the parse- + // error early returns above, which never reach here). No-op without a + // correlation id. + let emitTimeline = () => { + if (!timings || loggingCorrelationId === null) { + return; + } + emitSearchTiming( + `corr=${loggingCorrelationId}` + + (prerenderJobId ? ` job=${prerenderJobId}` : '') + + ` handler=${Date.now() - handlerStart}ms ` + + timings.toLogFragment(), ); }; @@ -181,6 +235,7 @@ export default function handleSearch(opts: { if (cached !== undefined) { ctxt.status = 304; ctxt.set('ETag', expectedEtag); + emitTimeline(); return; } } @@ -200,6 +255,7 @@ export default function handleSearch(opts: { }, }), ); + emitTimeline(); return; } @@ -210,5 +266,6 @@ export default function handleSearch(opts: { headers: { 'content-type': SupportedMimeType.CardJson }, }), ); + emitTimeline(); }; } diff --git a/packages/realm-server/health-sampler.ts b/packages/realm-server/health-sampler.ts new file mode 100644 index 00000000000..ca9deb1ccc5 --- /dev/null +++ b/packages/realm-server/health-sampler.ts @@ -0,0 +1,65 @@ +import { monitorEventLoopDelay, type IntervalHistogram } from 'node:perf_hooks'; +import { logger } from '@cardstack/runtime-common'; +import { getSearchInFlight } from './search-inflight'; + +// Periodically samples the realm-server process's event-loop health and +// in-flight search count, logging a `realm:health` line whenever there's a +// saturation signal worth capturing. +// +// Why: during a from-scratch index, prerendered cards block in +// `waiting-stability` on `_search` round-trips that the realm-server is slow +// to answer — yet the SQL behind them runs in milliseconds. The missing +// piece is whether the single-threaded realm-server's event loop is starved +// (by the synchronous, CPU-bound post-SQL JSON serialization across many +// concurrent searches) so requests sit unserviced. Event-loop lag rising in +// lockstep with `inFlightSearch` is the fingerprint of exactly that. +// +// `monitorEventLoopDelay` measures the delay between when a timer was +// scheduled and when it actually fired — i.e. how long synchronous work kept +// the loop from turning. Values are nanoseconds. +export interface HealthSamplerOptions { + // How often to sample + maybe log. Defaults to 5s. + intervalMs?: number; + // Only log when peak loop lag in the window exceeds this (or a search is + // in flight). Keeps the line quiet on an idle/healthy server. Defaults to + // 200ms. + lagThresholdMs?: number; +} + +export function startHealthSampler( + opts: HealthSamplerOptions = {}, +): () => void { + let intervalMs = opts.intervalMs ?? 5000; + let lagThresholdMs = opts.lagThresholdMs ?? 200; + // Created here (not at module load) to avoid racing the circular import + // that installs the logger factory; startup calls this well after boot. + let log = logger('realm:health'); + let histogram: IntervalHistogram = monitorEventLoopDelay({ resolution: 20 }); + histogram.enable(); + + let timer = setInterval(() => { + let toMs = (ns: number) => (Number.isFinite(ns) ? ns / 1e6 : 0); + let maxLagMs = toMs(histogram.max); + let meanLagMs = toMs(histogram.mean); + let p99LagMs = toMs(histogram.percentile(99)); + histogram.reset(); + let inFlightSearch = getSearchInFlight(); + // Stay silent when the loop is healthy and nothing is in flight — only + // the saturation windows are interesting. + if (maxLagMs < lagThresholdMs && inFlightSearch === 0) { + return; + } + let heapMB = Math.round(process.memoryUsage().heapUsed / (1024 * 1024)); + log.info( + `eventLoopLagMs(mean/p99/max)=${meanLagMs.toFixed(0)}/${p99LagMs.toFixed(0)}/${maxLagMs.toFixed(0)} ` + + `inFlightSearch=${inFlightSearch} heapMB=${heapMB}`, + ); + }, intervalMs); + // Don't keep the process alive solely for sampling. + timer.unref?.(); + + return () => { + clearInterval(timer); + histogram.disable(); + }; +} diff --git a/packages/realm-server/main.ts b/packages/realm-server/main.ts index abab0bca538..8d8d327d634 100644 --- a/packages/realm-server/main.ts +++ b/packages/realm-server/main.ts @@ -43,6 +43,7 @@ import { ModuleCacheCoordinator } from './lib/module-cache-coordination'; import { JobsFinishedListener } from './lib/jobs-finished-listener'; import { JobScopedSearchCache } from './job-scoped-search-cache'; import { JobScopedInstanceCache } from './job-scoped-instance-cache'; +import { startHealthSampler } from './health-sampler'; import { resolveFullIndexOnStartup } from './lib/full-index-on-startup'; import { PUBLISHED_DIRECTORY_NAME } from '@cardstack/runtime-common'; @@ -391,6 +392,11 @@ const smokeTestHostApp = async () => { // JobsFinishedListener below) and the age-based janitor backstop. let instanceCache = new JobScopedInstanceCache(dbAdapter); instanceCache.startJanitor(); + // Periodic event-loop-lag + in-flight-search sampler. Emits a + // `realm:health` line only during saturation windows, so a stalled + // `_search` can be checked against whether the process's event loop was + // starved at the time. + let stopHealthSampler = startHealthSampler(); let reconciler: RealmRegistryReconciler | undefined; let fileChangesListener: RealmFileChangesListener | undefined; let indexUpdatedListener: RealmIndexUpdatedListener | undefined; @@ -642,6 +648,7 @@ const smokeTestHostApp = async () => { } searchCache.stopJanitor(); instanceCache.stopJanitor(); + stopHealthSampler(); httpServer.close(() => { (async () => { await Promise.all([ diff --git a/packages/realm-server/middleware/index.ts b/packages/realm-server/middleware/index.ts index 49a01db2304..afa72c36eaa 100644 --- a/packages/realm-server/middleware/index.ts +++ b/packages/realm-server/middleware/index.ts @@ -4,6 +4,8 @@ import type { ResponseWithNodeStream } from '@cardstack/runtime-common'; import { logger as getLogger, webStreamToText, + sanitizeLoggingCorrelationId, + X_BOXEL_LOGGING_CORRELATION_ID_HEADER, } from '@cardstack/runtime-common'; import type Koa from 'koa'; import mime from 'mime-types'; @@ -18,6 +20,16 @@ import { PRERENDER_JOB_ID_HEADER, sanitizePrerenderJobId, } from '../prerender/prerender-constants'; +import { + incrementSearchInFlight, + decrementSearchInFlight, +} from '../search-inflight'; + +// Matches the realm-server's search endpoints (`/_search`, +// `/_search-prerendered`, `/_federated-search`, +// `/_federated-search-prerendered`) so the request middleware can track how +// many searches are in flight for the health sampler. +const SEARCH_PATH_PATTERN = /(^|\/)_(federated-)?search(-prerendered)?$/; const REQUEST_BODY_STATE = 'requestBody'; @@ -156,20 +168,60 @@ export function httpLogging(ctxt: Koa.Context, next: Koa.Next) { // realm-server lines for an indexing job alongside worker lines. let jobId = sanitizePrerenderJobId(ctxt.get(PRERENDER_JOB_ID_HEADER)); let jobTag = jobId ? ` [job: ${jobId}]` : ''; + // Correlation id minted by the client; echoed onto both request log + // lines (and into the response header) so a client-observed slow search + // joins to the realm-server's view of the same request. The matching + // `realm:search-timing` line (emitted by `searchRealms`) is keyed by the + // same value. + let loggingCorrelationId = sanitizeLoggingCorrelationId( + ctxt.get(X_BOXEL_LOGGING_CORRELATION_ID_HEADER), + ); + let corrTag = loggingCorrelationId ? ` corr=${loggingCorrelationId}` : ''; + if (loggingCorrelationId) { + ctxt.set(X_BOXEL_LOGGING_CORRELATION_ID_HEADER, loggingCorrelationId); + } + let startedAt = Date.now(); + + // Track in-flight search load for the health sampler across the request's + // full lifecycle (queue → parse → SQL → serialize → send), which is the + // window during which a saturated event loop would leave it unserviced. + let isSearch = SEARCH_PATH_PATTERN.test(ctxt.path); + let releasedInFlight = false; + let releaseInFlight = () => { + if (releasedInFlight) { + return; + } + releasedInFlight = true; + decrementSearchInFlight(); + }; + if (isSearch) { + incrementSearchInFlight(); + } logger.info( `<-- ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }${jobTag}`, + }${jobTag}${corrTag}`, ); - ctxt.res.on('finish', () => { + let onSettled = () => { + if (isSearch) { + releaseInFlight(); + } logger.info( `--> ${ctxt.method} ${ctxt.req.headers.accept} ${ fullRequestURL(ctxt).href - }: ${ctxt.status}${jobTag}`, + }: ${ctxt.status}${jobTag}${corrTag} dur=${Date.now() - startedAt}ms`, ); logger.debug(JSON.stringify(ctxt.req.headers)); + }; + // `finish` fires on a fully-sent response; `close` covers a connection + // torn down before that (so the in-flight count can't leak on aborts). + ctxt.res.on('finish', onSettled); + ctxt.res.on('close', () => { + if (isSearch) { + releaseInFlight(); + } }); return next(); } diff --git a/packages/realm-server/search-inflight.ts b/packages/realm-server/search-inflight.ts new file mode 100644 index 00000000000..79549e5d277 --- /dev/null +++ b/packages/realm-server/search-inflight.ts @@ -0,0 +1,21 @@ +// Count of `_federated-search` requests the realm-server is currently +// handling. Incremented at the search handler's entry and decremented when +// it settles. Read by the health sampler so a spike in concurrent searches +// can be correlated with event-loop lag — the signature of the realm-server +// process being saturated while prerenders wait on in-render `_search` +// round-trips. A plain module-level counter (the realm-server is a single +// process) kept separate from the sampler so the search handler doesn't pull +// in `perf_hooks`. +let inFlight = 0; + +export function incrementSearchInFlight(): void { + inFlight++; +} + +export function decrementSearchInFlight(): void { + inFlight = inFlight > 0 ? inFlight - 1 : 0; +} + +export function getSearchInFlight(): number { + return inFlight; +} diff --git a/packages/realm-server/server.ts b/packages/realm-server/server.ts index 43f60196ae3..c9fc2aace00 100644 --- a/packages/realm-server/server.ts +++ b/packages/realm-server/server.ts @@ -728,7 +728,7 @@ export class RealmServer { cors({ origin: '*', allowHeaders: - 'Authorization, Content-Type, If-Match, If-None-Match, X-Requested-With, X-Boxel-Client-Request-Id, X-Boxel-Assume-User, X-HTTP-Method-Override, X-Boxel-Disable-Module-Cache, X-Filename, X-Boxel-During-Prerender, X-Boxel-Consuming-Realm, X-Boxel-Job-Id, X-Boxel-Job-Priority, X-Grafana-Device-Id, X-Grafana-Action', + 'Authorization, Content-Type, If-Match, If-None-Match, X-Requested-With, X-Boxel-Client-Request-Id, X-Boxel-Assume-User, X-HTTP-Method-Override, X-Boxel-Disable-Module-Cache, X-Filename, X-Boxel-During-Prerender, X-Boxel-Consuming-Realm, X-Boxel-Job-Id, X-Boxel-Job-Priority, X-Boxel-Logging-Correlation-Id, X-Grafana-Device-Id, X-Grafana-Action', // Without an explicit expose list, @koa/cors only emits the // CORS-safelisted response headers (cache-control, content-*, // expires, last-modified, pragma). ETag is not on that list, diff --git a/packages/runtime-common/index.ts b/packages/runtime-common/index.ts index 23b27c8ce4d..d18b15ae3d2 100644 --- a/packages/runtime-common/index.ts +++ b/packages/runtime-common/index.ts @@ -705,6 +705,7 @@ export * from './resource-types'; export * from './prerender-headers'; export * from './query'; export * from './search-utils'; +export * from './request-timings'; export * from './prerendered-html-format'; export * from './query-field-utils'; export * from './relationship-utils'; diff --git a/packages/runtime-common/prerender-headers.ts b/packages/runtime-common/prerender-headers.ts index e1e8b7dc19d..dacedb8427f 100644 --- a/packages/runtime-common/prerender-headers.ts +++ b/packages/runtime-common/prerender-headers.ts @@ -93,3 +93,28 @@ export function sanitizeJobPriorityHeader( if (!Number.isSafeInteger(n) || n < 0 || n > JOB_PRIORITY_MAX) return null; return n; } + +// Per-search correlation id minted by the host SPA on each +// `_federated-search` fetch it issues while rendering inside a prerender +// tab. The realm-server's search path reads it back out and emits a +// `realm:search-timing` line keyed by it, so a client-observed slow +// search (visible in the prerender's `queryLoadsInFlight` diagnostics) +// can be joined to where the realm-server actually spent the time. Lives +// here so the host SPA can import the header name without depending on +// the realm-server package. +export const X_BOXEL_LOGGING_CORRELATION_ID_HEADER = + 'x-boxel-logging-correlation-id'; + +// Sanitize the inbound request-id header. It's echoed into log lines, so +// admit only a bounded run of URL-safe id characters (covers a UUID and +// then some) and reject anything with whitespace or control characters +// that could forge a log line. +const REQUEST_ID_PATTERN = /^[A-Za-z0-9._:-]{1,128}$/; +export function sanitizeLoggingCorrelationId( + raw: string | null | undefined, +): string | null { + if (typeof raw !== 'string') return null; + let trimmed = raw.trim(); + if (!trimmed) return null; + return REQUEST_ID_PATTERN.test(trimmed) ? trimmed : null; +} diff --git a/packages/runtime-common/realm-index-query-engine.ts b/packages/runtime-common/realm-index-query-engine.ts index 336683c9ec1..3354c924e16 100644 --- a/packages/runtime-common/realm-index-query-engine.ts +++ b/packages/runtime-common/realm-index-query-engine.ts @@ -35,6 +35,7 @@ import type { Realm } from './realm'; import type { VirtualNetwork } from './virtual-network'; import { FILE_META_RESERVED_KEYS } from './realm'; import { RealmPaths } from './paths'; +import type { RequestTimings } from './request-timings'; import type { RealmResourceIdentifier, RealmIdentifier, @@ -168,6 +169,13 @@ type Options = { // for live / external callers, which therefore never read or write the // cache. jobIdentity?: string; + // Per-request wall-clock collector, threaded from `searchRealms` when a + // request carries a correlation id. The post-SQL stages here — the SQL + // query, the `loadLinks` relationship assembly, and the per-instance + // cache reads/writes — stamp their elapsed time on it so the handler can + // attribute the request's server-side time across stages. Absent (and so + // a no-op) for everything except instrumented `_federated-search` calls. + timings?: RequestTimings; } & QueryOptions; type SearchResult = SearchResultDoc | SearchResultError; @@ -219,10 +227,19 @@ export function searchInFlightKey( // values inside query/opts — e.g. a `matches: 'a|b'` string — can never // collide with the delimiter and cause unrelated searches to coalesce. try { + // `timings` is a per-request diagnostic collector, not part of the + // result-shaping opts. Exclude it from the key so it can't perturb the + // in-flight coalescing — two otherwise-identical searches must still + // dedupe even though each carries its own collector. + let keyOpts = opts; + if (opts && 'timings' in opts) { + let { timings: _omitTimings, ...rest } = opts; + keyOpts = rest; + } return JSON.stringify([ realmURL, normalizeQueryForSignature(query), - opts ? sortKeysDeep(opts) : null, + keyOpts ? sortKeysDeep(keyOpts) : null, ]); } catch { return undefined; @@ -333,7 +350,15 @@ export class RealmIndexQueryEngine { if (key !== undefined) { let existing = this.#inFlightSearch.get(key); if (existing) { - return await existing; + // A concurrent identical search is already running; this follower + // awaits its result instead of re-running the work. Record that wait + // as `coalescedWait` on the follower's own collector so its + // `realm:search-timing` line reflects the time spent — otherwise the + // follower would show no `sql`/`loadLinks` and look misleadingly + // instant exactly under the concurrent search load we're diagnosing. + return opts?.timings + ? await opts.timings.time('coalescedWait', () => existing) + : await existing; } let pending = this.searchCardsUncoalesced(query, opts).finally(() => { // Identity-check before deletion: a concurrent invalidation path @@ -376,11 +401,15 @@ export class RealmIndexQueryEngine { meta, }; } else { - let { cards, meta } = await this.#indexQueryEngine.searchCards( - new URL(this.#realm.url), - query, - opts, - ); + let runCardSql = () => + this.#indexQueryEngine.searchCards( + new URL(this.#realm.url), + query, + opts, + ); + let { cards, meta } = opts?.timings + ? await opts.timings.time('sql', runCardSql) + : await runCardSql(); let cardResources = cards.map((resource) => ({ ...resource, ...{ links: { self: resource.id } }, @@ -408,14 +437,18 @@ export class RealmIndexQueryEngine { // Process all root resources together so a single batched DB query // resolves their first-level links (1+1 instead of N+M sequential // round-trips). See CS-11038. - let included = await this.loadLinks( - { - realmURL: this.realmURL, - rootResources: doc.data, - omit, - }, - linkOpts, - ); + let runLoadLinks = () => + this.loadLinks( + { + realmURL: this.realmURL, + rootResources: doc.data, + omit, + }, + linkOpts, + ); + let included = opts?.timings + ? await opts.timings.time('loadLinks', runLoadLinks) + : await runLoadLinks(); if (included.length > 0) { doc.included = included; } @@ -1373,14 +1406,18 @@ export class RealmIndexQueryEngine { : opts?.linkFields ? { ...opts, linkFields: undefined } : opts; + let timings = popOpts?.timings; let cacheKey = this.#instanceCacheKey(resource, popOpts); if (cacheKey) { - let cached = await this.#readCachedRelationships( - cacheKey.jobId, - cacheKey.url, - ); + let ck = cacheKey; + let cached = timings + ? await timings.time('cacheRead', () => + this.#readCachedRelationships(ck.jobId, ck.url), + ) + : await this.#readCachedRelationships(ck.jobId, ck.url); if (cached !== undefined) { recordInstanceCacheEvent(cacheKey.jobId, true); + timings?.incr('cacheHit'); // Hit: reuse this instance's assembled query-field // relationships from an earlier occurrence in the same job, // skipping the definition lookup + field-tree walk. The @@ -1396,28 +1433,43 @@ export class RealmIndexQueryEngine { return; } recordInstanceCacheEvent(cacheKey.jobId, false); + timings?.incr('cacheMiss'); } let storedDefs = ( resource.meta as { queryFieldDefs?: Record; } )?.queryFieldDefs; - if (popOpts?.cacheOnlyDefinitions && storedDefs) { - await this.populateQueryFieldsFromMeta( - resource, - realmURL, - storedDefs, - popOpts, - ); + // The relationship/query-field assembly — the definition lookup + + // field-tree walk — is the post-SQL "wire-format prep" the timeline + // attributes under `populate`. + let runPopulate = () => + popOpts?.cacheOnlyDefinitions && storedDefs + ? this.populateQueryFieldsFromMeta( + resource, + realmURL, + storedDefs, + popOpts, + ) + : this.populateQueryFields(resource, realmURL, popOpts); + if (timings) { + await timings.time('populate', runPopulate); } else { - await this.populateQueryFields(resource, realmURL, popOpts); + await runPopulate(); } if (cacheKey) { - await this.#writeCachedRelationships( - cacheKey.jobId, - cacheKey.url, - (resource as { relationships?: unknown }).relationships, - ); + let ck = cacheKey; + let writeCache = () => + this.#writeCachedRelationships( + ck.jobId, + ck.url, + (resource as { relationships?: unknown }).relationships, + ); + if (timings) { + await timings.time('cacheWrite', writeCache); + } else { + await writeCache(); + } } }), ); diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index c1c5331e1b5..4ed0168c18b 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -5175,6 +5175,7 @@ export class Realm { // `!== undefined` so an explicit priority 0 (system-initiated) survives. ...(opts?.priority !== undefined ? { priority: opts.priority } : {}), ...(opts?.jobIdentity ? { jobIdentity: opts.jobIdentity } : {}), + ...(opts?.timings ? { timings: opts.timings } : {}), }); } diff --git a/packages/runtime-common/request-timings.ts b/packages/runtime-common/request-timings.ts new file mode 100644 index 00000000000..431d9a4e80d --- /dev/null +++ b/packages/runtime-common/request-timings.ts @@ -0,0 +1,64 @@ +// A request-scoped wall-clock collector for the realm-server search path. +// +// One instance is created per instrumented `_search` request in +// `handle-search` and threaded into search opts. Each stage of the +// server-side pipeline that runs *after* the request is received — +// the SQL query, the post-SQL `loadLinks` relationship assembly (the +// per-instance cache reads/writes), and the JSON wire-format +// serialization — stamps its elapsed time here, so the handler can emit +// a single line attributing the request's wall-clock across stages. +// +// It is deliberately plain (no Node-only APIs) so it can live in +// runtime-common alongside the search pipeline it instruments. In +// practice it is only ever instantiated by the realm-server: the host's +// store fetches search results over HTTP and never runs +// `RealmIndexQueryEngine`, so in the browser the threaded collector is +// always `undefined` and every `opts?.timings?.…` call is a no-op. +export class RequestTimings { + #stages = new Map(); + #counters = new Map(); + + // Time an async stage and accumulate its elapsed ms under `stage`. + // Repeated stages (e.g. one `loadLinks` per realm in a federated + // search) sum, so the line reports total time in each stage. + async time(stage: string, fn: () => Promise): Promise { + let start = Date.now(); + try { + return await fn(); + } finally { + this.add(stage, Date.now() - start); + } + } + + add(stage: string, ms: number): void { + this.#stages.set(stage, (this.#stages.get(stage) ?? 0) + ms); + } + + // Integer tallies that aren't durations — result counts, per-instance + // cache hit/miss counts. + incr(counter: string, n = 1): void { + this.#counters.set(counter, (this.#counters.get(counter) ?? 0) + n); + } + + stages(): Record { + let out: Record = {}; + for (let [k, v] of this.#stages) { + out[k] = Math.round(v); + } + return out; + } + + counters(): Record { + return Object.fromEntries(this.#counters); + } + + // Compact fragment for a single log line, e.g. + // `sql=41 loadLinks=120 stringify=210 | results=312 cacheHit=18 cacheMiss=3` + toLogFragment(): string { + let stages = [...this.#stages] + .map(([k, v]) => `${k}=${Math.round(v)}`) + .join(' '); + let counters = [...this.#counters].map(([k, v]) => `${k}=${v}`).join(' '); + return counters ? `${stages} | ${counters}` : stages; + } +} diff --git a/packages/runtime-common/search-utils.ts b/packages/runtime-common/search-utils.ts index b7e17b6d764..515d15f3a02 100644 --- a/packages/runtime-common/search-utils.ts +++ b/packages/runtime-common/search-utils.ts @@ -1,6 +1,8 @@ import type { RealmResourceIdentifier } from './card-reference-resolver'; +import { logger } from './log'; import { ensureTrailingSlash } from './paths'; import { assertQuery, InvalidQueryError, type Query } from './query'; +import { RequestTimings } from './request-timings'; import { isValidPrerenderedHtmlFormat, PRERENDERED_HTML_FORMATS, @@ -326,6 +328,18 @@ export type SearchOpts = { omitIncluded?: boolean; priority?: number; jobIdentity?: string; + // Correlation id minted by the client (the prerendered host stamps + // `x-boxel-logging-correlation-id` on its `_federated-search` fetch) and read back + // out by the request handler into opts. When present, `searchRealms` + // instruments the server-side search pipeline and emits one + // `realm:search-timing` line keyed by this id, so a client-observed + // slow search can be joined to where the realm-server spent the time. + loggingCorrelationId?: string; + // Per-request wall-clock collector. `searchRealms` creates it when a + // `loggingCorrelationId` is present and threads it down through `Realm.search` → + // `searchCards` → `loadLinks` so each post-SQL stage stamps its + // elapsed time. Callers never supply this directly. + timings?: RequestTimings; }; type SearchableRealm = { @@ -336,11 +350,47 @@ type SearchableRealm = { url?: string; }; +// Indirection so a host integration test can deterministically capture +// the emitted timing line: loglevel rebinds a logger's methods on every +// `setLevel`, so a test that monkeypatches a direct logger handle would +// race the next `logger('realm:search-timing')` call. A settable sink +// sidesteps that. Defaults to the `realm:search-timing` logger. +let searchTimingSink: ((line: string) => void) | undefined; +let searchTimingLog: ReturnType | undefined; +export function setSearchTimingSinkForTests( + sink: ((line: string) => void) | undefined, +): void { + searchTimingSink = sink; +} +export function emitSearchTiming(line: string): void { + if (searchTimingSink) { + searchTimingSink(line); + return; + } + // Lazy: a module-load `logger()` call races the circular import that + // installs the logger factory. First emission happens well after boot. + (searchTimingLog ??= logger('realm:search-timing')).info(line); +} + export async function searchRealms( realms: Array, query: Query, opts?: SearchOpts, ): Promise { + // Instrument only when the caller threaded a correlation id. The + // prerendered host stamps one; live SPA / API traffic does not — so + // normal traffic allocates no collector and emits no line. + // + // Two callers: the realm-server's `handle-search` threads a collector it + // owns (so it can emit one complete request→response line itself — + // `opts.timings` is set, `ownsTimings` is false, we don't emit), and the + // host-test realm-server mock calls us with just a `loggingCorrelationId` (we create + // the collector and emit the line ourselves, which the host test observes). + let ownsTimings = Boolean(opts?.loggingCorrelationId) && !opts?.timings; + let timings = + opts?.timings ?? (ownsTimings ? new RequestTimings() : undefined); + let perRealmOpts = ownsTimings && opts ? { ...opts, timings } : opts; + let startedAt = ownsTimings ? Date.now() : 0; let realmEntries = realms .filter((realm): realm is SearchableRealm => Boolean(realm)) .map((realm) => ({ @@ -348,7 +398,7 @@ export async function searchRealms( label: realm.url ? String(realm.url) : undefined, })); let searchPromises = realmEntries.map(({ realm }) => - Promise.resolve().then(() => realm.search(query, opts)), + Promise.resolve().then(() => realm.search(query, perRealmOpts)), ); let results = await Promise.allSettled(searchPromises); let queryLabel = '[unserializable query]'; @@ -369,7 +419,20 @@ export async function searchRealms( let docs = results.flatMap((result) => result.status === 'fulfilled' ? [result.value] : [], ); - return combineSearchResults(docs); + let combined = combineSearchResults(docs); + if (timings) { + timings.incr('results', combined.data?.length ?? 0); + } + if (ownsTimings && timings) { + emitSearchTiming( + `corr=${opts!.loggingCorrelationId}` + + (opts!.jobIdentity ? ` job=${opts!.jobIdentity}` : '') + + ` realms=${realmEntries.length}` + + ` total=${Date.now() - startedAt}ms ` + + timings.toLogFragment(), + ); + } + return combined; } type PrerenderedSearchableRealm = {