Skip to content

Commit 187c047

Browse files
authored
perf(webapp): shrink run trace loader payload and add trace span cap controls (#3906)
## Summary The run trace page loader serialized every span's raw OTel events (with full properties) into the response, even though the tree UI only renders the derived `timelineEvents` and the span detail panel refetches what it needs. On event-heavy traces that inflated both the loader payload and the server-side heap copies built per request. This PR keeps raw span events server-side and pairs that with a few related trace-view improvements: - A new optional `TRACE_VIEW_EMERGENCY_SPAN_CAP` env var (unset by default) clamps the trace summary and detailed trace summary span limits on both event store paths, including the public run trace endpoint, so operators can bound trace query sizes in one place without retuning the per-store limits. - The TreeView virtualizer resolved every rendered row with a linear scan over the whole tree (and `getNodeProps` did the same via `findIndex`); rows now resolve through memoized id lookup maps, which matters once traces reach tens of thousands of spans. - The run stream SSE lookup now applies the same organization membership scoping as the rest of the run page presenters, for consistency. Behavior is unchanged by default: the trace tree renders from the same `timelineEvents` it always has, and the new cap only takes effect when set.
1 parent 2b6d249 commit 187c047

12 files changed

Lines changed: 114 additions & 22 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
area: webapp
3+
type: improvement
4+
---
5+
6+
Shrinks the run trace page loader payload by keeping raw span events server-side and makes large trace trees render more efficiently. Also adds an optional `TRACE_VIEW_EMERGENCY_SPAN_CAP` env var that clamps trace summary and detailed summary span limits on both event store paths.

apps/webapp/app/components/primitives/TreeView/TreeView.tsx

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
import { VirtualItem, Virtualizer, useVirtualizer } from "@tanstack/react-virtual";
22
import { motion } from "framer-motion";
3-
import { MutableRefObject, RefObject, useCallback, useEffect, useReducer, useRef } from "react";
3+
import {
4+
MutableRefObject,
5+
RefObject,
6+
useCallback,
7+
useEffect,
8+
useMemo,
9+
useReducer,
10+
useRef,
11+
} from "react";
412
import { cn } from "~/utils/cn";
513
import { NodeState, NodesState, reducer } from "./reducer";
614
import { concreteStateFromInput, selectedIdFromState } from "./utils";
@@ -47,6 +55,16 @@ export function TreeView<TData>({
4755

4856
const virtualItems = virtualizer.getVirtualItems();
4957

58+
// id -> node lookup so each virtual row resolves in O(1) instead of
59+
// scanning the whole tree per row.
60+
const nodesById = useMemo(() => {
61+
const map = new Map<string, FlatTreeItem<TData>>();
62+
for (const node of tree) {
63+
map.set(node.id, node);
64+
}
65+
return map;
66+
}, [tree]);
67+
5068
const scrollCallback = useCallback(
5169
(event: Event) => {
5270
if (!onScroll) return;
@@ -99,7 +117,7 @@ export function TreeView<TData>({
99117
}}
100118
>
101119
{virtualItems.map((virtualItem) => {
102-
const node = tree.find((node) => node.id === virtualItem.key);
120+
const node = nodesById.get(virtualItem.key as string);
103121
if (!node) return null;
104122
const state = nodes[node.id];
105123
if (!state) return null;
@@ -197,6 +215,16 @@ export function useTree<TData, TFilterValue>({
197215
concreteStateFromInput({ tree, selectedId, collapsedIds, filter })
198216
);
199217

218+
// id -> index lookup so getNodeProps resolves in O(1) instead of scanning
219+
// the whole tree per rendered row.
220+
const treeIndexById = useMemo(() => {
221+
const map = new Map<string, number>();
222+
tree.forEach((node, index) => {
223+
map.set(node.id, index);
224+
});
225+
return map;
226+
}, [tree]);
227+
200228
//sync external selectedId prop into internal state
201229
useEffect(() => {
202230
const internalSelectedId = selectedIdFromState(state.nodes);
@@ -497,7 +525,7 @@ export function useTree<TData, TFilterValue>({
497525
(id: string) => {
498526
const node = state.nodes[id];
499527
if (!node) return {};
500-
const treeItemIndex = tree.findIndex((node) => node.id === id);
528+
const treeItemIndex = treeIndexById.get(id) ?? -1;
501529
const treeItem = tree[treeItemIndex];
502530
return {
503531
"aria-expanded": node.expanded,
@@ -506,7 +534,7 @@ export function useTree<TData, TFilterValue>({
506534
tabIndex: node.selected ? -1 : undefined,
507535
};
508536
},
509-
[state]
537+
[state, treeIndexById]
510538
);
511539

512540
return {

apps/webapp/app/env.server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,9 @@ const EnvironmentSchema = z
726726
MAXIMUM_LIVE_RELOADING_EVENTS: z.coerce.number().int().default(1000),
727727
MAXIMUM_TRACE_SUMMARY_VIEW_COUNT: z.coerce.number().int().default(25_000),
728728
MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT: z.coerce.number().int().default(10_000),
729+
// Emergency circuit breaker: when set, clamps the trace summary and detailed
730+
// summary span limits on both event store paths to this value. Unset = disabled.
731+
TRACE_VIEW_EMERGENCY_SPAN_CAP: z.coerce.number().int().positive().optional(),
729732
TASK_PAYLOAD_OFFLOAD_THRESHOLD: z.coerce.number().int().default(524_288), // 512KB
730733
BATCH_PAYLOAD_OFFLOAD_THRESHOLD: z.coerce.number().int().optional(), // Defaults to TASK_PAYLOAD_OFFLOAD_THRESHOLD if not set
731734
TASK_PAYLOAD_MAXIMUM_SIZE: z.coerce.number().int().default(3_145_728), // 3MB

apps/webapp/app/presenters/v3/RunPresenter.server.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,16 @@ export class RunPresenter {
255255
linkedRunIdBySpanId[n.id] = n.runId;
256256
}
257257

258+
// Raw span events are only needed server-side (to derive timelineEvents);
259+
// keep them out of the serialized loader payload.
260+
const { events: spanEvents, ...data } = n.data;
261+
258262
return {
259263
...n,
260264
data: {
261-
...n.data,
265+
...data,
262266
timelineEvents: createTimelineSpanEventsFromSpanEvents(
263-
n.data.events,
267+
spanEvents,
264268
user?.admin ?? false,
265269
treeRootStartTimeMs
266270
),

apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { type PrismaClient, prisma } from "~/db.server";
22
import { logger } from "~/services/logger.server";
3+
import { requireUserId } from "~/services/session.server";
34
import { singleton } from "~/utils/singleton";
45
import { ABORT_REASON_SEND_ERROR, createSSELoader, SendFunction } from "~/utils/sse";
56
import { throttle } from "~/utils/throttle";
@@ -30,9 +31,23 @@ export class RunStreamPresenter {
3031
throw new Response("Missing runParam", { status: 400 });
3132
}
3233

34+
const userId = await requireUserId(context.request);
35+
36+
// Scope the lookup to organizations the requesting user is a member
37+
// of, matching RunPresenter's run lookup. Unauthorized and missing
38+
// runs are indistinguishable (both 404).
3339
const run = await prismaClient.taskRun.findFirst({
3440
where: {
3541
friendlyId: runFriendlyId,
42+
project: {
43+
organization: {
44+
members: {
45+
some: {
46+
userId,
47+
},
48+
},
49+
},
50+
},
3651
},
3752
select: {
3853
traceId: true,
@@ -51,7 +66,15 @@ export class RunStreamPresenter {
5166
if (buffer) {
5267
try {
5368
const entry = await buffer.getEntry(runFriendlyId);
54-
if (entry) {
69+
// Same membership scoping as the PG lookup above — the buffer
70+
// entry carries the owning org's id.
71+
const isMember = entry
72+
? (await prismaClient.orgMember.findFirst({
73+
where: { organizationId: entry.orgId, userId },
74+
select: { id: true },
75+
})) !== null
76+
: false;
77+
if (entry && isMember) {
5578
// Go through the webapp wrapper so this read-side module
5679
// shares a single deserialisation path with readFallback —
5780
// see the contract comment in syntheticRedirectInfo.server.ts.

apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { ClickHouse } from "@internal/clickhouse";
22
import { createHash } from "crypto";
33
import { ClickhouseEventRepository } from "~/v3/eventRepository/clickhouseEventRepository.server";
44
import { env } from "~/env.server";
5+
import { clampToEmergencySpanCap } from "~/v3/eventRepository/emergencySpanCap.server";
56
import { singleton } from "~/utils/singleton";
67
import type { OrganizationDataStoresRegistry } from "~/services/dataStores/organizationDataStoresRegistry.server";
78
import { type IEventRepository } from "~/v3/eventRepository/eventRepository.types";
@@ -533,9 +534,12 @@ function buildEventRepository(store: string, clickhouse: ClickHouse): Clickhouse
533534
clickhouse,
534535
batchSize: env.EVENTS_CLICKHOUSE_BATCH_SIZE,
535536
flushInterval: env.EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS,
536-
maximumTraceSummaryViewCount: env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT,
537-
maximumTraceDetailedSummaryViewCount:
538-
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT,
537+
maximumTraceSummaryViewCount: clampToEmergencySpanCap(
538+
env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT
539+
),
540+
maximumTraceDetailedSummaryViewCount: clampToEmergencySpanCap(
541+
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT
542+
),
539543
maximumLiveReloadingSetting: env.EVENTS_CLICKHOUSE_MAX_LIVE_RELOADING_SETTING,
540544
insertStrategy: env.EVENTS_CLICKHOUSE_INSERT_STRATEGY,
541545
waitForAsyncInsert: env.EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT === "1",
@@ -557,9 +561,12 @@ function buildEventRepository(store: string, clickhouse: ClickHouse): Clickhouse
557561
clickhouse: clickhouse,
558562
batchSize: env.EVENTS_CLICKHOUSE_BATCH_SIZE,
559563
flushInterval: env.EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS,
560-
maximumTraceSummaryViewCount: env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT,
561-
maximumTraceDetailedSummaryViewCount:
562-
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT,
564+
maximumTraceSummaryViewCount: clampToEmergencySpanCap(
565+
env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT
566+
),
567+
maximumTraceDetailedSummaryViewCount: clampToEmergencySpanCap(
568+
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT
569+
),
563570
maximumLiveReloadingSetting: env.EVENTS_CLICKHOUSE_MAX_LIVE_RELOADING_SETTING,
564571
insertStrategy: env.EVENTS_CLICKHOUSE_INSERT_STRATEGY,
565572
waitForAsyncInsert: env.EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT === "1",

apps/webapp/app/utils/timelineSpanEvents.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ export function createTimelineSpanEventsFromSpanEvents(
117117
offset,
118118
timestamp,
119119
duration,
120-
properties: spanEvent.properties,
121120
helpText: getHelpTextForEvent(name),
122121
markerVariant,
123122
lineVariant: "light" as const,
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { env } from "~/env.server";
2+
3+
// Emergency circuit breaker for trace views: when TRACE_VIEW_EMERGENCY_SPAN_CAP
4+
// is set, clamp a trace summary span limit to it. Unset = no clamping.
5+
export function clampToEmergencySpanCap(limit: number): number {
6+
const cap = env.TRACE_VIEW_EMERGENCY_SPAN_CAP;
7+
return cap === undefined ? limit : Math.min(limit, cap);
8+
}

apps/webapp/app/v3/mollifier/syntheticTrace.server.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,14 @@ export function buildSyntheticTraceForBufferedRun(run: SyntheticRun) {
4444
const offset = millisecondsToNanoseconds(
4545
n.data.startTime.getTime() - treeRootStartTimeMs
4646
);
47+
// Mirror RunPresenter: raw span events stay server-side, only
48+
// timelineEvents ship to the client.
49+
const { events: spanEvents, ...data } = n.data;
4750
return {
4851
...n,
4952
data: {
50-
...n.data,
51-
timelineEvents: createTimelineSpanEventsFromSpanEvents(n.data.events, false, treeRootStartTimeMs),
53+
...data,
54+
timelineEvents: createTimelineSpanEventsFromSpanEvents(spanEvents, false, treeRootStartTimeMs),
5255
duration: n.data.isPartial ? null : n.data.duration,
5356
offset,
5457
isRoot: n.id === spanId,

apps/webapp/app/v3/taskEventStore.server.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import { Prisma, TaskEvent } from "@trigger.dev/database";
33
import type { PrismaClient, PrismaReplicaClient } from "~/db.server";
44
import { env } from "~/env.server";
5+
import { clampToEmergencySpanCap } from "~/v3/eventRepository/emergencySpanCap.server";
56

67
export type CommonTaskEvent = Omit<TaskEvent, "id">;
78
export type TraceEvent = Pick<
@@ -192,7 +193,7 @@ export class TaskEventStore {
192193
: Prisma.empty
193194
}
194195
ORDER BY "startTime" ASC
195-
LIMIT ${env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT}
196+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT)}
196197
`;
197198
} else {
198199
return await this.readReplica.$queryRaw<TraceEvent[]>`
@@ -220,7 +221,7 @@ export class TaskEventStore {
220221
: Prisma.empty
221222
}
222223
ORDER BY "startTime" ASC
223-
LIMIT ${env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT}
224+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT)}
224225
`;
225226
}
226227
}
@@ -270,7 +271,7 @@ export class TaskEventStore {
270271
: Prisma.empty
271272
}
272273
ORDER BY "startTime" ASC
273-
LIMIT ${env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT}
274+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT)}
274275
`;
275276
} else {
276277
return await this.readReplica.$queryRaw<DetailedTraceEvent[]>`
@@ -299,7 +300,7 @@ export class TaskEventStore {
299300
: Prisma.empty
300301
}
301302
ORDER BY "startTime" ASC
302-
LIMIT ${env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT}
303+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT)}
303304
`;
304305
}
305306
}

0 commit comments

Comments
 (0)