Skip to content

Commit e08d939

Browse files
committed
fix(webapp): cap AI SDK OTel attribute size so ClickHouse JSON parse doesn't drop spans
Vercel AI SDK spans carry tens of KB of prompt/response content per attribute, producing an assembled attributes JSON that ClickHouse rejects with "Cannot parse JSON object" and silently drops the whole batch. - Add otlpAttributeLimits module with per-key overrides for ai.* / gen_ai.* content keys (tighter 1KB cap) plus a 32KB total-attributes backstop that drops AI content keys in priority order when exceeded. - Wire SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT / SERVER_OTEL_AI_CONTENT_ATTRIBUTE_VALUE_LENGTH_LIMIT / SERVER_OTEL_SPAN_TOTAL_ATTRIBUTES_LENGTH_LIMIT env vars through the OTLP exporter for spans and logs. - DynamicFlushScheduler now recognises ClickHouse JSON parse errors and recursively splits the failing batch (up to depth 4) to isolate the bad row instead of poisoning the whole 5-10k-row batch, with a droppedRows metric for rows that can't be isolated further.
1 parent 8ba067d commit e08d939

6 files changed

Lines changed: 565 additions & 96 deletions

File tree

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
area: webapp
3+
type: fix
4+
---
5+
6+
Tighten OTel span attribute truncation for Vercel AI SDK content keys
7+
(`ai.prompt*`, `ai.response.text/object/toolCalls/reasoning*`,
8+
`gen_ai.prompt`, `gen_ai.completion`, `gen_ai.request.messages`,
9+
`gen_ai.response.text`) to a 1KB per-attribute cap, plus a 32KB per-span
10+
backstop that drops these content keys in priority order if the assembled
11+
attributes JSON still exceeds it. Cost/token metadata (`ai.usage.*`,
12+
`ai.model.*`, `gen_ai.usage.*`, `gen_ai.response.model`, etc.) keeps the
13+
default 8KB cap so LLM enrichment continues to work.
14+
15+
Adds a parse-error-aware safety net in `DynamicFlushScheduler`: when
16+
ClickHouse rejects a batch with `Cannot parse JSON object here`, the
17+
batch is split in half and retried (up to 4 split levels / 16-way
18+
isolation) instead of failing all 5–10k rows at once. Singleton rows
19+
that still fail are logged with a 1KB sample and dropped so the rest of
20+
the queue keeps flowing.

apps/webapp/app/env.server.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,17 @@ const EnvironmentSchema = z
498498
TRIGGER_OTEL_ATTRIBUTE_PER_LINK_COUNT_LIMIT: z.string().default("10"),
499499
TRIGGER_OTEL_ATTRIBUTE_PER_EVENT_COUNT_LIMIT: z.string().default("10"),
500500

501+
// Server-side OTel ingestion limits applied in otlpExporter.server.ts.
502+
// Default per-attribute cap (8KB) is enough for nearly all keys, but
503+
// Vercel AI SDK content keys (ai.prompt*, ai.response.text/object/etc.,
504+
// gen_ai.prompt, gen_ai.completion) carry tens of KB and have a tighter
505+
// dedicated cap. The total cap is a backstop applied to the assembled
506+
// attributes JSON; if exceeded, AI content keys are dropped in priority
507+
// order. Both prevent oversized JSON from breaking ClickHouse inserts.
508+
SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT: z.coerce.number().int().default(8192),
509+
SERVER_OTEL_AI_CONTENT_ATTRIBUTE_VALUE_LENGTH_LIMIT: z.coerce.number().int().default(1024),
510+
SERVER_OTEL_SPAN_TOTAL_ATTRIBUTES_LENGTH_LIMIT: z.coerce.number().int().default(32768),
511+
501512
CHECKPOINT_THRESHOLD_IN_MS: z.coerce.number().int().default(30000),
502513

503514
// Internal OTEL environment variables

apps/webapp/app/v3/dynamicFlushScheduler.server.ts

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,19 @@ export type DynamicFlushSchedulerConfig<T> = {
1818
isDroppableEvent?: (item: T) => boolean; // Function to determine if an event can be dropped
1919
};
2020

21+
// Bound on the recursive batch-split safety net. 4 → up to 16-way split,
22+
// which isolates a single bad row inside a 16k-element batch.
23+
const MAX_SPLIT_DEPTH = 4;
24+
25+
function isClickHouseJsonParseError(error: unknown): boolean {
26+
if (!error) return false;
27+
const message =
28+
typeof error === "object" && error !== null && "message" in error
29+
? String((error as { message?: unknown }).message ?? "")
30+
: String(error);
31+
return message.includes("Cannot parse JSON object");
32+
}
33+
2134
export class DynamicFlushScheduler<T> {
2235
private batchQueue: T[][];
2336
private currentBatch: T[];
@@ -43,6 +56,10 @@ export class DynamicFlushScheduler<T> {
4356
totalItemsFlushed: 0,
4457
droppedEvents: 0,
4558
droppedEventsByKind: new Map<string, number>(),
59+
// Rows dropped at flush time because ClickHouse rejected them and the
60+
// batch-split safety net couldn't isolate them further. Distinct from
61+
// `droppedEvents`, which counts pre-batch load-shedding drops.
62+
droppedRows: 0,
4663
};
4764
private isShuttingDown: boolean = false;
4865

@@ -196,40 +213,112 @@ export class DynamicFlushScheduler<T> {
196213
// Schedule all batches for concurrent processing
197214
const flushPromises = batchesToFlush.map((batch) =>
198215
this.limiter(async () => {
199-
const itemCount = batch.length;
200-
201216
const self = this;
202217

203-
async function tryFlush(flushId: string, batchToFlush: T[], attempt: number = 1) {
218+
async function tryFlush(
219+
flushId: string,
220+
batchToFlush: T[],
221+
attempt: number = 1,
222+
splitDepth: number = 0
223+
) {
224+
const subBatchSize = batchToFlush.length;
225+
204226
try {
205227
const startTime = Date.now();
206228
await self.callback(flushId, batchToFlush);
207229

208230
const duration = Date.now() - startTime;
209-
self.totalQueuedItems -= itemCount;
231+
self.totalQueuedItems -= subBatchSize;
210232
self.consecutiveFlushFailures = 0;
211233
self.lastFlushTime = Date.now();
212234
self.metrics.flushedBatches++;
213-
self.metrics.totalItemsFlushed += itemCount;
235+
self.metrics.totalItemsFlushed += subBatchSize;
214236

215237
self.logger.debug("Batch flushed successfully", {
216238
flushId,
217-
itemCount,
239+
itemCount: subBatchSize,
218240
duration,
219241
remainingQueueDepth: self.totalQueuedItems,
220242
activeConcurrency: self.limiter.activeCount,
221243
pendingConcurrency: self.limiter.pendingCount,
222244
});
223245
} catch (error) {
246+
// ClickHouse rejects an entire batch when a single row's
247+
// attributes JSON is unparseable. Retrying the same batch will
248+
// just fail again, so split-and-retry isolates the offender
249+
// instead of poisoning the whole 5–10k-row batch.
250+
const isParseError = isClickHouseJsonParseError(error);
251+
252+
if (isParseError && subBatchSize > 1 && splitDepth < MAX_SPLIT_DEPTH) {
253+
const mid = Math.floor(subBatchSize / 2);
254+
const left = batchToFlush.slice(0, mid);
255+
const right = batchToFlush.slice(mid);
256+
257+
self.logger.warn(
258+
"Splitting OTel batch after ClickHouse JSON parse failure",
259+
{
260+
flushId,
261+
itemCount: subBatchSize,
262+
splitDepth,
263+
leftSize: left.length,
264+
rightSize: right.length,
265+
}
266+
);
267+
268+
// Run halves concurrently and tolerate independent failures —
269+
// a rejection from one half must not prevent the other half
270+
// from completing. Each leaf's tryFlush updates totalQueuedItems
271+
// and metrics on its own success/drop paths.
272+
const results = await Promise.allSettled([
273+
tryFlush(flushId + "-L", left, 1, splitDepth + 1),
274+
tryFlush(flushId + "-R", right, 1, splitDepth + 1),
275+
]);
276+
277+
for (const [index, result] of results.entries()) {
278+
if (result.status === "rejected") {
279+
self.metrics.failedBatches++;
280+
self.logger.error(
281+
"Split half failed after exhausting retries",
282+
{
283+
flushId: flushId + (index === 0 ? "-L" : "-R"),
284+
error: result.reason,
285+
splitDepth: splitDepth + 1,
286+
}
287+
);
288+
}
289+
}
290+
return;
291+
}
292+
293+
if (isParseError && subBatchSize === 1) {
294+
// Singleton row that ClickHouse still rejects. Drop it so
295+
// the rest of the queue keeps flowing, and log enough of
296+
// the offending event for someone to investigate later
297+
// without dumping multi-KB of attributes into the log.
298+
self.metrics.droppedRows += 1;
299+
self.metrics.failedBatches++;
300+
self.totalQueuedItems -= 1;
301+
self.logger.error(
302+
"Dropping single OTel row rejected by ClickHouse JSON parser",
303+
{
304+
flushId,
305+
sample: JSON.stringify(batchToFlush[0]).slice(0, 1024),
306+
splitDepth,
307+
}
308+
);
309+
return;
310+
}
311+
224312
self.consecutiveFlushFailures++;
225313
self.metrics.failedBatches++;
226314

227315
self.logger.error("Error attempting to flush batch", {
228316
flushId,
229-
itemCount,
317+
itemCount: subBatchSize,
230318
error,
231319
consecutiveFailures: self.consecutiveFlushFailures,
232320
attempt,
321+
splitDepth,
233322
});
234323

235324
// Back off on failures
@@ -239,7 +328,7 @@ export class DynamicFlushScheduler<T> {
239328

240329
if (attempt <= 3) {
241330
await new Promise((resolve) => setTimeout(resolve, 500));
242-
return await tryFlush(flushId, batchToFlush, attempt + 1);
331+
return await tryFlush(flushId, batchToFlush, attempt + 1, splitDepth);
243332
} else {
244333
throw error;
245334
}

0 commit comments

Comments
 (0)