Skip to content

Commit 4a84b0d

Browse files
authored
Correlate Sentry errors with OTel traces (#1320)
* Correlate Sentry errors with OTel traces * Default the Sentry OTel verification flags on in the e2e cloud stack
1 parent dfbc104 commit 4a84b0d

13 files changed

Lines changed: 382 additions & 40 deletions

File tree

apps/cloud/src/api/error-response.ts

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { Cause, Data, Effect, Predicate, Result } from "effect";
22
import { HttpServerRespondable, HttpServerResponse } from "effect/unstable/http";
33

4-
import { captureCause } from "../observability";
4+
import { captureCause, captureCauseEffect } from "../observability";
55

66
// Implements `Respondable` so the framework's default cause→response
77
// pipeline (`HttpServerRespondable.toResponseOrElse`) renders this as the
@@ -13,13 +13,14 @@ export class HttpResponseError extends Data.TaggedError("HttpResponseError")<{
1313
readonly message: string;
1414
}> {
1515
[HttpServerRespondable.symbol](): Effect.Effect<HttpServerResponse.HttpServerResponse> {
16-
if (this.status >= 500) captureCause(this);
17-
return Effect.succeed(
18-
HttpServerResponse.jsonUnsafe(
19-
{ error: this.message, code: this.code },
20-
{ status: this.status },
21-
),
22-
);
16+
const self = this;
17+
return Effect.gen(function* () {
18+
if (self.status >= 500) yield* captureCauseEffect(self);
19+
return HttpServerResponse.jsonUnsafe(
20+
{ error: self.message, code: self.code },
21+
{ status: self.status },
22+
);
23+
});
2324
}
2425
}
2526

@@ -65,3 +66,15 @@ export const toErrorServerResponse = (error: unknown): HttpServerResponse.HttpSe
6566
{ status: mapped.status },
6667
);
6768
};
69+
70+
export const toErrorServerResponseEffect = (
71+
error: unknown,
72+
): Effect.Effect<HttpServerResponse.HttpServerResponse> =>
73+
Effect.gen(function* () {
74+
const mapped = toHttpResponseError(error);
75+
if (mapped.status >= 500) yield* captureCauseEffect(mapped);
76+
return HttpServerResponse.jsonUnsafe(
77+
{ error: mapped.message, code: mapped.code },
78+
{ status: mapped.status },
79+
);
80+
});

apps/cloud/src/engine/execution-gate.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
// `CodeCompilationError` / `SandboxRuntimeError` into `ExecuteResult.error`.
2525
// ---------------------------------------------------------------------------
2626

27-
import * as Sentry from "@sentry/cloudflare";
2827
import { Data, Effect } from "effect";
2928
import type * as Cause from "effect/Cause";
3029

3130
import type { ExecutionEngine, ExecutionResult } from "@executor-js/execution";
3231

32+
import { captureCauseEffect } from "../observability";
3333
import { EXECUTION_LIMIT_BLOCKED_MESSAGE } from "./execution-limit-messages";
3434

3535
// The engine's completed-result payload (`ExecuteResult` in codemode-core),
@@ -166,10 +166,12 @@ export const makeExecutionLimitGate = (checkBalance: ExecutionBalanceCheck) => {
166166
// must never block executions. Reported like `trackExecution` so a
167167
// billing outage still pages; the error outcome is never cached.
168168
Effect.catch((error: unknown) =>
169-
Effect.sync((): GateDecision => {
170-
console.warn("[billing] execution balance check failed open:", error);
171-
Sentry.captureException(error);
172-
return { blocked: false };
169+
Effect.gen(function* () {
170+
yield* Effect.sync(() => {
171+
console.warn("[billing] execution balance check failed open:", error);
172+
});
173+
yield* captureCauseEffect(error);
174+
return { blocked: false } as const satisfies GateDecision;
173175
}),
174176
),
175177
);

apps/cloud/src/engine/execution-rate-limit.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
// ---------------------------------------------------------------------------
1515

1616
import { DurableObject, env } from "cloudflare:workers";
17-
import * as Sentry from "@sentry/cloudflare";
1817
import { Data, Effect } from "effect";
1918
import type * as Cause from "effect/Cause";
2019

2120
import type { ExecutionEngine } from "@executor-js/execution";
2221

22+
import { captureCauseEffect } from "../observability";
2323
import { withPreExecutionGate, type GateDecision } from "./execution-gate";
2424
import { RATE_LIMIT_BLOCKED_MESSAGE } from "./execution-limit-messages";
2525

@@ -149,10 +149,12 @@ export const makeExecutionRateLimiter = (
149149
// FAIL OPEN: the backstop must never block executions because its
150150
// counter is unreachable or slow.
151151
Effect.catch((error: unknown) =>
152-
Effect.sync((): GateDecision => {
153-
console.warn("[rate-limit] execution rate limit check failed open:", error);
154-
Sentry.captureException(error);
155-
return { blocked: false };
152+
Effect.gen(function* () {
153+
yield* Effect.sync(() => {
154+
console.warn("[rate-limit] execution rate limit check failed open:", error);
155+
});
156+
yield* captureCauseEffect(error);
157+
return { blocked: false } as const satisfies GateDecision;
156158
}),
157159
),
158160
);

apps/cloud/src/env-augment.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ declare global {
1111
AXIOM_TRACES_URL?: string;
1212
AXIOM_TRACES_SAMPLE_RATIO?: string;
1313
SENTRY_DSN?: string;
14+
SENTRY_OTEL_LOG_PAYLOAD?: string;
15+
SENTRY_OTEL_VERIFY?: string;
1416
VITE_PUBLIC_SENTRY_DSN?: string;
1517
VITE_PUBLIC_POSTHOG_KEY?: string;
1618
VITE_PUBLIC_POSTHOG_HOST?: string;

apps/cloud/src/extensions/billing/route.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ import { autumnHandler } from "autumn-js/backend";
55

66
import { WorkOSClient } from "../../auth/workos";
77
import { ORG_SELECTOR_HEADER, authorizeOrganizationSelector } from "../../auth/organization";
8-
import { HttpResponseError, isServerError, toErrorServerResponse } from "../../api/error-response";
8+
import {
9+
HttpResponseError,
10+
isServerError,
11+
toErrorServerResponseEffect,
12+
} from "../../api/error-response";
913

1014
type BillingSession = {
1115
readonly userId: string;
@@ -111,7 +115,7 @@ const handler = Effect.gen(function* () {
111115
if (isServerError(err)) {
112116
console.error("[autumn] request failed:", Cause.pretty(err));
113117
}
114-
return Effect.succeed(toErrorServerResponse(err));
118+
return toErrorServerResponseEffect(err);
115119
}),
116120
);
117121

apps/cloud/src/extensions/billing/service.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
// ---------------------------------------------------------------------------
44

55
import { env } from "cloudflare:workers";
6-
import * as Sentry from "@sentry/cloudflare";
76
import { Autumn } from "autumn-js";
87
import { Context, Data, Effect, Layer } from "effect";
98

9+
import { captureCauseEffect } from "../../observability";
10+
1011
// ---------------------------------------------------------------------------
1112
// Errors
1213
// ---------------------------------------------------------------------------
@@ -72,12 +73,12 @@ const make = Effect.sync(() => {
7273
).pipe(
7374
Effect.catchTag("AutumnError", (error) =>
7475
Effect.gen(function* () {
75-
// Silent billing data loss is worth paging on autumn.trackExecution
76+
// Silent billing data loss is worth paging on: autumn.trackExecution
7677
// is fire-and-forget so the caller doesn't handle it themselves.
7778
yield* Effect.sync(() => {
7879
console.error("[billing] track failed:", error);
79-
Sentry.captureException(error);
8080
});
81+
yield* captureCauseEffect(error);
8182
yield* Effect.annotateCurrentSpan({ "autumn.track.failed": true });
8283
}),
8384
),

apps/cloud/src/mcp/session-durable-object.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@ import { makeExecutionStack } from "../engine/execution-stack";
6868
import { CloudMeteredExecutionStackLayer } from "../engine/execution-stack-metered";
6969
import { AutumnService } from "../extensions/billing/service";
7070
import { DoTelemetryLive, flushTracerProvider } from "../observability/telemetry";
71-
import { captureCause as reportCause } from "../observability";
71+
import {
72+
captureCause as reportCause,
73+
captureCauseEffect as reportCauseEffect,
74+
tagCurrentSentryScopeWithCurrentOtelSpan,
75+
} from "../observability";
7276

7377
// Re-export the shared types so existing cloud importers
7478
// (`auth/handlers.ts`, etc.) keep their `../mcp/session-durable-object` path.
@@ -314,6 +318,16 @@ export class McpSessionDOSqlite extends McpAgentSessionDOBase<Env, CloudSessionD
314318
reportCause(cause);
315319
}
316320

321+
protected override captureCauseEffect(
322+
cause: Cause.Cause<unknown>,
323+
): Effect.Effect<string | undefined> {
324+
return reportCauseEffect(cause);
325+
}
326+
327+
protected override prepareErrorCaptureScope(): Effect.Effect<void> {
328+
return Effect.asVoid(tagCurrentSentryScopeWithCurrentOtelSpan);
329+
}
330+
317331
// Best-effort export the DO isolate's buffered spans after the RPC settles,
318332
// so a dying init/handleRequest can ship its own spans (and the exception +
319333
// stack recorded on them) — not just the worker-side `mcp.do.*` span. Keep it

apps/cloud/src/observability/index.ts

Lines changed: 114 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
// ---------------------------------------------------------------------------
1212

1313
import * as Sentry from "@sentry/cloudflare";
14+
import type { ErrorEvent, Scope } from "@sentry/cloudflare";
1415
import { Cause, Effect, Layer } from "effect";
16+
import type * as Tracer from "effect/Tracer";
1517

1618
import { ErrorCapture } from "@executor-js/api";
1719

@@ -21,12 +23,106 @@ import { ErrorCapture } from "@executor-js/api";
2123
// error. Sentry still receives the full, untruncated cause via
2224
// `setExtra`; only the dev-console mirror is capped.
2325
const MAX_CONSOLE_CAUSE_CHARS = 4_000;
26+
const OTEL_TRACE_ID_PATTERN = /^[0-9a-f]{32}$/;
27+
const OTEL_SPAN_ID_PATTERN = /^[0-9a-f]{16}$/;
28+
29+
export const OTEL_TRACE_ID_TAG = "otel_trace_id";
30+
export const OTEL_SPAN_ID_TAG = "otel_span_id";
31+
export const SENTRY_EVENT_ID_ATTRIBUTE = "sentry.event_id";
32+
33+
export type OtelCorrelationContext = {
34+
readonly traceId: string;
35+
readonly spanId: string;
36+
};
2437

2538
const truncate = (s: string): string =>
2639
s.length <= MAX_CONSOLE_CAUSE_CHARS
2740
? s
2841
: `${s.slice(0, MAX_CONSOLE_CAUSE_CHARS)}\n…[truncated ${s.length - MAX_CONSOLE_CAUSE_CHARS} chars]`;
2942

43+
const validOtelContext = (context: OtelCorrelationContext): boolean =>
44+
OTEL_TRACE_ID_PATTERN.test(context.traceId) && OTEL_SPAN_ID_PATTERN.test(context.spanId);
45+
46+
export const otelCorrelationContextFromEffectSpan = (
47+
span: Tracer.Span,
48+
): OtelCorrelationContext | null => {
49+
const context = { traceId: span.traceId, spanId: span.spanId };
50+
return validOtelContext(context) ? context : null;
51+
};
52+
53+
export const otelCorrelationContextFromOpenTelemetrySpan = (span: {
54+
readonly spanContext: () => { readonly traceId: string; readonly spanId: string };
55+
}): OtelCorrelationContext | null => {
56+
const { traceId, spanId } = span.spanContext();
57+
const context = { traceId, spanId };
58+
return validOtelContext(context) ? context : null;
59+
};
60+
61+
export const addOtelCorrelationTags = <T extends { readonly tags?: Record<string, unknown> }>(
62+
event: T,
63+
context: OtelCorrelationContext | null,
64+
): T => {
65+
if (!context) return event;
66+
return {
67+
...event,
68+
tags: {
69+
...event.tags,
70+
[OTEL_TRACE_ID_TAG]: context.traceId,
71+
[OTEL_SPAN_ID_TAG]: context.spanId,
72+
},
73+
};
74+
};
75+
76+
export const tagSentryScopeWithOtelContext = (
77+
scope: Scope,
78+
context: OtelCorrelationContext | null,
79+
): void => {
80+
if (!context) return;
81+
scope.setTag(OTEL_TRACE_ID_TAG, context.traceId);
82+
scope.setTag(OTEL_SPAN_ID_TAG, context.spanId);
83+
};
84+
85+
export const tagCurrentSentryScopeWithOtelContext = (
86+
context: OtelCorrelationContext | null,
87+
): void => {
88+
tagSentryScopeWithOtelContext(Sentry.getCurrentScope(), context);
89+
};
90+
91+
const currentOtelContext = Effect.map(
92+
Effect.currentSpan,
93+
otelCorrelationContextFromEffectSpan,
94+
).pipe(Effect.orElseSucceed(() => null));
95+
96+
export const tagCurrentSentryScopeWithCurrentOtelSpan: Effect.Effect<OtelCorrelationContext | null> =
97+
Effect.map(currentOtelContext, (context) => {
98+
tagCurrentSentryScopeWithOtelContext(context);
99+
return context;
100+
});
101+
102+
export const beforeSendWithOtelCorrelation = (
103+
event: ErrorEvent,
104+
options?: { readonly logPayload?: boolean },
105+
): ErrorEvent => {
106+
if (options?.logPayload) {
107+
console.info(
108+
JSON.stringify({
109+
event: "sentry_before_send_otel_correlation",
110+
sentry_event_id: event.event_id ?? "",
111+
otel_trace_id: String(event.tags?.[OTEL_TRACE_ID_TAG] ?? ""),
112+
otel_span_id: String(event.tags?.[OTEL_SPAN_ID_TAG] ?? ""),
113+
}),
114+
);
115+
}
116+
return event;
117+
};
118+
119+
export const addCurrentOtelCorrelationTags = <
120+
T extends { readonly tags?: Record<string, unknown> },
121+
>(
122+
event: T,
123+
): Effect.Effect<T> =>
124+
Effect.map(currentOtelContext, (context) => addOtelCorrelationTags(event, context));
125+
30126
// Sentry's `captureException` can't serialize Effect's `CauseImpl` (it logs
31127
// `'CauseImpl' captured as exception with keys: reasons, ~effect/Cause` and
32128
// drops the real failure). `Cause.squash` isn't enough on its own: when an
@@ -48,21 +144,36 @@ export const sentryPayloadForCause = (
48144
return { primary: input, pretty: null };
49145
};
50146

51-
export const captureCause = (input: unknown): string | undefined => {
147+
export const captureCause = (
148+
input: unknown,
149+
context: OtelCorrelationContext | null = null,
150+
): string | undefined => {
52151
const { primary, pretty } = sentryPayloadForCause(input);
152+
tagCurrentSentryScopeWithOtelContext(context);
53153
return Sentry.captureException(primary, (scope) => {
154+
tagSentryScopeWithOtelContext(scope, context);
54155
if (pretty !== null) scope.setExtra("cause", pretty);
55156
return scope;
56157
});
57158
};
58159

160+
export const captureCauseEffect = (input: unknown): Effect.Effect<string | undefined> =>
161+
Effect.gen(function* () {
162+
const context = yield* tagCurrentSentryScopeWithCurrentOtelSpan;
163+
const eventId = yield* Effect.sync(() => captureCause(input, context));
164+
if (eventId && context) {
165+
yield* Effect.annotateCurrentSpan(SENTRY_EVENT_ID_ATTRIBUTE, eventId);
166+
}
167+
return eventId;
168+
});
169+
59170
export const ErrorCaptureLive: Layer.Layer<ErrorCapture> = Layer.succeed(
60171
ErrorCapture,
61172
ErrorCapture.of({
62173
captureException: (cause) =>
63-
Effect.sync(() => {
174+
Effect.gen(function* () {
64175
console.error("[api] unhandled cause:", truncate(Cause.pretty(cause)));
65-
return captureCause(cause) ?? "";
176+
return (yield* captureCauseEffect(cause)) ?? "";
66177
}),
67178
}),
68179
);

0 commit comments

Comments
 (0)