Skip to content

Commit 4726b96

Browse files
authored
feat(kiloclaw): add Cloudflare Analytics Engine instrumentation (#1311)
## Summary Add Cloudflare Analytics Engine instrumentation to KiloClaw across HTTP routes, DO lifecycle, and reconciliation paths, then refactor reconcile telemetry to use a unified `ReconcileContext` that dual-writes console reconcile logs and AE events. - **HTTP telemetry**: global `timingMiddleware`; `instrumented()` wrappers for `/api/admin/*` and `/api/kiloclaw/*`; platform middleware emits request events and now records validated query/body-derived user context. - **DO lifecycle telemetry**: `emitEvent()` in `KiloClawInstance` records provision/start/stop/destroy lifecycle events with duration/value metrics. - **Reconcile telemetry**: reconcile call sites now emit consistently via `rctx.log(...)` with `reconcile.{action}` naming and common state-derived dimensions; includes duration/error/value fields where applicable. - **Hardening**: reconcile analytics error serialization is now guarded so unserializable error payloads cannot break best-effort analytics (`[unserializable error]` fallback). ## Verification - `pnpm typecheck` (in `kiloclaw`) — pass - `pnpm test` (in `kiloclaw`) — pass (`42` files / `936` tests) - Pre-push hook (repo root) — pass: - `pnpm format:check` - `pnpm lint` (monorepo) - `pnpm typecheck` (monorepo) ## Visual Changes N/A ## Reviewer Notes - Blob layout is centralized in `src/utils/analytics.ts` and shared by HTTP, DO, and reconcile emitters. - Platform middleware skips non-error events for routes without user context (for example version metadata endpoints) to reduce low-signal telemetry noise.
2 parents 0ea2905 + b264459 commit 4726b96

13 files changed

Lines changed: 848 additions & 692 deletions

File tree

kiloclaw/src/durable-objects/kiloclaw-instance/index.ts

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ import type { GatewayProcessStatus } from '../gateway-controller-types';
4646
import type { InstanceMutableState, InstanceStatus, DestroyResult } from './types';
4747
import { getFlyConfig } from './types';
4848
import { createMutableState, loadState, storageUpdate } from './state';
49-
import { nextAlarmTime, doLog, doError, doWarn, toLoggable } from './log';
49+
import { nextAlarmTime, doLog, doError, doWarn, toLoggable, createReconcileContext } from './log';
5050
import { attemptMetadataRecovery } from './reconcile';
5151
import { resolveImageTag, getRegistryApp, buildUserEnvVars } from './config';
5252
import * as gateway from './gateway';
@@ -62,6 +62,8 @@ import {
6262
markRestartSuccessful,
6363
} from './reconcile';
6464
import { restoreFromPostgres, markDestroyedInPostgresHelper } from './postgres';
65+
import { writeEvent } from '../../utils/analytics';
66+
import type { KiloClawEventData, KiloClawEventName } from '../../utils/analytics';
6567

6668
// Re-export extracted helpers so existing consumers don't break.
6769
export { parseRegions, shuffleRegions, deprioritizeRegion } from '../regions';
@@ -99,11 +101,45 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
99101
return buildUserEnvVars(this.env, this.ctx, this.s);
100102
}
101103

104+
/**
105+
* Emit an analytics event with common DO dimensions baked in.
106+
* Follows gastown's Omit<> pattern — callers provide only the
107+
* event-specific fields; userId, delivery, and machine context
108+
* are always filled from this.s.
109+
*/
110+
private emitEvent(
111+
data: Omit<
112+
KiloClawEventData,
113+
| 'userId'
114+
| 'sandboxId'
115+
| 'delivery'
116+
| 'flyAppName'
117+
| 'flyMachineId'
118+
| 'openclawVersion'
119+
| 'imageTag'
120+
| 'flyRegion'
121+
> & { event: KiloClawEventName }
122+
): void {
123+
writeEvent(this.env, {
124+
...data,
125+
delivery: 'do',
126+
userId: this.s.userId ?? undefined,
127+
sandboxId: this.s.sandboxId ?? undefined,
128+
flyAppName: this.s.flyAppName ?? undefined,
129+
flyMachineId: this.s.flyMachineId ?? undefined,
130+
openclawVersion: this.s.openclawVersion ?? undefined,
131+
imageTag: this.s.trackedImageTag ?? undefined,
132+
flyRegion: this.s.flyRegion ?? undefined,
133+
status: data.status ?? this.s.status ?? undefined,
134+
});
135+
}
136+
102137
// ========================================================================
103138
// Lifecycle methods (called by platform API routes via RPC)
104139
// ========================================================================
105140

106141
async provision(userId: string, config: InstanceConfig): Promise<{ sandboxId: string }> {
142+
const provisionStart = performance.now();
107143
await this.loadState();
108144

109145
if (this.s.status === 'destroying') {
@@ -295,6 +331,12 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
295331
await this.startAsync(userId);
296332
}
297333

334+
this.emitEvent({
335+
event: 'instance.provisioned',
336+
status: 'provisioned',
337+
durationMs: performance.now() - provisionStart,
338+
});
339+
298340
return { sandboxId };
299341
}
300342

@@ -660,7 +702,7 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
660702
flyConfig,
661703
this.ctx,
662704
this.s,
663-
'start_recovery'
705+
createReconcileContext(this.s, this.env, 'start_recovery')
664706
);
665707
if (!recovered && !this.s.flyMachineId) {
666708
throw new Error(
@@ -699,7 +741,13 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
699741
try {
700742
const machine = await fly.getMachine(flyConfig, this.s.flyMachineId);
701743
if (machine.state === 'started') {
702-
await reconcileMachineMount(flyConfig, this.ctx, this.s, machine, 'start');
744+
await reconcileMachineMount(
745+
flyConfig,
746+
this.ctx,
747+
this.s,
748+
machine,
749+
createReconcileContext(this.s, this.env, 'start')
750+
);
703751
console.log('[DO] Machine already running, mount verified');
704752
await this.scheduleAlarm();
705753
return;
@@ -803,6 +851,7 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
803851
return;
804852
}
805853

854+
const startingAt = this.s.startingAt;
806855
this.s.status = 'running';
807856
this.s.startingAt = null;
808857
this.s.lastStartedAt = Date.now();
@@ -819,6 +868,12 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
819868
lastStartErrorAt: null,
820869
});
821870

871+
this.emitEvent({
872+
event: 'instance.started',
873+
status: 'running',
874+
durationMs: startingAt ? Date.now() - startingAt : undefined,
875+
});
876+
822877
await this.scheduleAlarm();
823878
}
824879

@@ -902,6 +957,8 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
902957
return;
903958
}
904959

960+
const machineUptimeMs = this.s.lastStartedAt ? Date.now() - this.s.lastStartedAt : 0;
961+
905962
if (this.s.flyMachineId) {
906963
const flyConfig = getFlyConfig(this.env, this.s);
907964
try {
@@ -921,6 +978,12 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
921978
lastStoppedAt: this.s.lastStoppedAt,
922979
});
923980

981+
this.emitEvent({
982+
event: 'instance.stopped',
983+
status: 'stopped',
984+
value: machineUptimeMs,
985+
});
986+
924987
await this.scheduleAlarm();
925988
}
926989

@@ -931,6 +994,8 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
931994
throw new Error('Instance not provisioned');
932995
}
933996

997+
const machineUptimeMs = this.s.lastStartedAt ? Date.now() - this.s.lastStartedAt : 0;
998+
934999
this.s.pendingDestroyMachineId = this.s.flyMachineId;
9351000
this.s.pendingDestroyVolumeId = this.s.flyVolumeId;
9361001
this.s.status = 'destroying';
@@ -941,12 +1006,23 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
9411006
pendingDestroyVolumeId: this.s.pendingDestroyVolumeId,
9421007
});
9431008

944-
const flyConfig = getFlyConfig(this.env, this.s);
945-
await tryDeleteMachine(flyConfig, this.ctx, this.s, 'destroy');
946-
await tryDeleteVolume(flyConfig, this.ctx, this.s, 'destroy');
1009+
this.emitEvent({
1010+
event: 'instance.destroy_started',
1011+
status: 'destroying',
1012+
value: machineUptimeMs,
1013+
});
9471014

948-
const finalized = await finalizeDestroyIfComplete(this.ctx, this.s, (userId, sandboxId) =>
949-
markDestroyedInPostgresHelper(this.env, this.ctx, this.s, userId, sandboxId)
1015+
const flyConfig = getFlyConfig(this.env, this.s);
1016+
const destroyRctx = createReconcileContext(this.s, this.env, 'destroy');
1017+
await tryDeleteMachine(flyConfig, this.ctx, this.s, destroyRctx);
1018+
await tryDeleteVolume(flyConfig, this.ctx, this.s, destroyRctx);
1019+
1020+
const finalized = await finalizeDestroyIfComplete(
1021+
this.ctx,
1022+
this.s,
1023+
destroyRctx,
1024+
(userId, sandboxId) =>
1025+
markDestroyedInPostgresHelper(this.env, this.ctx, this.s, userId, sandboxId)
9501026
);
9511027
if (!finalized.finalized) {
9521028
doWarn(this.s, 'Destroy incomplete, alarm will retry', {
@@ -1447,7 +1523,11 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
14471523
const preSuccessStatus = await this.ctx.storage.get('status');
14481524
if (preSuccessStatus !== 'restarting') return;
14491525

1450-
await markRestartSuccessful(this.ctx, this.s);
1526+
await markRestartSuccessful(
1527+
this.ctx,
1528+
this.s,
1529+
createReconcileContext(this.s, this.env, 'restart')
1530+
);
14511531
doLog(this.s, 'restartMachine: background restart completed successfully');
14521532
await this.scheduleAlarm();
14531533
} catch (err) {

kiloclaw/src/durable-objects/kiloclaw-instance/log.ts

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
ALARM_INTERVAL_IDLE_MS,
88
ALARM_JITTER_MS,
99
} from '../../config';
10+
import { writeEvent, eventContextFromState } from '../../utils/analytics';
1011

1112
/**
1213
* Structured reconciliation logging — emits a JSON line tagged for
@@ -27,6 +28,61 @@ export function reconcileLog(
2728
);
2829
}
2930

31+
// ── ReconcileContext ──────────────────────────────────────────────────
32+
//
33+
// Bundles state + env + reason so every reconcileLog call site
34+
// automatically emits to Cloudflare Analytics Engine without needing
35+
// to thread env/state through every function signature.
36+
37+
export type ReconcileContext = {
38+
readonly state: InstanceMutableState;
39+
readonly env: { KILOCLAW_AE?: AnalyticsEngineDataset };
40+
readonly reason: string;
41+
/** Log a reconcile action to both console and Analytics Engine. */
42+
log: (action: string, details?: Record<string, unknown>) => void;
43+
};
44+
45+
export function createReconcileContext(
46+
state: InstanceMutableState,
47+
env: { KILOCLAW_AE?: AnalyticsEngineDataset },
48+
reason: string
49+
): ReconcileContext {
50+
return {
51+
state,
52+
env,
53+
reason,
54+
log(action: string, details: Record<string, unknown> = {}) {
55+
reconcileLog(reason, action, details);
56+
57+
const rawErr = details.error;
58+
let errorStr: string | undefined;
59+
if (rawErr !== undefined) {
60+
try {
61+
errorStr = (
62+
rawErr instanceof Error
63+
? rawErr.message
64+
: typeof rawErr === 'string'
65+
? rawErr
66+
: JSON.stringify(rawErr)
67+
).slice(0, 200);
68+
} catch {
69+
errorStr = '[unserializable error]';
70+
}
71+
}
72+
73+
writeEvent(env, {
74+
event: `reconcile.${action}`,
75+
delivery: 'reconcile',
76+
label: typeof details.label === 'string' ? details.label : '',
77+
error: errorStr,
78+
durationMs: typeof details.durationMs === 'number' ? details.durationMs : undefined,
79+
value: typeof details.value === 'number' ? details.value : undefined,
80+
...eventContextFromState(state),
81+
});
82+
},
83+
};
84+
}
85+
3086
// ── Structured error/warn logging ────────────────────────────────────
3187

3288
/**

kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import type { InstanceMutableState } from './types';
66
import { getFlyConfig } from './types';
77
import { storageUpdate } from './state';
88
import { attemptMetadataRecovery } from './reconcile';
9-
import { doError, doWarn, toLoggable } from './log';
9+
import { doError, doWarn, toLoggable, createReconcileContext } from './log';
1010

1111
/**
1212
* Restore DO state from Postgres backup if SQLite was wiped.
@@ -102,7 +102,12 @@ export async function restoreFromPostgres(
102102
// Attempt to recover machine/volume IDs via Fly metadata.
103103
try {
104104
const flyConfig = getFlyConfig(env, state);
105-
await attemptMetadataRecovery(flyConfig, ctx, state, 'postgres_restore');
105+
await attemptMetadataRecovery(
106+
flyConfig,
107+
ctx,
108+
state,
109+
createReconcileContext(state, env, 'postgres_restore')
110+
);
106111
} catch (err) {
107112
doWarn(state, 'Metadata recovery after Postgres restore failed', {
108113
error: toLoggable(err),

0 commit comments

Comments
 (0)