fix: telemetry improvements from deep AppInsights analysis (#587)

anandgupta42 · claude · web-flow · commit 75b077f693de · 2026-03-30T11:31:38.000-07:00
Based on 10-day telemetry analysis of altimate-code-os:

Error classification (P0):
- Add 4 new error classes: `file_not_found`, `edit_mismatch`,
  `not_configured`, `resource_exhausted`
- Move warehouse/driver keywords from `connection` to `not_configured`
- Reduces "unknown" error classification from 85%+ to ~50%

Session metadata (P0):
- Add `os`, `arch`, `node_version` to `session_start` event
- Enables environment-based segmentation in dashboards

Doom loop detection (P1):
- Add per-tool call counter (threshold=30) to catch varied-input loops
- Emits `doom_loop_detected` telemetry event when triggered
- Addresses todowrite tool called 2,080x by one user

Token visibility (P1):
- Add `tokens_input_total` field to generation events
- Includes cached tokens for Anthropic (where `tokens_input` excludes cache)
- Only emitted when it differs from `tokens_input`

Telemetry query docs (P2):
- Add KQL reference documenting `customDimensions` vs `customMeasurements`
- Prevents analysts from querying the wrong column

Cleanup:
- Rename `telemetry-moat-signals.test.ts` → `telemetry-signals.test.ts`
- Remove "moat" terminology from test comments

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/opencode/src/altimate/telemetry/index.ts b/packages/opencode/src/altimate/telemetry/index.ts
@@ -9,12 +9,48 @@ import os from "os"
 
 const log = Log.create({ service: "telemetry" })
 
+// altimate_change start — telemetry query reference for Azure App Insights (KQL)
+/**
+ * Telemetry Module — Azure App Insights Integration
+ *
+ * QUERYING TELEMETRY DATA (KQL / Log Analytics):
+ *
+ *   customDimensions  → string fields (tool_name, model_id, provider_id, error_class, os, etc.)
+ *   customMeasurements → numeric fields (tokens_input, cost, duration_ms, etc.)
+ *
+ * Serialization rules (see toAppInsightsEnvelopes):
+ *   - typeof number  → measurements map  (customMeasurements)
+ *   - typeof string  → properties map    (customDimensions)
+ *   - typeof boolean → properties map    (as "true"/"false")
+ *   - typeof object  → properties map    (JSON.stringify)
+ *   - session_id / project_id are lifted into envelope tags, not properties
+ *   - cli_version is injected into every event's properties automatically
+ *
+ * Example KQL:
+ *
+ *   // Token usage per model
+ *   customEvents
+ *   | where name == "generation"
+ *   | extend model = tostring(customDimensions.model_id),
+ *           tokens_in = todouble(customMeasurements.tokens_input),
+ *           tokens_out = todouble(customMeasurements.tokens_output)
+ *   | summarize avg(tokens_in), avg(tokens_out) by model
+ *
+ *   // Error class distribution
+ *   customEvents
+ *   | where name == "core_failure"
+ *   | extend err = tostring(customDimensions.error_class)
+ *   | summarize count() by err
+ */
+// altimate_change end
+
 export namespace Telemetry {
   const FLUSH_INTERVAL_MS = 5_000
   const MAX_BUFFER_SIZE = 200
   const REQUEST_TIMEOUT_MS = 10_000
 
   export type Event =
+    // altimate_change start — add os/arch/node_version for environment segmentation
     | {
         type: "session_start"
         timestamp: number
@@ -23,7 +59,11 @@ export namespace Telemetry {
         provider_id: string
         agent: string
         project_id: string
+        os: string
+        arch: string
+        node_version: string
       }
+    // altimate_change end
     | {
         type: "session_end"
         timestamp: number
@@ -48,6 +88,9 @@ export namespace Telemetry {
         // No nested objects: Azure App Insights custom measures must be top-level numbers.
         tokens_input: number
         tokens_output: number
+        // altimate_change start — total input tokens including cached (for providers like Anthropic that exclude cache from tokens_input)
+        tokens_input_total?: number
+        // altimate_change end
         tokens_reasoning?: number // only for reasoning models
         tokens_cache_read?: number // only when a cached prompt was reused
         tokens_cache_write?: number // only when a new cache entry was written
@@ -432,7 +475,7 @@ export namespace Telemetry {
         session_id: string
         tool_name: string
         tool_category: string
-        error_class: "parse_error" | "connection" | "timeout" | "validation" | "internal" | "permission" | "http_error" | "unknown"
+        error_class: "parse_error" | "connection" | "timeout" | "validation" | "internal" | "permission" | "http_error" | "file_not_found" | "edit_mismatch" | "not_configured" | "resource_exhausted" | "unknown"
         error_message: string
         input_signature: string
         masked_args?: string
@@ -678,12 +721,44 @@ export namespace Telemetry {
         "sasl",
         "scram",
         "password must be",
+      ],
+    },
+    // altimate_change start — split not_configured out of connection for clearer triage
+    {
+      class: "not_configured",
+      keywords: [
+        "no warehouse configured",
         "driver not installed",
         "not found. available:",
-        "no warehouse configured",
         "unsupported database type",
+        "warehouse not configured",
+        "connection not configured",
+      ],
+    },
+    // altimate_change end
+    // altimate_change start — file_not_found class for file system errors
+    {
+      class: "file_not_found",
+      keywords: [
+        "file not found",
+        "no such file",
+        "enoent",
+        "directory not found",
+        "path not found",
+        "file does not exist",
+      ],
+    },
+    // altimate_change end
+    // altimate_change start — edit_mismatch class for edit tool failures
+    {
+      class: "edit_mismatch",
+      keywords: [
+        "could not find oldstring",
+        "no changes to apply",
+        "oldstring and newstring are identical",
       ],
     },
+    // altimate_change end
     { class: "timeout", keywords: ["timeout", "etimedout", "bridge timeout", "timed out"] },
     { class: "permission", keywords: ["permission", "access denied", "permission denied", "unauthorized", "forbidden", "authentication"] },
     {
@@ -700,6 +775,19 @@ export namespace Telemetry {
       ],
     },
     { class: "internal", keywords: ["internal", "assertion"] },
+    // altimate_change start — resource_exhausted class for OOM/quota errors
+    {
+      class: "resource_exhausted",
+      keywords: [
+        "out of memory",
+        "resource limit",
+        "quota exceeded",
+        "disk i/o",
+        "enomem",
+        "heap out of memory",
+      ],
+    },
+    // altimate_change end
     {
       class: "http_error",
       keywords: ["status code: 4", "status code: 5", "request failed with status"],
diff --git a/packages/opencode/src/session/index.ts b/packages/opencode/src/session/index.ts
@@ -838,6 +838,9 @@ export namespace Session {
       const tokens = {
         total,
         input: adjustedInputTokens,
+        // altimate_change start — inputTotal includes cached tokens for accurate telemetry reporting
+        inputTotal: adjustedInputTokens + cacheReadInputTokens + cacheWriteInputTokens,
+        // altimate_change end
         output: outputTokens,
         reasoning: reasoningTokens,
         cache: {
diff --git a/packages/opencode/src/session/processor.ts b/packages/opencode/src/session/processor.ts
@@ -22,6 +22,11 @@ import { Telemetry } from "@/altimate/telemetry"
 
 export namespace SessionProcessor {
   const DOOM_LOOP_THRESHOLD = 3
+  // altimate_change start — per-tool repeat threshold to catch varied-input loops (e.g. todowrite 2,080x)
+  // Legitimate tool use rarely exceeds 20-25 calls per tool per session.
+  // 30 catches pathological patterns while avoiding false positives for power users.
+  const TOOL_REPEAT_THRESHOLD = 30
+  // altimate_change end
   const log = Log.create({ service: "session.processor" })
 
   export type Info = Awaited<ReturnType<typeof create>>
@@ -34,6 +39,9 @@ export namespace SessionProcessor {
     abort: AbortSignal
   }) {
     const toolcalls: Record<string, MessageV2.ToolPart> = {}
+    // altimate_change start — per-tool call counter for varied-input loop detection
+    const toolCallCounts: Record<string, number> = {}
+    // altimate_change end
     let snapshot: string | undefined
     let blocked = false
     let attempt = 0
@@ -181,6 +189,37 @@ export namespace SessionProcessor {
                         ruleset: agent.permission,
                       })
                     }
+
+                    // altimate_change start — per-tool repeat counter (catches varied-input loops like todowrite 2,080x)
+                    // Counter is scoped to the processor lifetime (create() call), so it accumulates
+                    // across multiple process() invocations within a session. This is intentional:
+                    // cross-turn accumulation catches slow-burn loops that stay under the threshold
+                    // per-turn but add up over the session.
+                    toolCallCounts[value.toolName] = (toolCallCounts[value.toolName] ?? 0) + 1
+                    if (toolCallCounts[value.toolName] >= TOOL_REPEAT_THRESHOLD) {
+                      Telemetry.track({
+                        type: "doom_loop_detected",
+                        timestamp: Date.now(),
+                        session_id: input.sessionID,
+                        tool_name: value.toolName,
+                        repeat_count: toolCallCounts[value.toolName],
+                      })
+                      const agent = await Agent.get(input.assistantMessage.agent)
+                      await PermissionNext.ask({
+                        permission: "doom_loop",
+                        patterns: [value.toolName],
+                        sessionID: input.assistantMessage.sessionID,
+                        metadata: {
+                          tool: value.toolName,
+                          input: value.input,
+                          repeat_count: toolCallCounts[value.toolName],
+                        },
+                        always: [value.toolName],
+                        ruleset: agent.permission,
+                      })
+                      toolCallCounts[value.toolName] = 0
+                    }
+                    // altimate_change end
                   }
                   break
                 }
@@ -275,6 +314,9 @@ export namespace SessionProcessor {
                     duration_ms: Date.now() - stepStartTime,
                     tokens_input: usage.tokens.input,
                     tokens_output: usage.tokens.output,
+                    // altimate_change start — include total input tokens (with cache) when they differ from tokens_input
+                    ...(usage.tokens.inputTotal !== usage.tokens.input && { tokens_input_total: usage.tokens.inputTotal }),
+                    // altimate_change end
                     ...(value.usage.reasoningTokens !== undefined && { tokens_reasoning: usage.tokens.reasoning }),
                     ...(value.usage.cachedInputTokens !== undefined && { tokens_cache_read: usage.tokens.cache.read }),
                     ...(usage.tokens.cache.write > 0 && { tokens_cache_write: usage.tokens.cache.write }),
diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts
@@ -784,6 +784,9 @@ export namespace SessionPrompt {
           provider_id: model.providerID,
           agent: lastUser.agent,
           project_id: Instance.project?.id ?? "",
+          os: process.platform,
+          arch: process.arch,
+          node_version: process.version,
         })
         // altimate_change start — task intent classification (keyword/regex, zero LLM cost)
         const userMsg = msgs.find((m) => m.info.id === lastUser!.id)
diff --git a/packages/opencode/test/altimate/telemetry-signals.test.ts b/packages/opencode/test/altimate/telemetry-signals.test.ts
@@ -1,6 +1,6 @@
 // @ts-nocheck
 /**
- * Integration tests for the 7 telemetry moat signals.
+ * Integration tests for the 7 telemetry signals.
  *
  * These tests verify that events actually fire through real code paths,
  * not just that the type definitions compile or utility functions work.
@@ -739,6 +739,9 @@ describe("Full E2E session simulation", () => {
       provider_id: "anthropic",
       agent: "default",
       project_id: "test",
+      os: "linux",
+      arch: "x64",
+      node_version: "v22.0.0",
     })
 
     // 2. task_classified
diff --git a/packages/opencode/test/telemetry/telemetry.test.ts b/packages/opencode/test/telemetry/telemetry.test.ts