MagicCube · 1330482928 · Apr 14, 2026 · Apr 14, 2026
diff --git a/src/agent/__tests__/agent-observation.test.ts b/src/agent/__tests__/agent-observation.test.ts
@@ -0,0 +1,149 @@
+import { describe, expect, test } from "bun:test";
+import z from "zod";
+
+import type { AssistantMessage, ModelProvider, ModelProviderInvokeParams } from "@/foundation";
+import { defineTool } from "@/foundation";
+import { Model } from "@/foundation/models";
+
+import { Agent } from "../agent";
+
+class RecordingProvider implements ModelProvider {
+  calls: ModelProviderInvokeParams[] = [];
+  private readonly responses: AssistantMessage[];
+
+  constructor(responses: AssistantMessage[]) {
+    this.responses = responses;
+  }
+
+  async invoke(): Promise<AssistantMessage> {
+    throw new Error("invoke not implemented in test provider");
+  }
+
+  async *stream(params: ModelProviderInvokeParams): AsyncGenerator<AssistantMessage> {
+    this.calls.push(params);
+    const next = this.responses[this.calls.length - 1];
+    if (!next) {
+      throw new Error(`Unexpected model stream call #${this.calls.length}`);
+    }
+    yield next;
+  }
+}
+
+function getSystemPromptText(call: ModelProviderInvokeParams) {
+  const first = call.messages[0];
+  if (!first || first.role !== "system") {
+    throw new Error("Expected first message to be a system prompt");
+  }
+
+  const textBlock = first.content[0];
+  if (!textBlock || textBlock.type !== "text") {
+    throw new Error("Expected first system content block to be text");
+  }
+
+  return textBlock.text;
+}
+
+describe("Agent tool observation injection", () => {
+  test("injects tool observation immediately after the first failure and upgrades it after repeated failure", async () => {
+    const provider = new RecordingProvider([
+      {
+        role: "assistant",
+        content: [
+          {
+            type: "tool_use",
+            id: "toolu_1",
+            name: "grep_search",
+            input: { pattern: "foo" },
+          },
+        ],
+      },
+      {
+        role: "assistant",
+        content: [
+          {
+            type: "tool_use",
+            id: "toolu_2",
+            name: "grep_search",
+            input: { pattern: "foo" },
+          },
+        ],
+      },
+      {
+        role: "assistant",
+        content: [{ type: "text", text: "done" }],
+      },
+    ]);
+
+    const model = new Model("test-model", provider);
+    const grepTool = defineTool({
+      name: "grep_search",
+      description: "test grep",
+      parameters: z.object({ pattern: z.string() }),
+      invoke: async () => ({
+        ok: false as const,
+        summary: "Failed to run rg",
+        error: "Failed to run rg",
+        code: "RG_NOT_FOUND",
+      }),
+    });
+
+    const agent = new Agent({
+      model,
+      prompt: "You are a coding agent.",
+      messages: [],
+      tools: [grepTool],
+      maxSteps: 5,
+    });
+
+    for await (const _ of agent.stream({ role: "user", content: [{ type: "text", text: "find foo" }] })) {
+      void _;
+    }
+
+    expect(provider.calls).toHaveLength(3);
+
+    const firstPromptText = getSystemPromptText(provider.calls[0]!);
+    const secondPromptText = getSystemPromptText(provider.calls[1]!);
+    const thirdPromptText = getSystemPromptText(provider.calls[2]!);
+
+    expect(firstPromptText).toBe("You are a coding agent.");
+
+    expect(secondPromptText).toContain("You are a coding agent.");
+    expect(secondPromptText).toContain("<tool_observation>");
+    expect(secondPromptText).toContain("tool=grep_search");
+    expect(secondPromptText).toContain("repeated_failures=0");
+    expect(secondPromptText).not.toContain("repeated_failure=true");
+    expect(secondPromptText).toContain("avoid_immediate_retry_tools=grep_search");
+
+    expect(thirdPromptText).toContain("You are a coding agent.");
+    expect(thirdPromptText).toContain("<tool_observation>");
+    expect(thirdPromptText).toContain("tool=grep_search");
+    expect(thirdPromptText).toContain("repeated_failures=1");
+    expect(thirdPromptText).toContain("repeated_failure=true");
+    expect(thirdPromptText).toContain("avoid_immediate_retry_tools=grep_search");
+  });
+
+  test("does not inject tool observation before any tool failure occurs", async () => {
+    const provider = new RecordingProvider([
+      {
+        role: "assistant",
+        content: [{ type: "text", text: "done" }],
+      },
+    ]);
+
+    const model = new Model("test-model", provider);
+    const agent = new Agent({
+      model,
+      prompt: "You are a coding agent.",
+      messages: [],
+      tools: [],
+      maxSteps: 2,
+    });
+
+    for await (const _ of agent.stream({ role: "user", content: [{ type: "text", text: "hello" }] })) {
+      void _;
+    }
+
+    expect(provider.calls).toHaveLength(1);
+    expect(getSystemPromptText(provider.calls[0]!)).toBe("You are a coding agent.");
+  });
+});
diff --git a/src/agent/__tests__/tool-compaction.test.ts b/src/agent/__tests__/tool-compaction.test.ts
@@ -0,0 +1,98 @@
+import { describe, expect, test } from "bun:test";
+
+import { compactToolResultData, compactTranscriptPayload } from "../tool-compaction";
+import { getToolResultPolicy } from "../tool-result-policy";
+
+describe("compactToolResultData", () => {
+  test("truncates large strings in success data", () => {
+    const result = compactToolResultData({
+      toolName: "apply_patch",
+      normalized: {
+        ok: true,
+        summary: "Applied patch",
+        data: { patch: "x".repeat(1200) },
+        raw: null,
+      },
+      policy: getToolResultPolicy("apply_patch"),
+    });
+
+    expect(result).toEqual({
+      ok: true,
+      summary: "Applied patch",
+      data: {
+        patch: expect.stringContaining("[truncated"),
+      },
+    });
+  });
+
+  test("samples long arrays", () => {
+    const result = compactToolResultData({
+      toolName: "list_files",
+      normalized: {
+        ok: true,
+        summary: "Listed files",
+        data: { entries: Array.from({ length: 25 }, (_, i) => `file-${i}`) },
+        raw: null,
+      },
+      policy: getToolResultPolicy("list_files"),
+    });
+
+    expect(result).toEqual({
+      ok: true,
+      summary: "Listed files",
+      data: {
+        entries: {
+          items: Array.from({ length: 10 }, (_, i) => `file-${i}`),
+          truncated: true,
+          originalLength: 25,
+        },
+      },
+    });
+  });
+});
+
+describe("compactTranscriptPayload", () => {
+  test("drops data for summary-first tools", () => {
+    const result = compactTranscriptPayload({
+      toolName: "grep_search",
+      normalized: {
+        ok: true,
+        summary: "Found 42 matches",
+        data: { matches: Array.from({ length: 15 }, (_, i) => `match-${i}`) },
+        raw: null,
+      },
+      policy: getToolResultPolicy("grep_search"),
+    });
+
+    expect(result).toEqual({
+      ok: true,
+      summary: "Found 42 matches",
+    });
+  });
+
+  test("preserves compacted details for errors", () => {
+    const result = compactTranscriptPayload({
+      toolName: "grep_search",
+      normalized: {
+        ok: false,
+        summary: "grep failed",
+        error: "x".repeat(1200),
+        code: "GREP_FAILED",
+        details: { stderr: "y".repeat(1200) },
+        errorKind: "execution_failed",
+        raw: null,
+      },
+      policy: getToolResultPolicy("grep_search"),
+    });
+
+    expect(result).toEqual({
+      ok: false,
+      summary: "grep failed",
+      error: expect.stringContaining("[truncated"),
+      code: "GREP_FAILED",
+      details: {
+        stderr: expect.stringContaining("[truncated"),
+      },
+    });
+  });
+});
diff --git a/src/agent/__tests__/tool-observation.test.ts b/src/agent/__tests__/tool-observation.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, test } from "bun:test";
+
+import { buildRecentToolObservation } from "../tool-observation";
+import type { ToolTraceRecord, ToolTraceState } from "../tool-trace";
+
+function makeState(records: ToolTraceRecord[]): ToolTraceState {
+  return {
+    recent: records,
+    repeatedFailureCount: records.filter((r) => r.repeatedFailure).length,
+  };
+}
+
+describe("buildRecentToolObservation", () => {
+  test("returns null when there are no failures", () => {
+    const state = makeState([
+      {
+        step: 1,
+        toolName: "list_files",
+        toolUseId: "t1",
+        inputSignature: "{}",
+        ok: true,
+        summary: "ok",
+        repeatedFailure: false,
+      },
+    ]);
+
+    const observation = buildRecentToolObservation({
+      state,
+      getRecoveryHint: () => null,
+    });
+
+    expect(observation).toBeNull();
+  });
+
+  test("includes a summary line and avoid_immediate_retry_tools for repeated failures", () => {
+    const failure: ToolTraceRecord = {
+      step: 2,
+      toolName: "grep_search",
+      toolUseId: "t2",
+      inputSignature: "{\"pattern\":\"foo\"}",
+      ok: false,
+      summary: "rg not found",
+      code: "RG_NOT_FOUND",
+      errorKind: "environment_missing",
+      repeatedFailure: true,
+    };
+
+    const state = makeState([failure]);
+    const observation = buildRecentToolObservation({
+      state,
+      getRecoveryHint: () => ({
+        message: "Missing env",
+        shouldSuppressImmediateRetry: true,
+        retryable: false,
+      }),
+    });
+
+    expect(observation).toContain("<tool_observation>");
+    expect(observation).toContain("summary: recent_failures=1");
+    expect(observation).toContain("repeated_failures=1");
+    expect(observation).toContain("avoid_immediate_retry_tools=grep_search");
+    expect(observation).toContain("repeated_failure=true");
+    expect(observation).toContain("avoid_immediate_retry=true");
+    expect(observation).toContain("code=RG_NOT_FOUND");
+    expect(observation).toContain("kind=environment_missing");
+  });
+
+  test("dedupes identical failures and prioritizes repeated failures over newer non-repeated ones", () => {
+    const a1: ToolTraceRecord = {
+      step: 1,
+      toolName: "apply_patch",
+      toolUseId: "t1",
+      inputSignature: "{\"patch\":\"...\"}",
+      ok: false,
+      summary: "Patch failed",
+      code: "PATCH_APPLY_FAILED",
+      errorKind: "execution_failed",
+      repeatedFailure: false,
+    };
+
+    const a2: ToolTraceRecord = {
+      ...a1,
+      step: 2,
+      toolUseId: "t2",
+      repeatedFailure: true,
+    };
+
+    const newerNonRepeated: ToolTraceRecord = {
+      step: 3,
+      toolName: "glob_search",
+      toolUseId: "t3",
+      inputSignature: "{\"pattern\":\"bar\"}",
+      ok: false,
+      summary: "glob failed",
+      code: "FILE_NOT_FOUND",
+      errorKind: "not_found",
+      repeatedFailure: false,
+    };
+
+    const state = makeState([a1, a2, newerNonRepeated]);
+    const observation = buildRecentToolObservation({
+      state,
+      getRecoveryHint: () => null,
+      maxFailures: 3,
+    });
+
+    expect(observation).toContain("summary: recent_failures=3");
+    // Dedup should keep one formatted line for these identical failures.
+    expect(observation?.match(/tool=apply_patch/g)?.length).toBe(1);
+    expect(observation).toContain("repeated_failure=true");
+  });
+});
+