fix: detect semantic step failures via STEP_FAILED_MARKERS (C-3) (#120)

Copilot · huberp · web-flow · commit ea8029b169ec · 2026-04-25T20:38:07.000+02:00
* Initial plan * fix: detect semantic step failures via STEP_FAILED_MARKERS (C-3) Agent-Logs-Url: https://github.com/huberp/agentloop/sessions/75f1b2a0-7836-4ef6-9ef4-e2b26db63c33 Co-authored-by: huberp <4027454+huberp@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: huberp <4027454+huberp@users.noreply.github.com>
diff --git a/src/__tests__/langgraph.test.ts b/src/__tests__/langgraph.test.ts
@@ -22,7 +22,7 @@ import {
   isDeadlocked,
 } from "../langgraph/scheduler";
 import { buildGraphNodes, invokeGraph } from "../langgraph/graph";
-import { runPlannedStep } from "../langgraph/step-runner";
+import { runPlannedStep, STEP_FAILED_MARKERS } from "../langgraph/step-runner";
 import type {
   BlocksPlan,
   CompiledPlan,
@@ -856,3 +856,81 @@ describe("runPlannedStep — original request grounding", () => {
     expect(stepPrompts[0]).toContain("add Anthropic models to github repo huberp/agentloop");
   }, 30000);
 });
+
+// ─────────────────────────────────────────────────────────────────────────────
+// (12) runPlannedStep — semantic step failure detection
+// ─────────────────────────────────────────────────────────────────────────────
+
+describe("runPlannedStep — semantic failure detection", () => {
+  function makeNode(overrides: Partial<CompiledPlanNode> = {}): CompiledPlanNode {
+    return {
+      id: "s1",
+      description: "Fork the huberp/agentloop repository",
+      dependsOn: [],
+      toolsNeeded: [],
+      estimatedComplexity: "low",
+      resources: [],
+      ...overrides,
+    };
+  }
+
+  function makeLlmWithOutput(output: string) {
+    const invoke = jest.fn().mockResolvedValue({ content: output, tool_calls: [] });
+    return {
+      invoke,
+      bindTools: jest.fn().mockReturnValue({ invoke }),
+    } as unknown as BaseChatModel;
+  }
+
+  it("returns status=failed when output contains 'I cannot'", async () => {
+    const llm = makeLlmWithOutput(
+      "I cannot directly fork a repository or perform GitHub actions like forking. " +
+      "However, you can manually fork the repository by following these steps.",
+    );
+    const registry = new ToolRegistry();
+    const result = await runPlannedStep(makeNode(), { registry, llm });
+
+    expect(result.status).toBe("failed");
+    expect(result.error).toContain("I cannot");
+    expect(result.output).toContain("I cannot");
+  });
+
+  it("returns status=failed when output contains 'I am unable'", async () => {
+    const llm = makeLlmWithOutput("I am unable to perform this action directly.");
+    const registry = new ToolRegistry();
+    const result = await runPlannedStep(makeNode(), { registry, llm });
+
+    expect(result.status).toBe("failed");
+    expect(result.error).toContain("I am unable");
+  });
+
+  it("returns status=failed when output contains 'cannot perform'", async () => {
+    const llm = makeLlmWithOutput("This agent cannot perform external API calls.");
+    const registry = new ToolRegistry();
+    const result = await runPlannedStep(makeNode(), { registry, llm });
+
+    expect(result.status).toBe("failed");
+  });
+
+  it("returns status=failed case-insensitively (e.g. 'I CANNOT')", async () => {
+    const llm = makeLlmWithOutput("I CANNOT access external services.");
+    const registry = new ToolRegistry();
+    const result = await runPlannedStep(makeNode(), { registry, llm });
+
+    expect(result.status).toBe("failed");
+  });
+
+  it("returns status=success when output does not contain any failure marker", async () => {
+    const llm = makeLlmWithOutput("Repository cloned successfully.");
+    const registry = new ToolRegistry();
+    const result = await runPlannedStep(makeNode(), { registry, llm });
+
+    expect(result.status).toBe("success");
+    expect(result.output).toBe("Repository cloned successfully.");
+  });
+
+  it("exports STEP_FAILED_MARKERS as a non-empty array", () => {
+    expect(Array.isArray(STEP_FAILED_MARKERS)).toBe(true);
+    expect(STEP_FAILED_MARKERS.length).toBeGreaterThan(0);
+  });
+});
diff --git a/src/langgraph/step-runner.ts b/src/langgraph/step-runner.ts
@@ -35,6 +35,20 @@ const REPLAN_MARKERS = [
   "REPLAN_REQUESTED",
 ];
 
+/**
+ * Markers that indicate the LLM could not complete the step (semantic failure).
+ * Checked case-insensitively against the full step output.
+ */
+export const STEP_FAILED_MARKERS = [
+  "I cannot",
+  "I am unable",
+  "I don't have the ability",
+  "I do not have the ability",
+  "cannot perform",
+  "unable to perform",
+  "not able to",
+];
+
 // ─────────────────────────────────────────────────────────────────────────────
 // Public API
 // ─────────────────────────────────────────────────────────────────────────────
@@ -127,6 +141,17 @@ export async function runPlannedStep(
       stepLlm,
     );
 
+    // Detect semantic failure — LLM explicitly declined or could not act
+    const stepFailed = detectStepFailure(result.output);
+    if (stepFailed.failed) {
+      logger.warn({ nodeId: node.id, reason: stepFailed.reason }, "Step semantically failed (LLM indicated inability)");
+      return {
+        status: "failed",
+        output: result.output,
+        error: stepFailed.reason ?? "LLM indicated it could not complete the step",
+      };
+    }
+
     // Detect replan request in the output
     const replanRequested = detectReplanRequest(result.output);
 
@@ -164,3 +189,29 @@ function detectReplanRequest(output: string): { requested: boolean; reason?: str
   }
   return { requested: false };
 }
+
+/**
+ * Lowercase versions of STEP_FAILED_MARKERS, pre-computed once to avoid
+ * repeated `.toLowerCase()` calls in the hot path.
+ */
+const STEP_FAILED_MARKERS_LOWER = STEP_FAILED_MARKERS.map((m) => m.toLowerCase());
+
+/**
+ * Detect whether the LLM output semantically indicates an inability to complete
+ * the step (e.g. "I cannot fork…", "I am unable to…").
+ *
+ * Matching is case-insensitive so that natural variations are caught.
+ */
+function detectStepFailure(output: string): { failed: boolean; reason?: string } {
+  const lower = output.toLowerCase();
+  for (let i = 0; i < STEP_FAILED_MARKERS_LOWER.length; i++) {
+    const markerLower = STEP_FAILED_MARKERS_LOWER[i];
+    const idx = lower.indexOf(markerLower);
+    if (idx !== -1) {
+      // Extract a short context window around the marker for the error message
+      const snippet = output.slice(idx, idx + 200).trim();
+      return { failed: true, reason: snippet };
+    }
+  }
+  return { failed: false };
+}