Skip to content

Commit ea8029b

Browse files
Copilothuberp
andauthored
fix: detect semantic step failures via STEP_FAILED_MARKERS (C-3) (#120)
* Initial plan * fix: detect semantic step failures via STEP_FAILED_MARKERS (C-3) Agent-Logs-Url: https://github.com/huberp/agentloop/sessions/75f1b2a0-7836-4ef6-9ef4-e2b26db63c33 Co-authored-by: huberp <4027454+huberp@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: huberp <4027454+huberp@users.noreply.github.com>
1 parent 3afffa5 commit ea8029b

2 files changed

Lines changed: 130 additions & 1 deletion

File tree

src/__tests__/langgraph.test.ts

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import {
2222
isDeadlocked,
2323
} from "../langgraph/scheduler";
2424
import { buildGraphNodes, invokeGraph } from "../langgraph/graph";
25-
import { runPlannedStep } from "../langgraph/step-runner";
25+
import { runPlannedStep, STEP_FAILED_MARKERS } from "../langgraph/step-runner";
2626
import type {
2727
BlocksPlan,
2828
CompiledPlan,
@@ -856,3 +856,81 @@ describe("runPlannedStep — original request grounding", () => {
856856
expect(stepPrompts[0]).toContain("add Anthropic models to github repo huberp/agentloop");
857857
}, 30000);
858858
});
859+
860+
// ─────────────────────────────────────────────────────────────────────────────
861+
// (12) runPlannedStep — semantic step failure detection
862+
// ─────────────────────────────────────────────────────────────────────────────
863+
864+
describe("runPlannedStep — semantic failure detection", () => {
865+
function makeNode(overrides: Partial<CompiledPlanNode> = {}): CompiledPlanNode {
866+
return {
867+
id: "s1",
868+
description: "Fork the huberp/agentloop repository",
869+
dependsOn: [],
870+
toolsNeeded: [],
871+
estimatedComplexity: "low",
872+
resources: [],
873+
...overrides,
874+
};
875+
}
876+
877+
function makeLlmWithOutput(output: string) {
878+
const invoke = jest.fn().mockResolvedValue({ content: output, tool_calls: [] });
879+
return {
880+
invoke,
881+
bindTools: jest.fn().mockReturnValue({ invoke }),
882+
} as unknown as BaseChatModel;
883+
}
884+
885+
it("returns status=failed when output contains 'I cannot'", async () => {
886+
const llm = makeLlmWithOutput(
887+
"I cannot directly fork a repository or perform GitHub actions like forking. " +
888+
"However, you can manually fork the repository by following these steps.",
889+
);
890+
const registry = new ToolRegistry();
891+
const result = await runPlannedStep(makeNode(), { registry, llm });
892+
893+
expect(result.status).toBe("failed");
894+
expect(result.error).toContain("I cannot");
895+
expect(result.output).toContain("I cannot");
896+
});
897+
898+
it("returns status=failed when output contains 'I am unable'", async () => {
899+
const llm = makeLlmWithOutput("I am unable to perform this action directly.");
900+
const registry = new ToolRegistry();
901+
const result = await runPlannedStep(makeNode(), { registry, llm });
902+
903+
expect(result.status).toBe("failed");
904+
expect(result.error).toContain("I am unable");
905+
});
906+
907+
it("returns status=failed when output contains 'cannot perform'", async () => {
908+
const llm = makeLlmWithOutput("This agent cannot perform external API calls.");
909+
const registry = new ToolRegistry();
910+
const result = await runPlannedStep(makeNode(), { registry, llm });
911+
912+
expect(result.status).toBe("failed");
913+
});
914+
915+
it("returns status=failed case-insensitively (e.g. 'I CANNOT')", async () => {
916+
const llm = makeLlmWithOutput("I CANNOT access external services.");
917+
const registry = new ToolRegistry();
918+
const result = await runPlannedStep(makeNode(), { registry, llm });
919+
920+
expect(result.status).toBe("failed");
921+
});
922+
923+
it("returns status=success when output does not contain any failure marker", async () => {
924+
const llm = makeLlmWithOutput("Repository cloned successfully.");
925+
const registry = new ToolRegistry();
926+
const result = await runPlannedStep(makeNode(), { registry, llm });
927+
928+
expect(result.status).toBe("success");
929+
expect(result.output).toBe("Repository cloned successfully.");
930+
});
931+
932+
it("exports STEP_FAILED_MARKERS as a non-empty array", () => {
933+
expect(Array.isArray(STEP_FAILED_MARKERS)).toBe(true);
934+
expect(STEP_FAILED_MARKERS.length).toBeGreaterThan(0);
935+
});
936+
});

src/langgraph/step-runner.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ const REPLAN_MARKERS = [
3535
"REPLAN_REQUESTED",
3636
];
3737

38+
/**
39+
* Markers that indicate the LLM could not complete the step (semantic failure).
40+
* Checked case-insensitively against the full step output.
41+
*/
42+
export const STEP_FAILED_MARKERS = [
43+
"I cannot",
44+
"I am unable",
45+
"I don't have the ability",
46+
"I do not have the ability",
47+
"cannot perform",
48+
"unable to perform",
49+
"not able to",
50+
];
51+
3852
// ─────────────────────────────────────────────────────────────────────────────
3953
// Public API
4054
// ─────────────────────────────────────────────────────────────────────────────
@@ -127,6 +141,17 @@ export async function runPlannedStep(
127141
stepLlm,
128142
);
129143

144+
// Detect semantic failure — LLM explicitly declined or could not act
145+
const stepFailed = detectStepFailure(result.output);
146+
if (stepFailed.failed) {
147+
logger.warn({ nodeId: node.id, reason: stepFailed.reason }, "Step semantically failed (LLM indicated inability)");
148+
return {
149+
status: "failed",
150+
output: result.output,
151+
error: stepFailed.reason ?? "LLM indicated it could not complete the step",
152+
};
153+
}
154+
130155
// Detect replan request in the output
131156
const replanRequested = detectReplanRequest(result.output);
132157

@@ -164,3 +189,29 @@ function detectReplanRequest(output: string): { requested: boolean; reason?: str
164189
}
165190
return { requested: false };
166191
}
192+
193+
/**
194+
* Lowercase versions of STEP_FAILED_MARKERS, pre-computed once to avoid
195+
* repeated `.toLowerCase()` calls in the hot path.
196+
*/
197+
const STEP_FAILED_MARKERS_LOWER = STEP_FAILED_MARKERS.map((m) => m.toLowerCase());
198+
199+
/**
200+
* Detect whether the LLM output semantically indicates an inability to complete
201+
* the step (e.g. "I cannot fork…", "I am unable to…").
202+
*
203+
* Matching is case-insensitive so that natural variations are caught.
204+
*/
205+
function detectStepFailure(output: string): { failed: boolean; reason?: string } {
206+
const lower = output.toLowerCase();
207+
for (let i = 0; i < STEP_FAILED_MARKERS_LOWER.length; i++) {
208+
const markerLower = STEP_FAILED_MARKERS_LOWER[i];
209+
const idx = lower.indexOf(markerLower);
210+
if (idx !== -1) {
211+
// Extract a short context window around the marker for the error message
212+
const snippet = output.slice(idx, idx + 200).trim();
213+
return { failed: true, reason: snippet };
214+
}
215+
}
216+
return { failed: false };
217+
}

0 commit comments

Comments
 (0)