Skip to content

Commit db6166a

Browse files
authored
Merge pull request #22 from Deep-CodeAI/feat/962-on-error-callback
feat(#962): onError — infrastructure-error observability hook
2 parents 1906a29 + cbcce10 commit db6166a

4 files changed

Lines changed: 206 additions & 7 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,7 @@ For the full contributor guide — running the live-LLM and MCP integration test
11951195
- [x] `model { }` — Ollama backend; `host`, `port`, `temperature`; injectable `ModelClient` for tests; auto-fallback to inline JSON tool-call format for models without native tool support (#706)
11961196
- [x] Agentic execution loop — multi-turn tool calling with budget controls (`maxTurns`, `maxToolCalls`, `maxDuration`, `perToolTimeout`) + `onToolUse` observability hook (#637)
11971197
- [x] Skill selection — manual `skillSelection {}` + automatic LLM routing when multiple skills match
1198+
- [x] `onError { Throwable -> }` — infrastructure-error observability hook (LLM transport, response parse, budget); pure observability — original exception always rethrows (#962)
11981199
- [ ] `>>` — security/education wrap
11991200

12001201
**Phase 2 — Runtime + Distribution** *(Q2 2026)*

docs/prd.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3927,6 +3927,7 @@ Notation: `[x]` shipped, `[ ]` planned. Mirrors the README's roadmap so contribu
39273927
- [x] `onSkillChosen { name -> }` — fires when an agent selects a skill to execute
39283928
- [x] `onKnowledgeUsed { name, content -> }` — fires when the LLM fetches a knowledge entry (tools model)
39293929
- [x] Tool error recovery — `onToolError { invalidArgs / deserializationError / executionError { ... } }` with `RepairResult.Fixed / Retry / Escalated / Unrecoverable`
3930+
- [x] `onError { Throwable -> }` — infrastructure-error observability hook (LLM transport, response parse, budget); pure observability — original exception always rethrows; listener exceptions are attached as suppressed (#962)
39303931
- [x] MCP client — `mcp { server() }` agent DSL with HTTP / stdio / TCP transports, Bearer auth, namespacing
39313932
- [x] MCP server — `McpServer.from(agent) { expose() }` exposes agent skills as MCP tools; 2025-03-26 spec conformance (ping, capabilities, protocolVersion negotiation, cursor/nextCursor, Content-Type/415, 405 with Allow, Mcp-Session-Id)
39323933
- [x] MCP runner — `McpRunner.serve(agent, args)` picocli-style one-line `main` for standalone agent JARs
@@ -3942,7 +3943,6 @@ Notation: `[x]` shipped, `[ ]` planned. Mirrors the README's roadmap so contribu
39423943
- [ ] MCP client integration — `McpTool` instances consumable alongside local tools
39433944
- [ ] `grants { tools(...) }` — Layer 2 permissions use actual `Tool<*,*>` references
39443945
- [ ] Permission model: 3 states — Granted (auto-runs), Confirmed (user approval), Absent (unavailable)
3945-
- [ ] `onError` callback for infrastructure error handling
39463946
- [ ] KSP annotation processor for compile-time `@Generable` (replaces runtime reflection); constrained decoding (Ollama/vLLM) + guided JSON mode (Anthropic/OpenAI)
39473947
- [ ] Native CLI binary (GraalVM — no JRE required); `brew`, npm, pip, curl, apt
39483948
- [ ] jlink minimal JRE bundle for runtime (~35 MB)

src/main/kotlin/agents_engine/core/Agent.kt

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ class Agent<IN, OUT>(
8686
private set
8787
var routerRationaleListener: ((rationale: String) -> Unit)? = null
8888
private set
89+
/**
90+
* Fires when an infrastructure error is about to propagate out of an agentic
91+
* invocation — LLM transport failures, response parse failures, budget
92+
* exceptions, skill-routing failures, etc. Pure observability: the original
93+
* exception is always rethrown after the listener runs. See #962.
94+
*
95+
* Distinct from [onToolError], which is per-tool *semantic* recovery and
96+
* can substitute a value or repaired arguments for the failure.
97+
*/
98+
var errorListener: ((Throwable) -> Unit)? = null
99+
private set
89100
var skillSelectionConfidenceThreshold: Double = 0.6
90101
private set
91102
private var skillSelector: ((IN) -> String)? = null
@@ -138,6 +149,10 @@ class Agent<IN, OUT>(
138149
skillChosenListener = block
139150
}
140151

152+
fun onError(block: (Throwable) -> Unit) {
153+
errorListener = block
154+
}
155+
141156
fun skillSelection(block: (IN) -> String) {
142157
checkNotFrozen()
143158
skillSelector = block
@@ -210,12 +225,28 @@ class Agent<IN, OUT>(
210225
* the agentic loop. The blocking [invoke] is a thin shim over this.
211226
*/
212227
suspend fun invokeSuspend(input: IN): OUT {
213-
val skill = resolveSkill(input)
214-
skillChosenListener?.invoke(skill.name)
215-
return if (skill.isAgentic) {
216-
castOut(executeAgentic(this, skill, input))
217-
} else {
218-
castOut(executors[skill.name]!!(input))
228+
try {
229+
val skill = resolveSkill(input)
230+
skillChosenListener?.invoke(skill.name)
231+
return if (skill.isAgentic) {
232+
castOut(executeAgentic(this, skill, input))
233+
} else {
234+
castOut(executors[skill.name]!!(input))
235+
}
236+
} catch (t: Throwable) {
237+
// #962: observability hook for infrastructure errors. Fires on
238+
// *anything* that escapes the agentic invocation — LLM transport
239+
// failures, response parse failures, budget exceptions, skill
240+
// routing errors. Listener exceptions are attached as suppressed
241+
// so they can never swallow the original error.
242+
errorListener?.let { listener ->
243+
try {
244+
listener(t)
245+
} catch (callbackError: Throwable) {
246+
t.addSuppressed(callbackError)
247+
}
248+
}
249+
throw t
219250
}
220251
}
221252

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package agents_engine.model
2+
3+
import agents_engine.core.agent
4+
import org.junit.jupiter.api.assertThrows
5+
import kotlin.test.Test
6+
import kotlin.test.assertEquals
7+
import kotlin.test.assertNotNull
8+
import kotlin.test.assertTrue
9+
10+
// Tests for #962 — onError is the infrastructure-error observability hook.
11+
// It MUST fire when an exception is about to propagate out of an agentic
12+
// invocation, and the original exception MUST always rethrow afterward —
13+
// onError is observability, never recovery (that's onToolError's job).
14+
class OnErrorListenerTest {
15+
16+
@Test
17+
fun `onError fires when ModelClient throws`() {
18+
val captured = mutableListOf<Throwable>()
19+
val mock = ModelClient { _ -> throw RuntimeException("transport blew up") }
20+
21+
val a = agent<String, String>("a") {
22+
model { ollama("llama3"); client = mock }
23+
skills { skill<String, String>("s", "s") { tools() } }
24+
onError { captured += it }
25+
}
26+
27+
val thrown = assertThrows<RuntimeException> { a("input") }
28+
// (Coroutines stack-trace recovery clones exceptions across
29+
// dispatcher boundaries, so we assert on logical identity —
30+
// class + message — rather than reference identity. Both the
31+
// listener and the caller see logically the same exception.)
32+
assertEquals("transport blew up", thrown.message)
33+
assertEquals(1, captured.size)
34+
val seen = captured.single()
35+
assertTrue(seen is RuntimeException)
36+
assertEquals("transport blew up", seen.message)
37+
}
38+
39+
@Test
40+
fun `onError fires when LLM output cannot be parsed as agent OUT type`() {
41+
// Agent declares OUT = Int, model returns text that cannot become an Int.
42+
val mock = ModelClient { _ -> LlmResponse.Text("not-a-number") }
43+
44+
val captured = mutableListOf<Throwable>()
45+
val a = agent<String, Int>("a") {
46+
model { ollama("llama3"); client = mock }
47+
skills { skill<String, Int>("s", "s") { tools() } }
48+
onError { captured += it }
49+
}
50+
51+
assertThrows<Throwable> { a("input") }
52+
assertEquals(1, captured.size)
53+
// Sanity: the captured throwable's message should mention the parse failure.
54+
val msg = captured.single().message.orEmpty()
55+
assertTrue(msg.contains("parse", ignoreCase = true) || msg.contains("Int"))
56+
}
57+
58+
@Test
59+
fun `onError fires on BudgetExceededException`() {
60+
// Model never returns Text — every response is a tool call into a no-op
61+
// tool. With maxTurns = 1, the second turn trips the budget.
62+
val responses = ArrayDeque<LlmResponse>()
63+
repeat(8) {
64+
responses.add(LlmResponse.ToolCalls(listOf(ToolCall(name = "noop", arguments = emptyMap()))))
65+
}
66+
val mock = ModelClient { _ -> responses.removeFirst() }
67+
68+
val captured = mutableListOf<Throwable>()
69+
val a = agent<String, String>("a") {
70+
model { ollama("llama3"); client = mock }
71+
budget { maxTurns = 1 }
72+
tools { tool("noop", "") { _ -> "ok" } }
73+
skills { skill<String, String>("s", "s") { tools("noop") } }
74+
onError { captured += it }
75+
}
76+
77+
assertThrows<BudgetExceededException> { a("input") }
78+
assertEquals(1, captured.size)
79+
val captured0 = captured.single()
80+
assertTrue(captured0 is BudgetExceededException)
81+
assertEquals(BudgetReason.TURNS, captured0.reason)
82+
}
83+
84+
@Test
85+
fun `onError absent — no callback, original error still propagates`() {
86+
// Agent declares no onError listener. The original exception must
87+
// still reach the caller unchanged; the absence of a listener must
88+
// not introduce any swallowing.
89+
val boom = IllegalStateException("nope")
90+
val mock = ModelClient { _ -> throw boom }
91+
92+
val a = agent<String, String>("a") {
93+
model { ollama("llama3"); client = mock }
94+
skills { skill<String, String>("s", "s") { tools() } }
95+
}
96+
97+
val thrown = assertThrows<IllegalStateException> { a("input") }
98+
assertEquals("nope", thrown.message)
99+
}
100+
101+
@Test
102+
fun `listener exception does not swallow the original error`() {
103+
val mock = ModelClient { _ -> throw RuntimeException("real failure") }
104+
val listenerError = IllegalStateException("listener itself blew up")
105+
106+
var listenerFired = false
107+
val a = agent<String, String>("a") {
108+
model { ollama("llama3"); client = mock }
109+
skills { skill<String, String>("s", "s") { tools() } }
110+
onError {
111+
listenerFired = true
112+
throw listenerError
113+
}
114+
}
115+
116+
val thrown = assertThrows<RuntimeException> { a("input") }
117+
// The original message — not the listener's — is what surfaces.
118+
// (If the listener's exception had swallowed the original, the
119+
// caller would see "listener itself blew up" instead.)
120+
assertEquals("real failure", thrown.message)
121+
assertTrue(listenerFired)
122+
// Listener's failure is attached to the propagated exception as a
123+
// suppressed entry, so it's never silently lost.
124+
val suppressed = thrown.suppressed.toList()
125+
assertEquals(1, suppressed.size)
126+
val attached = suppressed.single()
127+
assertTrue(attached is IllegalStateException)
128+
assertEquals("listener itself blew up", attached.message)
129+
}
130+
131+
@Test
132+
fun `onError fires only once per invocation`() {
133+
// Sanity: the wrapper is around invokeSuspend, not around inner
134+
// helpers. A single failing chat call → exactly one fire.
135+
val mock = ModelClient { _ -> throw RuntimeException("once") }
136+
137+
var fireCount = 0
138+
val a = agent<String, String>("a") {
139+
model { ollama("llama3"); client = mock }
140+
skills { skill<String, String>("s", "s") { tools() } }
141+
onError { fireCount++ }
142+
}
143+
144+
assertThrows<RuntimeException> { a("input") }
145+
assertEquals(1, fireCount)
146+
}
147+
148+
@Test
149+
fun `onError listener is mutable post-construction (instrumentation use case)`() {
150+
// The other listeners (onToolUse, onSkillChosen, onKnowledgeUsed) are
151+
// intentionally settable post-construction for tracing instrumentation.
152+
// onError follows the same convention — frozen state must not block it.
153+
val mock = ModelClient { _ -> throw RuntimeException("infra") }
154+
155+
val a = agent<String, String>("a") {
156+
model { ollama("llama3"); client = mock }
157+
skills { skill<String, String>("s", "s") { tools() } }
158+
}
159+
// Agent is now validated/frozen.
160+
161+
var captured: Throwable? = null
162+
a.onError { captured = it }
163+
164+
assertThrows<RuntimeException> { a("input") }
165+
assertNotNull(captured)
166+
}
167+
}

0 commit comments

Comments
 (0)