Merge pull request #22 from Deep-CodeAI/feat/962-on-error-callback

Skobeltsyn · web-flow · commit db6166a5def0 · 2026-05-03T22:52:30.000+03:00
feat(#962): onError — infrastructure-error observability hook
diff --git a/README.md b/README.md
@@ -1195,6 +1195,7 @@ For the full contributor guide — running the live-LLM and MCP integration test
 - [x] `model { }` — Ollama backend; `host`, `port`, `temperature`; injectable `ModelClient` for tests; auto-fallback to inline JSON tool-call format for models without native tool support (#706)
 - [x] Agentic execution loop — multi-turn tool calling with budget controls (`maxTurns`, `maxToolCalls`, `maxDuration`, `perToolTimeout`) + `onToolUse` observability hook (#637)
 - [x] Skill selection — manual `skillSelection {}` + automatic LLM routing when multiple skills match
+- [x] `onError { Throwable -> }` — infrastructure-error observability hook (LLM transport, response parse, budget); pure observability — original exception always rethrows (#962)
 - [ ] `>>` — security/education wrap
 
 **Phase 2 — Runtime + Distribution** *(Q2 2026)*
diff --git a/docs/prd.md b/docs/prd.md
@@ -3927,6 +3927,7 @@ Notation: `[x]` shipped, `[ ]` planned. Mirrors the README's roadmap so contribu
 - [x] `onSkillChosen { name -> }` — fires when an agent selects a skill to execute
 - [x] `onKnowledgeUsed { name, content -> }` — fires when the LLM fetches a knowledge entry (tools model)
 - [x] Tool error recovery — `onToolError { invalidArgs / deserializationError / executionError { ... } }` with `RepairResult.Fixed / Retry / Escalated / Unrecoverable`
+- [x] `onError { Throwable -> }` — infrastructure-error observability hook (LLM transport, response parse, budget); pure observability — original exception always rethrows; listener exceptions are attached as suppressed (#962)
 - [x] MCP client — `mcp { server() }` agent DSL with HTTP / stdio / TCP transports, Bearer auth, namespacing
 - [x] MCP server — `McpServer.from(agent) { expose() }` exposes agent skills as MCP tools; 2025-03-26 spec conformance (ping, capabilities, protocolVersion negotiation, cursor/nextCursor, Content-Type/415, 405 with Allow, Mcp-Session-Id)
 - [x] MCP runner — `McpRunner.serve(agent, args)` picocli-style one-line `main` for standalone agent JARs
@@ -3942,7 +3943,6 @@ Notation: `[x]` shipped, `[ ]` planned. Mirrors the README's roadmap so contribu
 - [ ] MCP client integration — `McpTool` instances consumable alongside local tools
 - [ ] `grants { tools(...) }` — Layer 2 permissions use actual `Tool<*,*>` references
 - [ ] Permission model: 3 states — Granted (auto-runs), Confirmed (user approval), Absent (unavailable)
-- [ ] `onError` callback for infrastructure error handling
 - [ ] KSP annotation processor for compile-time `@Generable` (replaces runtime reflection); constrained decoding (Ollama/vLLM) + guided JSON mode (Anthropic/OpenAI)
 - [ ] Native CLI binary (GraalVM — no JRE required); `brew`, npm, pip, curl, apt
 - [ ] jlink minimal JRE bundle for runtime (~35 MB)
diff --git a/src/main/kotlin/agents_engine/core/Agent.kt b/src/main/kotlin/agents_engine/core/Agent.kt
@@ -86,6 +86,17 @@ class Agent<IN, OUT>(
         private set
     var routerRationaleListener: ((rationale: String) -> Unit)? = null
         private set
+    /**
+     * Fires when an infrastructure error is about to propagate out of an agentic
+     * invocation — LLM transport failures, response parse failures, budget
+     * exceptions, skill-routing failures, etc. Pure observability: the original
+     * exception is always rethrown after the listener runs. See #962.
+     *
+     * Distinct from [onToolError], which is per-tool *semantic* recovery and
+     * can substitute a value or repaired arguments for the failure.
+     */
+    var errorListener: ((Throwable) -> Unit)? = null
+        private set
     var skillSelectionConfidenceThreshold: Double = 0.6
         private set
     private var skillSelector: ((IN) -> String)? = null
@@ -138,6 +149,10 @@ class Agent<IN, OUT>(
         skillChosenListener = block
     }
 
+    fun onError(block: (Throwable) -> Unit) {
+        errorListener = block
+    }
+
     fun skillSelection(block: (IN) -> String) {
         checkNotFrozen()
         skillSelector = block
@@ -210,12 +225,28 @@ class Agent<IN, OUT>(
      * the agentic loop. The blocking [invoke] is a thin shim over this.
      */
     suspend fun invokeSuspend(input: IN): OUT {
-        val skill = resolveSkill(input)
-        skillChosenListener?.invoke(skill.name)
-        return if (skill.isAgentic) {
-            castOut(executeAgentic(this, skill, input))
-        } else {
-            castOut(executors[skill.name]!!(input))
+        try {
+            val skill = resolveSkill(input)
+            skillChosenListener?.invoke(skill.name)
+            return if (skill.isAgentic) {
+                castOut(executeAgentic(this, skill, input))
+            } else {
+                castOut(executors[skill.name]!!(input))
+            }
+        } catch (t: Throwable) {
+            // #962: observability hook for infrastructure errors. Fires on
+            // *anything* that escapes the agentic invocation — LLM transport
+            // failures, response parse failures, budget exceptions, skill
+            // routing errors. Listener exceptions are attached as suppressed
+            // so they can never swallow the original error.
+            errorListener?.let { listener ->
+                try {
+                    listener(t)
+                } catch (callbackError: Throwable) {
+                    t.addSuppressed(callbackError)
+                }
+            }
+            throw t
         }
     }
 
diff --git a/src/test/kotlin/agents_engine/model/OnErrorListenerTest.kt b/src/test/kotlin/agents_engine/model/OnErrorListenerTest.kt
@@ -0,0 +1,167 @@
+package agents_engine.model
+
+import agents_engine.core.agent
+import org.junit.jupiter.api.assertThrows
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertNotNull
+import kotlin.test.assertTrue
+
+// Tests for #962 — onError is the infrastructure-error observability hook.
+// It MUST fire when an exception is about to propagate out of an agentic
+// invocation, and the original exception MUST always rethrow afterward —
+// onError is observability, never recovery (that's onToolError's job).
+class OnErrorListenerTest {
+
+    @Test
+    fun `onError fires when ModelClient throws`() {
+        val captured = mutableListOf<Throwable>()
+        val mock = ModelClient { _ -> throw RuntimeException("transport blew up") }
+
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, String>("s", "s") { tools() } }
+            onError { captured += it }
+        }
+
+        val thrown = assertThrows<RuntimeException> { a("input") }
+        // (Coroutines stack-trace recovery clones exceptions across
+        // dispatcher boundaries, so we assert on logical identity —
+        // class + message — rather than reference identity. Both the
+        // listener and the caller see logically the same exception.)
+        assertEquals("transport blew up", thrown.message)
+        assertEquals(1, captured.size)
+        val seen = captured.single()
+        assertTrue(seen is RuntimeException)
+        assertEquals("transport blew up", seen.message)
+    }
+
+    @Test
+    fun `onError fires when LLM output cannot be parsed as agent OUT type`() {
+        // Agent declares OUT = Int, model returns text that cannot become an Int.
+        val mock = ModelClient { _ -> LlmResponse.Text("not-a-number") }
+
+        val captured = mutableListOf<Throwable>()
+        val a = agent<String, Int>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, Int>("s", "s") { tools() } }
+            onError { captured += it }
+        }
+
+        assertThrows<Throwable> { a("input") }
+        assertEquals(1, captured.size)
+        // Sanity: the captured throwable's message should mention the parse failure.
+        val msg = captured.single().message.orEmpty()
+        assertTrue(msg.contains("parse", ignoreCase = true) || msg.contains("Int"))
+    }
+
+    @Test
+    fun `onError fires on BudgetExceededException`() {
+        // Model never returns Text — every response is a tool call into a no-op
+        // tool. With maxTurns = 1, the second turn trips the budget.
+        val responses = ArrayDeque<LlmResponse>()
+        repeat(8) {
+            responses.add(LlmResponse.ToolCalls(listOf(ToolCall(name = "noop", arguments = emptyMap()))))
+        }
+        val mock = ModelClient { _ -> responses.removeFirst() }
+
+        val captured = mutableListOf<Throwable>()
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            budget { maxTurns = 1 }
+            tools { tool("noop", "") { _ -> "ok" } }
+            skills { skill<String, String>("s", "s") { tools("noop") } }
+            onError { captured += it }
+        }
+
+        assertThrows<BudgetExceededException> { a("input") }
+        assertEquals(1, captured.size)
+        val captured0 = captured.single()
+        assertTrue(captured0 is BudgetExceededException)
+        assertEquals(BudgetReason.TURNS, captured0.reason)
+    }
+
+    @Test
+    fun `onError absent — no callback, original error still propagates`() {
+        // Agent declares no onError listener. The original exception must
+        // still reach the caller unchanged; the absence of a listener must
+        // not introduce any swallowing.
+        val boom = IllegalStateException("nope")
+        val mock = ModelClient { _ -> throw boom }
+
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, String>("s", "s") { tools() } }
+        }
+
+        val thrown = assertThrows<IllegalStateException> { a("input") }
+        assertEquals("nope", thrown.message)
+    }
+
+    @Test
+    fun `listener exception does not swallow the original error`() {
+        val mock = ModelClient { _ -> throw RuntimeException("real failure") }
+        val listenerError = IllegalStateException("listener itself blew up")
+
+        var listenerFired = false
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, String>("s", "s") { tools() } }
+            onError {
+                listenerFired = true
+                throw listenerError
+            }
+        }
+
+        val thrown = assertThrows<RuntimeException> { a("input") }
+        // The original message — not the listener's — is what surfaces.
+        // (If the listener's exception had swallowed the original, the
+        // caller would see "listener itself blew up" instead.)
+        assertEquals("real failure", thrown.message)
+        assertTrue(listenerFired)
+        // Listener's failure is attached to the propagated exception as a
+        // suppressed entry, so it's never silently lost.
+        val suppressed = thrown.suppressed.toList()
+        assertEquals(1, suppressed.size)
+        val attached = suppressed.single()
+        assertTrue(attached is IllegalStateException)
+        assertEquals("listener itself blew up", attached.message)
+    }
+
+    @Test
+    fun `onError fires only once per invocation`() {
+        // Sanity: the wrapper is around invokeSuspend, not around inner
+        // helpers. A single failing chat call → exactly one fire.
+        val mock = ModelClient { _ -> throw RuntimeException("once") }
+
+        var fireCount = 0
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, String>("s", "s") { tools() } }
+            onError { fireCount++ }
+        }
+
+        assertThrows<RuntimeException> { a("input") }
+        assertEquals(1, fireCount)
+    }
+
+    @Test
+    fun `onError listener is mutable post-construction (instrumentation use case)`() {
+        // The other listeners (onToolUse, onSkillChosen, onKnowledgeUsed) are
+        // intentionally settable post-construction for tracing instrumentation.
+        // onError follows the same convention — frozen state must not block it.
+        val mock = ModelClient { _ -> throw RuntimeException("infra") }
+
+        val a = agent<String, String>("a") {
+            model { ollama("llama3"); client = mock }
+            skills { skill<String, String>("s", "s") { tools() } }
+        }
+        // Agent is now validated/frozen.
+
+        var captured: Throwable? = null
+        a.onError { captured = it }
+
+        assertThrows<RuntimeException> { a("input") }
+        assertNotNull(captured)
+    }
+}