feat(#1739): v0.5.0 step 3 — agentic-loop rewire emits Token + ToolCall* events

Skobeltsyn · claude · Skobeltsyn · commit 449e4651a4c3 · 2026-05-16T00:27:06.000+03:00
Third v0.5.0 streaming step. TDD red-first.

ToolCall gains callId: String? = null. Nullable + default keeps the
existing constructor signature working; non-streaming providers leave
it empty. Streaming aggregator stamps it from LlmChunk.ToolCallStarted
so the loop carries the same id through to AgentEvent.ToolCallFinished.

New chatOrStream(client, messages, agentId, skillName, emitter):
- emitter == null  → withContext(IO) { client.chat(messages) } as before
- emitter != null  → collects client.chatStream(messages), emits Token
  for each TextDelta and ToolCallStarted / ToolCallArgumentsDelta for
  the matching LlmChunks. Provider-side LlmChunk.ToolCallFinished is
  bookkeeping; the consumer-facing AgentEvent.ToolCallFinished fires
  later in executeAgentic after the tool executor returns.

executeAgentic gains emitter: AgentEventEmitter? = null. After
executeToolWithBudget returns, emits ToolCallFinished with the
executor's result and isError=false. Try/catch around the executor
emits isError=true and rethrows so the loop's outer error path is
preserved.

Agent.invokeSuspendForSession plumbs the emitter through. Existing
Agent.invokeSuspend keeps the byte-for-byte non-streaming path (emitter
null). Agent.session(input) wires a trySend-based emitter into the
channel that fronts the events Flow.

Default ModelClient.chatStream honors call.callId when provided
(synthesizes UUID only when the non-streaming chat() path returned a
ToolCall without one). Preserves explicit ids end-to-end.

Tests (red-first):
- AgentSessionIntegrationTest's agentic-stub bracketing flipped from
  3 events to 4 — now expects a Token("done") between SkillStarted
  and SkillCompleted. ToolCall* still absent under a no-tool stub.
- New tool-call test exercises a two-turn stub (ToolCalls → Text),
  asserts shared callId between Started and Finished, single Token
  from the final text turn, strict ordering.
- New AgentSessionIncrementalArrivalTest: ModelClient overrides
  chatStream to insert delay(50) between TextDelta chunks. Collects
  events with arrival timestamps. Asserts first Token arrives at
  least 100ms before Completed — proves incremental flow vs.
  batch-at-end.

Verified: full root + KSP + no-reflect green; live π test still hits
full20=true through the new chatOrStream path.

tokensUsed threading on SkillCompleted/Completed deferred (separate
follow-up).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/main/kotlin/agents_engine/core/Agent.kt b/src/main/kotlin/agents_engine/core/Agent.kt
@@ -252,22 +252,30 @@ class Agent<IN, OUT>(
      * which lets parent-scope cancellation and `withTimeout` propagate cleanly into
      * the agentic loop. The blocking [invoke] is a thin shim over this.
      */
-    suspend fun invokeSuspend(input: IN): OUT = invokeSuspendForSession(input) { /* no-op */ }
+    suspend fun invokeSuspend(input: IN): OUT = invokeSuspendForSession(input, emitter = null) { /* no-op */ }
 
     /**
      * #1736 — session-aware sibling of [invokeSuspend]. Same logic, plus an
      * extra [onSkillStarted] callback fired after skill resolution and before
      * execution. Existing `invokeSuspend` delegates with a no-op callback, so
      * backward-compat is byte-for-byte; this entry point is only called by
      * `Agent.session(input)` to surface the skill name into the event flow.
+     *
+     * #1739 — when [emitter] is non-null, the agentic loop streams via
+     * `chatStream` and surfaces `Token` / `ToolCall*` events through it.
+     * Non-agentic skills ignore the emitter (they have no LLM round-trip).
      */
-    internal suspend fun invokeSuspendForSession(input: IN, onSkillStarted: (String) -> Unit): OUT {
+    internal suspend fun invokeSuspendForSession(
+        input: IN,
+        emitter: agents_engine.model.AgentEventEmitter? = null,
+        onSkillStarted: (String) -> Unit,
+    ): OUT {
         try {
             val skill = resolveSkill(input)
             skillChosenListener?.invoke(skill.name)
             onSkillStarted(skill.name)
             return if (skill.isAgentic) {
-                castOut(executeAgentic(this, skill, input))
+                castOut(executeAgentic(this, skill, input, emitter = emitter))
             } else {
                 castOut(executors[skill.name]!!(input))
             }
diff --git a/src/main/kotlin/agents_engine/model/AgenticLoop.kt b/src/main/kotlin/agents_engine/model/AgenticLoop.kt
@@ -28,6 +28,15 @@ suspend fun <IN> executeAgentic(
      * races on concurrent invocation of the same pipeline).
      */
     effectivePrompt: String = agent.prompt,
+    /**
+     * #1739: optional AgentEvent emitter. When non-null, the loop streams
+     * via `client.chatStream(...)`, surfaces `Token` / `ToolCallStarted` /
+     * `ToolCallArgumentsDelta` events from chunks, and emits
+     * `ToolCallFinished` after each tool executor runs. When null, the
+     * loop uses `client.chat(...)` byte-for-byte as before — non-streaming
+     * callers (`Agent.invoke`, `Agent.invokeSuspend`) pay no overhead.
+     */
+    emitter: AgentEventEmitter? = null,
 ): Any {
     val config = requireNotNull(agent.modelConfig) {
         "Agent '${agent.name}' has no model configured. Add a model { } block."
@@ -152,7 +161,7 @@ suspend fun <IN> executeAgentic(
             elapsedNanos.toDouble() / budget.maxDuration.inWholeNanoseconds,
         )
 
-        val response = withContext(Dispatchers.IO) { client.chat(messages) }
+        val response = chatOrStream(client, messages, agent.name, skill.name, emitter)
         turns++
         maybeFireThreshold(BudgetReason.TURNS, turns.toDouble() / budget.maxTurns)
 
@@ -213,9 +222,46 @@ suspend fun <IN> executeAgentic(
                             "Tool '${call.name}' is not allowed for skill '${skill.name}'. " +
                                 "Allowed: ${allowedToolMap.keys}"
                         )
-                    val result = executeToolWithBudget(agent, tool, call, budget)
+                    val result = try {
+                        executeToolWithBudget(agent, tool, call, budget)
+                    } catch (t: Throwable) {
+                        // #1739: tool executor threw and onError didn't recover.
+                        // Surface a ToolCallFinished event with isError=true so
+                        // consumers see the failure, then rethrow — the loop's
+                        // outer error path takes over (session emits Failed).
+                        if (emitter != null && call.callId != null) {
+                            emitter(
+                                agents_engine.runtime.events.AgentEvent.ToolCallFinished(
+                                    agentId = agent.name,
+                                    callId = call.callId,
+                                    toolName = call.name,
+                                    arguments = call.arguments,
+                                    result = t.message,
+                                    isError = true,
+                                )
+                            )
+                        }
+                        throw t
+                    }
                     if (isKnowledge) agent.knowledgeUsedListener?.invoke(call.name, result?.toString() ?: "")
                     else agent.toolUseListener?.invoke(call.name, call.arguments, result)
+                    // #1739: emit ToolCallFinished on the success path with the
+                    // executor's return value. callId is the one the streaming
+                    // aggregator stamped on this ToolCall — null only when the
+                    // emitter is null (no event work needed) or the non-streaming
+                    // path produced a ToolCall without one.
+                    if (emitter != null && call.callId != null) {
+                        emitter(
+                            agents_engine.runtime.events.AgentEvent.ToolCallFinished(
+                                agentId = agent.name,
+                                callId = call.callId,
+                                toolName = call.name,
+                                arguments = call.arguments,
+                                result = result,
+                                isError = false,
+                            )
+                        )
+                    }
                     val toolMessage = if (tool.untrustedOutput) {
                         wrapUntrustedToolResult(tool.name, result)
                     } else {
diff --git a/src/main/kotlin/agents_engine/model/ModelClient.kt b/src/main/kotlin/agents_engine/model/ModelClient.kt
@@ -11,6 +11,16 @@ data class ToolCall(
     val arguments: Map<String, Any?> = emptyMap(),
     val rawArguments: String? = null,
     val invalidArgumentsError: String? = null,
+    /**
+     * #1739 — provider-side call identifier. Set by streaming adapters
+     * (Anthropic SSE `tool_use_id`, OpenAI `tool_call_id`, MCP) so the
+     * agentic loop can correlate the chunks of one tool call back to a
+     * single `AgentEvent.ToolCallStarted` / `ToolCallFinished` pair, even
+     * under interleaved streaming. Nullable with default null — non-
+     * streaming providers that don't surface an explicit id can leave
+     * this empty.
+     */
+    val callId: String? = null,
 )
 
 /**
@@ -64,7 +74,12 @@ fun interface ModelClient {
                 }
                 is LlmResponse.ToolCalls -> {
                     response.calls.forEach { call ->
-                        val callId = java.util.UUID.randomUUID().toString()
+                        // #1739: honor the provider's callId when supplied; synthesize
+                        // only when the non-streaming `chat()` path returned a ToolCall
+                        // without one. This keeps explicit ids stable end-to-end so
+                        // AgentEvent.ToolCallStarted and ToolCallFinished can be
+                        // matched by consumers.
+                        val callId = call.callId ?: java.util.UUID.randomUUID().toString()
                         emit(LlmChunk.ToolCallStarted(callId, call.name))
                         emit(LlmChunk.ToolCallArgumentsDelta(callId, call.rawArguments ?: ""))
                         emit(LlmChunk.ToolCallFinished(callId, call.arguments))
diff --git a/src/main/kotlin/agents_engine/model/StreamingAggregator.kt b/src/main/kotlin/agents_engine/model/StreamingAggregator.kt
@@ -0,0 +1,95 @@
+package agents_engine.model
+
+import agents_engine.runtime.events.AgentEvent
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.flow.collect
+import kotlinx.coroutines.withContext
+
+// #1739 — emitter shape used to plumb AgentEvents out of the agentic
+// loop. `AgentEvent<*>` because the loop only ever produces non-`OUT`
+// subtypes (Token, ToolCall*, SkillStarted, SkillCompleted, Failed);
+// only `AgentEvent.Completed<OUT>` carries the typed payload and that's
+// emitted in `Agent.session(input)` after the loop returns.
+internal typealias AgentEventEmitter = suspend (AgentEvent<*>) -> Unit
+
+/**
+ * #1739 — round-trip the model: either via the existing non-streaming
+ * `chat()` path (when [emitter] is null — byte-for-byte the old
+ * behavior) or via `chatStream()` aggregated into the same `LlmResponse`
+ * the agentic loop expects, emitting `AgentEvent` chunks as they arrive.
+ *
+ * Aggregation strategy:
+ * - `TextDelta` chunks are concatenated into a final `LlmResponse.Text`.
+ *   Each delta also fires an `AgentEvent.Token`.
+ * - `ToolCallStarted` records `callId` -> `toolName` in arrival order.
+ *   Fires `AgentEvent.ToolCallStarted`.
+ * - `ToolCallArgumentsDelta` fires the matching `AgentEvent` with the
+ *   same `callId` (consumers can stream JSON-arg deltas to a UI today
+ *   even though the default `chatStream` impl coalesces them into one).
+ * - `ToolCallFinished` (provider-side) records final arguments per
+ *   `callId`. **No `AgentEvent.ToolCallFinished` fires here** — that
+ *   one needs the executor's `result`, which the agentic loop produces
+ *   after this function returns. The loop emits it then.
+ * - `End` carries optional `tokenUsage` into the returned `LlmResponse`.
+ *
+ * Interleaving safety: even if a provider's native streaming adapter
+ * later interleaves chunks across multiple tool calls (Anthropic SSE
+ * does this), the `callId` field on each chunk routes the delta to the
+ * right pending entry. `ToolCall.callId` propagates into the final
+ * `LlmResponse.ToolCalls` so the loop's `ToolCallFinished` event uses
+ * the same id.
+ */
+internal suspend fun chatOrStream(
+    client: ModelClient,
+    messages: List<LlmMessage>,
+    agentId: String,
+    skillName: String,
+    emitter: AgentEventEmitter?,
+): LlmResponse {
+    if (emitter == null) {
+        return withContext(Dispatchers.IO) { client.chat(messages) }
+    }
+    val textBuilder = StringBuilder()
+    val callOrder = mutableListOf<String>()
+    val pendingNames = mutableMapOf<String, String>()
+    val pendingArgs = mutableMapOf<String, Map<String, Any?>>()
+    var tokenUsage: TokenUsage? = null
+
+    client.chatStream(messages).collect { chunk ->
+        when (chunk) {
+            is LlmChunk.TextDelta -> {
+                textBuilder.append(chunk.text)
+                emitter(AgentEvent.Token(agentId, skillName, chunk.text))
+            }
+            is LlmChunk.ToolCallStarted -> {
+                callOrder += chunk.callId
+                pendingNames[chunk.callId] = chunk.toolName
+                emitter(AgentEvent.ToolCallStarted(agentId, skillName, chunk.callId, chunk.toolName))
+            }
+            is LlmChunk.ToolCallArgumentsDelta -> {
+                emitter(AgentEvent.ToolCallArgumentsDelta(agentId, chunk.callId, chunk.deltaJson))
+            }
+            is LlmChunk.ToolCallFinished -> {
+                // Bookkeeping only — the consumer-facing AgentEvent.ToolCallFinished
+                // fires AFTER the agentic loop runs the tool executor and has a result.
+                pendingArgs[chunk.callId] = chunk.arguments
+            }
+            is LlmChunk.End -> {
+                tokenUsage = chunk.tokenUsage
+            }
+        }
+    }
+
+    return if (callOrder.isNotEmpty()) {
+        val calls = callOrder.map { callId ->
+            ToolCall(
+                name = pendingNames[callId] ?: error("LlmChunk.ToolCallStarted missing for callId=$callId"),
+                arguments = pendingArgs[callId] ?: emptyMap(),
+                callId = callId,
+            )
+        }
+        LlmResponse.ToolCalls(calls, tokenUsage)
+    } else {
+        LlmResponse.Text(textBuilder.toString(), tokenUsage)
+    }
+}
diff --git a/src/main/kotlin/agents_engine/runtime/events/AgentSessionExtension.kt b/src/main/kotlin/agents_engine/runtime/events/AgentSessionExtension.kt
@@ -40,12 +40,20 @@ fun <IN, OUT> Agent<IN, OUT>.session(input: IN): AgentSession<OUT> {
     val scope = CoroutineScope(SupervisorJob() + Dispatchers.Unconfined)
     scope.launch {
         // Captured-on-the-stack: each session has its own holder, so
-        // concurrent sessions can't race on a shared field. Step 3's
-        // agentic-loop rewire moves skill-name tracking into the
-        // FlowCollector chain proper.
+        // concurrent sessions can't race on a shared field.
         var capturedSkillName: String? = null
+        // #1739: emitter forwards AgentEvents from inside the agentic loop
+        // (Token, ToolCallStarted, ToolCallArgumentsDelta, ToolCallFinished)
+        // into the same channel as the bracket events. trySend is non-
+        // suspending — appropriate for a BUFFERED channel; if the buffer
+        // ever fills (it has high capacity), excess events would be
+        // dropped silently. Step 4 will tighten this for high-throughput
+        // streaming.
+        val streamingEmitter: agents_engine.model.AgentEventEmitter = { event ->
+            channel.trySend(event as AgentEvent<OUT>)
+        }
         try {
-            val output = agent.invokeSuspendForSession(input) { skillName ->
+            val output = agent.invokeSuspendForSession(input, emitter = streamingEmitter) { skillName ->
                 capturedSkillName = skillName
                 channel.trySend(AgentEvent.SkillStarted(agent.name, skillName))
             }
diff --git a/src/test/kotlin/agents_engine/runtime/events/AgentSessionIncrementalArrivalTest.kt b/src/test/kotlin/agents_engine/runtime/events/AgentSessionIncrementalArrivalTest.kt
@@ -0,0 +1,98 @@
+package agents_engine.runtime.events
+
+import agents_engine.core.agent
+import agents_engine.model.LlmChunk
+import agents_engine.model.LlmMessage
+import agents_engine.model.LlmResponse
+import agents_engine.model.ModelClient
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.runBlocking
+import kotlin.test.Test
+import kotlin.test.assertIs
+import kotlin.test.assertTrue
+
+/**
+ * #1739 — proves that AgentEvent.Token events arrive INCREMENTALLY during
+ * the agentic loop, not batched-at-end.
+ *
+ * The premortem flagged this as the load-bearing claim of streaming. Step
+ * 2's tests only checked event *ordering* via `events.toList()`, which
+ * buffers everything — a fully-batched implementation would have passed.
+ *
+ * Approach: a custom ModelClient overrides `chatStream` to emit chunks
+ * with deliberate `delay(50)` between them. We collect events with
+ * arrival timestamps and assert the first Token's arrival lands well
+ * before Completed's. If `chatOrStream` accidentally aggregates and
+ * batch-emits, the gap collapses and this test fires.
+ *
+ * Uses `runBlocking` (real clock) — runTest's virtual time defeats the
+ * timing-based assertion this test is built on.
+ */
+class AgentSessionIncrementalArrivalTest {
+
+    /**
+     * Streaming stub: emits four TextDelta chunks with 50ms between each,
+     * then End. Total wire-time ≈ 150ms minimum.
+     */
+    private val incrementalStub = object : ModelClient {
+        override fun chat(messages: List<LlmMessage>): LlmResponse =
+            error("incrementalStub forces the streaming path; chat() must not be called")
+
+        override suspend fun chatStream(messages: List<LlmMessage>): Flow<LlmChunk> = flow {
+            emit(LlmChunk.TextDelta("alpha "))
+            delay(50)
+            emit(LlmChunk.TextDelta("beta "))
+            delay(50)
+            emit(LlmChunk.TextDelta("gamma "))
+            delay(50)
+            emit(LlmChunk.TextDelta("delta"))
+            emit(LlmChunk.End(tokenUsage = null))
+        }
+    }
+
+    @Test
+    fun `Token events arrive incrementally while the stream produces chunks, not batched at the end`() = runBlocking {
+        val streamingAgent = agent<String, String>("inc") {
+            prompt("Incremental stub.")
+            model { ollama("llama3"); client = incrementalStub }
+            skills {
+                skill<String, String>("recite", "Streams four words") { tools() }
+            }
+        }
+
+        val session = streamingAgent.session("kick")
+
+        val startNs = System.nanoTime()
+        var firstTokenMs: Long? = null
+        var completedMs: Long? = null
+        session.events.collect { event ->
+            val elapsedMs = (System.nanoTime() - startNs) / 1_000_000
+            when (event) {
+                is AgentEvent.Token -> if (firstTokenMs == null) firstTokenMs = elapsedMs
+                is AgentEvent.Completed<*> -> completedMs = elapsedMs
+                else -> {}
+            }
+        }
+
+        // Both arrival timestamps must have been recorded.
+        val first = firstTokenMs ?: error("never observed a Token event")
+        val last = completedMs ?: error("never observed a Completed event")
+
+        // Gap >= 100ms means at least two delays elapsed between the first
+        // Token arriving and Completed — proves incremental flow. The actual
+        // gap should be ~150ms (three delays); 100ms gives slack for CI noise.
+        val gap = last - first
+        assertTrue(
+            gap >= 100,
+            "expected first Token to arrive at least 100ms before Completed (proof of incremental flow); " +
+                "got first=${first}ms, completed=${last}ms, gap=${gap}ms",
+        )
+
+        // Final assembled output spans all four chunks.
+        val output = session.await()
+        assertIs<String>(output)
+        assertTrue("alpha beta gamma delta" in output, "expected full assembled text; got: \"$output\"")
+    }
+}
diff --git a/src/test/kotlin/agents_engine/runtime/events/AgentSessionIntegrationTest.kt b/src/test/kotlin/agents_engine/runtime/events/AgentSessionIntegrationTest.kt