feat(#1740): thread cumulative tokensUsed into SkillCompleted + Completed

Skobeltsyn · claude · Skobeltsyn · commit b67abeaf90d4 · 2026-05-16T00:44:16.000+03:00
Step 3.5: surface what executeAgentic already tracks onto the event
surface that's been carrying tokensUsed: TokenUsage? = null placeholders.

executeAgentic now returns AgenticResult(output, tokenUsage) instead of
raw Any. cumulativeUsage builds up by summing promptTokens and
completionTokens across all turns (TokenUsage.total is derived).
executeAgentic became internal because AgenticResult is internal; only
in-package callers (Agent.kt) use it.

Agent.invokeSuspendForSession gains onSkillCompleted: (TokenUsage?) -&gt; Unit
callback with a default no-op. session() captures it into capturedUsage
and threads it into both SkillCompleted and Completed events.
Agent.invokeSuspend and Agent.invokeSuspendWithPromptOverride unwrap
.output — preserves their OUT return contract byte-for-byte. For
implementedBy skills the callback never fires; tokensUsed stays null.

TDD red-first: two new tests in AgentSessionIntegrationTest. The
single-turn case asserts SkillCompleted.tokensUsed equals the stub's
turn-1 TokenUsage. The two-turn case (ToolCalls→Text with distinct
usages per turn) asserts cumulative equals the field-wise sum.

Full suite (root + KSP + no-reflect) green; live π test path unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/main/kotlin/agents_engine/core/Agent.kt b/src/main/kotlin/agents_engine/core/Agent.kt
@@ -268,14 +268,20 @@ class Agent<IN, OUT>(
     internal suspend fun invokeSuspendForSession(
         input: IN,
         emitter: agents_engine.model.AgentEventEmitter? = null,
+        onSkillCompleted: (agents_engine.model.TokenUsage?) -> Unit = { /* no-op */ },
         onSkillStarted: (String) -> Unit,
     ): OUT {
         try {
             val skill = resolveSkill(input)
             skillChosenListener?.invoke(skill.name)
             onSkillStarted(skill.name)
             return if (skill.isAgentic) {
-                castOut(executeAgentic(this, skill, input, emitter = emitter))
+                val result = executeAgentic(this, skill, input, emitter = emitter)
+                // #1740: surface cumulative usage on the way out. Non-agentic
+                // skills don't go through executeAgentic, so onSkillCompleted
+                // stays at its default null for the implementedBy path below.
+                onSkillCompleted(result.tokenUsage)
+                castOut(result.output)
             } else {
                 castOut(executors[skill.name]!!(input))
             }
@@ -318,7 +324,7 @@ class Agent<IN, OUT>(
             val skill = resolveSkill(input)
             skillChosenListener?.invoke(skill.name)
             return if (skill.isAgentic) {
-                castOut(executeAgentic(this, skill, input, effectivePrompt = promptOverride))
+                castOut(executeAgentic(this, skill, input, effectivePrompt = promptOverride).output)
             } else {
                 // Non-agentic skills don't read prompt — implementedBy lambdas
                 // ignore the override. Same behavior as the legacy path.
diff --git a/src/main/kotlin/agents_engine/model/AgenticLoop.kt b/src/main/kotlin/agents_engine/model/AgenticLoop.kt
@@ -13,11 +13,20 @@ import kotlinx.coroutines.withContext
 
 private const val MAX_ARGUMENT_REPAIR_STEPS = 8
 
+/**
+ * #1740 — return shape from [executeAgentic]. Carries the parsed output
+ * alongside cumulative [TokenUsage] summed across all LLM turns of the
+ * invocation. [tokenUsage] is null when the provider never reported
+ * usage for any turn.
+ */
+internal data class AgenticResult(val output: Any, val tokenUsage: TokenUsage?)
+
 /**
  * Runs the agentic loop for [skill] on [agent] with [input].
- * Returns the parsed output as [Any]; the caller casts it via the agent's castOut.
+ * Returns the parsed output paired with cumulative token usage;
+ * the caller casts the output via the agent's castOut.
  */
-suspend fun <IN> executeAgentic(
+internal suspend fun <IN> executeAgentic(
     agent: Agent<IN, *>,
     skill: Skill<*, *>,
     input: IN,
@@ -37,7 +46,7 @@ suspend fun <IN> executeAgentic(
      * callers (`Agent.invoke`, `Agent.invokeSuspend`) pay no overhead.
      */
     emitter: AgentEventEmitter? = null,
-): Any {
+): AgenticResult {
     val config = requireNotNull(agent.modelConfig) {
         "Agent '${agent.name}' has no model configured. Add a model { } block."
     }
@@ -123,6 +132,9 @@ suspend fun <IN> executeAgentic(
     var turns = 0
     var toolCalls = 0
     var totalTokens = 0
+    // #1740: cumulative usage across all turns. Provider reports per-turn;
+    // we sum prompt and completion independently (TokenUsage.total is derived).
+    var cumulativeUsage: TokenUsage? = null
     var lastToolName: String? = null
     var consecutiveSameTool = 0
     val invocationStartNanos = System.nanoTime()
@@ -171,6 +183,13 @@ suspend fun <IN> executeAgentic(
         // even if it tips us over: the throw still surfaces the breach.
         response.tokenUsage?.let { usage ->
             totalTokens += usage.total
+            // #1740: build cumulative TokenUsage for the event surface.
+            cumulativeUsage = cumulativeUsage?.let { prev ->
+                TokenUsage(
+                    promptTokens = prev.promptTokens + usage.promptTokens,
+                    completionTokens = prev.completionTokens + usage.completionTokens,
+                )
+            } ?: usage
             val cap = budget.maxTokens
             if (cap != null) {
                 maybeFireThreshold(BudgetReason.TOKENS, totalTokens.toDouble() / cap)
@@ -185,9 +204,10 @@ suspend fun <IN> executeAgentic(
 
         when (response) {
             is LlmResponse.Text -> {
-                return skill.outputTransformer?.invoke(response.content)
+                val parsed = skill.outputTransformer?.invoke(response.content)
                     ?: parseOutput(response.content, agent.outType)
                     ?: error("Could not parse LLM output as ${agent.outType.simpleName}: '${response.content}'")
+                return AgenticResult(parsed, cumulativeUsage)
             }
             is LlmResponse.ToolCalls -> {
                 messages.add(LlmMessage("assistant", "", response.calls))
diff --git a/src/main/kotlin/agents_engine/runtime/events/AgentSessionExtension.kt b/src/main/kotlin/agents_engine/runtime/events/AgentSessionExtension.kt
@@ -42,6 +42,9 @@ fun <IN, OUT> Agent<IN, OUT>.session(input: IN): AgentSession<OUT> {
         // Captured-on-the-stack: each session has its own holder, so
         // concurrent sessions can't race on a shared field.
         var capturedSkillName: String? = null
+        // #1740: per-session usage capture from the agentic loop's cumulative
+        // total. Stays null for implementedBy skills (no LLM round-trip).
+        var capturedUsage: agents_engine.model.TokenUsage? = null
         // #1739: emitter forwards AgentEvents from inside the agentic loop
         // (Token, ToolCallStarted, ToolCallArgumentsDelta, ToolCallFinished)
         // into the same channel as the bracket events. trySend is non-
@@ -53,12 +56,16 @@ fun <IN, OUT> Agent<IN, OUT>.session(input: IN): AgentSession<OUT> {
             channel.trySend(event as AgentEvent<OUT>)
         }
         try {
-            val output = agent.invokeSuspendForSession(input, emitter = streamingEmitter) { skillName ->
+            val output = agent.invokeSuspendForSession(
+                input,
+                emitter = streamingEmitter,
+                onSkillCompleted = { usage -> capturedUsage = usage },
+            ) { skillName ->
                 capturedSkillName = skillName
                 channel.trySend(AgentEvent.SkillStarted(agent.name, skillName))
             }
-            channel.trySend(AgentEvent.SkillCompleted(agent.name, capturedSkillName ?: "?", null))
-            channel.trySend(AgentEvent.Completed(agent.name, output, null))
+            channel.trySend(AgentEvent.SkillCompleted(agent.name, capturedSkillName ?: "?", capturedUsage))
+            channel.trySend(AgentEvent.Completed(agent.name, output, capturedUsage))
             channel.close()
             result.complete(output)
         } catch (t: Throwable) {
diff --git a/src/test/kotlin/agents_engine/runtime/events/AgentSessionIntegrationTest.kt b/src/test/kotlin/agents_engine/runtime/events/AgentSessionIntegrationTest.kt
@@ -202,6 +202,73 @@ class AgentSessionIntegrationTest {
         assertTrue(finishedIdx < tokenIdx, "ToolCallFinished (from turn 1) must precede the final Token (from turn 2)")
     }
 
+    @Test
+    fun `tokensUsed on SkillCompleted and Completed reflects single-turn stub usage`() = runTest {
+        // #1740 — one-turn agentic stub with explicit TokenUsage.
+        // Cumulative usage for a one-turn run equals that turn's usage.
+        val usage = TokenUsage(promptTokens = 12, completionTokens = 5)
+        val stub = ModelClient { _ -> LlmResponse.Text("done", usage) }
+
+        val agentic = agent<String, String>("tu") {
+            prompt("Single-turn stub.")
+            model { ollama("llama3"); client = stub }
+            skills { skill<String, String>("respond", "Echoes via the model") { tools() } }
+        }
+
+        val events = agentic.session("kick").events.toList()
+
+        val skillCompleted = events.filterIsInstance<AgentEvent.SkillCompleted>().single()
+        val completed = events.filterIsInstance<AgentEvent.Completed<String>>().single()
+        assertEquals(usage, skillCompleted.tokensUsed, "SkillCompleted.tokensUsed must reflect the stub's TokenUsage")
+        assertEquals(usage, completed.tokensUsed, "Completed.tokensUsed must reflect the stub's TokenUsage")
+    }
+
+    @Test
+    fun `tokensUsed sums prompt and completion tokens across multiple turns`() = runTest {
+        // #1740 — two-turn stub (ToolCalls then Text). Each turn reports
+        // distinct usage. Cumulative on SkillCompleted/Completed must sum
+        // prompt and completion tokens independently across turns.
+        val turn1Usage = TokenUsage(promptTokens = 100, completionTokens = 20)
+        val turn2Usage = TokenUsage(promptTokens = 150, completionTokens = 35)
+        val turn1 = LlmResponse.ToolCalls(
+            listOf(
+                ToolCall(
+                    name = "ping",
+                    arguments = emptyMap(),
+                    rawArguments = "{}",
+                    callId = "call-multi-turn",
+                ),
+            ),
+            turn1Usage,
+        )
+        val turn2 = LlmResponse.Text("pong", turn2Usage)
+        val responses = ArrayDeque<LlmResponse>().apply { add(turn1); add(turn2) }
+        val stub = ModelClient { _ -> responses.removeFirst() }
+
+        val agentic = agent<String, String>("multi") {
+            prompt("Two-turn stub.")
+            model { ollama("llama3"); client = stub }
+            tools { tool("ping", "Returns pong") { _: Map<String, Any?> -> "pong" } }
+            skills {
+                skill<String, String>("respond", "Two-turn skill") {
+                    @Suppress("DEPRECATION")
+                    tools("ping")
+                }
+            }
+        }
+
+        val events = agentic.session("kick").events.toList()
+
+        val expected = TokenUsage(
+            promptTokens = turn1Usage.promptTokens + turn2Usage.promptTokens,
+            completionTokens = turn1Usage.completionTokens + turn2Usage.completionTokens,
+        )
+        val skillCompleted = events.filterIsInstance<AgentEvent.SkillCompleted>().single()
+        val completed = events.filterIsInstance<AgentEvent.Completed<String>>().single()
+        assertEquals(expected, skillCompleted.tokensUsed, "SkillCompleted.tokensUsed must sum prompt and completion tokens across turns")
+        assertEquals(expected, completed.tokensUsed, "Completed.tokensUsed must sum prompt and completion tokens across turns")
+    }
+
     // Tiny generic 4-tuple — assertable via destructuring in the concurrent test.
     private data class Quad<A, B, C, D>(val a: A, val b: B, val c: C, val d: D)
 }