feat(#1743): OpenAiClient native chatStream — SSE with [DONE] terminator

Skobeltsyn · claude · Skobeltsyn · commit 5e9b4202ec71 · 2026-05-16T00:59:27.000+03:00
Third native streaming adapter. Closes the trio (Ollama / Claude /
OpenAI). OpenAI's SSE is simpler than Anthropic's — all events are
data:-only (no event: names), terminated by the literal data: [DONE].

OpenAI quirks handled:
- Tool call id arrives in the FIRST delta for a tool_calls[].index
  only; subsequent deltas correlate by index. Aggregator caches
  index -&gt; id after first sighting.
- Arguments are concatenated string fragments across deltas; we
  parse the accumulated buffer at finish_reason == "tool_calls".
- Token usage requires opt-in via stream_options.include_usage: true.
  OpenAI then sends a final usage-only delta with choices: [] and
  usage object. The End chunk carries it.

Implementation mirrors ClaudeClient's shape: chatStream override +
sendChatStream test seam + buildRequestJson(stream: Boolean = false)
parameter, flowOn(Dispatchers.IO).

TDD red-first: two non-live tests with hardcoded SSE (text-only,
tool-call with chunked arguments). Plus a live integration test:

  OpenAiClientChatStreamLiveTest: model=gpt-4o-mini chunks=19
  firstMs=3199 lastMs=3401 gapMs=202
  assembled="1 2 3 4 5 6 7 8 9 10"

All three providers now stream natively at the wire level:
- Ollama: 19 chunks / 84ms gap (NDJSON)
- Claude: 2 chunks / 27ms gap (SSE, fastest)
- OpenAI: 19 chunks / 202ms gap (SSE)

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/main/kotlin/agents_engine/model/OpenAiClient.kt b/src/main/kotlin/agents_engine/model/OpenAiClient.kt
@@ -2,13 +2,20 @@ package agents_engine.model
 
 import agents_engine.generation.LenientJsonParser
 import agents_engine.generation.jsonSchema
+import java.io.BufferedReader
+import java.io.InputStream
+import java.io.InputStreamReader
 import java.net.URI
 import java.net.http.HttpClient
 import java.net.http.HttpRequest
 import java.net.http.HttpResponse
 import kotlin.time.Duration
 import kotlin.time.Duration.Companion.seconds
 import kotlin.time.toJavaDuration
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.flow.flowOn
 
 /**
  * OpenAI Chat Completions adapter (#1656). Mirrors [OllamaClient] and
@@ -59,6 +66,134 @@ open class OpenAiClient(
         return parseResponse(responseBody)
     }
 
+    /**
+     * #1743 — native SSE streaming. OpenAI's protocol is `data:`-only
+     * (no `event:` names), terminated by the literal `data: [DONE]`.
+     *
+     * Tool-call correlation: the `id` (`call_*`) arrives in the FIRST
+     * delta for a given `tool_calls[].index`; subsequent deltas omit
+     * it. The aggregator caches `index -> id` after first sighting.
+     *
+     * Arguments arrive as concatenated string fragments. We emit
+     * `LlmChunk.ToolCallArgumentsDelta` per non-empty fragment and
+     * accumulate into a buffer; on `finish_reason: "tool_calls"` we
+     * parse the buffer and emit `LlmChunk.ToolCallFinished`.
+     *
+     * Token usage requires `stream_options.include_usage: true` (set in
+     * `buildRequestJson(stream=true)`). OpenAI then sends a final
+     * usage-only delta with `choices: []` and `usage: {...}`. We capture
+     * it and emit `LlmChunk.End(usage)` when `[DONE]` arrives.
+     */
+    override suspend fun chatStream(messages: List<LlmMessage>): Flow<LlmChunk> {
+        val body = buildRequestJson(messages, stream = true)
+        val headers = mapOf(
+            "Authorization" to "Bearer $apiKey",
+            "content-type" to "application/json",
+        )
+        return flow {
+            sendChatStream(body, headers).use { stream ->
+                parseSseStream(stream, this)
+            }
+        }.flowOn(Dispatchers.IO)
+    }
+
+    /** Test seam — subclasses override to stub the streaming InputStream. */
+    internal open fun sendChatStream(body: String, headers: Map<String, String>): InputStream {
+        val builder = HttpRequest.newBuilder()
+            .uri(URI.create("$baseUrl/v1/chat/completions"))
+            .timeout(requestTimeout.toJavaDuration())
+            .POST(HttpRequest.BodyPublishers.ofString(body))
+        headers.forEach { (k, v) -> builder.header(k, v) }
+        val response = http.send(builder.build(), HttpResponse.BodyHandlers.ofInputStream())
+        return response.body()
+    }
+
+    /** Per-tool-call streaming state. */
+    private data class ToolCallState(
+        var id: String? = null,
+        var name: String? = null,
+        val argsBuilder: StringBuilder = StringBuilder(),
+    )
+
+    private suspend fun parseSseStream(stream: InputStream, collector: kotlinx.coroutines.flow.FlowCollector<LlmChunk>) {
+        // Keyed by `tool_calls[].index` within the choice.
+        val toolStates = mutableMapOf<Int, ToolCallState>()
+        var usage: TokenUsage? = null
+
+        BufferedReader(InputStreamReader(stream, Charsets.UTF_8)).useLines { lines ->
+            for (line in lines) {
+                if (line.isBlank() || !line.startsWith("data:")) continue
+                val payload = line.removePrefix("data:").trim()
+                if (payload == "[DONE]") {
+                    collector.emit(LlmChunk.End(usage))
+                    return@useLines
+                }
+                @Suppress("UNCHECKED_CAST")
+                val data = LenientJsonParser.parse(payload) as? Map<String, Any?> ?: continue
+                // Final usage-only delta: choices is empty, usage non-null.
+                (data["usage"] as? Map<*, *>)?.let { u ->
+                    val prompt = (u["prompt_tokens"] as? Number)?.toInt()
+                    val completion = (u["completion_tokens"] as? Number)?.toInt()
+                    if (prompt != null && completion != null) usage = TokenUsage(prompt, completion)
+                }
+                val choices = data["choices"] as? List<*> ?: continue
+                val choice = choices.firstOrNull() as? Map<*, *> ?: continue
+                val delta = choice["delta"] as? Map<*, *>
+                val finishReason = choice["finish_reason"] as? String
+
+                // Text content delta.
+                (delta?.get("content") as? String)?.takeIf { it.isNotEmpty() }?.let {
+                    collector.emit(LlmChunk.TextDelta(it))
+                }
+
+                // Tool-call deltas.
+                val rawToolCalls = delta?.get("tool_calls") as? List<*>
+                rawToolCalls?.forEach { tc ->
+                    val tcMap = tc as? Map<*, *> ?: return@forEach
+                    val tcIndex = (tcMap["index"] as? Number)?.toInt() ?: return@forEach
+                    val state = toolStates.getOrPut(tcIndex) { ToolCallState() }
+                    val newId = tcMap["id"] as? String
+                    val fn = tcMap["function"] as? Map<*, *>
+                    val newName = fn?.get("name") as? String
+                    val argsFragment = fn?.get("arguments") as? String
+
+                    // First sighting: id + name typically present together.
+                    if (state.id == null && newId != null) {
+                        state.id = newId
+                        if (newName != null) state.name = newName
+                        collector.emit(LlmChunk.ToolCallStarted(callId = newId, toolName = newName ?: ""))
+                    } else if (newName != null && state.name == null) {
+                        state.name = newName
+                    }
+
+                    if (!argsFragment.isNullOrEmpty()) {
+                        state.argsBuilder.append(argsFragment)
+                        val callId = state.id
+                        if (callId != null) {
+                            collector.emit(LlmChunk.ToolCallArgumentsDelta(callId = callId, deltaJson = argsFragment))
+                        }
+                    }
+                }
+
+                // finish_reason == "tool_calls" marks completion of the
+                // assistant turn's tool-call sequence; emit Finished for
+                // each accumulated call.
+                if (finishReason == "tool_calls") {
+                    toolStates.values.forEach { state ->
+                        val callId = state.id ?: return@forEach
+                        val argsString = state.argsBuilder.toString()
+                        val parsed = if (argsString.isBlank()) emptyMap()
+                                     else parseToolArguments(argsString).arguments
+                        collector.emit(LlmChunk.ToolCallFinished(callId = callId, arguments = parsed))
+                    }
+                    toolStates.clear()
+                }
+            }
+            // EOF without [DONE]: emit End with whatever usage we captured.
+            collector.emit(LlmChunk.End(usage))
+        }
+    }
+
     /** Test seam — subclasses override to stub HTTP without a server. */
     internal open fun sendChat(body: String, headers: Map<String, String>): String {
         val builder = HttpRequest.newBuilder()
@@ -78,7 +213,7 @@ open class OpenAiClient(
         return String(bytes, Charsets.UTF_8)
     }
 
-    internal fun buildRequestJson(messages: List<LlmMessage>): String {
+    internal fun buildRequestJson(messages: List<LlmMessage>, stream: Boolean = false): String {
         val pendingToolCallIds: ArrayDeque<String> = ArrayDeque()
         var toolCallCounter = 0
 
@@ -121,7 +256,10 @@ open class OpenAiClient(
             ""","tools":[$defs]"""
         } else ""
 
-        return """{"model":${model.toJsonString()},"max_tokens":$maxTokens,"temperature":$temperature,"messages":[${messageObjects.joinToString(",")}]$toolsField}"""
+        // #1743: stream_options.include_usage opts into a final usage-only
+        // delta after finish_reason — required to get TokenUsage on stream.
+        val streamField = if (stream) ""","stream":true,"stream_options":{"include_usage":true}""" else ""
+        return """{"model":${model.toJsonString()},"max_tokens":$maxTokens,"temperature":$temperature$streamField,"messages":[${messageObjects.joinToString(",")}]$toolsField}"""
     }
 
     internal fun parseResponse(body: String): LlmResponse {
diff --git a/src/test/kotlin/agents_engine/model/OpenAiClientChatStreamLiveTest.kt b/src/test/kotlin/agents_engine/model/OpenAiClientChatStreamLiveTest.kt
@@ -0,0 +1,79 @@
+package agents_engine.model
+
+import kotlinx.coroutines.flow.collect
+import kotlinx.coroutines.runBlocking
+import org.junit.jupiter.api.Assumptions.assumeTrue
+import org.junit.jupiter.api.Tag
+import org.junit.jupiter.api.Test
+import java.io.File
+import kotlin.test.assertNotNull
+import kotlin.test.assertTrue
+
+/**
+ * #1743 — live integration test for OpenAiClient.chatStream against the
+ * real OpenAI API. Requires an API key at `.secrets/openai-key` or in
+ * `OPENAI_API_KEY`. Tagged `live-llm` — runs via `./gradlew integrationTest`.
+ */
+class OpenAiClientChatStreamLiveTest {
+
+    private val apiKey: String? = loadKey()
+    private val openAiModel: String = System.getenv("OPENAI_TEST_MODEL") ?: "gpt-4o-mini"
+
+    @Tag("live-llm")
+    @Test
+    fun `native chatStream against OpenAI emits multiple TextDelta chunks incrementally with token usage`() = runBlocking {
+        assumeTrue(apiKey != null, "skipping: no OpenAI key at .secrets/openai-key or OPENAI_API_KEY")
+
+        val client = OpenAiClient(apiKey = apiKey!!, model = openAiModel, temperature = 0.0)
+
+        val startNs = System.nanoTime()
+        val arrivals = mutableListOf<Pair<Long, LlmChunk>>()
+        client.chatStream(
+            listOf(
+                LlmMessage(
+                    role = "user",
+                    content = "Count from 1 to 10 separated by spaces. Output ONLY the numbers, nothing else.",
+                ),
+            ),
+        ).collect { chunk ->
+            arrivals += ((System.nanoTime() - startNs) / 1_000_000) to chunk
+        }
+
+        val textDeltas = arrivals.filter { it.second is LlmChunk.TextDelta }
+        val endChunk = arrivals.last().second as? LlmChunk.End
+            ?: error("last chunk must be End; got: ${arrivals.last().second}")
+
+        assertTrue(
+            textDeltas.size > 1,
+            "expected multiple TextDelta chunks (proves wire-level SSE streaming); got ${textDeltas.size}",
+        )
+
+        val firstMs = textDeltas.first().first
+        val lastMs = textDeltas.last().first
+        val gapMs = lastMs - firstMs
+        assertTrue(
+            gapMs >= 20,
+            "expected at least 20ms between first and last TextDelta; first=${firstMs}ms last=${lastMs}ms gap=${gapMs}ms",
+        )
+
+        assertNotNull(endChunk.tokenUsage, "End chunk must carry TokenUsage (stream_options.include_usage)")
+        assertTrue(endChunk.tokenUsage!!.completionTokens > 0)
+
+        val assembled = textDeltas.joinToString("") { (it.second as LlmChunk.TextDelta).text }
+        listOf("1", "2", "3").forEach { d ->
+            assertTrue(d in assembled, "assembled output should contain '$d'; got: \"$assembled\"")
+        }
+
+        println(
+            "OpenAiClientChatStreamLiveTest: model=$openAiModel chunks=${textDeltas.size} " +
+                "firstMs=$firstMs lastMs=$lastMs gapMs=$gapMs assembled=\"$assembled\""
+        )
+    }
+
+    private fun loadKey(): String? {
+        val envKey = System.getenv("OPENAI_API_KEY")
+        if (!envKey.isNullOrBlank()) return envKey
+        val file = File(".secrets/openai-key")
+        return if (file.exists()) file.readText().trim().ifBlank { null } else null
+    }
+}
diff --git a/src/test/kotlin/agents_engine/model/OpenAiClientChatStreamTest.kt b/src/test/kotlin/agents_engine/model/OpenAiClientChatStreamTest.kt
@@ -0,0 +1,91 @@
+package agents_engine.model
+
+import kotlinx.coroutines.flow.toList
+import kotlinx.coroutines.test.runTest
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertIs
+import kotlin.test.assertTrue
+
+// #1743 — non-live unit coverage for OpenAiClient.chatStream SSE parsing.
+// OpenAI's SSE is `data:`-only (no `event:` names), terminated by the
+// literal `data: [DONE]`. Tool calls correlate across deltas by
+// `tool_calls[].index`; `id` arrives in the FIRST delta only.
+
+class OpenAiClientChatStreamTest {
+
+    @Test
+    fun `text-only SSE stream emits TextDelta chunks plus End with usage from final delta`() = runTest {
+        val sse = buildString {
+            appendLine("""data: {"id":"x","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{"content":" world"},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[],"usage":{"prompt_tokens":11,"completion_tokens":6,"total_tokens":17}}""")
+            appendLine()
+            appendLine("""data: [DONE]""")
+            appendLine()
+        }
+
+        val chunks = stubbedOpenAi(sse).chatStream(listOf(LlmMessage("user", "Hi"))).toList()
+
+        assertEquals(3, chunks.size, "expected 2 TextDelta + End; got: $chunks")
+        val d1 = chunks[0]; assertIs<LlmChunk.TextDelta>(d1); assertEquals("Hello", d1.text)
+        val d2 = chunks[1]; assertIs<LlmChunk.TextDelta>(d2); assertEquals(" world", d2.text)
+        val end = chunks[2]; assertIs<LlmChunk.End>(end)
+        assertEquals(TokenUsage(promptTokens = 11, completionTokens = 6), end.tokenUsage)
+    }
+
+    @Test
+    fun `tool-call SSE stream emits Started with call_id, ArgumentsDelta per chunk, Finished with parsed args`() = runTest {
+        // The id only arrives in the first delta; subsequent deltas
+        // correlate via tool_calls[].index. Aggregator must remember the
+        // id across deltas.
+        val sse = buildString {
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{"role":"assistant","tool_calls":[{"index":0,"id":"call_abc","type":"function","function":{"name":"get_weather","arguments":""}}]},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\"location"}}]},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":\"SF\"}"}}]},"finish_reason":null}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[{"index":0,"delta":{},"finish_reason":"tool_calls"}]}""")
+            appendLine()
+            appendLine("""data: {"id":"x","choices":[],"usage":{"prompt_tokens":42,"completion_tokens":18,"total_tokens":60}}""")
+            appendLine()
+            appendLine("""data: [DONE]""")
+            appendLine()
+        }
+
+        val chunks = stubbedOpenAi(sse).chatStream(listOf(LlmMessage("user", "weather"))).toList()
+
+        val started = chunks.filterIsInstance<LlmChunk.ToolCallStarted>().single()
+        assertEquals("call_abc", started.callId, "callId must be OpenAI's call_* id from the first delta")
+        assertEquals("get_weather", started.toolName)
+
+        val deltas = chunks.filterIsInstance<LlmChunk.ToolCallArgumentsDelta>().filter { it.callId == "call_abc" }
+        // Three argument-bearing deltas: the initial empty arguments string,
+        // then two non-empty fragments. Aggregator may or may not skip the
+        // empty one; we accept either shape but assert the non-empty deltas
+        // appear with the right content.
+        val deltaJsons = deltas.map { it.deltaJson }
+        assertTrue("""{"location""" in deltaJsons, "expected first non-empty args fragment; got deltas: $deltaJsons")
+        assertTrue("""":"SF"}""" in deltaJsons, "expected second non-empty args fragment; got deltas: $deltaJsons")
+
+        val finished = chunks.filterIsInstance<LlmChunk.ToolCallFinished>().single()
+        assertEquals("call_abc", finished.callId)
+        assertEquals(mapOf("location" to "SF"), finished.arguments)
+
+        val end = chunks.filterIsInstance<LlmChunk.End>().single()
+        assertEquals(TokenUsage(promptTokens = 42, completionTokens = 18), end.tokenUsage)
+    }
+
+    private fun stubbedOpenAi(sse: String): OpenAiClient =
+        object : OpenAiClient(apiKey = "test-key", model = "test-model") {
+            override fun sendChatStream(body: String, headers: Map<String, String>): java.io.InputStream =
+                sse.byteInputStream(Charsets.UTF_8)
+        }
+}