Tom-Ryder
diff --git a/‎Sources/AgentRunKit/Documentation.docc/Articles/MultimodalAndAudio.md‎
Lines changed: 38 additions & 3 deletions b/‎Sources/AgentRunKit/Documentation.docc/Articles/MultimodalAndAudio.md‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎Sources/AgentRunKit/TTS/MP3Concatenator.swift‎
Lines changed: 10 additions & 2 deletions b/‎Sources/AgentRunKit/TTS/MP3Concatenator.swift‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎Sources/AgentRunKit/TTS/OpenAITTSProvider.swift‎
Lines changed: 12 additions & 0 deletions b/‎Sources/AgentRunKit/TTS/OpenAITTSProvider.swift‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎Sources/AgentRunKit/TTS/TTSClient.swift‎
Lines changed: 64 additions & 18 deletions b/‎Sources/AgentRunKit/TTS/TTSClient.swift‎
Lines changed: 64 additions & 18 deletions
diff --git a/‎Sources/AgentRunKit/TTS/TTSProvider.swift‎
Lines changed: 15 additions & 0 deletions b/‎Sources/AgentRunKit/TTS/TTSProvider.swift‎
Lines changed: 15 additions & 0 deletions
@@ -141,19 +141,37 @@ for entry in result.manifest {
     if let range = entry.timing.byteRangeInConcatenatedAudio {
         print("chunk \(entry.chunk.index): bytes \(range) of result.audio")
     }
+    if let duration = entry.timing.durationSeconds {
+        print("chunk \(entry.chunk.index): \(duration) seconds")
+    }
 }
 
 // Forecast the chunk plan without generating audio
 let plan = tts.chunks(for: longArticle)
 ```
 
-`generateWithManifest` populates ``TTSSegmentTiming/byteRangeInConcatenatedAudio`` for `pcm`
-output today. Other formats leave it `nil` until the framework can compute byte ranges defensibly.
 `generateAll` is implemented on top of the same path and returns `result.audio`.
 
 `stream` segments always carry ``TTSSegmentTiming/uncomputed`` timing. Per-segment audio is the
 raw chunk bytes, and final container offsets are only meaningful after concatenation.
 
+#### Supported Timing
+
+| Format | Byte range | Duration |
+|---|---:|---:|
+| `pcm`  | yes | yes, when the provider supplies sample rate, channels, and bits per sample |
+| `mp3`  | yes (accounts for ID3v2, Xing/Info, and ID3v1 stripping) | not supported |
+| `wav`, `flac`, `opus`, `aac` | not supported | not supported |
+
+Unsupported values are reported as `nil` rather than guessed. Duration for `pcm` is computed as
+`bytes / (sampleRate * channels * (bitsPerSample / 8))` when the provider's
+``TTSProvider/resolvedEncoding(for:options:)`` returns a fully populated ``TTSAudioEncoding``.
+Built-in providers override the hook only with values they have published documentation for.
+``OpenAITTSProvider`` populates `pcm` `sampleRate` and `bitsPerSample` from OpenAI's
+`/v1/audio/speech` documentation but leaves `channels` `nil` until the speech endpoint's channel
+count is documented separately, so OpenAI `pcm` `durationSeconds` remains `nil`. Custom providers
+using the protocol's default implementation report `nil` PCM fields and therefore `nil` duration.
+
 ### TTSOptions
 
 ``TTSOptions`` controls per-request parameters:
@@ -188,12 +206,29 @@ For MP3 output, the concatenator strips ID3v2 headers, Xing/Info frames, and ID3
 
 ### Custom Providers
 
-Conform to ``TTSProvider`` to use any speech synthesis backend. ``TTSClient`` delivers a ``TTSChunkContext`` carrying the chunk plan and requested encoding alongside each call. Providers should treat `context.encoding` as the authoritative source for the format to produce, and can additionally use it for logging or request correlation:
+Conform to ``TTSProvider`` to use any speech synthesis backend. ``TTSClient`` delivers a
+``TTSChunkContext`` carrying the chunk plan and requested encoding alongside each call. Providers
+should treat `context.encoding` as the authoritative source for the format to produce, and can
+additionally use it for logging or request correlation.
+
+Override ``TTSProvider/resolvedEncoding(for:options:)`` to surface documented `pcm` sample rate,
+channel count, and bit depth so the framework can compute ``TTSSegmentTiming/durationSeconds`` for
+`pcm` segments. The default implementation returns ``TTSAudioEncoding`` with `nil` PCM fields, so
+providers without published encoding values can omit it.
 
 ```swift
 struct MyTTSProvider: TTSProvider {
     let config: TTSProviderConfig
 
+    func resolvedEncoding(for format: TTSAudioFormat, options: TTSOptions) -> TTSAudioEncoding {
+        switch format {
+        case .pcm:
+            TTSAudioEncoding(format, sampleRate: 24000, channels: 1, bitsPerSample: 16)
+        case .mp3, .opus, .aac, .flac, .wav:
+            TTSAudioEncoding(format)
+        }
+    }
+
     func generate(
         text: String,
         voice: String,
 
@@ -2,9 +2,15 @@ import Foundation
 
 enum MP3Concatenator {
     static func concatenate(_ segments: [Data]) -> Data {
-        guard !segments.isEmpty else { return Data() }
+        concatenateWithRanges(segments).audio
+    }
+
+    static func concatenateWithRanges(_ segments: [Data]) -> (audio: Data, ranges: [Range<Int>]) {
+        guard !segments.isEmpty else { return (Data(), []) }
 
         var result = Data()
+        var ranges: [Range<Int>] = []
+        ranges.reserveCapacity(segments.count)
         for (index, segment) in segments.enumerated() {
             var data = segment
             if index > 0 {
@@ -14,9 +20,11 @@ enum MP3Concatenator {
             if index < segments.count - 1 {
                 data = stripID3v1Tail(data)
             }
+            let lower = result.count
             result.append(data)
+            ranges.append(lower ..< result.count)
         }
-        return result
+        return (result, ranges)
     }
 
     static func stripID3v2Header(_ data: Data) -> Data {
 
@@ -31,6 +31,18 @@ public struct OpenAITTSProvider: TTSProvider, Sendable {
         )
     }
 
+    public func resolvedEncoding(
+        for format: TTSAudioFormat,
+        options _: TTSOptions
+    ) -> TTSAudioEncoding {
+        switch format {
+        case .pcm:
+            TTSAudioEncoding(format, sampleRate: 24000, bitsPerSample: 16)
+        case .mp3, .opus, .aac, .flac, .wav:
+            TTSAudioEncoding(format)
+        }
+    }
+
     public func generate(
         text: String,
         voice: String,
 
@@ -22,7 +22,8 @@ public struct TTSClient<P: TTSProvider>: Sendable {
         guard !trimmed.isEmpty else {
             throw TTSError.emptyText
         }
-        let encoding = TTSAudioEncoding(options.responseFormat ?? provider.config.defaultFormat)
+        let format = options.responseFormat ?? provider.config.defaultFormat
+        let encoding = provider.resolvedEncoding(for: format, options: options)
         let leadingShift = SentenceChunker.trimByteOffset(in: text)
         let chunk = TTSChunk(
             index: 0,
@@ -64,7 +65,8 @@ public struct TTSClient<P: TTSProvider>: Sendable {
         }
 
         let publicChunks = Self.makePublicChunks(internalChunks)
-        let encoding = TTSAudioEncoding(options.responseFormat ?? provider.config.defaultFormat)
+        let format = options.responseFormat ?? provider.config.defaultFormat
+        let encoding = provider.resolvedEncoding(for: format, options: options)
         let provider = provider
         let maxConcurrent = maxConcurrent
 
@@ -108,35 +110,79 @@ public struct TTSClient<P: TTSProvider>: Sendable {
             segments.append(segment)
         }
 
-        let effectiveFormat = options.responseFormat ?? provider.config.defaultFormat
-        let audio: Data = if effectiveFormat == .mp3 {
-            MP3Concatenator.concatenate(segments.map(\.audio))
-        } else {
-            Self.appendingConcatenation(segments.map(\.audio))
+        guard let firstFormat = segments.first?.encoding.format else {
+            return TTSConcatenationResult(audio: Data(), manifest: [])
         }
+        precondition(
+            segments.allSatisfy { $0.encoding.format == firstFormat },
+            "TTSClient stream must yield segments with a single encoding format"
+        )
+
+        let audioSegments = segments.map(\.audio)
+        let (audio, byteRanges) = Self.concatenate(audioSegments, format: firstFormat)
+        precondition(
+            byteRanges.count == segments.count,
+            "Concatenation must produce one byte range per input segment"
+        )
 
         var manifest: [TTSManifestEntry] = []
         manifest.reserveCapacity(segments.count)
-        var pcmCursor = 0
-        for segment in segments {
-            let timing: TTSSegmentTiming
-            if effectiveFormat == .pcm {
-                let lower = pcmCursor
-                pcmCursor += segment.audio.count
-                timing = TTSSegmentTiming(byteRangeInConcatenatedAudio: lower ..< pcmCursor)
-            } else {
-                timing = .uncomputed
-            }
+        for (segment, range) in zip(segments, byteRanges) {
+            let duration = Self.durationSeconds(forSegment: segment)
             manifest.append(TTSManifestEntry(
                 chunk: segment.chunk,
                 encoding: segment.encoding,
-                timing: timing
+                timing: TTSSegmentTiming(
+                    byteRangeInConcatenatedAudio: range,
+                    durationSeconds: duration
+                )
             ))
         }
 
         return TTSConcatenationResult(audio: audio, manifest: manifest)
     }
 
+    private static func durationSeconds(forSegment segment: TTSSegment) -> Double? {
+        guard segment.encoding.format == .pcm,
+              let sampleRate = segment.encoding.sampleRate,
+              let channels = segment.encoding.channels,
+              let bitsPerSample = segment.encoding.bitsPerSample,
+              sampleRate > 0, channels > 0,
+              bitsPerSample > 0, bitsPerSample.isMultiple(of: 8)
+        else { return nil }
+        let bytesPerSample = bitsPerSample / 8
+        let (channelBytes, channelOverflow) = sampleRate.multipliedReportingOverflow(by: channels)
+        guard !channelOverflow else { return nil }
+        let (bytesPerSecond, totalOverflow) = channelBytes.multipliedReportingOverflow(by: bytesPerSample)
+        guard !totalOverflow, bytesPerSecond > 0 else { return nil }
+        return Double(segment.audio.count) / Double(bytesPerSecond)
+    }
+
+    private static func concatenate(
+        _ audioSegments: [Data],
+        format: TTSAudioFormat
+    ) -> (audio: Data, byteRanges: [Range<Int>?]) {
+        switch format {
+        case .mp3:
+            let result = MP3Concatenator.concatenateWithRanges(audioSegments)
+            return (result.audio, result.ranges as [Range<Int>?])
+        case .pcm:
+            let audio = appendingConcatenation(audioSegments)
+            var ranges: [Range<Int>?] = []
+            ranges.reserveCapacity(audioSegments.count)
+            var cursor = 0
+            for segment in audioSegments {
+                let lower = cursor
+                cursor += segment.count
+                ranges.append(lower ..< cursor)
+            }
+            return (audio, ranges)
+        case .opus, .aac, .flac, .wav:
+            let audio = appendingConcatenation(audioSegments)
+            return (audio, Array(repeating: nil, count: audioSegments.count))
+        }
+    }
+
     private static func appendingConcatenation(_ audioSegments: [Data]) -> Data {
         var result = Data()
         result.reserveCapacity(audioSegments.reduce(0) { $0 + $1.count })
 
@@ -3,6 +3,12 @@ import Foundation
 /// A speech synthesis backend that converts text to audio.
 public protocol TTSProvider: Sendable {
     var config: TTSProviderConfig { get }
+
+    func resolvedEncoding(
+        for format: TTSAudioFormat,
+        options: TTSOptions
+    ) -> TTSAudioEncoding
+
     func generate(
         text: String,
         voice: String,
@@ -11,6 +17,15 @@ public protocol TTSProvider: Sendable {
     ) async throws -> Data
 }
 
+public extension TTSProvider {
+    func resolvedEncoding(
+        for format: TTSAudioFormat,
+        options _: TTSOptions
+    ) -> TTSAudioEncoding {
+        TTSAudioEncoding(format)
+    }
+}
+
 /// Configuration for a TTSProvider's chunking and default settings.
 public struct TTSProviderConfig: Sendable, Equatable {
     public let maxChunkCharacters: Int