Skip to content

Commit a37440c

Browse files
committed
add(tts): compute manifest timing from resolved audio encoding
1 parent e2ea254 commit a37440c

10 files changed

Lines changed: 754 additions & 28 deletions

File tree

Sources/AgentRunKit/Documentation.docc/Articles/MultimodalAndAudio.md

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,19 +141,37 @@ for entry in result.manifest {
141141
if let range = entry.timing.byteRangeInConcatenatedAudio {
142142
print("chunk \(entry.chunk.index): bytes \(range) of result.audio")
143143
}
144+
if let duration = entry.timing.durationSeconds {
145+
print("chunk \(entry.chunk.index): \(duration) seconds")
146+
}
144147
}
145148

146149
// Forecast the chunk plan without generating audio
147150
let plan = tts.chunks(for: longArticle)
148151
```
149152

150-
`generateWithManifest` populates ``TTSSegmentTiming/byteRangeInConcatenatedAudio`` for `pcm`
151-
output today. Other formats leave it `nil` until the framework can compute byte ranges defensibly.
152153
`generateAll` is implemented on top of the same path and returns `result.audio`.
153154

154155
`stream` segments always carry ``TTSSegmentTiming/uncomputed`` timing. Per-segment audio is the
155156
raw chunk bytes, and final container offsets are only meaningful after concatenation.
156157

158+
#### Supported Timing
159+
160+
| Format | Byte range | Duration |
161+
|---|---:|---:|
162+
| `pcm` | yes | yes, when the provider supplies sample rate, channels, and bits per sample |
163+
| `mp3` | yes (accounts for ID3v2, Xing/Info, and ID3v1 stripping) | not supported |
164+
| `wav`, `flac`, `opus`, `aac` | not supported | not supported |
165+
166+
Unsupported values are reported as `nil` rather than guessed. Duration for `pcm` is computed as
167+
`bytes / (sampleRate * channels * (bitsPerSample / 8))` when the provider's
168+
``TTSProvider/resolvedEncoding(for:options:)`` returns a fully populated ``TTSAudioEncoding``.
169+
Built-in providers override the hook only with values they have published documentation for.
170+
``OpenAITTSProvider`` populates `pcm` `sampleRate` and `bitsPerSample` from OpenAI's
171+
`/v1/audio/speech` documentation but leaves `channels` `nil` until the speech endpoint's channel
172+
count is documented separately, so OpenAI `pcm` `durationSeconds` remains `nil`. Custom providers
173+
using the protocol's default implementation report `nil` PCM fields and therefore `nil` duration.
174+
157175
### TTSOptions
158176

159177
``TTSOptions`` controls per-request parameters:
@@ -188,12 +206,29 @@ For MP3 output, the concatenator strips ID3v2 headers, Xing/Info frames, and ID3
188206

189207
### Custom Providers
190208

191-
Conform to ``TTSProvider`` to use any speech synthesis backend. ``TTSClient`` delivers a ``TTSChunkContext`` carrying the chunk plan and requested encoding alongside each call. Providers should treat `context.encoding` as the authoritative source for the format to produce, and can additionally use it for logging or request correlation:
209+
Conform to ``TTSProvider`` to use any speech synthesis backend. ``TTSClient`` delivers a
210+
``TTSChunkContext`` carrying the chunk plan and requested encoding alongside each call. Providers
211+
should treat `context.encoding` as the authoritative source for the format to produce, and can
212+
additionally use it for logging or request correlation.
213+
214+
Override ``TTSProvider/resolvedEncoding(for:options:)`` to surface documented `pcm` sample rate,
215+
channel count, and bit depth so the framework can compute ``TTSSegmentTiming/durationSeconds`` for
216+
`pcm` segments. The default implementation returns ``TTSAudioEncoding`` with `nil` PCM fields, so
217+
providers without published encoding values can omit it.
192218

193219
```swift
194220
struct MyTTSProvider: TTSProvider {
195221
let config: TTSProviderConfig
196222

223+
func resolvedEncoding(for format: TTSAudioFormat, options: TTSOptions) -> TTSAudioEncoding {
224+
switch format {
225+
case .pcm:
226+
TTSAudioEncoding(format, sampleRate: 24000, channels: 1, bitsPerSample: 16)
227+
case .mp3, .opus, .aac, .flac, .wav:
228+
TTSAudioEncoding(format)
229+
}
230+
}
231+
197232
func generate(
198233
text: String,
199234
voice: String,

Sources/AgentRunKit/TTS/MP3Concatenator.swift

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,15 @@ import Foundation
22

33
enum MP3Concatenator {
44
static func concatenate(_ segments: [Data]) -> Data {
5-
guard !segments.isEmpty else { return Data() }
5+
concatenateWithRanges(segments).audio
6+
}
7+
8+
static func concatenateWithRanges(_ segments: [Data]) -> (audio: Data, ranges: [Range<Int>]) {
9+
guard !segments.isEmpty else { return (Data(), []) }
610

711
var result = Data()
12+
var ranges: [Range<Int>] = []
13+
ranges.reserveCapacity(segments.count)
814
for (index, segment) in segments.enumerated() {
915
var data = segment
1016
if index > 0 {
@@ -14,9 +20,11 @@ enum MP3Concatenator {
1420
if index < segments.count - 1 {
1521
data = stripID3v1Tail(data)
1622
}
23+
let lower = result.count
1724
result.append(data)
25+
ranges.append(lower ..< result.count)
1826
}
19-
return result
27+
return (result, ranges)
2028
}
2129

2230
static func stripID3v2Header(_ data: Data) -> Data {

Sources/AgentRunKit/TTS/OpenAITTSProvider.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@ public struct OpenAITTSProvider: TTSProvider, Sendable {
3131
)
3232
}
3333

34+
public func resolvedEncoding(
35+
for format: TTSAudioFormat,
36+
options _: TTSOptions
37+
) -> TTSAudioEncoding {
38+
switch format {
39+
case .pcm:
40+
TTSAudioEncoding(format, sampleRate: 24000, bitsPerSample: 16)
41+
case .mp3, .opus, .aac, .flac, .wav:
42+
TTSAudioEncoding(format)
43+
}
44+
}
45+
3446
public func generate(
3547
text: String,
3648
voice: String,

Sources/AgentRunKit/TTS/TTSClient.swift

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ public struct TTSClient<P: TTSProvider>: Sendable {
2222
guard !trimmed.isEmpty else {
2323
throw TTSError.emptyText
2424
}
25-
let encoding = TTSAudioEncoding(options.responseFormat ?? provider.config.defaultFormat)
25+
let format = options.responseFormat ?? provider.config.defaultFormat
26+
let encoding = provider.resolvedEncoding(for: format, options: options)
2627
let leadingShift = SentenceChunker.trimByteOffset(in: text)
2728
let chunk = TTSChunk(
2829
index: 0,
@@ -64,7 +65,8 @@ public struct TTSClient<P: TTSProvider>: Sendable {
6465
}
6566

6667
let publicChunks = Self.makePublicChunks(internalChunks)
67-
let encoding = TTSAudioEncoding(options.responseFormat ?? provider.config.defaultFormat)
68+
let format = options.responseFormat ?? provider.config.defaultFormat
69+
let encoding = provider.resolvedEncoding(for: format, options: options)
6870
let provider = provider
6971
let maxConcurrent = maxConcurrent
7072

@@ -108,35 +110,79 @@ public struct TTSClient<P: TTSProvider>: Sendable {
108110
segments.append(segment)
109111
}
110112

111-
let effectiveFormat = options.responseFormat ?? provider.config.defaultFormat
112-
let audio: Data = if effectiveFormat == .mp3 {
113-
MP3Concatenator.concatenate(segments.map(\.audio))
114-
} else {
115-
Self.appendingConcatenation(segments.map(\.audio))
113+
guard let firstFormat = segments.first?.encoding.format else {
114+
return TTSConcatenationResult(audio: Data(), manifest: [])
116115
}
116+
precondition(
117+
segments.allSatisfy { $0.encoding.format == firstFormat },
118+
"TTSClient stream must yield segments with a single encoding format"
119+
)
120+
121+
let audioSegments = segments.map(\.audio)
122+
let (audio, byteRanges) = Self.concatenate(audioSegments, format: firstFormat)
123+
precondition(
124+
byteRanges.count == segments.count,
125+
"Concatenation must produce one byte range per input segment"
126+
)
117127

118128
var manifest: [TTSManifestEntry] = []
119129
manifest.reserveCapacity(segments.count)
120-
var pcmCursor = 0
121-
for segment in segments {
122-
let timing: TTSSegmentTiming
123-
if effectiveFormat == .pcm {
124-
let lower = pcmCursor
125-
pcmCursor += segment.audio.count
126-
timing = TTSSegmentTiming(byteRangeInConcatenatedAudio: lower ..< pcmCursor)
127-
} else {
128-
timing = .uncomputed
129-
}
130+
for (segment, range) in zip(segments, byteRanges) {
131+
let duration = Self.durationSeconds(forSegment: segment)
130132
manifest.append(TTSManifestEntry(
131133
chunk: segment.chunk,
132134
encoding: segment.encoding,
133-
timing: timing
135+
timing: TTSSegmentTiming(
136+
byteRangeInConcatenatedAudio: range,
137+
durationSeconds: duration
138+
)
134139
))
135140
}
136141

137142
return TTSConcatenationResult(audio: audio, manifest: manifest)
138143
}
139144

145+
private static func durationSeconds(forSegment segment: TTSSegment) -> Double? {
146+
guard segment.encoding.format == .pcm,
147+
let sampleRate = segment.encoding.sampleRate,
148+
let channels = segment.encoding.channels,
149+
let bitsPerSample = segment.encoding.bitsPerSample,
150+
sampleRate > 0, channels > 0,
151+
bitsPerSample > 0, bitsPerSample.isMultiple(of: 8)
152+
else { return nil }
153+
let bytesPerSample = bitsPerSample / 8
154+
let (channelBytes, channelOverflow) = sampleRate.multipliedReportingOverflow(by: channels)
155+
guard !channelOverflow else { return nil }
156+
let (bytesPerSecond, totalOverflow) = channelBytes.multipliedReportingOverflow(by: bytesPerSample)
157+
guard !totalOverflow, bytesPerSecond > 0 else { return nil }
158+
return Double(segment.audio.count) / Double(bytesPerSecond)
159+
}
160+
161+
private static func concatenate(
162+
_ audioSegments: [Data],
163+
format: TTSAudioFormat
164+
) -> (audio: Data, byteRanges: [Range<Int>?]) {
165+
switch format {
166+
case .mp3:
167+
let result = MP3Concatenator.concatenateWithRanges(audioSegments)
168+
return (result.audio, result.ranges as [Range<Int>?])
169+
case .pcm:
170+
let audio = appendingConcatenation(audioSegments)
171+
var ranges: [Range<Int>?] = []
172+
ranges.reserveCapacity(audioSegments.count)
173+
var cursor = 0
174+
for segment in audioSegments {
175+
let lower = cursor
176+
cursor += segment.count
177+
ranges.append(lower ..< cursor)
178+
}
179+
return (audio, ranges)
180+
case .opus, .aac, .flac, .wav:
181+
let audio = appendingConcatenation(audioSegments)
182+
return (audio, Array(repeating: nil, count: audioSegments.count))
183+
}
184+
}
185+
140186
private static func appendingConcatenation(_ audioSegments: [Data]) -> Data {
141187
var result = Data()
142188
result.reserveCapacity(audioSegments.reduce(0) { $0 + $1.count })

Sources/AgentRunKit/TTS/TTSProvider.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ import Foundation
33
/// A speech synthesis backend that converts text to audio.
44
public protocol TTSProvider: Sendable {
55
var config: TTSProviderConfig { get }
6+
7+
func resolvedEncoding(
8+
for format: TTSAudioFormat,
9+
options: TTSOptions
10+
) -> TTSAudioEncoding
11+
612
func generate(
713
text: String,
814
voice: String,
@@ -11,6 +17,15 @@ public protocol TTSProvider: Sendable {
1117
) async throws -> Data
1218
}
1319

20+
public extension TTSProvider {
21+
func resolvedEncoding(
22+
for format: TTSAudioFormat,
23+
options _: TTSOptions
24+
) -> TTSAudioEncoding {
25+
TTSAudioEncoding(format)
26+
}
27+
}
28+
1429
/// Configuration for a TTSProvider's chunking and default settings.
1530
public struct TTSProviderConfig: Sendable, Equatable {
1631
public let maxChunkCharacters: Int

0 commit comments

Comments
 (0)