Skip to content

Commit 0efdc27

Browse files
committed
Request-AudioTranscription: Support new model gpt-4o-transcribe-diarize
1 parent 9bfc2b7 commit 0efdc27

3 files changed

Lines changed: 156 additions & 10 deletions

File tree

Public/Audio/Request-AudioTranscription.ps1

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,17 @@ function Request-AudioTranscription {
77
[string]$File,
88

99
[Parameter()]
10-
[Completions('whisper-1', 'gpt-4o-transcribe', 'gpt-4o-mini-transcribe')]
10+
[Completions('whisper-1', 'gpt-4o-transcribe', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe-diarize')]
1111
[string]$Model = 'whisper-1',
1212

1313
[Parameter()]
1414
[string]$Prompt,
1515

1616
[Parameter()]
1717
[Alias('response_format')]
18-
[ValidateSet('json', 'text', 'srt', 'verbose_json', 'vtt')]
19-
[string]$Format = 'text',
18+
[Alias('Format')] # for backward compatibility
19+
[ValidateSet('json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json')]
20+
[string]$ResponseFormat = 'text',
2021

2122
[Parameter()]
2223
[ValidateRange(0.0, 1.0)]
@@ -26,6 +27,29 @@ function Request-AudioTranscription {
2627
[Completions('logprobs')]
2728
[string[]]$Include,
2829

30+
[Parameter()]
31+
[Alias('known_speaker_names')]
32+
[string[]]$KnownSpeakerNames,
33+
34+
[Parameter()]
35+
[Alias('known_speaker_references')]
36+
[string[]]$KnownSpeakerReferences,
37+
38+
[Parameter()]
39+
[ValidateSet('auto', 'server_vad')]
40+
[Alias('chunking_strategy')]
41+
[string]$ChunkingStrategy = 'auto',
42+
43+
[Parameter()]
44+
[ValidateRange(0.0, 1.0)]
45+
[float]$ChunkingStrategyThreshold,
46+
47+
[Parameter()]
48+
[uint16]$ChunkingStrategyPrefixPadding,
49+
50+
[Parameter()]
51+
[uint16]$ChunkingStrategySilenceDuration,
52+
2953
[Parameter()]
3054
[ValidateSet('word', 'segment')]
3155
[Alias('timestamp_granularities')]
@@ -108,8 +132,8 @@ function Request-AudioTranscription {
108132
$PostBody.model = $Model
109133
}
110134
$PostBody.file = $FileInfo
111-
if ($Format) {
112-
$PostBody.response_format = $Format
135+
if ($ResponseFormat) {
136+
$PostBody.response_format = $ResponseFormat
113137
}
114138
if ($PSBoundParameters.ContainsKey('Prompt')) {
115139
$PostBody.prompt = $Prompt
@@ -123,6 +147,51 @@ function Request-AudioTranscription {
123147
if ($PSBoundParameters.ContainsKey('Include')) {
124148
$PostBody.'include[]' = $Include
125149
}
150+
151+
if ($PSBoundParameters.ContainsKey('KnownSpeakerNames')) {
152+
$PostBody.'known_speaker_names[]' = $KnownSpeakerNames
153+
}
154+
if ($PSBoundParameters.ContainsKey('KnownSpeakerReferences')) {
155+
$KnownSpeakerReferencesFileInfoList = @()
156+
foreach ($ref in $KnownSpeakerReferences) {
157+
$KnownSpeakerReferencesFileInfoList += Resolve-FileInfo $ref
158+
}
159+
if ($KnownSpeakerReferencesFileInfoList.Count -gt 0) {
160+
$PostBody.'known_speaker_references[]' = $KnownSpeakerReferencesFileInfoList
161+
}
162+
}
163+
164+
#region Chunking Strategy
165+
$ChunkingStrategyOptions = @{}
166+
if ($PSBoundParameters.ContainsKey('ChunkingStrategy')) {
167+
if ($ChunkingStrategy -eq 'auto') {
168+
$PostBody.chunking_strategy = 'auto'
169+
}
170+
else {
171+
$ChunkingStrategyOptions.type = 'server_vad'
172+
}
173+
}
174+
else {
175+
if ($PSBoundParameters.ContainsKey('ChunkingStrategyThreshold')) {
176+
$ChunkingStrategyOptions.threshold = $ChunkingStrategyThreshold
177+
}
178+
if ($PSBoundParameters.ContainsKey('ChunkingStrategyPrefixPadding')) {
179+
$ChunkingStrategyOptions.prefix_padding_ms = $ChunkingStrategyPrefixPadding
180+
}
181+
if ($PSBoundParameters.ContainsKey('ChunkingStrategySilenceDuration')) {
182+
$ChunkingStrategyOptions.silence_duration_ms = $ChunkingStrategySilenceDuration
183+
}
184+
}
185+
if ( $ChunkingStrategyOptions.Keys.Count -gt 0) {
186+
$ChunkingStrategyOptions.type = 'server_vad'
187+
$PostBody.chunking_strategy = ConvertTo-Json $ChunkingStrategyOptions -Compress
188+
}
189+
elseif ($Model -like '*diarize*') {
190+
# chunking_strategy parameter is required when using diarization models
191+
$PostBody.chunking_strategy = 'auto'
192+
}
193+
#endregion Chunking Strategy
194+
126195
if ($Language) {
127196
$PostBody.language = $Language
128197
}

Tests/Audio/Request-AudioTranscription.tests.ps1

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,39 @@ Describe 'Request-AudioTranscription' {
2626
$Text | Should -Be 'MOCKED'
2727
}
2828

29+
It 'Audio transcription (format: verbose_json)' {
30+
Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { 'MOCKED' }
31+
{ $script:Text = Request-AudioTranscription -File ($script:TestData + '/voice_japanese.mp3') -Format 'verbose_json' -ea Stop } | Should -Not -Throw
32+
Should -InvokeVerifiable
33+
$Text | Should -Be 'MOCKED'
34+
}
35+
36+
It 'Audio transcription (full parameters)' {
37+
Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { $PesterBoundParameters }
38+
{
39+
$params = @{
40+
File = ($script:TestData + '/voice_japanese.mp3')
41+
Model = 'gpt-4o-transcribe-diarize'
42+
Prompt = 'This is a test.'
43+
ResponseFormat = 'diarized_json'
44+
Temperature = 0.7
45+
Include = @('logprobs')
46+
KnownSpeakerNames = @('Alice', 'Bob')
47+
KnownSpeakerReferences = @(($script:TestData + '/voice_japanese.mp3'), ($script:TestData + '/voice_japanese.mp3'))
48+
ChunkingStrategy = 'server_vad'
49+
ChunkingStrategyThreshold = 0.5
50+
ChunkingStrategyPrefixPadding = 200
51+
ChunkingStrategySilenceDuration = 900
52+
TimestampGranularities = @('word', 'segment')
53+
Language = 'Japanese'
54+
TimeoutSec = 60
55+
MaxRetryCount = 2
56+
}
57+
$script:Result = Request-AudioTranscription @params -ea Stop
58+
} | Should -Not -Throw
59+
Should -InvokeVerifiable
60+
}
61+
2962
It 'Audio transcription (Stream text)' {
3063
Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequestSSE {
3164
'{"type":"transcript.text.delta","delta":"ECHO","logprobs":[{"token":"ECHO","logprob":-0.0024760163,"bytes":[228,189,149]}]}'
@@ -82,6 +115,34 @@ Describe 'Request-AudioTranscription' {
82115
$Result = Request-AudioTranscription @params
83116
$Result.Body.language | Should -BeExactly 'English'
84117
}
118+
119+
It 'Diarization model reuires chunking_strategy parameter' {
120+
Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { $PesterBoundParameters }
121+
122+
# Test that chunking_strategy is set to 'auto' when using a diarization model
123+
{
124+
$params = @{
125+
File = ($script:TestData + '/meeting_sample.m4a')
126+
Model = 'model-transcribe-diarize'
127+
# ChunkingStrategy = 'auto' # intentionally omitted, should be set implicitly
128+
}
129+
$script:Result = Request-AudioTranscription @params -ea Stop
130+
} | Should -Not -Throw
131+
Should -InvokeVerifiable
132+
$script:Result.Body.chunking_strategy | Should -BeExactly 'auto'
133+
134+
# Test that chunking_strategy is NOT set when using a non-diarization model
135+
{
136+
$params = @{
137+
File = ($script:TestData + '/meeting_sample.m4a')
138+
Model = 'model-transcribe' # model without diarization
139+
# ChunkingStrategy = 'auto' # intentionally omitted, should not be set implicitly
140+
}
141+
$script:Result2 = Request-AudioTranscription @params -ea Stop
142+
} | Should -Not -Throw
143+
Should -InvokeVerifiable
144+
$script:Result2.Body.chunking_strategy | Should -BeNullOrEmpty
145+
}
85146
}
86147

87148
Context 'Integration tests (online)' -Tag 'Online' {
@@ -98,11 +159,11 @@ Describe 'Request-AudioTranscription' {
98159

99160
It 'Audio transcription (format: verbose_json)' {
100161
{ $params = @{
101-
File = ($script:TestData + '/voice_japanese.mp3')
102-
Model = 'whisper-1'
103-
Format = 'verbose_json'
104-
TimeoutSec = 30
105-
ErrorAction = 'Stop'
162+
File = ($script:TestData + '/voice_japanese.mp3')
163+
Model = 'whisper-1'
164+
ResponseFormat = 'verbose_json'
165+
TimeoutSec = 30
166+
ErrorAction = 'Stop'
106167
}
107168

108169
$script:Text = Request-AudioTranscription @params
@@ -112,6 +173,22 @@ Describe 'Request-AudioTranscription' {
112173
$ret.task | Should -Be 'transcribe'
113174
}
114175

176+
It 'Audio transcription (format: diarized_json)' {
177+
{ $params = @{
178+
File = ($script:TestData + '/meeting_sample.m4a')
179+
Model = 'gpt-4o-transcribe-diarize'
180+
ResponseFormat = 'diarized_json'
181+
TimeoutSec = 90
182+
ErrorAction = 'Stop'
183+
}
184+
185+
$script:Text = Request-AudioTranscription @params
186+
} | Should -Not -Throw
187+
$ret = ($Text | ConvertFrom-Json)
188+
$ret.text.Length | Should -BeGreaterThan 1
189+
$ret.segments | Should -Not -BeNullOrEmpty
190+
}
191+
115192
It 'Audio transcription (Stream)' {
116193
$params = @{
117194
File = ($script:TestData + '/voice_japanese.mp3')

Tests/TestData/meeting_sample.m4a

541 KB
Binary file not shown.

0 commit comments

Comments
 (0)