Request-AudioTranscription: Support new model gpt-4o-transcribe-diarize

mkht · mkht · commit 0efdc278efe5 · 2025-10-18T02:30:30.000+09:00
diff --git a/Public/Audio/Request-AudioTranscription.ps1 b/Public/Audio/Request-AudioTranscription.ps1
@@ -7,16 +7,17 @@ function Request-AudioTranscription {
         [string]$File,
 
         [Parameter()]
-        [Completions('whisper-1', 'gpt-4o-transcribe', 'gpt-4o-mini-transcribe')]
+        [Completions('whisper-1', 'gpt-4o-transcribe', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe-diarize')]
         [string]$Model = 'whisper-1',
 
         [Parameter()]
         [string]$Prompt,
 
         [Parameter()]
         [Alias('response_format')]
-        [ValidateSet('json', 'text', 'srt', 'verbose_json', 'vtt')]
-        [string]$Format = 'text',
+        [Alias('Format')]  # for backward compatibility
+        [ValidateSet('json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json')]
+        [string]$ResponseFormat = 'text',
 
         [Parameter()]
         [ValidateRange(0.0, 1.0)]
@@ -26,6 +27,29 @@ function Request-AudioTranscription {
         [Completions('logprobs')]
         [string[]]$Include,
 
+        [Parameter()]
+        [Alias('known_speaker_names')]
+        [string[]]$KnownSpeakerNames,
+
+        [Parameter()]
+        [Alias('known_speaker_references')]
+        [string[]]$KnownSpeakerReferences,
+
+        [Parameter()]
+        [ValidateSet('auto', 'server_vad')]
+        [Alias('chunking_strategy')]
+        [string]$ChunkingStrategy = 'auto',
+
+        [Parameter()]
+        [ValidateRange(0.0, 1.0)]
+        [float]$ChunkingStrategyThreshold,
+
+        [Parameter()]
+        [uint16]$ChunkingStrategyPrefixPadding,
+
+        [Parameter()]
+        [uint16]$ChunkingStrategySilenceDuration,
+
         [Parameter()]
         [ValidateSet('word', 'segment')]
         [Alias('timestamp_granularities')]
@@ -108,8 +132,8 @@ function Request-AudioTranscription {
             $PostBody.model = $Model
         }
         $PostBody.file = $FileInfo
-        if ($Format) {
-            $PostBody.response_format = $Format
+        if ($ResponseFormat) {
+            $PostBody.response_format = $ResponseFormat
         }
         if ($PSBoundParameters.ContainsKey('Prompt')) {
             $PostBody.prompt = $Prompt
@@ -123,6 +147,51 @@ function Request-AudioTranscription {
         if ($PSBoundParameters.ContainsKey('Include')) {
             $PostBody.'include[]' = $Include
         }
+
+        if ($PSBoundParameters.ContainsKey('KnownSpeakerNames')) {
+            $PostBody.'known_speaker_names[]' = $KnownSpeakerNames
+        }
+        if ($PSBoundParameters.ContainsKey('KnownSpeakerReferences')) {
+            $KnownSpeakerReferencesFileInfoList = @()
+            foreach ($ref in $KnownSpeakerReferences) {
+                $KnownSpeakerReferencesFileInfoList += Resolve-FileInfo $ref
+            }
+            if ($KnownSpeakerReferencesFileInfoList.Count -gt 0) {
+                $PostBody.'known_speaker_references[]' = $KnownSpeakerReferencesFileInfoList
+            }
+        }
+
+        #region Chunking Strategy
+        $ChunkingStrategyOptions = @{}
+        if ($PSBoundParameters.ContainsKey('ChunkingStrategy')) {
+            if ($ChunkingStrategy -eq 'auto') {
+                $PostBody.chunking_strategy = 'auto'
+            }
+            else {
+                $ChunkingStrategyOptions.type = 'server_vad'
+            }
+        }
+        else {
+            if ($PSBoundParameters.ContainsKey('ChunkingStrategyThreshold')) {
+                $ChunkingStrategyOptions.threshold = $ChunkingStrategyThreshold
+            }
+            if ($PSBoundParameters.ContainsKey('ChunkingStrategyPrefixPadding')) {
+                $ChunkingStrategyOptions.prefix_padding_ms = $ChunkingStrategyPrefixPadding
+            }
+            if ($PSBoundParameters.ContainsKey('ChunkingStrategySilenceDuration')) {
+                $ChunkingStrategyOptions.silence_duration_ms = $ChunkingStrategySilenceDuration
+            }
+        }
+        if ( $ChunkingStrategyOptions.Keys.Count -gt 0) {
+            $ChunkingStrategyOptions.type = 'server_vad'
+            $PostBody.chunking_strategy = ConvertTo-Json $ChunkingStrategyOptions -Compress
+        }
+        elseif ($Model -like '*diarize*') {
+            # chunking_strategy parameter is required when using diarization models
+            $PostBody.chunking_strategy = 'auto'
+        }
+        #endregion Chunking Strategy
+
         if ($Language) {
             $PostBody.language = $Language
         }
diff --git a/Tests/Audio/Request-AudioTranscription.tests.ps1 b/Tests/Audio/Request-AudioTranscription.tests.ps1
@@ -26,6 +26,39 @@ Describe 'Request-AudioTranscription' {
             $Text | Should -Be 'MOCKED'
         }
 
+        It 'Audio transcription (format: verbose_json)' {
+            Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { 'MOCKED' }
+            { $script:Text = Request-AudioTranscription -File ($script:TestData + '/voice_japanese.mp3') -Format 'verbose_json' -ea Stop } | Should -Not -Throw
+            Should -InvokeVerifiable
+            $Text | Should -Be 'MOCKED'
+        }
+
+        It 'Audio transcription (full parameters)' {
+            Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { $PesterBoundParameters }
+            {
+                $params = @{
+                    File                            = ($script:TestData + '/voice_japanese.mp3')
+                    Model                           = 'gpt-4o-transcribe-diarize'
+                    Prompt                          = 'This is a test.'
+                    ResponseFormat                  = 'diarized_json'
+                    Temperature                     = 0.7
+                    Include                         = @('logprobs')
+                    KnownSpeakerNames               = @('Alice', 'Bob')
+                    KnownSpeakerReferences          = @(($script:TestData + '/voice_japanese.mp3'), ($script:TestData + '/voice_japanese.mp3'))
+                    ChunkingStrategy                = 'server_vad'
+                    ChunkingStrategyThreshold       = 0.5
+                    ChunkingStrategyPrefixPadding   = 200
+                    ChunkingStrategySilenceDuration = 900
+                    TimestampGranularities          = @('word', 'segment')
+                    Language                        = 'Japanese'
+                    TimeoutSec                      = 60
+                    MaxRetryCount                   = 2
+                }
+                $script:Result = Request-AudioTranscription @params -ea Stop
+            } | Should -Not -Throw
+            Should -InvokeVerifiable
+        }
+
         It 'Audio transcription (Stream text)' {
             Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequestSSE {
                 '{"type":"transcript.text.delta","delta":"ECHO","logprobs":[{"token":"ECHO","logprob":-0.0024760163,"bytes":[228,189,149]}]}'
@@ -82,6 +115,34 @@ Describe 'Request-AudioTranscription' {
             $Result = Request-AudioTranscription @params
             $Result.Body.language | Should -BeExactly 'English'
         }
+
+        It 'Diarization model reuires chunking_strategy parameter' {
+            Mock -Verifiable -ModuleName $script:ModuleName Invoke-OpenAIAPIRequest { $PesterBoundParameters }
+
+            # Test that chunking_strategy is set to 'auto' when using a diarization model
+            {
+                $params = @{
+                    File  = ($script:TestData + '/meeting_sample.m4a')
+                    Model = 'model-transcribe-diarize'
+                    # ChunkingStrategy = 'auto'  # intentionally omitted, should be set implicitly
+                }
+                $script:Result = Request-AudioTranscription @params -ea Stop
+            } | Should -Not -Throw
+            Should -InvokeVerifiable
+            $script:Result.Body.chunking_strategy | Should -BeExactly 'auto'
+
+            # Test that chunking_strategy is NOT set when using a non-diarization model
+            {
+                $params = @{
+                    File  = ($script:TestData + '/meeting_sample.m4a')
+                    Model = 'model-transcribe'  # model without diarization
+                    # ChunkingStrategy = 'auto'  # intentionally omitted, should not be set implicitly
+                }
+                $script:Result2 = Request-AudioTranscription @params -ea Stop
+            } | Should -Not -Throw
+            Should -InvokeVerifiable
+            $script:Result2.Body.chunking_strategy | Should -BeNullOrEmpty
+        }
     }
 
     Context 'Integration tests (online)' -Tag 'Online' {
@@ -98,11 +159,11 @@ Describe 'Request-AudioTranscription' {
 
         It 'Audio transcription (format: verbose_json)' {
             { $params = @{
-                    File        = ($script:TestData + '/voice_japanese.mp3')
-                    Model       = 'whisper-1'
-                    Format      = 'verbose_json'
-                    TimeoutSec  = 30
-                    ErrorAction = 'Stop'
+                    File           = ($script:TestData + '/voice_japanese.mp3')
+                    Model          = 'whisper-1'
+                    ResponseFormat = 'verbose_json'
+                    TimeoutSec     = 30
+                    ErrorAction    = 'Stop'
                 }
 
                 $script:Text = Request-AudioTranscription @params
@@ -112,6 +173,22 @@ Describe 'Request-AudioTranscription' {
             $ret.task | Should -Be 'transcribe'
         }
 
+        It 'Audio transcription (format: diarized_json)' {
+            { $params = @{
+                    File           = ($script:TestData + '/meeting_sample.m4a')
+                    Model          = 'gpt-4o-transcribe-diarize'
+                    ResponseFormat = 'diarized_json'
+                    TimeoutSec     = 90
+                    ErrorAction    = 'Stop'
+                }
+
+                $script:Text = Request-AudioTranscription @params
+            } | Should -Not -Throw
+            $ret = ($Text | ConvertFrom-Json)
+            $ret.text.Length | Should -BeGreaterThan 1
+            $ret.segments | Should -Not -BeNullOrEmpty
+        }
+
         It 'Audio transcription (Stream)' {
             $params = @{
                 File        = ($script:TestData + '/voice_japanese.mp3')
diff --git a/Tests/TestData/meeting_sample.m4a b/Tests/TestData/meeting_sample.m4a