@@ -41,32 +41,18 @@ export class ClientSecrets extends APIResource {
4141}
4242
4343/**
44- * Ephemeral key returned by the API .
44+ * A Realtime session configuration object .
4545 */
46- export interface RealtimeSessionClientSecret {
47- /**
48- * Timestamp for when the token expires. Currently, all tokens expire after one
49- * minute.
50- */
51- expires_at : number ;
52-
46+ export interface RealtimeSessionCreateResponse {
5347 /**
54- * Ephemeral key usable in client environments to authenticate connections to the
55- * Realtime API. Use this in client-side environments rather than a standard API
56- * token, which should only be used server-side.
48+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
5749 */
58- value : string ;
59- }
50+ id : string ;
6051
61- /**
62- * A new Realtime session configuration, with an ephemeral key. Default TTL for
63- * keys is one minute.
64- */
65- export interface RealtimeSessionCreateResponse {
6652 /**
67- * Ephemeral key returned by the API .
53+ * The object type. Always `realtime.session` .
6854 */
69- client_secret : RealtimeSessionClientSecret ;
55+ object : 'realtime.session' ;
7056
7157 /**
7258 * The type of session to create. Always `realtime` for the Realtime API.
@@ -78,6 +64,11 @@ export interface RealtimeSessionCreateResponse {
7864 */
7965 audio ?: RealtimeSessionCreateResponse . Audio ;
8066
67+ /**
68+ * Expiration timestamp for the session, in seconds since epoch.
69+ */
70+ expires_at ?: number ;
71+
8172 /**
8273 * Additional fields to include in server outputs.
8374 *
@@ -115,6 +106,7 @@ export interface RealtimeSessionCreateResponse {
115106 | ( string & { } )
116107 | 'gpt-realtime'
117108 | 'gpt-realtime-1.5'
109+ | 'gpt-realtime-2'
118110 | 'gpt-realtime-2025-08-28'
119111 | 'gpt-4o-realtime-preview'
120112 | 'gpt-4o-realtime-preview-2024-10-01'
@@ -144,6 +136,11 @@ export interface RealtimeSessionCreateResponse {
144136 */
145137 prompt ?: ResponsesAPI . ResponsePrompt | null ;
146138
139+ /**
140+ * Configuration for reasoning-capable Realtime models such as `gpt-realtime-2`.
141+ */
142+ reasoning ?: RealtimeAPI . RealtimeReasoning ;
143+
147144 /**
148145 * How the model chooses tools. Provide one of the string modes or force a specific
149146 * function/MCP tool.
@@ -215,16 +212,6 @@ export namespace RealtimeSessionCreateResponse {
215212 */
216213 noise_reduction ?: Input . NoiseReduction ;
217214
218- /**
219- * Configuration for input audio transcription, defaults to off and can be set to
220- * `null` to turn off once on. Input audio transcription is not native to the
221- * model, since the model consumes audio directly. Transcription runs
222- * asynchronously through
223- * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
224- * and should be treated as guidance of input audio content rather than precisely
225- * what the model heard. The client can optionally set the language and prompt for
226- * transcription, these offer additional guidance to the transcription service.
227- */
228215 transcription ?: RealtimeAPI . AudioTranscription ;
229216
230217 /**
@@ -241,6 +228,9 @@ export namespace RealtimeSessionCreateResponse {
241228 * trails off with "uhhm", the model will score a low probability of turn end and
242229 * wait longer for the user to continue speaking. This can be useful for more
243230 * natural conversations, but may have a higher latency.
231+ *
232+ * For `gpt-realtime-whisper` transcription sessions, turn detection must be set to
233+ * `null`; VAD is not supported.
244234 */
245235 turn_detection ?: Input . ServerVad | Input . SemanticVad | null ;
246236 }
@@ -640,17 +630,15 @@ export namespace RealtimeTranscriptionSessionCreateResponse {
640630 */
641631 noise_reduction ?: Input . NoiseReduction ;
642632
643- /**
644- * Configuration of the transcription model.
645- */
646633 transcription ?: RealtimeAPI . AudioTranscription ;
647634
648635 /**
649636 * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
650637 * means that the model will detect the start and end of speech based on audio
651- * volume and respond at the end of user speech.
638+ * volume and respond at the end of user speech. For `gpt-realtime-whisper`, this
639+ * must be `null`; VAD is not supported.
652640 */
653- turn_detection ?: ClientSecretsAPI . RealtimeTranscriptionSessionTurnDetection ;
641+ turn_detection ?: ClientSecretsAPI . RealtimeTranscriptionSessionTurnDetection | null ;
654642 }
655643
656644 export namespace Input {
@@ -672,7 +660,8 @@ export namespace RealtimeTranscriptionSessionCreateResponse {
672660/**
673661 * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
674662 * means that the model will detect the start and end of speech based on audio
675- * volume and respond at the end of user speech.
663+ * volume and respond at the end of user speech. For `gpt-realtime-whisper`, this
664+ * must be `null`; VAD is not supported.
676665 */
677666export interface RealtimeTranscriptionSessionTurnDetection {
678667 /**
@@ -763,7 +752,6 @@ export namespace ClientSecretCreateParams {
763752
764753export declare namespace ClientSecrets {
765754 export {
766- type RealtimeSessionClientSecret as RealtimeSessionClientSecret ,
767755 type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse ,
768756 type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse ,
769757 type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection ,
0 commit comments