From 60f844d5ebb5413acae6eff7eeac12180926958d Mon Sep 17 00:00:00 2001 From: "fern-api[bot]" <115122769+fern-api[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:01:53 +0000 Subject: [PATCH 1/4] SDK regeneration --- README.md | 73 ++++++++++++++++++- pyproject.toml | 2 +- .../types/start_agents_request_properties.py | 6 +- .../start_agents_request_properties_mllm.py | 2 +- .../start_agents_request_properties_sal.py | 2 +- src/agora_agent/core/client_wrapper.py | 4 +- 6 files changed, 78 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 3b661dd..44310b3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ -# Agora Agent Server SDK for Python +# Agoraio Python Library [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) [![pypi](https://img.shields.io/pypi/v/agora-agent-server-sdk)](https://pypi.python.org/pypi/agora-agent-server-sdk) -The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, -enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) +The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, +enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) and multimodal flows (MLLM) for real-time audio processing. + ## Table of Contents - [Installation](#installation) @@ -14,6 +15,7 @@ and multimodal flows (MLLM) for real-time audio processing. - [Documentation](#documentation) - [Reference](#reference) - [Mllm Flow Multimodal](#mllm-flow-multimodal) +- [Mllm Flow Multimodal](#mllm-flow-multimodal) - [Usage](#usage) - [Async Client](#async-client) - [Exception Handling](#exception-handling) @@ -212,6 +214,71 @@ client.agents.start( ) ``` +## MLLM Flow (Multimodal) + +For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. + +```python +from agora-agent-server-sdk import Agora +from agora-agent-server-sdk.agents import ( + StartAgentsRequestProperties, + StartAgentsRequestPropertiesAdvancedFeatures, + StartAgentsRequestPropertiesMllm, + StartAgentsRequestPropertiesMllmVendor, + StartAgentsRequestPropertiesTts, + StartAgentsRequestPropertiesTtsVendor, + StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionType, +) + +client = Agora( + customer_id="YOUR_CUSTOMER_ID", + customer_secret="YOUR_CUSTOMER_SECRET", +) + +client.agents.start( + appid="your_app_id", + name="mllm_agent", + properties=StartAgentsRequestProperties( + channel="channel_name", + token="your_token", + agent_rtc_uid="1001", + remote_rtc_uids=["1002"], + idle_timeout=120, + advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( + enable_mllm=True, + ), + mllm=StartAgentsRequestPropertiesMllm( + url="wss://api.openai.com/v1/realtime", + api_key="", + vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + params={ + "model": "gpt-4o-realtime-preview", + "voice": "alloy", + }, + input_modalities=["audio"], + output_modalities=["text", "audio"], + greeting_message="Hello! I'm ready to chat in real-time.", + ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, + threshold=0.5, + silence_duration_ms=500, + ), + # TTS and LLM are still required but not used when MLLM is enabled + tts=StartAgentsRequestPropertiesTts( + vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, + params={}, + ), + llm=StartAgentsRequestPropertiesLlm( + url="https://api.openai.com/v1/chat/completions", + ), + ), +) +``` + + ## Usage Instantiate and use the client with the following: diff --git a/pyproject.toml b/pyproject.toml index 8f51a0e..8b2dcf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "1.1.0" +version = "1.1.1" description = "" readme = "README.md" authors = [] diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 9f8b762..538a2b0 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -71,19 +71,19 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Automatic Speech Recognition (ASR) configuration. """ - tts: Tts = pydantic.Field() + tts: typing.Optional[Tts] = pydantic.Field(default=None) """ Text-to-speech (TTS) module configuration. """ - llm: StartAgentsRequestPropertiesLlm = pydantic.Field() + llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. MLLM is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ avatar: typing.Optional[StartAgentsRequestPropertiesAvatar] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py index 881c155..c0b9f61 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py @@ -10,7 +10,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. MLLM is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ url: typing.Optional[str] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_sal.py b/src/agora_agent/agents/types/start_agents_request_properties_sal.py index c39b299..1d8b0b5 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_sal.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_sal.py @@ -29,7 +29,7 @@ class StartAgentsRequestPropertiesSal(UncheckedBaseModel): > - For a registered voiceprint, ensure that: > - Size: A single voiceprint file must not exceed 2 MB. > - Duration: 10 to 15 seconds, with at least 8 seconds of effective audio without silent segments. - > - Format: 16kHz sampling rate, 16-bit depth, mono PCM audio file. The file name extension must be ".pcm". + > - Format: 16kHz sampling rate, 16-bit depth, mono PCM audio file. The file name extension must be ".pcm". """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index 9bd9ac7..d28bbbf 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/1.1.0", + "User-Agent": "agora-agent-server-sdk/1.1.1", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agent-server-sdk", - "X-Fern-SDK-Version": "1.1.0", + "X-Fern-SDK-Version": "1.1.1", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header From ea2dd59e481a6b6e7dfb973f55c62d692d4df302 Mon Sep 17 00:00:00 2001 From: Hermes Date: Wed, 11 Mar 2026 10:24:17 -0400 Subject: [PATCH 2/4] Update project title in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eccf502..e01e338 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Agoraio Python Library +# Agora Agent Server SDK for Python [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) [![pypi](https://img.shields.io/pypi/v/agent-server-sdk-python)](https://pypi.python.org/pypi/agent-server-sdk-python) From 848bcaa75f6dfe32706121f11a8a4613f99eda93 Mon Sep 17 00:00:00 2001 From: Hermes Date: Wed, 11 Mar 2026 10:27:02 -0400 Subject: [PATCH 3/4] Rename project to 'agent-server-sdk-python' --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4e3c1f3..1747f0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "agent-server-sdk-python" [tool.poetry] -name = "agora-agent-server-sdk" +name = "agent-server-sdk-python" version = "1.1.1" description = "" readme = "README.md" From 5fdac4e29edec995b608101c91615ff8ec789451 Mon Sep 17 00:00:00 2001 From: Hermes Date: Wed, 11 Mar 2026 10:40:51 -0400 Subject: [PATCH 4/4] Rename project to agora-agent-server-sdk --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1747f0f..8b2dcf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [project] -name = "agent-server-sdk-python" +name = "agora-agent-server-sdk" [tool.poetry] -name = "agent-server-sdk-python" +name = "agora-agent-server-sdk" version = "1.1.1" description = "" readme = "README.md"