AgoraIO · digitallysavvy · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/README.md b/README.md
@@ -3,17 +3,19 @@
 [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python)
 [![pypi](https://img.shields.io/pypi/v/agent-server-sdk-python)](https://pypi.python.org/pypi/agent-server-sdk-python)
 
-The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs,
-enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS)
+The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, 
+enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) 
 and multimodal flows (MLLM) for real-time audio processing.
 
+
 ## Table of Contents
 
 - [Installation](#installation)
 - [Quick Start](#quick-start)
 - [Documentation](#documentation)
 - [Reference](#reference)
 - [Mllm Flow Multimodal](#mllm-flow-multimodal)
+- [Mllm Flow Multimodal](#mllm-flow-multimodal)
 - [Usage](#usage)
 - [Async Client](#async-client)
 - [Exception Handling](#exception-handling)
@@ -213,6 +215,71 @@ client.agents.start(
 )
 ```
 
+## MLLM Flow (Multimodal)
+
+For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details.
+
+```python
+from agora-agent-server-sdk import Agora
+from agora-agent-server-sdk.agents import (
+    StartAgentsRequestProperties,
+    StartAgentsRequestPropertiesAdvancedFeatures,
+    StartAgentsRequestPropertiesMllm,
+    StartAgentsRequestPropertiesMllmVendor,
+    StartAgentsRequestPropertiesTts,
+    StartAgentsRequestPropertiesTtsVendor,
+    StartAgentsRequestPropertiesLlm,
+    StartAgentsRequestPropertiesTurnDetection,
+    StartAgentsRequestPropertiesTurnDetectionType,
+)
+
+client = Agora(
+    customer_id="YOUR_CUSTOMER_ID",
+    customer_secret="YOUR_CUSTOMER_SECRET",
+)
+
+client.agents.start(
+    appid="your_app_id",
+    name="mllm_agent",
+    properties=StartAgentsRequestProperties(
+        channel="channel_name",
+        token="your_token",
+        agent_rtc_uid="1001",
+        remote_rtc_uids=["1002"],
+        idle_timeout=120,
+        advanced_features=StartAgentsRequestPropertiesAdvancedFeatures(
+            enable_mllm=True,
+        ),
+        mllm=StartAgentsRequestPropertiesMllm(
+            url="wss://api.openai.com/v1/realtime",
+            api_key="<your_openai_api_key>",
+            vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI,
+            params={
+                "model": "gpt-4o-realtime-preview",
+                "voice": "alloy",
+            },
+            input_modalities=["audio"],
+            output_modalities=["text", "audio"],
+            greeting_message="Hello! I'm ready to chat in real-time.",
+        ),
+        turn_detection=StartAgentsRequestPropertiesTurnDetection(
+            type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD,
+            threshold=0.5,
+            silence_duration_ms=500,
+        ),
+        # TTS and LLM are still required but not used when MLLM is enabled
+        tts=StartAgentsRequestPropertiesTts(
+            vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT,
+            params={},
+        ),
+        llm=StartAgentsRequestPropertiesLlm(
+            url="https://api.openai.com/v1/chat/completions",
+        ),
+    ),
+)
+```
+
+
 ## Usage
 
 Instantiate and use the client with the following:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,9 @@
 [project]
-name = "agent-server-sdk-python"
+name = "agora-agent-server-sdk"
 
 [tool.poetry]
-name = "agent-server-sdk-python"
-version = "1.1.0"
+name = "agora-agent-server-sdk"
+version = "1.1.1"
 description = ""
 readme = "README.md"
 authors = []

diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py
@@ -71,19 +71,19 @@ class StartAgentsRequestProperties(UncheckedBaseModel):
     Automatic Speech Recognition (ASR) configuration.
     """
 
-    tts: Tts = pydantic.Field()
+    tts: typing.Optional[Tts] = pydantic.Field(default=None)
     """
     Text-to-speech (TTS) module configuration.
     """
 
-    llm: StartAgentsRequestPropertiesLlm = pydantic.Field()
+    llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None)
     """
     Large language model (LLM) configuration.
     """
 
     mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None)
     """
-    Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing.
+    Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. MLLM is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline.
     """
 
     avatar: typing.Optional[StartAgentsRequestPropertiesAvatar] = pydantic.Field(default=None)

diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py
@@ -10,7 +10,7 @@
 
 class StartAgentsRequestPropertiesMllm(UncheckedBaseModel):
     """
-    Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing.
+    Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. MLLM is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline.
     """
 
     url: typing.Optional[str] = pydantic.Field(default=None)

diff --git a/src/agora_agent/agents/types/start_agents_request_properties_sal.py b/src/agora_agent/agents/types/start_agents_request_properties_sal.py
@@ -29,7 +29,7 @@ class StartAgentsRequestPropertiesSal(UncheckedBaseModel):
     > - For a registered voiceprint, ensure that:
     >   - Size: A single voiceprint file must not exceed 2 MB.
     >   - Duration: 10 to 15 seconds, with at least 8 seconds of effective audio without silent segments.
-    >   - Format: 16kHz sampling rate, 16-bit depth, mono PCM audio file. The file name extension must be ".pcm".      
+    >   - Format: 16kHz sampling rate, 16-bit depth, mono PCM audio file. The file name extension must be ".pcm".
     """
 
     if IS_PYDANTIC_V2:

diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py
@@ -26,10 +26,10 @@ def __init__(
 
     def get_headers(self) -> typing.Dict[str, str]:
         headers: typing.Dict[str, str] = {
-            "User-Agent": "agora-agent-server-sdk/1.1.0",
+            "User-Agent": "agora-agent-server-sdk/1.1.1",
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "agora-agent-server-sdk",
-            "X-Fern-SDK-Version": "1.1.0",
+            "X-Fern-SDK-Version": "1.1.1",
             **(self.get_custom_headers() or {}),
         }
         headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header