Skip to content

Commit ca47a7d

Browse files
Nash0x7E2dangusev
andauthored
[AI-367] Add support for Qwen 3 Realtime/OMNI (#255)
* Add support for qwen omni * Hacky - support for LLM audio output * Add Qwen 3 Flash Realtime plugin * Update example to realtime * Update readme to focus on realtime * Fix frame forwarder fps * Add reconnection to Qwen3 realtime * Add tests for Qwen 3 realtime * Remove Qwen3 omni * Update uv.lock and qwen/pyproject.toml * qwen3 realtime: Fix fps param handling * fix mypy * update uv.lock * Update example readme * Update docstring * fix ruff * Resolve ruff + mypy warnings * remove unused agent modification --------- Co-authored-by: Daniil Gusev <dangusev92@gmail.com>
1 parent 1cf84f4 commit ca47a7d

17 files changed

Lines changed: 866 additions & 51 deletions

conftest.py

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -139,28 +139,11 @@ def get_assets_dir():
139139
return os.path.join(os.path.dirname(__file__), "tests", "test_assets")
140140

141141

142-
@pytest.fixture(scope="session")
143-
def assets_dir():
144-
"""Fixture providing the test assets directory path."""
145-
return get_assets_dir()
146-
147-
148-
@pytest.fixture
149-
def participant():
150-
"""Create a test participant for STT testing."""
151-
return Participant({}, user_id="test-user")
152-
153-
154-
@pytest.fixture
155-
def mia_audio_16khz():
156-
"""Load mia.mp3 and convert to 16kHz PCM data."""
157-
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
158-
142+
def _mp3_to_pcm(path: str, target_rate: int) -> PcmData:
159143
# Load audio file using PyAV
160-
container = av.open(audio_file_path)
144+
container = av.open(path)
161145
audio_stream = container.streams.audio[0]
162146
original_sample_rate = audio_stream.sample_rate
163-
target_rate = 16000
164147

165148
# Create resampler if needed
166149
resampler = None
@@ -194,46 +177,39 @@ def mia_audio_16khz():
194177
return pcm
195178

196179

197-
@pytest.fixture
198-
def mia_audio_48khz():
199-
"""Load mia.mp3 and convert to 48kHz PCM data."""
200-
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
180+
@pytest.fixture(scope="session")
181+
def assets_dir():
182+
"""Fixture providing the test assets directory path."""
183+
return get_assets_dir()
201184

202-
# Load audio file using PyAV
203-
container = av.open(audio_file_path)
204-
audio_stream = container.streams.audio[0]
205-
original_sample_rate = audio_stream.sample_rate
206-
target_rate = 48000
207185

208-
# Create resampler if needed
209-
resampler = None
210-
if original_sample_rate != target_rate:
211-
resampler = av.AudioResampler(format="s16", layout="mono", rate=target_rate)
186+
@pytest.fixture
187+
def participant():
188+
"""Create a test participant for STT testing."""
189+
return Participant({}, user_id="test-user")
212190

213-
# Read all audio frames
214-
samples = []
215-
for frame in container.decode(audio_stream):
216-
# Resample if needed
217-
if resampler:
218-
frame = resampler.resample(frame)[0]
219191

220-
# Convert to numpy array
221-
frame_array = frame.to_ndarray()
222-
if len(frame_array.shape) > 1:
223-
# Convert stereo to mono
224-
frame_array = np.mean(frame_array, axis=0)
225-
samples.append(frame_array)
192+
@pytest.fixture
193+
def mia_audio_16khz():
194+
"""Load mia.mp3 and convert to 16kHz PCM data."""
195+
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
196+
pcm = _mp3_to_pcm(audio_file_path, 16000)
197+
return pcm
226198

227-
# Concatenate all samples
228-
samples = np.concatenate(samples)
229199

230-
# Convert to int16
231-
samples = samples.astype(np.int16)
232-
container.close()
200+
@pytest.fixture
201+
def describe_what_you_see_audio_16khz():
202+
"""Load describe_what_you_see.mp3 and convert to 16kHz PCM data."""
203+
audio_file_path = os.path.join(get_assets_dir(), "describe_what_you_see.mp3")
204+
pcm = _mp3_to_pcm(audio_file_path, 16000)
205+
return pcm
233206

234-
# Create PCM data
235-
pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
236207

208+
@pytest.fixture
209+
def mia_audio_48khz():
210+
"""Load mia.mp3 and convert to 48kHz PCM data."""
211+
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
212+
pcm = _mp3_to_pcm(audio_file_path, 48000)
237213
return pcm
238214

239215

@@ -255,6 +231,20 @@ def silence_2s_48khz():
255231
return pcm
256232

257233

234+
@pytest.fixture
235+
def silence_1s_16khz():
236+
"""Generate 1 seconds of silence at 16kHz PCM data."""
237+
sample_rate = 16000
238+
239+
# Create silence (zeros) as int16
240+
samples = np.zeros(sample_rate, dtype=np.int16)
241+
242+
# Create PCM data
243+
pcm = PcmData(samples=samples, sample_rate=sample_rate, format=AudioFormat.S16)
244+
245+
return pcm
246+
247+
258248
@pytest.fixture
259249
def mia_audio_48khz_chunked():
260250
"""Load mia.mp3 and yield 48kHz PCM data in 20ms chunks."""

plugins/qwen/README.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Qwen Realtime Plugin for Vision Agents
2+
3+
Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
4+
5+
## Features
6+
7+
- **Native audio output**: No TTS service needed - audio comes directly from the model
8+
- **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
9+
- **Server-side VAD**: Automatic turn detection with configurable silence thresholds
10+
- **Video understanding**: Optional video frame support for multimodal interactions
11+
- **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
12+
- **Interruption handling**: Automatic cancellation when user starts speaking
13+
14+
## Installation
15+
16+
```bash
17+
uv add vision-agents[qwen]
18+
```
19+
20+
## Usage
21+
22+
```python
23+
from vision_agents.core import User, Agent
24+
from vision_agents.plugins import getstream, qwen
25+
26+
agent = Agent(
27+
edge=getstream.Edge(),
28+
agent_user=User(name="Qwen Assistant"),
29+
instructions="Be helpful and friendly",
30+
llm=qwen.Realtime(
31+
model="qwen3-omni-flash-realtime",
32+
voice="Cherry",
33+
fps=1,
34+
),
35+
# No STT or TTS needed - Qwen Realtime provides both
36+
)
37+
```
38+
39+
## Configuration
40+
41+
| Parameter | Description | Default | Accepted Values |
42+
|-----------|-------------|---------|----------------|
43+
| `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
44+
| `api_key` | DashScope API key | `None` (from env) | String or `None` |
45+
| `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
46+
| `voice` | Voice for audio output | `"Cherry"` | Voice name string |
47+
| `fps` | Video frames per second | `1` | Integer |
48+
| `include_video` | Include video frames in requests | `False` | Boolean |
49+
| `video_width` | Video frame width | `1280` | Integer |
50+
| `video_height` | Video frame height | `720` | Integer |
51+
52+
## Environment Variables
53+
54+
Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
55+
56+
```bash
57+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
58+
```
59+
60+
## Example
61+
62+
See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
63+
64+
## Dependencies
65+
66+
- vision-agents
67+
- websockets
68+
- aiortc
69+
- av

plugins/qwen/example/README.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Qwen Realtime Example
2+
3+
This example demonstrates how to use Qwen Realtime with Vision Agents for real-time conversations.
4+
5+
## Features
6+
7+
- **Real-time streaming**: Direct audio streaming from Qwen Realtime API
8+
- **No text input**: The model does not support text input, so start speaking once you join the call
9+
- **Video support**: Configure frames per second for video processing
10+
11+
## Installation
12+
13+
```bash
14+
uv add vision-agents[qwen]
15+
```
16+
17+
## Quick Start
18+
19+
1. Set your API key in your environment:
20+
21+
```bash
22+
export DASHSCOPE_API_KEY=your_dashscope_api_key_here
23+
```
24+
25+
Or create a `.env` file:
26+
27+
```
28+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
29+
```
30+
31+
2. Run the example:
32+
33+
```bash
34+
uv run python qwen_realtime_example.py
35+
```
36+
37+
## Code Example
38+
39+
```python
40+
from dotenv import load_dotenv
41+
from vision_agents.core import Agent, User, cli
42+
from vision_agents.core.agents import AgentLauncher
43+
from vision_agents.plugins import getstream, qwen
44+
45+
load_dotenv()
46+
47+
async def create_agent(**kwargs) -> Agent:
48+
llm = qwen.Realtime(fps=1)
49+
50+
agent = Agent(
51+
edge=getstream.Edge(),
52+
agent_user=User(name="Qwen Assistant", id="agent"),
53+
instructions="You are a helpful AI assistant. Be friendly and conversational.",
54+
llm=llm,
55+
)
56+
return agent
57+
58+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
59+
await agent.create_user()
60+
call = await agent.create_call(call_type, call_id)
61+
62+
with await agent.join(call):
63+
await agent.edge.open_demo(call)
64+
await agent.finish()
65+
66+
if __name__ == "__main__":
67+
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
68+
```
69+
70+
## Configuration
71+
72+
### Environment Variables
73+
74+
- **`DASHSCOPE_API_KEY`**: Your DashScope/Alibaba API key (required)
75+
76+
### Realtime Parameters
77+
78+
| Parameter | Description | Default |
79+
|-----------|-------------|---------|
80+
| `fps` | Video frames per second | `1` |
81+
| `api_key` | DashScope API key | `None` (from env) |
82+
83+
## Requirements
84+
85+
- Python 3.10+
86+
- DashScope API key
87+
- Stream API credentials (configured via `getstream.Edge()`)
88+
- `vision-agents` framework
89+
90+
## Notes
91+
92+
- The model is hosted in Singapore, so latency may vary depending on your location
93+
- The model does not support text input - once you join the call, simply start speaking to the agent
94+
- This example uses the CLI interface for easy interaction

plugins/qwen/example/__init__.py

Whitespace-only changes.

plugins/qwen/example/env.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# DashScope API key for Qwen Omni
2+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
3+
4+
# Stream API credentials
5+
STREAM_API_KEY=your_stream_api_key
6+
STREAM_API_SECRET=your_stream_api_secret
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[project]
2+
name = "qwen-omni-example"
3+
version = "0.1.0"
4+
description = "Example using Qwen Omni with Vision Agents"
5+
requires-python = ">=3.10"
6+
dependencies = [
7+
"vision-agents",
8+
"vision-agents-plugins-qwen",
9+
"vision-agents-plugins-getstream",
10+
"python-dotenv",
11+
]
12+
13+
[tool.uv.sources]
14+
vision-agents = { workspace = true }
15+
vision-agents-plugins-qwen = { workspace = true }
16+
vision-agents-plugins-getstream = { workspace = true }
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# This is a basic example using Qwen Realtime with Vision Agents
2+
# To run this example, you must have DASHSCOPE_API_KEY set in your env.
3+
# Do note that the model is hosted in Singapore so depending on your location, the latency may vary.
4+
# This model also does not support text input so once you join the call, simply start speaking to the agent.
5+
6+
from dotenv import load_dotenv
7+
from vision_agents.core import Agent, User, cli
8+
from vision_agents.core.agents import AgentLauncher
9+
from vision_agents.plugins import getstream, qwen
10+
11+
load_dotenv()
12+
13+
14+
async def create_agent(**kwargs) -> Agent:
15+
llm = qwen.Realtime(fps=1)
16+
17+
agent = Agent(
18+
edge=getstream.Edge(),
19+
agent_user=User(name="Qwen Assistant", id="agent"),
20+
instructions="You are a helpful AI assistant. Be friendly and conversational.",
21+
llm=llm,
22+
)
23+
return agent
24+
25+
26+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
27+
await agent.create_user()
28+
call = await agent.create_call(call_type, call_id)
29+
30+
with await agent.join(call):
31+
await agent.edge.open_demo(call)
32+
await agent.finish()
33+
34+
35+
if __name__ == "__main__":
36+
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))

plugins/qwen/py.typed

Whitespace-only changes.

plugins/qwen/pyproject.toml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[build-system]
2+
requires = ["hatchling", "hatch-vcs"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "vision-agents-plugins-qwen"
7+
dynamic = ["version"]
8+
description = "Qwen Omni plugin for vision agents"
9+
readme = "README.md"
10+
requires-python = ">=3.10"
11+
license = "MIT"
12+
dependencies = [
13+
"vision-agents",
14+
"numpy",
15+
"websockets>=15.0.1",
16+
]
17+
18+
[project.urls]
19+
Documentation = "https://visionagents.ai/"
20+
Website = "https://visionagents.ai/"
21+
Source = "https://github.com/GetStream/Vision-Agents"
22+
23+
[tool.hatch.version]
24+
source = "vcs"
25+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26+
27+
[tool.hatch.build.targets.wheel]
28+
packages = ["."]
29+
30+
[tool.uv.sources]
31+
vision-agents = { workspace = true }
32+
33+
[dependency-groups]
34+
dev = [
35+
"pytest>=8.4.1",
36+
"pytest-asyncio>=1.0.0",
37+
]

0 commit comments

Comments
 (0)