GetStream
diff --git a/‎conftest.py‎
Lines changed: 41 additions & 51 deletions b/‎conftest.py‎
Lines changed: 41 additions & 51 deletions
diff --git a/‎plugins/qwen/README.md‎
Lines changed: 69 additions & 0 deletions b/‎plugins/qwen/README.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎plugins/qwen/example/README.md‎
Lines changed: 94 additions & 0 deletions b/‎plugins/qwen/example/README.md‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎plugins/qwen/example/__init__.py‎ b/‎plugins/qwen/example/__init__.py‎
diff --git a/‎plugins/qwen/example/env.example‎
Lines changed: 6 additions & 0 deletions b/‎plugins/qwen/example/env.example‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎plugins/qwen/example/pyproject.toml‎
Lines changed: 16 additions & 0 deletions b/‎plugins/qwen/example/pyproject.toml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎plugins/qwen/example/qwen_realtime_example.py‎
Lines changed: 36 additions & 0 deletions b/‎plugins/qwen/example/qwen_realtime_example.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎plugins/qwen/py.typed‎ b/‎plugins/qwen/py.typed‎
diff --git a/‎plugins/qwen/pyproject.toml‎
Lines changed: 37 additions & 0 deletions b/‎plugins/qwen/pyproject.toml‎
Lines changed: 37 additions & 0 deletions
@@ -139,28 +139,11 @@ def get_assets_dir():
     return os.path.join(os.path.dirname(__file__), "tests", "test_assets")
 
 
-@pytest.fixture(scope="session")
-def assets_dir():
-    """Fixture providing the test assets directory path."""
-    return get_assets_dir()
-
-
-@pytest.fixture
-def participant():
-    """Create a test participant for STT testing."""
-    return Participant({}, user_id="test-user")
-
-
-@pytest.fixture
-def mia_audio_16khz():
-    """Load mia.mp3 and convert to 16kHz PCM data."""
-    audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
-
+def _mp3_to_pcm(path: str, target_rate: int) -> PcmData:
     # Load audio file using PyAV
-    container = av.open(audio_file_path)
+    container = av.open(path)
     audio_stream = container.streams.audio[0]
     original_sample_rate = audio_stream.sample_rate
-    target_rate = 16000
 
     # Create resampler if needed
     resampler = None
@@ -194,46 +177,39 @@ def mia_audio_16khz():
     return pcm
 
 
-@pytest.fixture
-def mia_audio_48khz():
-    """Load mia.mp3 and convert to 48kHz PCM data."""
-    audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
+@pytest.fixture(scope="session")
+def assets_dir():
+    """Fixture providing the test assets directory path."""
+    return get_assets_dir()
 
-    # Load audio file using PyAV
-    container = av.open(audio_file_path)
-    audio_stream = container.streams.audio[0]
-    original_sample_rate = audio_stream.sample_rate
-    target_rate = 48000
 
-    # Create resampler if needed
-    resampler = None
-    if original_sample_rate != target_rate:
-        resampler = av.AudioResampler(format="s16", layout="mono", rate=target_rate)
+@pytest.fixture
+def participant():
+    """Create a test participant for STT testing."""
+    return Participant({}, user_id="test-user")
 
-    # Read all audio frames
-    samples = []
-    for frame in container.decode(audio_stream):
-        # Resample if needed
-        if resampler:
-            frame = resampler.resample(frame)[0]
 
-        # Convert to numpy array
-        frame_array = frame.to_ndarray()
-        if len(frame_array.shape) > 1:
-            # Convert stereo to mono
-            frame_array = np.mean(frame_array, axis=0)
-        samples.append(frame_array)
+@pytest.fixture
+def mia_audio_16khz():
+    """Load mia.mp3 and convert to 16kHz PCM data."""
+    audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
+    pcm = _mp3_to_pcm(audio_file_path, 16000)
+    return pcm
 
-    # Concatenate all samples
-    samples = np.concatenate(samples)
 
-    # Convert to int16
-    samples = samples.astype(np.int16)
-    container.close()
+@pytest.fixture
+def describe_what_you_see_audio_16khz():
+    """Load describe_what_you_see.mp3 and convert to 16kHz PCM data."""
+    audio_file_path = os.path.join(get_assets_dir(), "describe_what_you_see.mp3")
+    pcm = _mp3_to_pcm(audio_file_path, 16000)
+    return pcm
 
-    # Create PCM data
-    pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
 
+@pytest.fixture
+def mia_audio_48khz():
+    """Load mia.mp3 and convert to 48kHz PCM data."""
+    audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
+    pcm = _mp3_to_pcm(audio_file_path, 48000)
     return pcm
 
 
@@ -255,6 +231,20 @@ def silence_2s_48khz():
     return pcm
 
 
+@pytest.fixture
+def silence_1s_16khz():
+    """Generate 1 seconds of silence at 16kHz PCM data."""
+    sample_rate = 16000
+
+    # Create silence (zeros) as int16
+    samples = np.zeros(sample_rate, dtype=np.int16)
+
+    # Create PCM data
+    pcm = PcmData(samples=samples, sample_rate=sample_rate, format=AudioFormat.S16)
+
+    return pcm
+
+
 @pytest.fixture
 def mia_audio_48khz_chunked():
     """Load mia.mp3 and yield 48kHz PCM data in 20ms chunks."""
 
@@ -0,0 +1,69 @@
+# Qwen Realtime Plugin for Vision Agents
+
+Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
+
+## Features
+
+- **Native audio output**: No TTS service needed - audio comes directly from the model
+- **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
+- **Server-side VAD**: Automatic turn detection with configurable silence thresholds
+- **Video understanding**: Optional video frame support for multimodal interactions
+- **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
+- **Interruption handling**: Automatic cancellation when user starts speaking
+
+## Installation
+
+```bash
+uv add vision-agents[qwen]
+```
+
+## Usage
+
+```python
+from vision_agents.core import User, Agent
+from vision_agents.plugins import getstream, qwen
+
+agent = Agent(
+    edge=getstream.Edge(),
+    agent_user=User(name="Qwen Assistant"),
+    instructions="Be helpful and friendly",
+    llm=qwen.Realtime(
+        model="qwen3-omni-flash-realtime",
+        voice="Cherry",
+        fps=1,
+    ),
+    # No STT or TTS needed - Qwen Realtime provides both
+)
+```
+
+## Configuration
+
+| Parameter | Description | Default | Accepted Values |
+|-----------|-------------|---------|----------------|
+| `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
+| `api_key` | DashScope API key | `None` (from env) | String or `None` |
+| `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
+| `voice` | Voice for audio output | `"Cherry"` | Voice name string |
+| `fps` | Video frames per second | `1` | Integer |
+| `include_video` | Include video frames in requests | `False` | Boolean |
+| `video_width` | Video frame width | `1280` | Integer |
+| `video_height` | Video frame height | `720` | Integer |
+
+## Environment Variables
+
+Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
+
+```bash
+DASHSCOPE_API_KEY=your_dashscope_api_key_here
+```
+
+## Example
+
+See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
+
+## Dependencies
+
+- vision-agents
+- websockets
+- aiortc
+- av
@@ -0,0 +1,94 @@
+# Qwen Realtime Example
+
+This example demonstrates how to use Qwen Realtime with Vision Agents for real-time conversations.
+
+## Features
+
+- **Real-time streaming**: Direct audio streaming from Qwen Realtime API
+- **No text input**: The model does not support text input, so start speaking once you join the call
+- **Video support**: Configure frames per second for video processing
+
+## Installation
+
+```bash
+uv add vision-agents[qwen]
+```
+
+## Quick Start
+
+1. Set your API key in your environment:
+
+```bash
+export DASHSCOPE_API_KEY=your_dashscope_api_key_here
+```
+
+Or create a `.env` file:
+
+```
+DASHSCOPE_API_KEY=your_dashscope_api_key_here
+```
+
+2. Run the example:
+
+```bash
+uv run python qwen_realtime_example.py
+```
+
+## Code Example
+
+```python
+from dotenv import load_dotenv
+from vision_agents.core import Agent, User, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import getstream, qwen
+
+load_dotenv()
+
+async def create_agent(**kwargs) -> Agent:
+    llm = qwen.Realtime(fps=1)
+
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Qwen Assistant", id="agent"),
+        instructions="You are a helpful AI assistant. Be friendly and conversational.",
+        llm=llm,
+    )
+    return agent
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    await agent.create_user()
+    call = await agent.create_call(call_type, call_id)
+
+    with await agent.join(call):
+        await agent.edge.open_demo(call)
+        await agent.finish()
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
+```
+
+## Configuration
+
+### Environment Variables
+
+- **`DASHSCOPE_API_KEY`**: Your DashScope/Alibaba API key (required)
+
+### Realtime Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `fps` | Video frames per second | `1` |
+| `api_key` | DashScope API key | `None` (from env) |
+
+## Requirements
+
+- Python 3.10+
+- DashScope API key
+- Stream API credentials (configured via `getstream.Edge()`)
+- `vision-agents` framework
+
+## Notes
+
+- The model is hosted in Singapore, so latency may vary depending on your location
+- The model does not support text input - once you join the call, simply start speaking to the agent
+- This example uses the CLI interface for easy interaction
@@ -0,0 +1,6 @@
+# DashScope API key for Qwen Omni
+DASHSCOPE_API_KEY=your_dashscope_api_key_here
+
+# Stream API credentials
+STREAM_API_KEY=your_stream_api_key
+STREAM_API_SECRET=your_stream_api_secret
@@ -0,0 +1,16 @@
+[project]
+name = "qwen-omni-example"
+version = "0.1.0"
+description = "Example using Qwen Omni with Vision Agents"
+requires-python = ">=3.10"
+dependencies = [
+    "vision-agents",
+    "vision-agents-plugins-qwen",
+    "vision-agents-plugins-getstream",
+    "python-dotenv",
+]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+vision-agents-plugins-qwen = { workspace = true }
+vision-agents-plugins-getstream = { workspace = true }
@@ -0,0 +1,36 @@
+# This is a basic example using Qwen Realtime with Vision Agents
+# To run this example, you must have DASHSCOPE_API_KEY set in your env.
+# Do note that the model is hosted in Singapore so depending on your location, the latency may vary.
+# This model also does not support text input so once you join the call, simply start speaking to the agent.
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, User, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import getstream, qwen
+
+load_dotenv()
+
+
+async def create_agent(**kwargs) -> Agent:
+    llm = qwen.Realtime(fps=1)
+
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Qwen Assistant", id="agent"),
+        instructions="You are a helpful AI assistant. Be friendly and conversational.",
+        llm=llm,
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    await agent.create_user()
+    call = await agent.create_call(call_type, call_id)
+
+    with await agent.join(call):
+        await agent.edge.open_demo(call)
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
@@ -0,0 +1,37 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-qwen"
+dynamic = ["version"]
+description = "Qwen Omni plugin for vision agents"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "numpy",
+    "websockets>=15.0.1",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+]