Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
38f8ba5
add resale advisor example
maxkahan Apr 3, 2026
36a3160
Upgrade transformers library version
Nash0x7E2 Apr 6, 2026
e4d7035
Change example to voice only for local testing
Nash0x7E2 Apr 6, 2026
ec6719e
Add resolve_device for mapping DeviceType
Nash0x7E2 Apr 6, 2026
047d7d6
Extractr common logic to _local_inference
Nash0x7E2 Apr 7, 2026
17df094
Add API for MLX models
Nash0x7E2 Apr 7, 2026
6dafa40
Convert example to use Gemma MLX Quant
Nash0x7E2 Apr 7, 2026
cad81d7
Add huggingface to root toml
Nash0x7E2 Apr 7, 2026
61861f1
Fixes for formatting and mypy
Nash0x7E2 Apr 7, 2026
61674a6
Fast-path tool-call extraction when output has no JSON braces
Nash0x7E2 Apr 21, 2026
8e31104
Share _extract_last_user_text helper between local VLMs
Nash0x7E2 Apr 21, 2026
09f4f56
Pass PIL images directly to mlx-vlm generate, drop temp-file PNGs
Nash0x7E2 Apr 21, 2026
57430fe
Fix mlx_lm.generate string return; warn on hung generation threads
Nash0x7E2 Apr 21, 2026
c098cfd
Gate MLX dev dep on Apple Silicon so Linux CI skips install
Nash0x7E2 Apr 21, 2026
95e0785
Add requires_mlx pytest skip markers for Apple-only tests
Nash0x7E2 Apr 21, 2026
9ad044c
Gate huggingface [mlx] and [mlx-vlm] extras on Apple Silicon
Nash0x7E2 Apr 21, 2026
abf231b
Harden MLX import-failure detection for missing shared libs
Nash0x7E2 Apr 21, 2026
12c321b
Format __init__.py per ruff
Nash0x7E2 Apr 21, 2026
6471a4d
fix(huggingface): preserve local inference followup behavior
Nash0x7E2 Apr 22, 2026
9649575
fix(huggingface): align mlx and transformers local settings
Nash0x7E2 Apr 22, 2026
b0726e6
chore(examples): tighten resale example dependencies
Nash0x7E2 Apr 22, 2026
2af000d
fix(huggingface): lazy-load mlx plugins
Nash0x7E2 Apr 22, 2026
4cfc1f5
fix(roboflow): handle filtered detections safely
Nash0x7E2 Apr 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
"""

import asyncio
import importlib.util
import logging
import os
import platform
import sys
from typing import Iterator

import av
Expand All @@ -26,6 +29,24 @@
load_dotenv()


requires_mlx = pytest.mark.skipif(
sys.platform != "darwin"
or platform.machine() != "arm64"
or importlib.util.find_spec("mlx_lm") is None,
reason="MLX tests require Apple Silicon with mlx-lm installed",
)
"""Skip marker for tests that require MLX (Apple Silicon only)."""


requires_mlx_vlm = pytest.mark.skipif(
sys.platform != "darwin"
or platform.machine() != "arm64"
or importlib.util.find_spec("mlx_vlm") is None,
reason="MLX-VLM tests require Apple Silicon with mlx-vlm installed",
)
"""Skip marker for tests that require MLX-VLM (Apple Silicon only)."""


def skip_blockbuster(func_or_class):
"""Decorator to skip blockbuster checks for a test function or class.

Expand Down
18 changes: 18 additions & 0 deletions examples/12_resale_advisor_example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[project]
name = "resale-advisor-example"
version = "0.0.0"
requires-python = ">=3.10"

dependencies = [
"python-dotenv>=1.0",
"vision-agents-plugins-huggingface[mlx-vlm]",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents",
]

[tool.uv.sources]
"vision-agents-plugins-huggingface" = {path = "../../plugins/huggingface", editable=true}
"vision-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
"vision-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
"vision-agents" = {path = "../../agents-core", editable=true}
77 changes: 77 additions & 0 deletions examples/12_resale_advisor_example/resale_advisor_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Resale Advisor with Gemma 4 - Local VLM Agent (MLX)

A real-time resale advisor powered by Gemma 4 E4B running on Apple Silicon via
MLX. Demonstrates how to build a multimodal AI agent that can see an item on
camera, discuss its condition, and provide resale-oriented guidance with voice:

- Gemma 4 E4B (8-bit quantized) via mlx-vlm for vision-language inference
- Deepgram for speech-to-text and text-to-speech
- GetStream for real-time communication

The user speaks naturally and the agent responds with voice, describing the
item, asking clarifying questions when needed, and giving a rough resale view.

Requirements:
- STREAM_API_KEY and STREAM_API_SECRET environment variables
- DEEPGRAM_API_KEY environment variable
- Apple Silicon Mac with 16GB+ unified memory

First run will download the MLX model (~8GB).
"""
Comment thread
coderabbitai[bot] marked this conversation as resolved.

import asyncio
import logging

from dotenv import load_dotenv
from vision_agents.core import Agent, Runner, User
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, huggingface

logger = logging.getLogger(__name__)

load_dotenv()

SYSTEM_PROMPT = (
"You are a resale advisor running on a local Gemma 4 model. "
"You can see the user's camera feed. Identify the item, comment on visible "
"condition, ask for age or brand details when needed, and give a cautious "
"resale estimate or range when the user asks. Speak naturally, with no "
"lists or formatting. Never use emojis or special characters. Keep "
"responses under 60 words and be explicit when you are uncertain."
)


async def create_agent(**kwargs) -> Agent:
"""Create a resale advisor agent with Gemma 4 VLM."""
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Resale Advisor", id="agent"),
instructions=SYSTEM_PROMPT,
llm=huggingface.MlxVLM(
model="mlx-community/gemma-4-e4b-it-8bit",
max_new_tokens=150,
),
tts=deepgram.TTS(),
stt=deepgram.STT(),
)

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
"""Join the call and run the agent."""
call = await agent.create_call(call_type, call_id)

logger.info("Starting Resale Advisor...")

async with agent.join(call):
await asyncio.sleep(2)
await agent.llm.simple_response(
text="Greet the user briefly. Tell them you can inspect items on camera and help with resale guidance.",
)
await agent.finish()


if __name__ == "__main__":
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
14 changes: 12 additions & 2 deletions plugins/huggingface/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"huggingface_hub<1.0",
"huggingface_hub>=0.20.0,<2",
]

[project.optional-dependencies]
transformers = [
"transformers>=4.45.0,<5",
"transformers>=5.3.0,<6",
"torch>=2.0.0,<3",
"accelerate>=0.25.0,<2",
"supervision>=0.21.0,<1",
Expand All @@ -27,6 +27,16 @@ transformers-quantized = [
"vision-agents-plugins-huggingface[transformers]",
"bitsandbytes>=0.41.0",
]
mlx = [
"mlx>=0.22.0 ; sys_platform == 'darwin' and platform_machine == 'arm64'",
"mlx-lm>=0.22.0 ; sys_platform == 'darwin' and platform_machine == 'arm64'",
]
mlx-vlm = [
"mlx>=0.22.0 ; sys_platform == 'darwin' and platform_machine == 'arm64'",
"mlx-vlm>=0.4.0 ; sys_platform == 'darwin' and platform_machine == 'arm64'",
"av",
"aiortc",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Expand Down
6 changes: 3 additions & 3 deletions plugins/huggingface/tests/test_transformers_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ async def test_processor_fallback(self, vlm):
}

messages = [{"role": "user", "content": "describe this"}]
result = vlm._build_processor_inputs(messages, [])
result = vlm._build_processor_inputs(processor, messages, [], None)
assert "input_ids" in result

call_kwargs = processor.call_args.kwargs
Expand All @@ -174,7 +174,7 @@ async def test_build_processor_inputs_passes_tools(self, vlm):
}
]
messages = [{"role": "user", "content": "hi"}]
vlm._build_processor_inputs(messages, [], tools)
vlm._build_processor_inputs(vlm._resources.processor, messages, [], tools)

call_kwargs = vlm._resources.processor.apply_chat_template.call_args.kwargs
assert call_kwargs["tools"] is tools
Expand Down Expand Up @@ -205,7 +205,7 @@ def _side_effect(*args, **kwargs):
}
]
result = vlm._build_processor_inputs(
[{"role": "user", "content": "hi"}], [], tools
vlm._resources.processor, [{"role": "user", "content": "hi"}], [], tools
)
assert "input_ids" in result
assert call_count == 2
Expand Down
43 changes: 41 additions & 2 deletions plugins/huggingface/vision_agents/plugins/huggingface/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from importlib import import_module

import warnings

from .events import DetectionCompletedEvent
from .huggingface_llm import HuggingFaceLLM as LLM
from .huggingface_vlm import HuggingFaceVLM as VLM
Expand All @@ -11,8 +15,6 @@

__all__ += ["TransformersDetectionProcessor", "TransformersLLM", "TransformersVLM"]
except ImportError as e:
import warnings

optional = {"torch", "transformers", "av", "aiortc", "jinja2", "supervision", "cv2"}
if e.name in optional:
warnings.warn(
Expand All @@ -22,3 +24,40 @@
)
else:
raise


def _is_mlx_import_error(exc: ImportError) -> bool:
if exc.name in {"mlx", "mlx_lm", "mlx_vlm", "mlx.core"}:
return True
return exc.name is None and "mlx" in str(exc).lower()


__all__ += ["MlxLLM", "MlxVLM"]


def __getattr__(name: str):
if name == "MlxLLM":
try:
return import_module(".mlx_llm", __name__).MlxLLM
except ImportError as e:
if _is_mlx_import_error(e):
warnings.warn(
"MLX is not available on this platform. "
"Install the [mlx] extra on Apple Silicon to enable MLX plugins.",
stacklevel=2,
)
raise

if name == "MlxVLM":
try:
return import_module(".mlx_vlm", __name__).MlxVLM
except ImportError as e:
if _is_mlx_import_error(e) or e.name in {"av", "aiortc"}:
warnings.warn(
"MLX-VLM is not available on this platform. "
"Install the [mlx-vlm] extra on Apple Silicon to enable MLX VLM plugins.",
stacklevel=2,
)
raise

raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
Loading
Loading