feat(plugins): add FunASR self-hosted STT plugin

LauraGPT · LauraGPT · commit b68cd989d2a9 · 2026-06-16T19:06:27.000Z
Adds `livekit-plugins-funasr`: a non-streaming STT plugin backed by [FunASR](https://github.com/modelscope/FunASR) (SenseVoice / Paraformer / Fun-ASR-Nano), running fully locally with no cloud API. Strong on Chinese and 50+ languages; SenseVoice also returns language/emotion/event tags. Implements `STT._recognize_impl` (combine frames -> FunASR -> SpeechEvent) and declares `STTCapabilities(streaming=False)`, so LiveKit wraps it with a VAD StreamAdapter for real-time agents. Tested: transcribes a Chinese clip via the STT interface and returns a FINAL_TRANSCRIPT event. Resolves #5897.
diff --git a/livekit-plugins/livekit-plugins-funasr/README.md b/livekit-plugins/livekit-plugins-funasr/README.md
@@ -0,0 +1,21 @@
+# LiveKit Plugins FunASR
+
+Self-hosted speech-to-text for LiveKit Agents using [FunASR](https://github.com/modelscope/FunASR) — SenseVoice, Paraformer, Fun-ASR-Nano. Runs **locally, no cloud API**, strong on Chinese and 50+ languages.
+
+## Install
+```bash
+pip install livekit-plugins-funasr
+```
+
+## Usage
+```python
+from livekit.plugins import funasr
+
+# ModelScope (default hub="ms")
+stt = funasr.STT(model="iic/SenseVoiceSmall", device="cuda")
+
+# HuggingFace
+stt = funasr.STT(model="FunAudioLLM/SenseVoiceSmall", hub="hf", device="cuda")
+```
+
+Non-streaming STT; LiveKit wraps it with a VAD `StreamAdapter` for real-time agents.
diff --git a/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/__init__.py b/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/__init__.py
@@ -0,0 +1,15 @@
+"""FunASR plugin for LiveKit Agents — self-hosted speech-to-text (SenseVoice / Paraformer / Fun-ASR-Nano)."""
+from livekit.agents import Plugin
+from .log import logger
+from .stt import STT
+from .version import __version__
+
+__all__ = ["STT", "__version__"]
+
+
+class FunASRPlugin(Plugin):
+    def __init__(self) -> None:
+        super().__init__(__name__, __version__, __package__, logger)
+
+
+Plugin.register_plugin(FunASRPlugin())
diff --git a/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/log.py b/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/log.py
@@ -0,0 +1,2 @@
+import logging
+logger = logging.getLogger("livekit.plugins.funasr")
diff --git a/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/py.typed b/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/py.typed
diff --git a/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py b/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import asyncio
+import io
+from dataclasses import dataclass
+
+import numpy as np
+
+from livekit import rtc
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectionError,
+    APIConnectOptions,
+    stt,
+)
+from livekit.agents.stt import SpeechEventType, STTCapabilities
+from livekit.agents.types import NOT_GIVEN, NotGivenOr
+from livekit.agents.utils import AudioBuffer, is_given
+
+from .log import logger
+
+_DEFAULT_MODEL = "iic/SenseVoiceSmall"
+_TARGET_SR = 16000
+
+
+@dataclass
+class _STTOptions:
+    model: str = _DEFAULT_MODEL
+    language: str = "auto"
+    device: str = "cpu"
+    hub: str = "ms"
+    use_itn: bool = True
+
+
+class STT(stt.STT):
+    """FunASR self-hosted speech-to-text.
+
+    Runs FunASR models (SenseVoice, Paraformer, Fun-ASR-Nano) locally — no cloud
+    API. Non-streaming; LiveKit wraps it with a VAD StreamAdapter for agents.
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str = _DEFAULT_MODEL,
+        language: str = "auto",
+        device: str = "cpu",
+        hub: str = "ms",
+        use_itn: bool = True,
+        vad_model: str | None = "fsmn-vad",
+    ) -> None:
+        super().__init__(capabilities=STTCapabilities(streaming=False, interim_results=False))
+        self._opts = _STTOptions(model=model, language=language, device=device, hub=hub, use_itn=use_itn)
+        self._vad_model = vad_model
+        self._model = None
+
+    def _ensure_model(self):
+        if self._model is None:
+            from funasr import AutoModel
+
+            kwargs = dict(model=self._opts.model, device=self._opts.device, hub=self._opts.hub, disable_update=True)
+            if self._vad_model:
+                kwargs.update(vad_model=self._vad_model, vad_kwargs={"max_single_segment_time": 30000})
+            logger.info("loading FunASR model %s on %s", self._opts.model, self._opts.device)
+            self._model = AutoModel(**kwargs)
+        return self._model
+
+    async def _recognize_impl(
+        self,
+        buffer: AudioBuffer,
+        *,
+        language: NotGivenOr[str] = NOT_GIVEN,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> stt.SpeechEvent:
+        lang = language if is_given(language) else self._opts.language
+        wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()
+
+        def _run() -> str:
+            import soundfile as sf
+            from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+            model = self._ensure_model()
+            audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
+            if audio.ndim > 1:
+                audio = audio.mean(axis=1)
+            if sr != _TARGET_SR:
+                import librosa
+
+                audio = librosa.resample(audio, orig_sr=sr, target_sr=_TARGET_SR)
+            gen_kwargs = dict(input=audio, cache={}, use_itn=self._opts.use_itn, batch_size_s=300)
+            if "SenseVoice" in self._opts.model or (lang and lang != "auto"):
+                gen_kwargs["language"] = lang
+            res = model.generate(**gen_kwargs)
+            text = res[0]["text"] if res else ""
+            return rich_transcription_postprocess(text)
+
+        try:
+            text = await asyncio.get_event_loop().run_in_executor(None, _run)
+        except Exception as e:  # noqa: BLE001
+            raise APIConnectionError() from e
+
+        return stt.SpeechEvent(
+            type=SpeechEventType.FINAL_TRANSCRIPT,
+            alternatives=[stt.SpeechData(text=text, language=str(lang))],
+        )
diff --git a/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/version.py b/livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/livekit-plugins/livekit-plugins-funasr/pyproject.toml b/livekit-plugins/livekit-plugins-funasr/pyproject.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "livekit-plugins-funasr"
+dynamic = ["version"]
+description = "FunASR (SenseVoice / Paraformer / Fun-ASR-Nano) self-hosted STT plugin for LiveKit Agents"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.10.0"
+authors = [{ name = "LiveKit", email = "hello@livekit.io" }]
+keywords = ["voice", "ai", "realtime", "audio", "livekit", "funasr", "speech-to-text", "asr"]
+classifiers = [
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Topic :: Multimedia :: Sound/Audio",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+]
+dependencies = ["livekit-agents>=1.6.0", "funasr>=1.1.0", "soundfile", "librosa"]
+
+[project.urls]
+Documentation = "https://docs.livekit.io"
+Website = "https://livekit.io/"
+Source = "https://github.com/livekit/agents"
+
+[tool.hatch.version]
+path = "livekit/plugins/funasr/version.py"
+
+[tool.hatch.build.targets.wheel]
+packages = ["livekit"]
+
+[tool.hatch.build.targets.sdist]
+include = ["/livekit"]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+import logging`
	`2`	`+logger = logging.getLogger("livekit.plugins.funasr")`