Skip to content

Commit b68cd98

Browse files
committed
feat(plugins): add FunASR self-hosted STT plugin
Adds `livekit-plugins-funasr`: a non-streaming STT plugin backed by [FunASR](https://github.com/modelscope/FunASR) (SenseVoice / Paraformer / Fun-ASR-Nano), running fully locally with no cloud API. Strong on Chinese and 50+ languages; SenseVoice also returns language/emotion/event tags. Implements `STT._recognize_impl` (combine frames -> FunASR -> SpeechEvent) and declares `STTCapabilities(streaming=False)`, so LiveKit wraps it with a VAD StreamAdapter for real-time agents. Tested: transcribes a Chinese clip via the STT interface and returns a FINAL_TRANSCRIPT event. Resolves #5897.
1 parent 61b14fd commit b68cd98

7 files changed

Lines changed: 180 additions & 0 deletions

File tree

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# LiveKit Plugins FunASR
2+
3+
Self-hosted speech-to-text for LiveKit Agents using [FunASR](https://github.com/modelscope/FunASR) — SenseVoice, Paraformer, Fun-ASR-Nano. Runs **locally, no cloud API**, strong on Chinese and 50+ languages.
4+
5+
## Install
6+
```bash
7+
pip install livekit-plugins-funasr
8+
```
9+
10+
## Usage
11+
```python
12+
from livekit.plugins import funasr
13+
14+
# ModelScope (default hub="ms")
15+
stt = funasr.STT(model="iic/SenseVoiceSmall", device="cuda")
16+
17+
# HuggingFace
18+
stt = funasr.STT(model="FunAudioLLM/SenseVoiceSmall", hub="hf", device="cuda")
19+
```
20+
21+
Non-streaming STT; LiveKit wraps it with a VAD `StreamAdapter` for real-time agents.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""FunASR plugin for LiveKit Agents — self-hosted speech-to-text (SenseVoice / Paraformer / Fun-ASR-Nano)."""
2+
from livekit.agents import Plugin
3+
from .log import logger
4+
from .stt import STT
5+
from .version import __version__
6+
7+
__all__ = ["STT", "__version__"]
8+
9+
10+
class FunASRPlugin(Plugin):
11+
def __init__(self) -> None:
12+
super().__init__(__name__, __version__, __package__, logger)
13+
14+
15+
Plugin.register_plugin(FunASRPlugin())
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import logging
2+
logger = logging.getLogger("livekit.plugins.funasr")

livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/py.typed

Whitespace-only changes.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import io
5+
from dataclasses import dataclass
6+
7+
import numpy as np
8+
9+
from livekit import rtc
10+
from livekit.agents import (
11+
DEFAULT_API_CONNECT_OPTIONS,
12+
APIConnectionError,
13+
APIConnectOptions,
14+
stt,
15+
)
16+
from livekit.agents.stt import SpeechEventType, STTCapabilities
17+
from livekit.agents.types import NOT_GIVEN, NotGivenOr
18+
from livekit.agents.utils import AudioBuffer, is_given
19+
20+
from .log import logger
21+
22+
_DEFAULT_MODEL = "iic/SenseVoiceSmall"
23+
_TARGET_SR = 16000
24+
25+
26+
@dataclass
27+
class _STTOptions:
28+
model: str = _DEFAULT_MODEL
29+
language: str = "auto"
30+
device: str = "cpu"
31+
hub: str = "ms"
32+
use_itn: bool = True
33+
34+
35+
class STT(stt.STT):
36+
"""FunASR self-hosted speech-to-text.
37+
38+
Runs FunASR models (SenseVoice, Paraformer, Fun-ASR-Nano) locally — no cloud
39+
API. Non-streaming; LiveKit wraps it with a VAD StreamAdapter for agents.
40+
"""
41+
42+
def __init__(
43+
self,
44+
*,
45+
model: str = _DEFAULT_MODEL,
46+
language: str = "auto",
47+
device: str = "cpu",
48+
hub: str = "ms",
49+
use_itn: bool = True,
50+
vad_model: str | None = "fsmn-vad",
51+
) -> None:
52+
super().__init__(capabilities=STTCapabilities(streaming=False, interim_results=False))
53+
self._opts = _STTOptions(model=model, language=language, device=device, hub=hub, use_itn=use_itn)
54+
self._vad_model = vad_model
55+
self._model = None
56+
57+
def _ensure_model(self):
58+
if self._model is None:
59+
from funasr import AutoModel
60+
61+
kwargs = dict(model=self._opts.model, device=self._opts.device, hub=self._opts.hub, disable_update=True)
62+
if self._vad_model:
63+
kwargs.update(vad_model=self._vad_model, vad_kwargs={"max_single_segment_time": 30000})
64+
logger.info("loading FunASR model %s on %s", self._opts.model, self._opts.device)
65+
self._model = AutoModel(**kwargs)
66+
return self._model
67+
68+
async def _recognize_impl(
69+
self,
70+
buffer: AudioBuffer,
71+
*,
72+
language: NotGivenOr[str] = NOT_GIVEN,
73+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
74+
) -> stt.SpeechEvent:
75+
lang = language if is_given(language) else self._opts.language
76+
wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()
77+
78+
def _run() -> str:
79+
import soundfile as sf
80+
from funasr.utils.postprocess_utils import rich_transcription_postprocess
81+
82+
model = self._ensure_model()
83+
audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
84+
if audio.ndim > 1:
85+
audio = audio.mean(axis=1)
86+
if sr != _TARGET_SR:
87+
import librosa
88+
89+
audio = librosa.resample(audio, orig_sr=sr, target_sr=_TARGET_SR)
90+
gen_kwargs = dict(input=audio, cache={}, use_itn=self._opts.use_itn, batch_size_s=300)
91+
if "SenseVoice" in self._opts.model or (lang and lang != "auto"):
92+
gen_kwargs["language"] = lang
93+
res = model.generate(**gen_kwargs)
94+
text = res[0]["text"] if res else ""
95+
return rich_transcription_postprocess(text)
96+
97+
try:
98+
text = await asyncio.get_event_loop().run_in_executor(None, _run)
99+
except Exception as e: # noqa: BLE001
100+
raise APIConnectionError() from e
101+
102+
return stt.SpeechEvent(
103+
type=SpeechEventType.FINAL_TRANSCRIPT,
104+
alternatives=[stt.SpeechData(text=text, language=str(lang))],
105+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = "0.1.0"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "livekit-plugins-funasr"
7+
dynamic = ["version"]
8+
description = "FunASR (SenseVoice / Paraformer / Fun-ASR-Nano) self-hosted STT plugin for LiveKit Agents"
9+
readme = "README.md"
10+
license = "Apache-2.0"
11+
requires-python = ">=3.10.0"
12+
authors = [{ name = "LiveKit", email = "hello@livekit.io" }]
13+
keywords = ["voice", "ai", "realtime", "audio", "livekit", "funasr", "speech-to-text", "asr"]
14+
classifiers = [
15+
"Intended Audience :: Developers",
16+
"License :: OSI Approved :: Apache Software License",
17+
"Topic :: Multimedia :: Sound/Audio",
18+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
19+
"Programming Language :: Python :: 3",
20+
"Programming Language :: Python :: 3 :: Only",
21+
]
22+
dependencies = ["livekit-agents>=1.6.0", "funasr>=1.1.0", "soundfile", "librosa"]
23+
24+
[project.urls]
25+
Documentation = "https://docs.livekit.io"
26+
Website = "https://livekit.io/"
27+
Source = "https://github.com/livekit/agents"
28+
29+
[tool.hatch.version]
30+
path = "livekit/plugins/funasr/version.py"
31+
32+
[tool.hatch.build.targets.wheel]
33+
packages = ["livekit"]
34+
35+
[tool.hatch.build.targets.sdist]
36+
include = ["/livekit"]

0 commit comments

Comments
 (0)