WEIFENG2333 · kaidegit · Feb 19, 2026
diff --git a/app/common/config.py b/app/common/config.py
@@ -172,6 +172,27 @@ class Config(QConfig):
         OptionsValidator(WhisperModelEnum),
         EnumSerializer(WhisperModelEnum),
     )
+    whisper_cpp_enable_vad = ConfigItem(
+        "Whisper", "EnableVad", True, BoolValidator()
+    )
+    whisper_cpp_vad_model = ConfigItem(
+        "Whisper", "VadModel", "silero-v6.2.0"
+    )
+    whisper_cpp_vad_threshold = RangeConfigItem(
+        "Whisper", "VadThreshold", 0.3, RangeValidator(0, 1)
+    )
+    whisper_cpp_vad_min_speech_duration_ms = ConfigItem(
+        "Whisper", "VadMinSpeechDurationMs", 150
+    )
+    whisper_cpp_vad_min_silence_duration_ms = ConfigItem(
+        "Whisper", "VadMinSilenceDurationMs", 200
+    )
+    whisper_cpp_vad_max_speech_duration_s = ConfigItem(
+        "Whisper", "VadMaxSpeechDurationS", 30
+    )
+    whisper_cpp_vad_speech_pad_ms = ConfigItem(
+        "Whisper", "VadSpeechPadMs", 50
+    )
 
     # ------------------- Faster Whisper 配置 -------------------
     faster_whisper_program = ConfigItem(

diff --git a/app/components/WhisperCppSettingWidget.py b/app/components/WhisperCppSettingWidget.py
@@ -107,7 +107,15 @@
     #     "downloadLink": "https://huggingface.co/distil-whisper/distil-large-v3-ggml/resolve/main/ggml-distil-large-v3.bin?download=true",
     #     "mirrorLink": "https://www.modelscope.cn/models/cjc1887415157/whisper.cpp/resolve/master/ggml-distil-large-v3.bin",
     #     "sha": "5e61e98bdcf3b9a78516c59bf7d1a10d64cae67a"
-    # }
+    # },
+    {
+        "label": "Silero V6.2.0(VAD model)",
+        "value": "ggml-silero-v6.2.0.bin",
+        "size": "889 KB",
+        "downloadLink": "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin?download=true",
+        "mirrorLink": "https://www.modelscope.cn/models/ggml-org/whisper-vad/resolve/master/ggml-silero-v6.2.0.bin",
+        "sha": "470e5d9d094ddba2f0a512cecc3732a252188abd",
+    }
 ]
 
 

diff --git a/app/core/asr/transcribe.py b/app/core/asr/transcribe.py
@@ -100,6 +100,13 @@ def _create_whisper_cpp_asr(audio_path: str, config: TranscribeConfig) -> Chunke
         "need_word_time_stamp": config.need_word_time_stamp,
         "language": config.transcribe_language,
         "whisper_model": config.whisper_model.value if config.whisper_model else None,
+        "enable_vad": config.whisper_cpp_enable_vad,
+        "vad_model": config.whisper_cpp_vad_model,
+        "vad_threshold": config.whisper_cpp_vad_threshold,
+        "vad_min_speech_duration_ms": config.whisper_cpp_vad_min_speech_duration_ms,
+        "vad_min_silence_duration_ms": config.whisper_cpp_vad_min_silence_duration_ms,
+        "vad_max_speech_duration_s": config.whisper_cpp_vad_max_speech_duration_s,
+        "vad_speech_pad_ms": config.whisper_cpp_vad_speech_pad_ms,
     }
     return ChunkedASR(
         asr_class=WhisperCppASR,

diff --git a/app/core/asr/whisper_cpp.py b/app/core/asr/whisper_cpp.py
@@ -32,6 +32,13 @@ def __init__(
         whisper_model=None,
         use_cache: bool = False,
         need_word_time_stamp: bool = False,
+        enable_vad: bool = False,
+        vad_model: Optional[str] = None,
+        vad_threshold: float = 0.3,
+        vad_min_speech_duration_ms: int = 150,
+        vad_min_silence_duration_ms: int = 200,
+        vad_max_speech_duration_s: int = 30,
+        vad_speech_pad_ms: int = 50,
     ):
         super().__init__(audio_input, use_cache)
 
@@ -46,8 +53,8 @@ def __init__(
             whisper_cpp_path = detect_whisper_executable()
 
         # Find model file in models directory
+        models_dir = Path(MODEL_PATH)
         if whisper_model:
-            models_dir = Path(MODEL_PATH)
             model_files = list(models_dir.glob(f"*ggml*{whisper_model}*.bin"))
             if not model_files:
                 raise ValueError(
@@ -62,6 +69,29 @@ def __init__(
         self.whisper_cpp_path = Path(whisper_cpp_path)
         self.need_word_time_stamp = need_word_time_stamp
         self.language = language
+        self.enable_vad = enable_vad
+        self.vad_model_path = None
+        self.vad_threshold = vad_threshold
+        self.vad_min_speech_duration_ms = vad_min_speech_duration_ms
+        self.vad_min_silence_duration_ms = vad_min_silence_duration_ms
+        self.vad_max_speech_duration_s = vad_max_speech_duration_s
+        self.vad_speech_pad_ms = vad_speech_pad_ms
+
+        if self.enable_vad:
+            if vad_model:
+                vad_model_files = list(models_dir.glob(f"*{vad_model}*.bin"))
+                if vad_model_files:
+                    self.vad_model_path = str(vad_model_files[0])
+                    logger.info(f"VAD Model found: {self.vad_model_path}")
+                elif os.path.exists(vad_model):
+                    self.vad_model_path = vad_model
+                    logger.info(f"VAD Model found at path: {self.vad_model_path}")
+                else:
+                     logger.warning(f"VAD Model not found: {vad_model}, VAD will be disabled")
+                     self.enable_vad = False
+            else:
+                 logger.warning("VAD enabled but no model provided. VAD will be disabled unless a default is found (not implemented).")
+                 self.enable_vad = False
 
         self.process = None
 
@@ -109,6 +139,15 @@ def _build_command(
                 ["--prompt", "你好，我们需要使用简体中文，以下是普通话的句子。"]
             )
 
+        if self.enable_vad and self.vad_model_path:
+            whisper_params.append("--vad")
+            whisper_params.extend(["-vm", str(self.vad_model_path)])
+            whisper_params.extend(["-vt", str(self.vad_threshold)])
+            whisper_params.extend(["-vspd", str(self.vad_min_speech_duration_ms)])
+            whisper_params.extend(["-vsd", str(self.vad_min_silence_duration_ms)])
+            whisper_params.extend(["-vmsd", str(self.vad_max_speech_duration_s)])
+            whisper_params.extend(["-vp", str(self.vad_speech_pad_ms)])
+
         return whisper_params
 
     def _run(
@@ -235,7 +274,7 @@ def _default_callback(_progress: int, _message: str) -> None:
                 raise RuntimeError(f"SRT generation failed: {str(e)}")
 
     def _get_key(self):
-        return f"{self.crc32_hex}-{self.need_word_time_stamp}-{self.model_path}-{self.language}"
+        return f"{self.crc32_hex}-{self.need_word_time_stamp}-{self.model_path}-{self.language}-{self.enable_vad}-{self.vad_model_path}-{self.vad_threshold}-{self.vad_min_speech_duration_ms}-{self.vad_min_silence_duration_ms}-{self.vad_max_speech_duration_s}-{self.vad_speech_pad_ms}"
 
     def get_audio_duration(self, filepath: str) -> int:
         """Get audio file duration in seconds using ffmpeg."""

diff --git a/app/core/entities.py b/app/core/entities.py
@@ -557,6 +557,13 @@ class TranscribeConfig:
     output_format: Optional[TranscribeOutputFormatEnum] = None
     # Whisper Cpp 配置
     whisper_model: Optional[WhisperModelEnum] = None
+    whisper_cpp_enable_vad: bool = True
+    whisper_cpp_vad_model: str = "silero-v6.2.0"
+    whisper_cpp_vad_threshold: float = 0.3
+    whisper_cpp_vad_min_speech_duration_ms: int = 150
+    whisper_cpp_vad_min_silence_duration_ms: int = 200
+    whisper_cpp_vad_max_speech_duration_s: int = 30
+    whisper_cpp_vad_speech_pad_ms: int = 50
     # Whisper API 配置
     whisper_api_key: Optional[str] = None
     whisper_api_base: Optional[str] = None
@@ -616,6 +623,14 @@ def print_config(self) -> str:
             lines.append(
                 f"Model: {self.whisper_model.value if self.whisper_model else 'None'}"
             )
+            lines.append(f"VAD Enabled: {self.whisper_cpp_enable_vad}")
+            if self.whisper_cpp_enable_vad:
+                lines.append(f"VAD Model: {self.whisper_cpp_vad_model}")
+                lines.append(f"VAD Threshold: {self.whisper_cpp_vad_threshold}")
+                lines.append(f"VAD Min Speech: {self.whisper_cpp_vad_min_speech_duration_ms}ms")
+                lines.append(f"VAD Min Silence: {self.whisper_cpp_vad_min_silence_duration_ms}ms")
+                lines.append(f"VAD Max Speech: {self.whisper_cpp_vad_max_speech_duration_s}s")
+                lines.append(f"VAD Speech Pad: {self.whisper_cpp_vad_speech_pad_ms}ms")
 
         lines.append("=" * 42)
         return "\n".join(lines)

diff --git a/app/core/task_factory.py b/app/core/task_factory.py
@@ -75,6 +75,13 @@ def create_transcribe_task(
             output_format=cfg.transcribe_output_format.value,
             # Whisper Cpp 配置
             whisper_model=cfg.whisper_model.value,
+            whisper_cpp_enable_vad=cfg.whisper_cpp_enable_vad.value,
+            whisper_cpp_vad_model=cfg.whisper_cpp_vad_model.value,
+            whisper_cpp_vad_threshold=cfg.whisper_cpp_vad_threshold.value,
+            whisper_cpp_vad_min_speech_duration_ms=cfg.whisper_cpp_vad_min_speech_duration_ms.value,
+            whisper_cpp_vad_min_silence_duration_ms=cfg.whisper_cpp_vad_min_silence_duration_ms.value,
+            whisper_cpp_vad_max_speech_duration_s=cfg.whisper_cpp_vad_max_speech_duration_s.value,
+            whisper_cpp_vad_speech_pad_ms=cfg.whisper_cpp_vad_speech_pad_ms.value,
             # Whisper API 配置
             whisper_api_key=cfg.whisper_api_key.value,
             whisper_api_base=cfg.whisper_api_base.value,