diff --git a/app/common/config.py b/app/common/config.py index 9ce62067..612dd104 100644 --- a/app/common/config.py +++ b/app/common/config.py @@ -172,6 +172,27 @@ class Config(QConfig): OptionsValidator(WhisperModelEnum), EnumSerializer(WhisperModelEnum), ) + whisper_cpp_enable_vad = ConfigItem( + "Whisper", "EnableVad", True, BoolValidator() + ) + whisper_cpp_vad_model = ConfigItem( + "Whisper", "VadModel", "silero-v6.2.0" + ) + whisper_cpp_vad_threshold = RangeConfigItem( + "Whisper", "VadThreshold", 0.3, RangeValidator(0, 1) + ) + whisper_cpp_vad_min_speech_duration_ms = ConfigItem( + "Whisper", "VadMinSpeechDurationMs", 150 + ) + whisper_cpp_vad_min_silence_duration_ms = ConfigItem( + "Whisper", "VadMinSilenceDurationMs", 200 + ) + whisper_cpp_vad_max_speech_duration_s = ConfigItem( + "Whisper", "VadMaxSpeechDurationS", 30 + ) + whisper_cpp_vad_speech_pad_ms = ConfigItem( + "Whisper", "VadSpeechPadMs", 50 + ) # ------------------- Faster Whisper 配置 ------------------- faster_whisper_program = ConfigItem( diff --git a/app/components/WhisperCppSettingWidget.py b/app/components/WhisperCppSettingWidget.py index 6b3c472e..0e5efcf0 100644 --- a/app/components/WhisperCppSettingWidget.py +++ b/app/components/WhisperCppSettingWidget.py @@ -107,7 +107,15 @@ # "downloadLink": "https://huggingface.co/distil-whisper/distil-large-v3-ggml/resolve/main/ggml-distil-large-v3.bin?download=true", # "mirrorLink": "https://www.modelscope.cn/models/cjc1887415157/whisper.cpp/resolve/master/ggml-distil-large-v3.bin", # "sha": "5e61e98bdcf3b9a78516c59bf7d1a10d64cae67a" - # } + # }, + { + "label": "Silero V6.2.0(VAD model)", + "value": "ggml-silero-v6.2.0.bin", + "size": "889 KB", + "downloadLink": "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin?download=true", + "mirrorLink": "https://www.modelscope.cn/models/ggml-org/whisper-vad/resolve/master/ggml-silero-v6.2.0.bin", + "sha": "470e5d9d094ddba2f0a512cecc3732a252188abd", + } ] diff --git a/app/core/asr/transcribe.py b/app/core/asr/transcribe.py index 02e8944f..a328be98 100644 --- a/app/core/asr/transcribe.py +++ b/app/core/asr/transcribe.py @@ -100,6 +100,13 @@ def _create_whisper_cpp_asr(audio_path: str, config: TranscribeConfig) -> Chunke "need_word_time_stamp": config.need_word_time_stamp, "language": config.transcribe_language, "whisper_model": config.whisper_model.value if config.whisper_model else None, + "enable_vad": config.whisper_cpp_enable_vad, + "vad_model": config.whisper_cpp_vad_model, + "vad_threshold": config.whisper_cpp_vad_threshold, + "vad_min_speech_duration_ms": config.whisper_cpp_vad_min_speech_duration_ms, + "vad_min_silence_duration_ms": config.whisper_cpp_vad_min_silence_duration_ms, + "vad_max_speech_duration_s": config.whisper_cpp_vad_max_speech_duration_s, + "vad_speech_pad_ms": config.whisper_cpp_vad_speech_pad_ms, } return ChunkedASR( asr_class=WhisperCppASR, diff --git a/app/core/asr/whisper_cpp.py b/app/core/asr/whisper_cpp.py index bd8ba3af..40f44e22 100644 --- a/app/core/asr/whisper_cpp.py +++ b/app/core/asr/whisper_cpp.py @@ -32,6 +32,13 @@ def __init__( whisper_model=None, use_cache: bool = False, need_word_time_stamp: bool = False, + enable_vad: bool = False, + vad_model: Optional[str] = None, + vad_threshold: float = 0.3, + vad_min_speech_duration_ms: int = 150, + vad_min_silence_duration_ms: int = 200, + vad_max_speech_duration_s: int = 30, + vad_speech_pad_ms: int = 50, ): super().__init__(audio_input, use_cache) @@ -46,8 +53,8 @@ def __init__( whisper_cpp_path = detect_whisper_executable() # Find model file in models directory + models_dir = Path(MODEL_PATH) if whisper_model: - models_dir = Path(MODEL_PATH) model_files = list(models_dir.glob(f"*ggml*{whisper_model}*.bin")) if not model_files: raise ValueError( @@ -62,6 +69,29 @@ def __init__( self.whisper_cpp_path = Path(whisper_cpp_path) self.need_word_time_stamp = need_word_time_stamp self.language = language + self.enable_vad = enable_vad + self.vad_model_path = None + self.vad_threshold = vad_threshold + self.vad_min_speech_duration_ms = vad_min_speech_duration_ms + self.vad_min_silence_duration_ms = vad_min_silence_duration_ms + self.vad_max_speech_duration_s = vad_max_speech_duration_s + self.vad_speech_pad_ms = vad_speech_pad_ms + + if self.enable_vad: + if vad_model: + vad_model_files = list(models_dir.glob(f"*{vad_model}*.bin")) + if vad_model_files: + self.vad_model_path = str(vad_model_files[0]) + logger.info(f"VAD Model found: {self.vad_model_path}") + elif os.path.exists(vad_model): + self.vad_model_path = vad_model + logger.info(f"VAD Model found at path: {self.vad_model_path}") + else: + logger.warning(f"VAD Model not found: {vad_model}, VAD will be disabled") + self.enable_vad = False + else: + logger.warning("VAD enabled but no model provided. VAD will be disabled unless a default is found (not implemented).") + self.enable_vad = False self.process = None @@ -109,6 +139,15 @@ def _build_command( ["--prompt", "你好,我们需要使用简体中文,以下是普通话的句子。"] ) + if self.enable_vad and self.vad_model_path: + whisper_params.append("--vad") + whisper_params.extend(["-vm", str(self.vad_model_path)]) + whisper_params.extend(["-vt", str(self.vad_threshold)]) + whisper_params.extend(["-vspd", str(self.vad_min_speech_duration_ms)]) + whisper_params.extend(["-vsd", str(self.vad_min_silence_duration_ms)]) + whisper_params.extend(["-vmsd", str(self.vad_max_speech_duration_s)]) + whisper_params.extend(["-vp", str(self.vad_speech_pad_ms)]) + return whisper_params def _run( @@ -235,7 +274,7 @@ def _default_callback(_progress: int, _message: str) -> None: raise RuntimeError(f"SRT generation failed: {str(e)}") def _get_key(self): - return f"{self.crc32_hex}-{self.need_word_time_stamp}-{self.model_path}-{self.language}" + return f"{self.crc32_hex}-{self.need_word_time_stamp}-{self.model_path}-{self.language}-{self.enable_vad}-{self.vad_model_path}-{self.vad_threshold}-{self.vad_min_speech_duration_ms}-{self.vad_min_silence_duration_ms}-{self.vad_max_speech_duration_s}-{self.vad_speech_pad_ms}" def get_audio_duration(self, filepath: str) -> int: """Get audio file duration in seconds using ffmpeg.""" diff --git a/app/core/entities.py b/app/core/entities.py index 5eccab8e..9069d846 100644 --- a/app/core/entities.py +++ b/app/core/entities.py @@ -557,6 +557,13 @@ class TranscribeConfig: output_format: Optional[TranscribeOutputFormatEnum] = None # Whisper Cpp 配置 whisper_model: Optional[WhisperModelEnum] = None + whisper_cpp_enable_vad: bool = True + whisper_cpp_vad_model: str = "silero-v6.2.0" + whisper_cpp_vad_threshold: float = 0.3 + whisper_cpp_vad_min_speech_duration_ms: int = 150 + whisper_cpp_vad_min_silence_duration_ms: int = 200 + whisper_cpp_vad_max_speech_duration_s: int = 30 + whisper_cpp_vad_speech_pad_ms: int = 50 # Whisper API 配置 whisper_api_key: Optional[str] = None whisper_api_base: Optional[str] = None @@ -616,6 +623,14 @@ def print_config(self) -> str: lines.append( f"Model: {self.whisper_model.value if self.whisper_model else 'None'}" ) + lines.append(f"VAD Enabled: {self.whisper_cpp_enable_vad}") + if self.whisper_cpp_enable_vad: + lines.append(f"VAD Model: {self.whisper_cpp_vad_model}") + lines.append(f"VAD Threshold: {self.whisper_cpp_vad_threshold}") + lines.append(f"VAD Min Speech: {self.whisper_cpp_vad_min_speech_duration_ms}ms") + lines.append(f"VAD Min Silence: {self.whisper_cpp_vad_min_silence_duration_ms}ms") + lines.append(f"VAD Max Speech: {self.whisper_cpp_vad_max_speech_duration_s}s") + lines.append(f"VAD Speech Pad: {self.whisper_cpp_vad_speech_pad_ms}ms") lines.append("=" * 42) return "\n".join(lines) diff --git a/app/core/task_factory.py b/app/core/task_factory.py index 76100a57..088e4603 100644 --- a/app/core/task_factory.py +++ b/app/core/task_factory.py @@ -75,6 +75,13 @@ def create_transcribe_task( output_format=cfg.transcribe_output_format.value, # Whisper Cpp 配置 whisper_model=cfg.whisper_model.value, + whisper_cpp_enable_vad=cfg.whisper_cpp_enable_vad.value, + whisper_cpp_vad_model=cfg.whisper_cpp_vad_model.value, + whisper_cpp_vad_threshold=cfg.whisper_cpp_vad_threshold.value, + whisper_cpp_vad_min_speech_duration_ms=cfg.whisper_cpp_vad_min_speech_duration_ms.value, + whisper_cpp_vad_min_silence_duration_ms=cfg.whisper_cpp_vad_min_silence_duration_ms.value, + whisper_cpp_vad_max_speech_duration_s=cfg.whisper_cpp_vad_max_speech_duration_s.value, + whisper_cpp_vad_speech_pad_ms=cfg.whisper_cpp_vad_speech_pad_ms.value, # Whisper API 配置 whisper_api_key=cfg.whisper_api_key.value, whisper_api_base=cfg.whisper_api_base.value,