NVIDIA-NeMo
diff --git a/‎examples/audio/qwen_omni_inprocess/run_pipeline.py‎
Lines changed: 212 additions & 205 deletions b/‎examples/audio/qwen_omni_inprocess/run_pipeline.py‎
Lines changed: 212 additions & 205 deletions
diff --git a/‎nemo_curator/models/qwen_asr.py‎
Lines changed: 174 additions & 0 deletions b/‎nemo_curator/models/qwen_asr.py‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎nemo_curator/stages/audio/inference/qwen_asr.py‎
Lines changed: 170 additions & 0 deletions b/‎nemo_curator/stages/audio/inference/qwen_asr.py‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎nemo_curator/stages/audio/inference/qwen_omni.py‎
Lines changed: 3 additions & 1 deletion b/‎nemo_curator/stages/audio/inference/qwen_omni.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎nemo_curator/stages/audio/text_filtering/fasttext_lid.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo_curator/stages/audio/text_filtering/fasttext_lid.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,174 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Qwen3-ASR model wrapper for in-process vLLM inference.
+
+Uses the ``qwen_asr`` library which wraps vLLM internally and exposes a
+high-level ``transcribe()`` API that accepts in-memory numpy waveforms.
+"""
+
+from __future__ import annotations
+
+import gc
+from typing import TYPE_CHECKING, Any
+
+from loguru import logger
+
+from nemo_curator.models.base import ModelInterface
+
+if TYPE_CHECKING:
+    import numpy as np
+
+_QWEN3_ASR_MODEL_ID = "Qwen/Qwen3-ASR-0.6B"
+
+
+class QwenASR(ModelInterface):
+    """Qwen3-ASR model via the ``qwen_asr`` library with vLLM backend.
+
+    Audio is accepted as in-memory numpy arrays (mono, any sample rate).
+    The ``qwen_asr`` library handles resampling to 16 kHz, chunking long
+    audio, and batched vLLM inference internally.
+    """
+
+    def __init__(
+        self,
+        model_id: str = _QWEN3_ASR_MODEL_ID,
+        language: str | None = None,
+        gpu_memory_utilization: float = 0.7,
+        max_new_tokens: int = 4096,
+        max_inference_batch_size: int = 128,
+    ):
+        self.model_id = model_id
+        self.language = language
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.max_new_tokens = max_new_tokens
+        self.max_inference_batch_size = max_inference_batch_size
+
+        self._model: Any = None
+
+    @property
+    def model_id_names(self) -> list[str]:
+        return [self.model_id]
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _patch_transformers_compat() -> None:
+        """Patch transformers.check_model_inputs for qwen-asr compatibility.
+
+        Newer transformers changed check_model_inputs from a decorator factory
+        (called with parentheses) to a plain decorator. The qwen-asr package
+        uses the old ``@check_model_inputs()`` syntax which breaks on newer
+        versions. This wraps it to accept both styles.
+        """
+        try:
+            import transformers
+            original = getattr(transformers, "check_model_inputs", None)
+            if original is None:
+                return
+            import inspect
+            sig = inspect.signature(original)
+            params = list(sig.parameters.values())
+            if params and params[0].name == "func":
+                def compat_check_model_inputs(*args, **kwargs):
+                    if args and callable(args[0]):
+                        return original(args[0])
+                    return original
+                transformers.check_model_inputs = compat_check_model_inputs
+        except Exception:  # noqa: BLE001
+            pass
+
+    def setup(self) -> None:
+        self._patch_transformers_compat()
+
+        try:
+            from qwen_asr import Qwen3ASRModel
+        except ImportError:
+            msg = "qwen_asr is required for QwenASR. Install it: pip install qwen-asr[vllm]"
+            raise ImportError(msg) from None
+
+        logger.info(
+            f"Loading QwenASR model={self.model_id}  "
+            f"gpu_mem={self.gpu_memory_utilization}  "
+            f"max_new_tokens={self.max_new_tokens}  "
+            f"max_batch={self.max_inference_batch_size}"
+        )
+
+        self._model = Qwen3ASRModel.LLM(
+            model=self.model_id,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            max_inference_batch_size=self.max_inference_batch_size,
+            max_new_tokens=self.max_new_tokens,
+            trust_remote_code=True,
+            enforce_eager=True,
+        )
+
+        logger.info("QwenASR model loaded")
+
+    def teardown(self) -> None:
+        del self._model
+        self._model = None
+        gc.collect()
+        try:
+            import torch
+
+            torch.cuda.empty_cache()
+        except Exception:  # noqa: BLE001, S110
+            pass
+
+    # ------------------------------------------------------------------
+    # Generation
+    # ------------------------------------------------------------------
+
+    def generate(
+        self,
+        waveforms: list[np.ndarray],
+        sample_rates: list[int],
+        contexts: list[str] | None = None,
+    ) -> tuple[list[str], list[str]]:
+        """Run batched ASR inference on in-memory audio waveforms.
+
+        Args:
+            waveforms: List of 1-D mono numpy float32 arrays.
+            sample_rates: Corresponding sample rates for each waveform.
+            contexts: Optional per-sample instruction strings for
+                ``with_instruction`` mode.
+
+        Returns:
+            ``(texts, languages)`` -- transcribed text and detected
+            language for each input.
+        """
+        if self._model is None:
+            msg = "Model not initialized. Call setup() first."
+            raise RuntimeError(msg)
+
+        audio_inputs: list[tuple[np.ndarray, int]] = list(
+            zip(waveforms, sample_rates, strict=True)
+        )
+
+        kwargs: dict[str, Any] = {
+            "audio": audio_inputs,
+            "language": self.language,
+        }
+        if contexts is not None:
+            kwargs["context"] = contexts
+
+        results = self._model.transcribe(**kwargs)
+
+        texts = [getattr(r, "text", str(r)) for r in results]
+        languages = [getattr(r, "language", "") or (self.language or "") for r in results]
+
+        return texts, languages
@@ -0,0 +1,170 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from loguru import logger
+
+from nemo_curator.models.qwen_asr import QwenASR
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+from nemo_curator.tasks import AudioTask
+
+if TYPE_CHECKING:
+    from nemo_curator.backends.base import NodeInfo, WorkerMetadata
+
+
+@dataclass
+class InferenceQwenASRStage(ProcessingStage[AudioTask, AudioTask]):
+    """Audio inference using Qwen3-ASR via the ``qwen_asr`` library (vLLM backend).
+
+    Expects each ``AudioTask.data`` to carry:
+
+    - ``waveform``: 1-D mono numpy float32 array (any sample rate)
+    - ``sample_rate``: int
+
+    When ``run_only_if_key`` is set, the stage only runs inference on
+    tasks where ``task.data[run_only_if_key]`` starts with
+    ``run_only_if_prefix`` (default ``"Hallucination"``).  Non-matching
+    tasks pass through unchanged.
+
+    Args:
+        model_id: HuggingFace model identifier or local path.
+        language: Language hint (e.g. ``"English"``).
+        pred_text_key: Key where the predicted text is stored.
+        language_key: Key where the detected language is stored.
+        run_only_if_key: If set, only run inference on tasks where
+            ``task.data[run_only_if_key]`` starts with ``run_only_if_prefix``.
+        gpu_memory_utilization: Fraction of GPU memory vLLM may use.
+        max_new_tokens: Maximum tokens to generate per sample.
+        max_inference_batch_size: Batch size for internal vLLM batching.
+    """
+
+    name: str = "QwenASR_inference"
+    model_id: str = "Qwen/Qwen3-ASR-0.6B"
+    language: str | None = None
+    waveform_key: str = "waveform"
+    sample_rate_key: str = "sample_rate"
+    pred_text_key: str = "qwen3_asr_prediction"
+    language_key: str = "qwen3_asr_language"
+    context_key: str | None = None
+    run_only_if_key: str | None = None
+    run_only_if_prefix: str = "Hallucination"
+    gpu_memory_utilization: float = 0.7
+    max_new_tokens: int = 4096
+    max_inference_batch_size: int = 128
+    resources: Resources = field(default_factory=lambda: Resources(gpus=1.0))
+    batch_size: int = 128
+
+    def __post_init__(self) -> None:
+        self._model: QwenASR | None = None
+
+    def _create_model(self) -> QwenASR:
+        return QwenASR(
+            model_id=self.model_id,
+            language=self.language,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            max_new_tokens=self.max_new_tokens,
+            max_inference_batch_size=self.max_inference_batch_size,
+        )
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def setup_on_node(
+        self,
+        _node_info: NodeInfo | None = None,
+        _worker_metadata: WorkerMetadata | None = None,
+    ) -> None:
+        self._model = self._create_model()
+        self._model.setup()
+        logger.info("QwenASR model ready on node")
+
+    def setup(self, _worker_metadata: WorkerMetadata | None = None) -> None:
+        if self._model is None:
+            self._model = self._create_model()
+            self._model.setup()
+
+    def teardown(self) -> None:
+        if self._model is not None:
+            self._model.teardown()
+            self._model = None
+
+    # ------------------------------------------------------------------
+    # I/O contract
+    # ------------------------------------------------------------------
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return [], [self.waveform_key, self.sample_rate_key]
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return [], [self.pred_text_key, self.language_key]
+
+    # ------------------------------------------------------------------
+    # Processing
+    # ------------------------------------------------------------------
+
+    def process(self, task: AudioTask) -> AudioTask:
+        msg = "InferenceQwenASRStage only supports process_batch"
+        raise NotImplementedError(msg)
+
+    def process_batch(self, tasks: list[AudioTask]) -> list[AudioTask]:
+        if not tasks:
+            return []
+
+        if self._model is None:
+            msg = "Model not initialized — setup() was not called"
+            raise RuntimeError(msg)
+
+        for task in tasks:
+            task.data.setdefault(self.pred_text_key, "")
+            task.data.setdefault(self.language_key, "")
+
+        if self.run_only_if_key:
+            run_indices = [
+                i for i, t in enumerate(tasks)
+                if str(t.data.get(self.run_only_if_key, "")).startswith(self.run_only_if_prefix)
+            ]
+        else:
+            run_indices = list(range(len(tasks)))
+
+        if not run_indices:
+            for task in tasks:
+                task.data.pop(self.waveform_key, None)
+            logger.info(f"QwenASR: skipped entire batch of {len(tasks)} (none matched run_only_if_key)")
+            return tasks
+
+        waveforms = [tasks[i].data[self.waveform_key] for i in run_indices]
+        sample_rates = [tasks[i].data[self.sample_rate_key] for i in run_indices]
+        contexts = (
+            [tasks[i].data.get(self.context_key, "") for i in run_indices]
+            if self.context_key else None
+        )
+
+        pred_texts, languages = self._model.generate(waveforms, sample_rates, contexts)
+
+        for idx, pred, lang in zip(run_indices, pred_texts, languages, strict=True):
+            tasks[idx].data[self.pred_text_key] = pred
+            tasks[idx].data[self.language_key] = lang
+
+        for task in tasks:
+            task.data.pop(self.waveform_key, None)
+
+        skipped = len(tasks) - len(run_indices)
+        logger.info(f"QwenASR: generated {len(run_indices)} predictions, skipped {skipped}")
+        return tasks
@@ -81,6 +81,7 @@ class InferenceQwenOmniStage(ProcessingStage[AudioTask, AudioTask]):
     temperature: float = 0.0
     top_k: int = 1
     prep_workers: int = 8
+    keep_waveform: bool = False
     resources: Resources = field(default_factory=lambda: Resources(gpus=1.0))
     batch_size: int = 32
 
@@ -172,7 +173,8 @@ def process_batch(self, tasks: list[AudioTask]) -> list[AudioTask]:
             task.data[self.pred_text_key] = pred
             if self.followup_prompt:
                 task.data[self.disfluency_text_key] = disfl
-            task.data.pop(self.waveform_key, None)
+            if not self.keep_waveform:
+                task.data.pop(self.waveform_key, None)
 
         logger.info(f"QwenOmni: generated {len(pred_texts)} predictions (turn2={bool(self.followup_prompt)})")
         return tasks
@@ -105,7 +105,7 @@ def _process_single(self, task: AudioTask) -> AudioTask:
         text = text.strip().replace("\n", " ")
         if not text:
             if not task.data[self.skip_me_key]:
-                task.data[self.skip_me_key] = "Empty text"
+                task.data[self.skip_me_key] = f"Empty text:{self.name}"
             return task
         result_str = self._lid.score_document(text)
         score_list = eval(result_str)  # noqa: S307  — output of our own FastText model