PR_#integration-test from nithinraok/Curator nkoluguri/integration-test, validated improvements on top of the 4 PRs

Jorjeous · Jorjeous · commit b60dc95f59cf · 2026-04-27T13:20:53.000-07:00
Squash cherry-pick of integration-test's unique commits on top of #1853 + #1 + #3 + #1839: - 633acc7 FastText and Hallucination update → SelectBestPredictionStage: cross-model WER agreement. If both omni and ASR are flagged hallucinated but agree (WER ≤ 100 - min_agreement_pct, default 80%), keep omni and mark recovered — two independent models producing near-identical text is strong evidence the text is correct. → FastTextLIDStage: HuggingFace-format model loader, proper _predict() abstraction, source-tracked _skip_me ("Wrong language:{name}"). - 5fdfa0a additional notes key + skip writing keys after skip_me + pnc prompt + prefill caching → Models (qwen_omni, qwen_asr, qwen_text_llm): notes_key field for diagnostic info, vLLM enable_prefix_caching=True with xxhash. → text_filtering stages: skip writing output keys when skip_me is set. → New file: prompts/pnc_prompt.md. - 15424e3 updated prompt for ITN → Sharper ITN prompt (handles more conversion edge cases). - 0cf8e6c match max model len for ITN and PnC → Aligned ITN/PnC max_model_len (4096), max_num_seqs (16), gpu_memory_utilization (0.95). Wired ITN args through run_pipeline. - 7e32df1 add Qwen3ASR for all → Apply QwenASR recovery to all hallucination flags, not just specific patterns. WhisperHallucinationStage tweaks. - caccd37 Add min word count for FastText → Re-adds min_word_count=2 (FastText is unreliable on single-word inputs). Conflict resolution: - run_pipeline.py: kept multi-line argparse style (ours), kept --source_lang_key, adopted theirs' ITN stage construction (with new max_model_len/num_seqs/gpu_mem args). - fasttext_lid.py: took theirs' richer process logic (min_word_count check, per-sample expected language via source_lang_key, source-tracked _skip_me values). #NO_PR Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
diff --git a/examples/audio/qwen_omni_inprocess/run_pipeline.py b/examples/audio/qwen_omni_inprocess/run_pipeline.py
@@ -82,7 +82,7 @@
 from nemo_curator.stages.resources import Resources
 
 
-def _build_arg_parser() -> argparse.ArgumentParser:
+def _build_arg_parser() -> argparse.ArgumentParser:  # noqa: PLR0915
     ap = argparse.ArgumentParser(description="QwenOmni in-process vLLM pipeline")
     ap.add_argument("--data_config", type=str, required=True, help="Granary YAML data config.")
     ap.add_argument("--corpus", type=str, nargs="*", default=None, help="Process only these corpora.")
@@ -108,7 +108,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
     tf.add_argument("--hall_phrases", type=str, required=True,
                     help="Path to hallucination phrases text file.")
     tf.add_argument("--fasttext_model", type=str, default="lid.176.ftz",
-                    help="FastText LID model: local path or known name (lid.176.bin / lid.176.ftz).")
+                    help="FastText LID model: HuggingFace repo ID, local path, or known name (lid.176.bin / lid.176.ftz).")
     tf.add_argument("--regex_yaml", type=str, required=True,
                     help="Path to regex substitution rules YAML.")
     tf.add_argument("--target_lang", type=str, default="en",
@@ -170,11 +170,17 @@ def _build_arg_parser() -> argparse.ArgumentParser:
                      help="TP size for ITN model (None = auto-detect).")
     itn.add_argument("--itn_max_output_tokens", type=int, default=4096,
                      help="Max tokens to generate per ITN sample.")
+    itn.add_argument("--itn_max_model_len", type=int, default=4096,
+                     help="Max context length for ITN vLLM engine.")
+    itn.add_argument("--itn_max_num_seqs", type=int, default=16,
+                     help="Max concurrent sequences for ITN vLLM engine.")
+    itn.add_argument("--itn_gpu_memory_utilization", type=float, default=0.95,
+                     help="Fraction of GPU memory for ITN vLLM engine.")
     itn.add_argument("--itn_no_validation", action="store_true", help="Disable ITN output validation.")
     return ap
 
 
-def main() -> None:
+def main() -> None:  # noqa: C901
     args = _build_arg_parser().parse_args()
 
     prompt = args.prompt
@@ -259,7 +265,6 @@ def main() -> None:
                 batch_size=args.asr_batch_size,
                 gpu_memory_utilization=args.asr_gpu_memory_utilization,
                 max_new_tokens=args.asr_max_new_tokens,
-                run_only_if_key="_skip_me",
             ),
             WhisperHallucinationStage(
                 name="WhisperHallucination_asr",
@@ -320,18 +325,19 @@ def main() -> None:
         ])
 
     if args.enable_itn:
-        stages.append(
-            ITNRestorationStage(
-                model_id=args.itn_model_id,
-                prompt_text=itn_prompt_text,
-                text_key=args.itn_text_key or ("pnc_text" if not args.skip_pnc else "abbreviated_text"),
-                output_text_key=args.itn_output_key,
-                tensor_parallel_size=args.itn_tensor_parallel_size,
-                max_output_tokens=args.itn_max_output_tokens,
-                batch_size=args.itn_batch_size,
-                enable_validation=not args.itn_no_validation,
-            )
-        )
+        stages.append(ITNRestorationStage(
+            model_id=args.itn_model_id,
+            prompt_text=itn_prompt_text,
+            text_key=args.itn_text_key or ("pnc_text" if not args.skip_pnc else "abbreviated_text"),
+            output_text_key=args.itn_output_key,
+            tensor_parallel_size=args.itn_tensor_parallel_size,
+            max_output_tokens=args.itn_max_output_tokens,
+            max_model_len=args.itn_max_model_len,
+            max_num_seqs=args.itn_max_num_seqs,
+            gpu_memory_utilization=args.itn_gpu_memory_utilization,
+            batch_size=args.itn_batch_size,
+            enable_validation=not args.itn_no_validation,
+        ))
 
     stages.append(ShardedManifestWriterStage(output_dir=args.output_dir))
 
diff --git a/nemo_curator/models/qwen_asr.py b/nemo_curator/models/qwen_asr.py
@@ -83,12 +83,12 @@ def _patch_transformers_compat() -> None:
             sig = inspect.signature(original)
             params = list(sig.parameters.values())
             if params and params[0].name == "func":
-                def compat_check_model_inputs(*args, **kwargs):
+                def compat_check_model_inputs(*args):  # noqa: ANN202
                     if args and callable(args[0]):
                         return original(args[0])
                     return original
                 transformers.check_model_inputs = compat_check_model_inputs
-        except Exception:  # noqa: BLE001
+        except Exception:  # noqa: BLE001, S110
             pass
 
     def setup(self) -> None:
@@ -114,6 +114,8 @@ def setup(self) -> None:
             max_new_tokens=self.max_new_tokens,
             trust_remote_code=True,
             enforce_eager=True,
+            enable_prefix_caching=True,
+            prefix_caching_hash_algo="xxhash",
         )
 
         logger.info("QwenASR model loaded")
diff --git a/nemo_curator/models/qwen_omni.py b/nemo_curator/models/qwen_omni.py
@@ -116,6 +116,8 @@ def setup(self) -> None:
             max_num_seqs=self.max_num_seqs,
             max_model_len=self.max_model_len,
             seed=1234,
+            enable_prefix_caching=True,
+            prefix_caching_hash_algo="xxhash",
         )
 
         from transformers import Qwen3OmniMoeProcessor
diff --git a/nemo_curator/models/qwen_text_llm.py b/nemo_curator/models/qwen_text_llm.py
@@ -122,6 +122,8 @@ def setup(self) -> None:
             max_num_seqs=self.max_num_seqs,
             max_model_len=self.max_model_len,
             seed=1234,
+            enable_prefix_caching=True,
+            prefix_caching_hash_algo="xxhash",
         )
 
         self._sampling_params = SamplingParams(
diff --git a/nemo_curator/stages/audio/text_filtering/abbreviation_concat.py b/nemo_curator/stages/audio/text_filtering/abbreviation_concat.py
@@ -237,7 +237,7 @@ def outputs(self) -> tuple[list[str], list[str]]:
     def _process_single(self, task: AudioTask) -> AudioTask:
         skip = task.data.get(self.skip_me_key, "")
         if skip:
-            task.data.setdefault(self.output_text_key, task.data.get(self.text_key, ""))
+            task.data.setdefault(self.output_text_key, "")
             task.data.setdefault(self.abbreviations_key, [])
             return task
 
diff --git a/nemo_curator/stages/audio/text_filtering/fasttext_lid.py b/nemo_curator/stages/audio/text_filtering/fasttext_lid.py
@@ -29,6 +29,26 @@
 }
 _DEFAULT_CACHE_DIR = os.path.expanduser("~/.cache/nemo_curator/fasttext")
 
+_ISO639_3_TO_1: dict[str, str] = {
+    "afr": "af", "amh": "am", "ara": "ar", "asm": "as", "aze": "az",
+    "bel": "be", "ben": "bn", "bos": "bs", "bul": "bg", "cat": "ca",
+    "ces": "cs", "cym": "cy", "dan": "da", "deu": "de", "ell": "el",
+    "eng": "en", "est": "et", "eus": "eu", "fas": "fa", "fin": "fi",
+    "fra": "fr", "gle": "ga", "glg": "gl", "guj": "gu", "hau": "ha",
+    "heb": "he", "hin": "hi", "hrv": "hr", "hun": "hu", "hye": "hy",
+    "ibo": "ig", "ind": "id", "isl": "is", "ita": "it", "jav": "jv",
+    "jpn": "ja", "kan": "kn", "kat": "ka", "khm": "km", "kor": "ko",
+    "lao": "lo", "lav": "lv", "lit": "lt", "mal": "ml", "mar": "mr",
+    "mkd": "mk", "mon": "mn", "msa": "ms", "mya": "my", "nep": "ne",
+    "nld": "nl", "nob": "nb", "nor": "no", "ori": "or", "pan": "pa",
+    "pol": "pl", "por": "pt", "ron": "ro", "rus": "ru", "sin": "si",
+    "slk": "sk", "slv": "sl", "som": "so", "spa": "es", "sqi": "sq",
+    "srp": "sr", "sun": "su", "swa": "sw", "swe": "sv", "tam": "ta",
+    "tel": "te", "tgl": "tl", "tha": "th", "tur": "tr", "ukr": "uk",
+    "urd": "ur", "vie": "vi", "xho": "xh", "yor": "yo", "zho": "zh",
+    "zul": "zu",
+}
+
 
 @dataclass
 class FastTextLIDStage(ProcessingStage[AudioTask, AudioTask]):
@@ -43,21 +63,29 @@ class FastTextLIDStage(ProcessingStage[AudioTask, AudioTask]):
 
     An already non-empty ``skip_me`` value is never overwritten.
 
+    Texts with fewer than ``min_word_count`` words are passed through
+    without LID filtering because FastText confidence is unreliable on
+    very short inputs (especially single words).
+
     ``model_path`` can be:
+    - A HuggingFace Hub repo ID (e.g.
+      ``facebook/fasttext-language-identification``), which is downloaded
+      via ``huggingface_hub``.
     - An absolute path to a local ``.bin`` or ``.ftz`` file.
-    - A known model name (``lid.176.bin`` or ``lid.176.ftz``), which is
+    - A legacy model name (``lid.176.bin`` or ``lid.176.ftz``), which is
       downloaded to ``~/.cache/nemo_curator/fasttext/`` on first use.
     """
 
     model_path: str = ""
     target_lang: str = "en"
     min_lang_prob: float = 0.8
+    min_word_count: int = 2
     text_key: str = "pred_text"
     skip_me_key: str = "_skip_me"
     name: str = "FastTextLID"
     resources: Resources = field(default_factory=lambda: Resources(cpus=1.0))
 
-    _lid: Any = field(default=None, init=False, repr=False)
+    _model: Any = field(default=None, init=False, repr=False)
 
     def __post_init__(self) -> None:
         if not self.model_path:
@@ -76,18 +104,39 @@ def _resolve_model_path(self) -> str:
             logger.info(f"FastTextLIDStage: downloading {self.model_path} from {url}")
             urllib.request.urlretrieve(url, cache_path)  # noqa: S310
             return cache_path
+        if "/" in self.model_path:
+            try:
+                from huggingface_hub import hf_hub_download
+
+                return hf_hub_download(repo_id=self.model_path, filename="model.bin")
+            except Exception as exc:
+                msg = f"Failed to download '{self.model_path}' from HuggingFace Hub: {exc}"
+                raise ValueError(msg) from exc
         msg = (
-            f"model_path '{self.model_path}' is not a valid file path and not a known model name. "
-            f"Known names: {list(_FASTTEXT_MODEL_URLS)}"
+            f"model_path '{self.model_path}' is not a valid file path, a known model name, "
+            f"or a HuggingFace repo ID.  Known names: {list(_FASTTEXT_MODEL_URLS)}"
         )
         raise ValueError(msg)
 
+    @staticmethod
+    def _parse_label(raw_label: str) -> str:
+        """Extract a 2-letter ISO 639-1 language code from a fasttext label.
+
+        Handles both the legacy format (``__label__en``) and the HuggingFace
+        ``facebook/fasttext-language-identification`` format
+        (``__label__eng_Latn``).
+        """
+        lang_part = raw_label.replace("__label__", "")
+        if "_" in lang_part:
+            iso3 = lang_part.split("_", 1)[0]
+            return _ISO639_3_TO_1.get(iso3, iso3).lower()
+        return lang_part.lower()
+
     def setup(self, _worker_metadata: object | None = None) -> None:
-        from nemo_curator.stages.text.filters.fasttext.fasttext_filters import FastTextLangId
+        import fasttext
 
         resolved = self._resolve_model_path()
-        self._lid = FastTextLangId(model_path=resolved, min_langid_score=0.0)
-        self._lid.load_model()
+        self._model = fasttext.load_model(resolved)
         logger.info(f"FastTextLIDStage: loaded model from {resolved}")
 
     def inputs(self) -> tuple[list[str], list[str]]:
@@ -96,6 +145,10 @@ def inputs(self) -> tuple[list[str], list[str]]:
     def outputs(self) -> tuple[list[str], list[str]]:
         return [], [self.skip_me_key]
 
+    def _predict(self, text: str) -> tuple[str, float]:
+        labels, scores = self._model.predict([text], k=1)
+        return self._parse_label(labels[0][0]), scores[0][0].item()
+
     def _process_single(self, task: AudioTask) -> AudioTask:
         if task.data.get(self.skip_me_key, ""):
             return task
@@ -107,19 +160,21 @@ def _process_single(self, task: AudioTask) -> AudioTask:
             if not task.data[self.skip_me_key]:
                 task.data[self.skip_me_key] = f"Empty text:{self.name}"
             return task
-        result_str = self._lid.score_document(text)
-        score_list = eval(result_str)  # noqa: S307  — output of our own FastText model
-        prob = float(score_list[0])
-        lang = str(score_list[1]).lower()
+        if len(text.split()) < self.min_word_count:
+            return task
+        lang, prob = self._predict(text)
+        expected = self.target_lang
+        if self.source_lang_key and self.source_lang_key in task.data:
+            expected = task.data[self.source_lang_key]
         if not task.data[self.skip_me_key]:
-            if lang != self.target_lang.lower():
-                task.data[self.skip_me_key] = "Wrong language"
+            if lang != expected.lower():
+                task.data[self.skip_me_key] = f"Wrong language:{self.name}"
             elif prob < self.min_lang_prob:
-                task.data[self.skip_me_key] = "Low probability of language"
+                task.data[self.skip_me_key] = f"Low probability of language:{self.name}"
         return task
 
     def process(self, task: AudioTask) -> AudioTask:
-        if self._lid is None:
+        if self._model is None:
             logger.warning(
                 f"FastTextLIDStage ({self.name}): setup() was not called before process(). "
                 "Calling setup() now — check that your executor invokes setup() on each worker."
@@ -128,7 +183,7 @@ def process(self, task: AudioTask) -> AudioTask:
         return self._process_single(task)
 
     def process_batch(self, tasks: list[AudioTask]) -> list[AudioTask]:
-        if self._lid is None:
+        if self._model is None:
             logger.warning(
                 f"FastTextLIDStage ({self.name}): setup() was not called before process_batch(). "
                 "Calling setup() now — check that your executor invokes setup() on each worker."
diff --git a/nemo_curator/stages/audio/text_filtering/initialize_fields.py b/nemo_curator/stages/audio/text_filtering/initialize_fields.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field  # noqa: I001
+from dataclasses import dataclass, field
 
 from nemo_curator.stages.base import ProcessingStage
 from nemo_curator.stages.resources import Resources
diff --git a/nemo_curator/stages/audio/text_filtering/itn_restoration.py b/nemo_curator/stages/audio/text_filtering/itn_restoration.py
@@ -196,9 +196,9 @@ class ITNRestorationStage(ProcessingStage[AudioTask, AudioTask]):
     itn_filtered_key: str = "itn_filtered"
     enable_validation: bool = True
     tensor_parallel_size: int | None = None
-    max_output_tokens: int = 4096
-    max_model_len: int = 32768
-    max_num_seqs: int = 256
+    max_output_tokens: int = 512
+    max_model_len: int = 4096
+    max_num_seqs: int = 16
     gpu_memory_utilization: float = 0.95
     kv_cache_dtype: str = "fp8"
     resources: Resources = field(default_factory=lambda: Resources(gpus=1.0))
@@ -354,7 +354,10 @@ def process_batch(self, tasks: list[AudioTask]) -> list[AudioTask]:
         for i, task in enumerate(tasks):
             text = task.data.get(self.text_key, "")
             skip = task.data.get(self.skip_me_key, "")
-            if not text or not text.strip() or skip:
+            if skip:
+                task.data[self.output_text_key] = ""
+                continue
+            if not text or not text.strip():
                 task.data[self.output_text_key] = text
                 continue
             valid_indices.append(i)
diff --git a/nemo_curator/stages/audio/text_filtering/pnc_content_guard.py b/nemo_curator/stages/audio/text_filtering/pnc_content_guard.py
@@ -61,6 +61,7 @@ def outputs(self) -> tuple[list[str], list[str]]:
 
     def _process_single(self, task: AudioTask) -> AudioTask:
         if task.data.get(self.skip_me_key, ""):
+            task.data.setdefault(self.pnc_text_key, "")
             task.data.setdefault(self.rejected_text_key, "")
             return task
         original = task.data.get(self.text_key, "")
diff --git a/nemo_curator/stages/audio/text_filtering/pnc_restoration.py b/nemo_curator/stages/audio/text_filtering/pnc_restoration.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 from loguru import logger
@@ -27,6 +28,8 @@
 from nemo_curator.stages.resources import Resources
 from nemo_curator.tasks import AudioTask
 
+_DEFAULT_PNC_PROMPT_PATH = Path(__file__).resolve().parent / "prompts" / "pnc_prompt.md"
+
 
 @dataclass
 class PnCRestorationStage(ProcessingStage[AudioTask, AudioTask]):
@@ -82,13 +85,11 @@ class PnCRestorationStage(ProcessingStage[AudioTask, AudioTask]):
         'Answer only "yes" or "no".\n\n'
         "Text: {text}"
     )
-    pnc_prompt: str = (
-        "Restore proper punctuation and capitalization to the following text. "
-        "Output only the corrected text, nothing else.\n\nText: {text}"
-    )
+    pnc_prompt: str | None = None
+    pnc_prompt_file: str | None = None
     system_prompt: str | None = None
-    max_model_len: int = 8192
-    max_num_seqs: int = 64
+    max_model_len: int = 4096
+    max_num_seqs: int = 16
     gpu_memory_utilization: float = 0.95
     tensor_parallel_size: int | None = None
     max_output_tokens: int = 512
@@ -98,17 +99,25 @@ class PnCRestorationStage(ProcessingStage[AudioTask, AudioTask]):
     batch_size: int = 64
     resources: Resources = field(default_factory=lambda: Resources(gpus=1.0))
 
+    def _resolve_pnc_prompt(self) -> str:
+        if self.pnc_prompt:
+            return self.pnc_prompt
+        path = Path(self.pnc_prompt_file) if self.pnc_prompt_file else _DEFAULT_PNC_PROMPT_PATH
+        logger.info("PnCRestoration: loading prompt from {}", path)
+        return path.read_text(encoding="utf-8").strip()
+
     def __post_init__(self) -> None:
         self._model: QwenTextLLM | None = None
         tp = self.tensor_parallel_size
         if tp and tp > 0:
             self.resources = Resources(gpus=float(tp))
 
     def _create_model(self) -> QwenTextLLM:
+        pnc_prompt = self._resolve_pnc_prompt()
         return QwenTextLLM(
             model_id=self.model_id,
             completeness_prompt=self.completeness_prompt,
-            pnc_prompt=self.pnc_prompt,
+            pnc_prompt=pnc_prompt,
             system_prompt=self.system_prompt,
             max_model_len=self.max_model_len,
             max_num_seqs=self.max_num_seqs,
@@ -175,7 +184,9 @@ def process_batch(self, tasks: list[AudioTask]) -> list[AudioTask]:
         for i, task in enumerate(tasks):
             skip = task.data.get(self.skip_me_key, "")
             text = task.data.get(self.text_key, "")
-            if skip or not text.strip():
+            if skip:
+                task.data[self.output_text_key] = ""
+            elif not text.strip():
                 task.data[self.output_text_key] = text
             else:
                 eligible_indices.append(i)
diff --git a/nemo_curator/stages/audio/text_filtering/prompts/itn_prompt.md b/nemo_curator/stages/audio/text_filtering/prompts/itn_prompt.md
diff --git a/nemo_curator/stages/audio/text_filtering/prompts/pnc_prompt.md b/nemo_curator/stages/audio/text_filtering/prompts/pnc_prompt.md
diff --git a/nemo_curator/stages/audio/text_filtering/regex_substitution.py b/nemo_curator/stages/audio/text_filtering/regex_substitution.py
diff --git a/nemo_curator/stages/audio/text_filtering/select_best_prediction.py b/nemo_curator/stages/audio/text_filtering/select_best_prediction.py
diff --git a/nemo_curator/stages/audio/text_filtering/whisper_hallucination.py b/nemo_curator/stages/audio/text_filtering/whisper_hallucination.py

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,8 @@ def setup(self) -> None:`
`116`	`116`	`max_num_seqs=self.max_num_seqs,`
`117`	`117`	`max_model_len=self.max_model_len,`
`118`	`118`	`seed=1234,`
	`119`	`+ enable_prefix_caching=True,`
	`120`	`+ prefix_caching_hash_algo="xxhash",`
`119`	`121`	`)`
`120`	`122`
`121`	`123`	`from transformers import Qwen3OmniMoeProcessor`
Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,8 @@ def setup(self) -> None:`
`122`	`122`	`max_num_seqs=self.max_num_seqs,`
`123`	`123`	`max_model_len=self.max_model_len,`
`124`	`124`	`seed=1234,`
	`125`	`+ enable_prefix_caching=True,`
	`126`	`+ prefix_caching_hash_algo="xxhash",`
`125`	`127`	`)`
`126`	`128`
`127`	`129`	`self._sampling_params = SamplingParams(`