NVIDIA
diff --git a/‎examples/megatron_bridge/README.md‎
Lines changed: 6 additions & 4 deletions b/‎examples/megatron_bridge/README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/megatron_bridge/distill.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/megatron_bridge/distill.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modelopt/onnx/quantization/autotune/__init__.py‎
Lines changed: 57 additions & 0 deletions b/‎modelopt/onnx/quantization/autotune/__init__.py‎
Lines changed: 57 additions & 0 deletions
@@ -105,11 +105,13 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
     --jsonl_paths /path/to/data1.jsonl /path/to/data2.jsonl ... \
     --json_keys text \
     --tokenizer Qwen/Qwen3-0.6B \
-    --output_dir /path/to/tokenized/data/qwen3 \
+    --output_dir tokenized_qwen3 \
     --workers 32 \
     --max_sequence_length 256_000
 ```
 
+This will create `tokenized_qwen3/data1_text_document.{bin,idx}` and `tokenized_qwen3/data2_text_document.{bin,idx}` files. We can use these files in the distillation script by passing `--data_paths 1.0 tokenized_qwen3/data1_text_document 1.0 tokenized_qwen3/data2_text_document` (equal weight for both datasets).
+
 Instead of `--jsonl_paths`, you can also pass a directory path to the `--input_dir` argument to tokenize all JSONL files in the directory.
 We are setting a maximum sequence length of 256k to avoid rare OOM errors in tokenization if text is too long.
 
@@ -123,12 +125,12 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
     --hf_max_samples_per_split 10_000_000 \
     --json_keys text \
     --tokenizer Qwen/Qwen3-0.6B \
-    --output_dir /path/to/tokenized/data/qwen3 \
+    --output_dir tokenized_qwen3 \
     --workers 32 \
     --max_sequence_length 256_000
 ```
 
-The [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) dataset is huge, so it will take a while to download and tokenize. You can also split the large `.jsonl` into multiple files (e.g. 10M samples per file using `split -l 10000000 -d --additional-suffix=.jsonl <file>.jsonl <file>_part`) and tokenize them parallelly.
+The [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) dataset is huge, so it will take a few hours to download and tokenize. You can also split the large `.jsonl` into multiple files (e.g. 10M samples per file using `split -l 10000000 -d --additional-suffix=.jsonl <file>.jsonl <file>_part`) and tokenize them parallelly via the `--jsonl_paths` argument.
 To quickly test the script, you can try the [nvidia/Nemotron-Pretraining-Dataset-sample](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-Dataset-sample) dataset.
 
 If you skip `--hf_name`, it will download and tokenize all subsets for the dataset.
@@ -144,7 +146,7 @@ torchrun --nnodes 1 --nproc_per_node 8 distill.py \
     --tp_size 8 \
     --teacher_hf_path Qwen/Qwen3-8B \
     --student_hf_path Qwen/Qwen3-4B \
-    --data_paths 1.0 /path/to/tokenized/data/qwen3 \
+    --data_paths 1.0 tokenized_qwen3/data1_text_document 1.0 tokenized_qwen3/data2_text_document \
     --data_path_to_cache /path/to/cache/dataset_indices_qwen3 \
     --seq_length 8192 \
     --mbs 1 \
 
@@ -66,6 +66,7 @@ def get_args():
         required=True,
         help="HuggingFace model name or path for the teacher (e.g. Qwen/Qwen3-8B)",
     )
+    parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code")
     # Parallelism arguments
     parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size")
     parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size")
@@ -135,7 +136,7 @@ def main(args: argparse.Namespace):
 
     # Build student and teacher model providers
     def _build_model_provider(hf_path):
-        bridge = AutoBridge.from_hf_pretrained(hf_path)
+        bridge = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=args.trust_remote_code)
         provider = bridge.to_megatron_provider(load_weights=True)
 
         # Override parallelism / training settings
 
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pattern-Based Q/DQ Autotuning for ONNX Models.
+
+This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
+in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
+region analysis to efficiently explore and optimize Q/DQ insertion strategies.
+"""
+
+# Core data structures
+from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
+from .common import (
+    AutotunerError,
+    AutotunerNotInitializedError,
+    InsertionScheme,
+    InvalidSchemeError,
+    Region,
+    RegionType,
+)
+from .insertion_points import (
+    ChildRegionInputInsertionPoint,
+    ChildRegionOutputInsertionPoint,
+    NodeInputInsertionPoint,
+    ResolvedInsertionPoint,
+)
+from .region_pattern import RegionPattern
+from .region_search import CombinedRegionSearch
+
+__all__ = [
+    "AutotunerError",
+    "AutotunerNotInitializedError",
+    "ChildRegionInputInsertionPoint",
+    "ChildRegionOutputInsertionPoint",
+    "CombinedRegionSearch",
+    "InsertionScheme",
+    "InvalidSchemeError",
+    "NodeInputInsertionPoint",
+    "Region",
+    "RegionPattern",
+    "RegionType",
+    "ResolvedInsertionPoint",
+    "TensorRTPyBenchmark",
+    "TrtExecBenchmark",
+]