Skip to content

Commit aba42e7

Browse files
committed
Merge branch 'main' into jingyux/jingyux-bug-fixed-5924267
2 parents 4ba3ac1 + 4eacb0d commit aba42e7

6 files changed

Lines changed: 1057 additions & 5 deletions

File tree

examples/megatron_bridge/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,13 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
105105
--jsonl_paths /path/to/data1.jsonl /path/to/data2.jsonl ... \
106106
--json_keys text \
107107
--tokenizer Qwen/Qwen3-0.6B \
108-
--output_dir /path/to/tokenized/data/qwen3 \
108+
--output_dir tokenized_qwen3 \
109109
--workers 32 \
110110
--max_sequence_length 256_000
111111
```
112112

113+
This will create `tokenized_qwen3/data1_text_document.{bin,idx}` and `tokenized_qwen3/data2_text_document.{bin,idx}` files. We can use these files in the distillation script by passing `--data_paths 1.0 tokenized_qwen3/data1_text_document 1.0 tokenized_qwen3/data2_text_document` (equal weight for both datasets).
114+
113115
Instead of `--jsonl_paths`, you can also pass a directory path to the `--input_dir` argument to tokenize all JSONL files in the directory.
114116
We are setting a maximum sequence length of 256k to avoid rare OOM errors in tokenization if text is too long.
115117

@@ -123,12 +125,12 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
123125
--hf_max_samples_per_split 10_000_000 \
124126
--json_keys text \
125127
--tokenizer Qwen/Qwen3-0.6B \
126-
--output_dir /path/to/tokenized/data/qwen3 \
128+
--output_dir tokenized_qwen3 \
127129
--workers 32 \
128130
--max_sequence_length 256_000
129131
```
130132

131-
The [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) dataset is huge, so it will take a while to download and tokenize. You can also split the large `.jsonl` into multiple files (e.g. 10M samples per file using `split -l 10000000 -d --additional-suffix=.jsonl <file>.jsonl <file>_part`) and tokenize them parallelly.
133+
The [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) dataset is huge, so it will take a few hours to download and tokenize. You can also split the large `.jsonl` into multiple files (e.g. 10M samples per file using `split -l 10000000 -d --additional-suffix=.jsonl <file>.jsonl <file>_part`) and tokenize them parallelly via the `--jsonl_paths` argument.
132134
To quickly test the script, you can try the [nvidia/Nemotron-Pretraining-Dataset-sample](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-Dataset-sample) dataset.
133135

134136
If you skip `--hf_name`, it will download and tokenize all subsets for the dataset.
@@ -144,7 +146,7 @@ torchrun --nnodes 1 --nproc_per_node 8 distill.py \
144146
--tp_size 8 \
145147
--teacher_hf_path Qwen/Qwen3-8B \
146148
--student_hf_path Qwen/Qwen3-4B \
147-
--data_paths 1.0 /path/to/tokenized/data/qwen3 \
149+
--data_paths 1.0 tokenized_qwen3/data1_text_document 1.0 tokenized_qwen3/data2_text_document \
148150
--data_path_to_cache /path/to/cache/dataset_indices_qwen3 \
149151
--seq_length 8192 \
150152
--mbs 1 \

examples/megatron_bridge/distill.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def get_args():
6666
required=True,
6767
help="HuggingFace model name or path for the teacher (e.g. Qwen/Qwen3-8B)",
6868
)
69+
parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code")
6970
# Parallelism arguments
7071
parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size")
7172
parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size")
@@ -135,7 +136,7 @@ def main(args: argparse.Namespace):
135136

136137
# Build student and teacher model providers
137138
def _build_model_provider(hf_path):
138-
bridge = AutoBridge.from_hf_pretrained(hf_path)
139+
bridge = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=args.trust_remote_code)
139140
provider = bridge.to_megatron_provider(load_weights=True)
140141

141142
# Override parallelism / training settings
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Pattern-Based Q/DQ Autotuning for ONNX Models.
17+
18+
This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
19+
in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
20+
region analysis to efficiently explore and optimize Q/DQ insertion strategies.
21+
"""
22+
23+
# Core data structures
24+
from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
25+
from .common import (
26+
AutotunerError,
27+
AutotunerNotInitializedError,
28+
InsertionScheme,
29+
InvalidSchemeError,
30+
Region,
31+
RegionType,
32+
)
33+
from .insertion_points import (
34+
ChildRegionInputInsertionPoint,
35+
ChildRegionOutputInsertionPoint,
36+
NodeInputInsertionPoint,
37+
ResolvedInsertionPoint,
38+
)
39+
from .region_pattern import RegionPattern
40+
from .region_search import CombinedRegionSearch
41+
42+
__all__ = [
43+
"AutotunerError",
44+
"AutotunerNotInitializedError",
45+
"ChildRegionInputInsertionPoint",
46+
"ChildRegionOutputInsertionPoint",
47+
"CombinedRegionSearch",
48+
"InsertionScheme",
49+
"InvalidSchemeError",
50+
"NodeInputInsertionPoint",
51+
"Region",
52+
"RegionPattern",
53+
"RegionType",
54+
"ResolvedInsertionPoint",
55+
"TensorRTPyBenchmark",
56+
"TrtExecBenchmark",
57+
]

0 commit comments

Comments
 (0)