Skip to content

Commit d4c4043

Browse files
authored
Merge branch 'main' into jingyux/ltx-2-update
2 parents 0d749bc + 03a1899 commit d4c4043

38 files changed

Lines changed: 2392 additions & 148 deletions

.pre-commit-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ repos:
2424
hooks:
2525
- id: ruff-check
2626
args: [--fix, --exit-non-zero-on-fix]
27+
exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
2728
- id: ruff-format
29+
exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
2830

2931
- repo: https://github.com/pre-commit/mirrors-mypy
3032
rev: v1.17.1
@@ -93,6 +95,7 @@ repos:
9395
examples/llm_eval/modeling.py|
9496
examples/llm_qat/main.py|
9597
examples/llm_sparsity/weight_sparsity/finetune.py|
98+
examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
9699
examples/speculative_decoding/main.py|
97100
examples/speculative_decoding/medusa_utils.py|
98101
examples/speculative_decoding/server_generate.py|

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ NVIDIA Model Optimizer Changelog (Linux)
88

99
- User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
1010
- ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
11+
- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to all the experts.
1112
- Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
1213
- Add support for rotating the input before quantization for RHT.
1314

examples/llm_ptq/example_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ def build_quant_cfg(
201201
model_type,
202202
quant_cfg_choices,
203203
kv_quant_cfg_choices,
204+
moe_calib_experts_ratio: float | None = None,
204205
) -> dict[str, Any]:
205206
quant_cfg = {}
206207
assert qformat in quant_cfg_choices, (
@@ -232,6 +233,20 @@ def build_quant_cfg(
232233
getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"],
233234
)
234235

236+
if moe_calib_experts_ratio:
237+
assert 0 < moe_calib_experts_ratio <= 1, "moe_calib_experts_ratio must be between 0 and 1"
238+
if isinstance(quant_cfg["algorithm"], str):
239+
quant_cfg["algorithm"] = {
240+
"method": quant_cfg["algorithm"],
241+
"moe_calib_experts_ratio": moe_calib_experts_ratio,
242+
}
243+
elif isinstance(quant_cfg["algorithm"], dict):
244+
quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
245+
else:
246+
warnings.warn(
247+
f"Quantization algorithm: {quant_cfg['algorithm']} does not support setting moe_calib_experts_ratio"
248+
)
249+
235250
# Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
236251
if model_type == "gemma" and "int8_sq" in qformat:
237252
quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}

examples/llm_ptq/hf_ptq.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,7 @@ def quantize_main(
906906
model_type,
907907
QUANT_CFG_CHOICES,
908908
KV_QUANT_CFG_CHOICES,
909+
args.moe_calib_experts_ratio,
909910
)
910911

911912
# Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92)
@@ -1126,8 +1127,21 @@ def parse_args() -> argparse.Namespace:
11261127
"(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
11271128
),
11281129
)
1130+
parser.add_argument(
1131+
"--moe_calib_experts_ratio",
1132+
type=float,
1133+
default=1.0,
1134+
help=(
1135+
"Fraction of experts to calibrate during forward pass (ratio in (0.0, 1.0]). "
1136+
"Only used for MOE models; used to reduce the number of experts calibrated during the forward pass."
1137+
"Does not impact non-MOE models."
1138+
),
1139+
)
11291140

1130-
return parser.parse_args()
1141+
args = parser.parse_args()
1142+
if not (0.0 < args.moe_calib_experts_ratio <= 1.0):
1143+
parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
1144+
return args
11311145

11321146

11331147
def main(args: argparse.Namespace):

examples/megatron_bridge/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
128128
--max_sequence_length 256_000
129129
```
130130

131+
The [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) dataset is huge, so it will take a while to download and tokenize. You can also split the large `.jsonl` into multiple files (e.g. 10M samples per file using `split -l 10000000 -d --additional-suffix=.jsonl <file>.jsonl <file>_part`) and tokenize them parallelly.
132+
To quickly test the script, you can try the [nvidia/Nemotron-Pretraining-Dataset-sample](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-Dataset-sample) dataset.
133+
131134
If you skip `--hf_name`, it will download and tokenize all subsets for the dataset.
132135
If you skip `--hf_split`, it will download and tokenize all splits for the subset.
133136
If you skip `--hf_max_samples_per_split`, it will download and tokenize all samples for the split.

examples/specdec_bench/README.md

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,121 @@ MTBench is available [here](https://huggingface.co/datasets/HuggingFaceH4/mt_ben
2828
Download `nvidia/gpt-oss-120b-Eagle3` to a local directory `/path/to/eagle`.
2929

3030
```bash
31-
python3 run.py --model_dir openai/gpt-oss-120b --tokenizer openai/gpt-oss-120b --draft_model_dir /path/to/eagle --mtbench question.jsonl --tp_size 1 --ep_size 1 --draft_length 3 --output_length 4096 --num_requests 80 --engine TRTLLM --concurrency 1 --postprocess gptoss
32-
31+
python3 run.py \
32+
--model_dir openai/gpt-oss-120b \
33+
--tokenizer openai/gpt-oss-120b \
34+
--draft_model_dir /path/to/eagle \
35+
--mtbench question.jsonl \
36+
--tp_size 1 \
37+
--ep_size 1 \
38+
--draft_length 3 \
39+
--output_length 4096 \
40+
--num_requests 80 \
41+
--engine TRTLLM \
42+
--concurrency 1 \
43+
--postprocess gptoss
3344
```
3445

3546
### Running Random ids on GPT OSS + Eagle3
3647

3748
Download `nvidia/gpt-oss-120b-Eagle3` to a local directory `/path/to/eagle`.
3849

3950
```bash
40-
python3 run.py --model_dir openai/gpt-oss-120b --tokenizer openai/gpt-oss-120b --draft_model_dir /path/to/eagle --random_isl 1024 --tp_size 1 --ep_size 1 --draft_length 3 --output_length 4096 --num_requests 40 --engine TRTLLM --concurrency 1
51+
python3 run.py \
52+
--model_dir openai/gpt-oss-120b \
53+
--tokenizer openai/gpt-oss-120b \
54+
--draft_model_dir /path/to/eagle \
55+
--random_isl 1024 \
56+
--tp_size 1 \
57+
--ep_size 1 \
58+
--draft_length 3 \
59+
--output_length 4096 \
60+
--num_requests 40 \
61+
--engine TRTLLM \
62+
--concurrency 1
63+
```
64+
65+
### Running [SPEED-Bench](https://huggingface.co/datasets/nvidia/SPEED-Bench) on Llama 3.3 70B + Eagle 3
66+
67+
1. Install the requirements file using `pip install -r requirements_speed.txt`
68+
69+
2. Prepare the data using the provided script:
70+
71+
```bash
72+
python3 prepare_data.py --dataset speed --config all
73+
```
74+
75+
The data will be saved to `data/` directory, each config type (qualitative, throughput_1k, ...) to each own directory.
76+
77+
#### License
78+
79+
GOVERNING TERMS: This dataset is governed by the NVIDIA Evaluation Dataset License Agreement.
80+
81+
ADDITIONAL INFORMATION: MIT for bigcode/humanevalpack, RUCAIBox/MMATH, RUCAIBox/BAMBOO and EQ-Bench. Apache 2.0 for Writing Bench and Spec-Bench. CC BY 4.0 for FBK-MT/MCIF. MIT and Apache 2.0 for tianyang/repobench_python_v1.1, JetBrains-Research/lca-project-level-code-completion and tianyang/repobench_java_v1.1.
82+
83+
NOTICE: For each dataset a user elects to use, the user is responsible for checking if the dataset license is fit for the intended purpose. The `prepare_data.py` script automatically fetches data from all the source datasets.
84+
85+
Additional details are in [HuggingFace dataset repository](https://huggingface.co/datasets/nvidia/SPEED-Bench).
86+
87+
#### Qualitative split
88+
89+
```bash
90+
python3 run.py \
91+
--model_dir meta-llama/Llama-3.3-70B-Instruct \
92+
--tokenizer meta-llama/Llama-3.3-70B-Instruct \
93+
--draft_model_dir yuhuili/EAGLE3-LLaMA3.3-Instruct-70B \
94+
--dataset speed \
95+
--dataset_path data/speed/qualitative \
96+
--tp_size 8 \
97+
--ep_size 1 \
98+
--draft_length 3 \
99+
--output_length 4096 \
100+
--engine TRTLLM \
101+
--concurrency 32 \
102+
--show_progress
103+
```
104+
105+
#### Throughput split
41106

107+
```bash
108+
python3 run.py \
109+
--model_dir meta-llama/Llama-3.3-70B-Instruct \
110+
--tokenizer meta-llama/Llama-3.3-70B-Instruct \
111+
--draft_model_dir yuhuili/EAGLE3-LLaMA3.3-Instruct-70B \
112+
--dataset speed \
113+
--dataset_path data/speed/throughput_1k \
114+
--tp_size 8 \
115+
--ep_size 1 \
116+
--draft_length 3 \
117+
--output_length 4096 \
118+
--engine TRTLLM \
119+
--concurrency 32 \
120+
--show_progress
121+
```
122+
123+
For longer context (>8192 tokens), please use the following configuration when using TRTLLM:
124+
125+
```yaml
126+
engine_args:
127+
max_seq_len: 131072 # Model max context length (for Llama 3.3 70B)
128+
enable_chunked_prefill: true
129+
```
130+
131+
```bash
132+
python3 run.py \
133+
--model_dir meta-llama/Llama-3.3-70B-Instruct \
134+
--tokenizer meta-llama/Llama-3.3-70B-Instruct \
135+
--draft_model_dir yuhuili/EAGLE3-LLaMA3.3-Instruct-70B \
136+
--dataset speed \
137+
--dataset_path data/speed/throughput_16k \
138+
--tp_size 8 \
139+
--ep_size 1 \
140+
--draft_length 3 \
141+
--output_length 4096 \
142+
--engine TRTLLM \
143+
--concurrency 32 \
144+
--show_progress \
145+
--runtime_params runtime_args_long_context.yaml
42146
```
43147

44148
## Notes

0 commit comments

Comments
 (0)