Skip to content

Commit 2c9186f

Browse files
committed
merge main
Signed-off-by: Hung-Yueh Chiang <hungyuehc@nvidia.com>
2 parents 08cd44f + 6a3b6b8 commit 2c9186f

14 files changed

Lines changed: 332 additions & 49 deletions

File tree

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Make sure you read and follow the [Security Best Practices](https://github.com/N
2323
- If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A <!--- Mandatory -->
2424
- Did you write any new necessary tests?: ✅ / ❌ / N/A <!--- Mandatory for new features or examples. -->
2525
- Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A <!--- Only for new features, API changes, critical bug fixes or backward incompatible changes. -->
26+
- Did you get Claude approval on this PR?: ✅ / ❌ / N/A <!--- Run `/claude review`. NVIDIA org members can self-trigger for complex changes; orthogonal to CodeRabbit. -->
2627

2728
### Additional Information
2829
<!-- E.g. related issue. -->

.github/workflows/claude.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ jobs:
5252
)
5353
)
5454
runs-on: ubuntu-latest
55+
timeout-minutes: 10
5556
permissions:
5657
contents: read
5758
pull-requests: write
@@ -67,6 +68,13 @@ jobs:
6768
uses: anthropics/claude-code-action@v1
6869
env:
6970
ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
71+
# NVIDIA inference proxy (LiteLLM-based) rejects two fields
72+
# the Claude Code SDK sends by default. Set per NVIDIA/OSMO's
73+
# workflow which has hit and solved both issues:
74+
# - `context_management` → disable experimental betas
75+
# - `cache_control.ephemeral.scope` → disable prompt caching
76+
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1"
77+
DISABLE_PROMPT_CACHING: "1"
7078
with:
7179
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
7280
claude_args: |

.github/workflows/claude_review.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ jobs:
2323
contains(github.event.comment.body, '/claude review') &&
2424
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)
2525
runs-on: ubuntu-latest
26+
timeout-minutes: 10
2627
permissions:
2728
contents: read
2829
pull-requests: write
@@ -59,6 +60,13 @@ jobs:
5960
uses: anthropics/claude-code-action@v1
6061
env:
6162
ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
63+
# NVIDIA inference proxy (LiteLLM-based) rejects two fields
64+
# the Claude Code SDK sends by default. Set per NVIDIA/OSMO's
65+
# workflow which has hit and solved both issues:
66+
# - `context_management` → disable experimental betas
67+
# - `cache_control.ephemeral.scope` → disable prompt caching
68+
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1"
69+
DISABLE_PROMPT_CACHING: "1"
6270
with:
6371
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
6472
trigger_phrase: "/claude review"

CLAUDE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ Primarily Python codebase with optional C++/CUDA extensions supporting PyTorch,
1818
sign-off line
1919
- `pre-commit` hooks run on commit — if files are modified by hooks, re-stage and commit again
2020
- PRs require CODEOWNERS review (auto-assigned based on `.github/CODEOWNERS`)
21+
- When creating PRs (`gh pr create`), fill in `.github/PULL_REQUEST_TEMPLATE.md` verbatim — do NOT substitute the harness's default `## Summary` / `## Test plan` format
22+
- For non-trivial PRs, run `/claude review` to get Claude approval before merging (NVIDIA org members can self-trigger; orthogonal to CodeRabbit)
2123
- After rebasing, always re-run tests locally before pushing
2224
- All code must follow the security guidelines in `SECURITY.md` — violations are blocked as pre-merge errors
2325
- For contribution guidelines, commit conventions, and PR requirements, see `CONTRIBUTING.md`

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,7 @@ dense | sparsegpt) ;;
4949
;;
5050
esac
5151

52-
#Iterate over list of qformats provided and check if they are valid
53-
IFS=","
54-
for qformat in $QFORMAT; do
55-
case $qformat in
56-
fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | w4a16_nvfp4) ;;
57-
*)
58-
echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, w4a16_nvfp4]" >&2
59-
exit 1
60-
;;
61-
esac
62-
done
63-
IFS=" "
64-
65-
if [ -n "$RECIPE" ] && [ -n "$QFORMAT" ]; then
66-
echo "Error: --recipe and --quant are mutually exclusive." >&2
67-
exit 1
68-
fi
52+
# Quant format / recipe validation is delegated to hf_ptq.py.
6953

7054
script_dir="$(dirname "$(readlink -f "$0")")"
7155

@@ -77,11 +61,13 @@ fi
7761

7862
QFORMAT_MODIFIED="${QFORMAT//,/_}"
7963

64+
# When using --recipe, build the model name from the recipe basename (without
65+
# directory or .yaml suffix) so each recipe gets its own SAVE_PATH.
8066
if [ -n "$RECIPE" ]; then
81-
RECIPE_LABEL=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g')
82-
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${RECIPE_LABEL}
67+
RECIPE_TAG=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g')
68+
MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG}
8369
else
84-
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
70+
MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
8571
fi
8672

8773
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
@@ -137,10 +123,6 @@ if $TRUST_REMOTE_CODE; then
137123
PTQ_ARGS+=" --trust_remote_code "
138124
fi
139125

140-
if [ -n "${EXCLUDE_MODULES:-}" ]; then
141-
PTQ_ARGS+=" --exclude_modules ${EXCLUDE_MODULES} "
142-
fi
143-
144126
if $USE_SEQ_DEVICE_MAP; then
145127
PTQ_ARGS+=" --use_seq_device_map "
146128
fi
@@ -178,29 +160,18 @@ fi
178160

179161
if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then
180162

181-
if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then
182-
if [ -d "$MODEL_PATH" ]; then
183-
MODEL_CONFIG_EXIST=true
184-
MODEL_CONFIG=$MODEL_PATH/config.json
185-
for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done
186-
else
187-
echo "Please use the model directory where the config.json file is present."
188-
exit 1
189-
fi
190-
fi
191-
192163
if [[ "$MODEL_CONFIG_EXIST" == false ]]; then
193164
echo "Quantizing original model..."
194165
if [ -n "$RECIPE" ]; then
195-
QUANT_ARG="--recipe=$RECIPE"
166+
QUANT_SPEC_ARGS="--recipe=$RECIPE"
196167
else
197-
QUANT_ARG="--qformat=${QFORMAT// /,}"
168+
QUANT_SPEC_ARGS="--qformat=${QFORMAT// /,}"
198169
fi
199170
python hf_ptq.py \
200171
--pyt_ckpt_path=$MODEL_PATH \
201172
--export_path=$SAVE_PATH \
202173
--sparsity_fmt=$SPARSITY_FMT \
203-
$QUANT_ARG \
174+
$QUANT_SPEC_ARGS \
204175
--calib_size=$CALIB_SIZE \
205176
--batch_size=$CALIB_BATCH_SIZE \
206177
--inference_tensor_parallel=$TP \
@@ -222,13 +193,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
222193
exit 0
223194
fi
224195

225-
if [ "$QFORMAT" = "w4a16_nvfp4" ]; then
226-
echo "w4a16_nvfp4 checkpoint exported to $SAVE_PATH"
227-
echo "To serve on vLLM, convert to compressed-tensors"
228-
exit 0
229-
fi
230-
231-
if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then
196+
if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]] || [[ "$RECIPE" == *"nvfp4"* ]]; then
232197
cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
233198

234199
if [ "$cuda_major" -lt 10 ]; then
@@ -237,6 +202,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
237202
fi
238203
fi
239204

205+
if [ -n "$RECIPE" ]; then
206+
echo "Recipe $RECIPE used. Please deploy with TensorRT-LLM directly. Checkpoint export_path: $SAVE_PATH"
207+
exit 0
208+
fi
209+
240210
if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then
241211
echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
242212
exit 0

examples/llm_ptq/scripts/parser.sh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ parse_options() {
2020
# Default values
2121
MODEL_PATH=""
2222
QFORMAT=""
23+
RECIPE=""
2324
KV_CACHE_QUANT=""
2425
TP=1
2526
PP=1
@@ -37,13 +38,14 @@ parse_options() {
3738
CAST_MXFP4_TO_NVFP4=false
3839

3940
# Parse command-line options
40-
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
41+
ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
4142

4243
eval set -- "$ARGS"
4344
while true; do
4445
case "$1" in
4546
--model ) MODEL_PATH="$2"; shift 2;;
4647
--quant ) QFORMAT="$2"; shift 2;;
48+
--recipe ) RECIPE="$2"; shift 2;;
4749
--kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;;
4850
--tp ) TP="$2"; shift 2;;
4951
--pp ) PP="$2"; shift 2;;
@@ -99,12 +101,19 @@ parse_options() {
99101
fi
100102

101103
# Verify required options are provided
102-
if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || { [ -z "$QFORMAT" ] && [ -z "$RECIPE" ]; }; then
103-
echo "Usage: $0 --model=<MODEL_PATH> (--quant=<QFORMAT> | RECIPE=<recipe>) --tasks=<TASK,...>"
104+
if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || ([ -z "$QFORMAT" ] && [ -z "$RECIPE" ]); then
105+
echo "Usage: $0 --model=<MODEL_PATH> (--quant=<QFORMAT> | --recipe=<RECIPE>) --tasks=<TASK,...>"
104106
echo "Optional args: --sparsity=<SPARSITY_FMT> --awq_block_size=<AWQ_BLOCK_SIZE> --calib=<CALIB_SIZE>"
105107
exit 1
106108
fi
107109

110+
# --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec, while
111+
# --quant selects a built-in qformat preset. Pick exactly one.
112+
if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ]; then
113+
echo "Cannot specify both --quant and --recipe; pick one." >&2
114+
exit 1
115+
fi
116+
108117
VALID_TASKS=("quant" "mmlu" "lm_eval" "livecodebench" "simple_eval")
109118

110119
for task in $(echo "$TASKS" | tr ',' ' '); do
@@ -135,6 +144,7 @@ parse_options() {
135144
echo "================="
136145
echo "model: $MODEL_PATH"
137146
echo "quant: $QFORMAT"
147+
echo "recipe: $RECIPE"
138148
echo "tp (TensorRT-LLM Checkpoint only): $TP"
139149
echo "pp (TensorRT-LLM Checkpoint only): $PP"
140150
echo "sparsity: $SPARSITY_FMT"

modelopt/torch/export/moe_utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,29 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
6262
for idx in range(n):
6363
expert = nn.Module()
6464

65+
# If the gate_up source quantizer was never calibrated (rare expert
66+
# that received no calibration tokens), derive its amax once from the
67+
# FUSED tensor so gate and up share the same weight_scale_2 below.
68+
# Why: vLLM fuses W1 (gate) and W3 (up) at load time and asserts a
69+
# single per-tensor scale across the fusion. The per-projection
70+
# fallback further down would otherwise compute amax independently from
71+
# each half — gate's max and up's max generally differ — producing
72+
# mismatched weight_scale_2 and garbled MoE output at inference.
73+
gate_up_q = module.gate_up_proj_weight_quantizers[idx]
74+
if getattr(gate_up_q, "is_enabled", False) and (
75+
not hasattr(gate_up_q, "_amax")
76+
or gate_up_q._amax is None
77+
or torch.all(gate_up_q._amax == 0)
78+
):
79+
gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
80+
warnings.warn(
81+
f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
82+
f"(amax missing or zero). Using fused-tensor amax as fallback "
83+
f"(shared by gate and up so weight_scale_2 stays consistent). "
84+
f"Consider increasing calibration size to activate all experts.",
85+
stacklevel=2,
86+
)
87+
6588
projections = [
6689
("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True),
6790
("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True),

modelopt/torch/export/unified_export_hf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
5656
from modelopt.torch.quantization.qtensor import MXFP8QTensor, NVFP4QTensor
5757
from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
58+
from modelopt.torch.utils.dataset_utils import _disable_use_cache
5859

5960
try:
6061
from modelopt.torch.sparsity.attention_sparsity.conversion import export_sparse_attention_config
@@ -214,11 +215,14 @@ def _output_hook(module, input, output):
214215
if not handles:
215216
return input_to_linear, output_to_layernorm
216217

217-
# Run dummy forward pass to collect modules sharing same input
218+
# Run dummy forward pass to collect modules sharing same input.
219+
# `_disable_use_cache` keeps the probe forward working on configs that don't
220+
# set `use_cache` (e.g., stepfun-ai/Step-3.5-Flash's Step3p5Config).
218221
try:
219222
with (
220223
torch.no_grad(),
221224
set_quantizer_by_cfg_context(model, [{"quantizer_name": "*", "enable": False}]),
225+
_disable_use_cache(model),
222226
):
223227
dummy_forward_fn()
224228
finally:
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
imports:
17+
base_disable_all: configs/ptq/units/base_disable_all
18+
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
19+
nvfp4: configs/numerics/nvfp4
20+
nvfp4_static: configs/numerics/nvfp4_static
21+
kv_fp8_cast: configs/ptq/units/kv_fp8_cast
22+
23+
metadata:
24+
recipe_type: ptq
25+
description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax.
26+
quantize:
27+
algorithm:
28+
method: mse
29+
fp8_scale_sweep: true
30+
# layerwise=false required for VLMs where the decoder layers are nested under
31+
# `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
32+
layerwise: false
33+
quant_cfg:
34+
- $import: base_disable_all
35+
- quantizer_name: '*mlp.experts*weight_quantizer'
36+
cfg:
37+
$import: nvfp4_static
38+
- quantizer_name: '*mlp.experts*input_quantizer'
39+
cfg:
40+
$import: nvfp4
41+
- quantizer_name: '*block_sparse_moe*weight_quantizer'
42+
cfg:
43+
$import: nvfp4_static
44+
- quantizer_name: '*block_sparse_moe*input_quantizer'
45+
cfg:
46+
$import: nvfp4
47+
- $import: kv_fp8_cast
48+
- $import: default_disabled_quantizers
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
imports:
17+
base_disable_all: configs/ptq/units/base_disable_all
18+
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
19+
nvfp4: configs/numerics/nvfp4
20+
nvfp4_static: configs/numerics/nvfp4_static
21+
kv_fp8_cast: configs/ptq/units/kv_fp8_cast
22+
23+
metadata:
24+
recipe_type: ptq
25+
description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax.
26+
quantize:
27+
algorithm:
28+
method: mse
29+
fp8_scale_sweep: true
30+
# layerwise=false required for VLMs where the decoder layers are nested under
31+
# `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
32+
layerwise: false
33+
quant_cfg:
34+
- $import: base_disable_all
35+
- quantizer_name: '*mlp*weight_quantizer'
36+
cfg:
37+
$import: nvfp4_static
38+
- quantizer_name: '*mlp*input_quantizer'
39+
cfg:
40+
$import: nvfp4
41+
- quantizer_name: '*block_sparse_moe*weight_quantizer'
42+
cfg:
43+
$import: nvfp4_static
44+
- quantizer_name: '*block_sparse_moe*input_quantizer'
45+
cfg:
46+
$import: nvfp4
47+
- quantizer_name: '*.experts.*weight_quantizer'
48+
cfg:
49+
$import: nvfp4_static
50+
- quantizer_name: '*.experts.*input_quantizer'
51+
cfg:
52+
$import: nvfp4
53+
- $import: kv_fp8_cast
54+
- $import: default_disabled_quantizers

0 commit comments

Comments
 (0)