[None] [refactor] Unify compressed-tensors quant config parsing (#14468)

DomBrown · web-flow · commit c7e7fc5cdc35 · 2026-05-27T07:25:27.000+08:00
Signed-off-by: Dom Brown &lt;3886319+DomBrown@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
@@ -23,6 +23,8 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
+from tensorrt_llm.models.quant_config_utils import \
+    update_quant_config_from_compressed_tensors
 from tensorrt_llm.quantization.mode import QuantAlgo
 from tensorrt_llm.quantization.modelopt_config import (
     is_modelopt_quant_config, read_modelopt_quant_config,
@@ -477,78 +479,8 @@ def load_hf_quant_config(hf_quant_config, moe_backend, checkpoint_dir=None):
 
         # NOTE: This is for llm-compressor's quantized checkpoints.
         elif hf_quant_config.get("quant_method") == "compressed-tensors":
-            config_groups = hf_quant_config.get("config_groups")
-            if config_groups is None:
-                raise ValueError(
-                    f"config_groups is not set in {hf_quant_config}.")
-
-            weights_quant_config = config_groups["group_0"]["weights"]
-            inputs_quant_config = config_groups["group_0"]["input_activations"]
-            weights_quant_strategy = weights_quant_config["strategy"]
-            inputs_quant_strategy = inputs_quant_config["strategy"]
-
-            if weights_quant_config["num_bits"] == 8:
-                if weights_quant_strategy == "channel":
-                    if inputs_quant_strategy != "token":
-                        raise ValueError(
-                            f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}."
-                        )
-                    quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN
-                elif weights_quant_strategy == "block":
-                    if inputs_quant_strategy != "group":
-                        raise ValueError(
-                            f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}."
-                        )
-                    quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-                    group_size = inputs_quant_config["group_size"]
-
-                    # NOTE: TRT-LLM only supports group_size=128 for FP8_BLOCK_SCALES.
-                    if group_size != 128:
-                        raise ValueError(
-                            f"Unsupported group_size: {group_size}. Supported: 128."
-                        )
-                    quant_config.group_size = group_size
-
-                else:
-                    raise ValueError(
-                        f"Unsupported weights_quant_strategy: {weights_quant_strategy}. "
-                        "Supported strategies: 'channel', 'block'.")
-            elif (weights_quant_config["num_bits"] == 4
-                  and weights_quant_config.get("type") == "float"
-                  and weights_quant_strategy == "tensor_group"):
-                # llm-compressor NVFP4: weights FP4 with FP8 per-group scales
-                # (group_size=16), scaled by an FP32 global scale.
-                if inputs_quant_strategy != "tensor_group":
-                    raise ValueError(
-                        f"Unsupported inputs_quant_strategy for NVFP4: {inputs_quant_strategy}."
-                    )
-                group_size = weights_quant_config["group_size"]
-                if group_size != 16:
-                    raise ValueError(
-                        f"Unsupported group_size: {group_size}. Supported: 16 for NVFP4."
-                    )
-                quant_config.quant_algo = QuantAlgo.NVFP4
-                quant_config.group_size = group_size
-            else:
-                raise ValueError(
-                    f"Unsupported quant_bits: {weights_quant_config['num_bits']}. "
-                    "Supported: 8 (FP8) or 4 (NVFP4).")
-
-            # kv_cache_scheme (llm-compressor): FP8 per-tensor KV cache.
-            kv_cache_scheme = hf_quant_config.get("kv_cache_scheme")
-            if kv_cache_scheme is not None:
-                if (kv_cache_scheme.get("num_bits") == 8
-                        and kv_cache_scheme.get("type") == "float"):
-                    quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-                else:
-                    raise ValueError(
-                        f"Unsupported kv_cache_scheme: {kv_cache_scheme}.")
-
-            if hf_exclude_modules is not None:
-                quant_config.exclude_modules = list(
-                    set(hf_exclude_modules + hf_quant_config.get("ignore", [])))
-            else:
-                quant_config.exclude_modules = hf_quant_config.get("ignore", [])
+            update_quant_config_from_compressed_tensors(quant_config,
+                                                        hf_quant_config)
         elif hf_quant_config.get("quant_method") == "nvfp4":
             quant_config.quant_algo = QuantAlgo.NVFP4
             group_size = hf_quant_config.get("group_size", 16)
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
@@ -26,6 +26,8 @@
 from ..mapping import Mapping
 from ..models.automodel import MODEL_MAP, AutoConfig, AutoModelForCausalLM
 from ..models.modeling_utils import PretrainedConfig, QuantAlgo, QuantConfig
+from ..models.quant_config_utils import \
+    update_quant_config_from_compressed_tensors
 from ..module import Module
 from ..quantization.modelopt_config import (is_modelopt_quant_config,
                                             read_modelopt_quant_config,
@@ -470,90 +472,8 @@ def _update_from_hf_quant_config(self) -> bool:
                 ]
             # NOTE: This is for llm-compressor's quantized checkpoints.
             elif hf_quant_config.get("quant_method") == "compressed-tensors":
-                config_groups = hf_quant_config.get("config_groups")
-                if config_groups is None:
-                    raise ValueError(
-                        f"config_groups is not set in {hf_quant_config}.")
-
-                weights_quant_config = config_groups["group_0"]["weights"]
-                inputs_quant_config = config_groups["group_0"][
-                    "input_activations"]
-                weights_quant_strategy = weights_quant_config["strategy"]
-                inputs_quant_strategy = inputs_quant_config["strategy"]
-
-                if weights_quant_config["num_bits"] == 8:
-                    if weights_quant_strategy == "channel":
-                        if inputs_quant_strategy != "token":
-                            raise ValueError(
-                                f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}."
-                            )
-                        quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN
-                    elif weights_quant_strategy == "block":
-                        if inputs_quant_strategy != "group":
-                            raise ValueError(
-                                f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}."
-                            )
-                        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-                        group_size = inputs_quant_config["group_size"]
-
-                        # NOTE: TRT-LLM only supports group_size=128 for FP8_BLOCK_SCALES.
-                        if group_size != 128:
-                            raise ValueError(
-                                f"Unsupported group_size: {group_size}. Supported: 128."
-                            )
-                        quant_config.group_size = group_size
-
-                    else:
-                        raise ValueError(
-                            f"Unsupported weights_quant_strategy: {weights_quant_strategy}. "
-                            "Supported strategies: 'channel', 'block'.")
-                elif (weights_quant_config["num_bits"] == 4
-                      and weights_quant_config.get("type") == "float"
-                      and weights_quant_strategy == "tensor_group"):
-                    # llm-compressor NVFP4: weights FP4 with FP8 per-group
-                    # scales (group_size=16), scaled by an FP32 global scale.
-                    if inputs_quant_strategy != "tensor_group":
-                        raise ValueError(
-                            f"Unsupported inputs_quant_strategy for NVFP4: {inputs_quant_strategy}."
-                        )
-                    group_size = weights_quant_config["group_size"]
-                    if group_size != 16:
-                        raise ValueError(
-                            f"Unsupported group_size: {group_size}. Supported: 16 for NVFP4."
-                        )
-                    quant_config.quant_algo = QuantAlgo.NVFP4
-                    quant_config.group_size = group_size
-                else:
-                    raise ValueError(
-                        f"Unsupported quant_bits: {weights_quant_config['num_bits']}. "
-                        "Supported: 8 (FP8) or 4 (NVFP4).")
-
-                # kv_cache_scheme (llm-compressor): FP8 per-tensor KV cache.
-                kv_cache_scheme = hf_quant_config.get("kv_cache_scheme")
-                if kv_cache_scheme is not None:
-                    if (kv_cache_scheme.get("num_bits") == 8
-                            and kv_cache_scheme.get("type") == "float"):
-                        if quant_config.kv_cache_quant_algo in (None,
-                                                                QuantAlgo.FP8):
-                            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-                        else:
-                            raise ValueError(
-                                f"Specified kv_cache_quant_algo={quant_config.kv_cache_quant_algo}, "
-                                f"conflicting with FP8 KV cache from HF quant config."
-                            )
-                    else:
-                        raise ValueError(
-                            f"Unsupported kv_cache_scheme: {kv_cache_scheme}.")
-
-                hf_exclude_modules = hf_quant_config.get(
-                    "modules_to_not_convert", None)
-                if hf_exclude_modules is not None:
-                    quant_config.exclude_modules = list(
-                        set(hf_exclude_modules +
-                            hf_quant_config.get("ignore", [])))
-                else:
-                    quant_config.exclude_modules = hf_quant_config.get(
-                        "ignore", [])
+                update_quant_config_from_compressed_tensors(
+                    quant_config, hf_quant_config)
             elif hf_quant_config.get("quant_method") == "nvfp4":
                 quant_config.quant_algo = QuantAlgo.NVFP4
                 group_size = hf_quant_config.get("group_size", 16)
diff --git a/tensorrt_llm/models/quant_config_utils.py b/tensorrt_llm/models/quant_config_utils.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Mapping
+
+from tensorrt_llm.models.modeling_utils import QuantConfig
+from tensorrt_llm.quantization.mode import QuantAlgo
+
+
+def update_quant_config_from_compressed_tensors(
+    quant_config: QuantConfig, hf_quant_config: Mapping[str, Any]
+) -> None:
+    """Mutate QuantConfig from an llm-compressor compressed-tensors config."""
+    config_groups = hf_quant_config.get("config_groups")
+    if config_groups is None:
+        raise ValueError(f"config_groups is not set in {hf_quant_config}.")
+
+    weights_quant_config = config_groups["group_0"]["weights"]
+    inputs_quant_config = config_groups["group_0"]["input_activations"]
+    weights_quant_strategy = weights_quant_config["strategy"]
+    inputs_quant_strategy = inputs_quant_config["strategy"]
+
+    if weights_quant_config["num_bits"] == 8:
+        if weights_quant_strategy == "channel":
+            if inputs_quant_strategy != "token":
+                raise ValueError(f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}.")
+            quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN
+        elif weights_quant_strategy == "block":
+            if inputs_quant_strategy != "group":
+                raise ValueError(f"Unsupported inputs_quant_strategy: {inputs_quant_strategy}.")
+            quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+            group_size = inputs_quant_config["group_size"]
+
+            # TRT-LLM only supports group_size=128 for FP8_BLOCK_SCALES.
+            if group_size != 128:
+                raise ValueError(f"Unsupported group_size: {group_size}. Supported: 128.")
+            quant_config.group_size = group_size
+
+        else:
+            raise ValueError(
+                f"Unsupported weights_quant_strategy: {weights_quant_strategy}. "
+                "Supported strategies: 'channel', 'block'."
+            )
+    elif (
+        weights_quant_config["num_bits"] == 4
+        and weights_quant_config.get("type") == "float"
+        and weights_quant_strategy == "tensor_group"
+    ):
+        # llm-compressor NVFP4: weights FP4 with FP8 per-group scales
+        # (group_size=16), scaled by an FP32 global scale.
+        if inputs_quant_strategy != "tensor_group":
+            raise ValueError(
+                f"Unsupported inputs_quant_strategy for NVFP4: {inputs_quant_strategy}."
+            )
+        group_size = weights_quant_config["group_size"]
+        if group_size != 16:
+            raise ValueError(f"Unsupported group_size: {group_size}. Supported: 16 for NVFP4.")
+        quant_config.quant_algo = QuantAlgo.NVFP4
+        quant_config.group_size = group_size
+    else:
+        raise ValueError(
+            f"Unsupported quant_bits: {weights_quant_config['num_bits']}. "
+            "Supported: 8 (FP8) or 4 (NVFP4)."
+        )
+
+    # kv_cache_scheme (llm-compressor): FP8 per-tensor KV cache.
+    kv_cache_scheme = hf_quant_config.get("kv_cache_scheme")
+    if kv_cache_scheme is not None:
+        if kv_cache_scheme.get("num_bits") == 8 and kv_cache_scheme.get("type") == "float":
+            if quant_config.kv_cache_quant_algo in (None, QuantAlgo.FP8):
+                quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            else:
+                raise ValueError(
+                    f"Specified kv_cache_quant_algo={quant_config.kv_cache_quant_algo}, "
+                    "conflicting with FP8 KV cache from HF quant config."
+                )
+        else:
+            raise ValueError(f"Unsupported kv_cache_scheme: {kv_cache_scheme}.")
+
+    hf_exclude_modules = hf_quant_config.get("modules_to_not_convert", None)
+    if hf_exclude_modules is not None:
+        quant_config.exclude_modules = list(
+            set(hf_exclude_modules + hf_quant_config.get("ignore", []))
+        )
+    else:
+        quant_config.exclude_modules = hf_quant_config.get("ignore", [])
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -19,6 +19,7 @@ l0_a10:
   - unittest/utils/test_util.py
   - unittest/utils/test_logger.py
   - unittest/_torch/test_model_config.py
+  - unittest/models/test_quant_config_utils.py
   - unittest/_torch/modeling/test_modeling_mistral.py
   - unittest/_torch/modeling/test_modeling_pixtral.py
   - unittest/_torch/modeling/test_modeling_cohere2.py
diff --git a/tests/unittest/llmapi/test_kv_cache_dtype_override.py b/tests/unittest/llmapi/test_kv_cache_dtype_override.py
@@ -22,6 +22,27 @@ def _write_hf_quant_config(model_dir, kv_cache_quant_algo: str = "FP8"):
         )
 
 
+def _compressed_tensors_nvfp4_config(**overrides):
+    config = {
+        "quant_method": "compressed-tensors",
+        "config_groups": {
+            "group_0": {
+                "weights": {
+                    "num_bits": 4,
+                    "type": "float",
+                    "strategy": "tensor_group",
+                    "group_size": 16,
+                },
+                "input_activations": {
+                    "strategy": "tensor_group",
+                },
+            },
+        },
+    }
+    config.update(overrides)
+    return config
+
+
 def test_get_llm_args_plumbs_kv_cache_dtype():
     llm_args, _ = get_llm_args(model="dummy", kv_cache_dtype="nvfp4")
     assert llm_args["kv_cache_config"].dtype == "nvfp4"
@@ -65,3 +86,42 @@ def test_update_from_hf_quant_config_explicit_dtype_overrides(tmp_path):
 
     assert model_loader._update_from_hf_quant_config() is True
     assert llm_args.quant_config.kv_cache_quant_algo == QuantAlgo.NVFP4
+
+
+def test_update_from_hf_quant_config_parses_compressed_tensors_model_kwargs(tmp_path):
+    llm_args = TorchLlmArgs(
+        model=str(tmp_path),
+        model_kwargs={
+            "quantization_config": _compressed_tensors_nvfp4_config(
+                kv_cache_scheme={
+                    "num_bits": 8,
+                    "type": "float",
+                }
+            ),
+        },
+    )
+    model_loader = ModelLoader(llm_args)
+
+    assert model_loader._update_from_hf_quant_config() is True
+    assert llm_args.quant_config.quant_algo == QuantAlgo.NVFP4
+    assert llm_args.quant_config.group_size == 16
+    assert llm_args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
+
+def test_update_from_hf_quant_config_rejects_compressed_tensors_kv_conflict(tmp_path):
+    llm_args = TorchLlmArgs(
+        model=str(tmp_path),
+        model_kwargs={
+            "quantization_config": _compressed_tensors_nvfp4_config(
+                kv_cache_scheme={
+                    "num_bits": 8,
+                    "type": "float",
+                }
+            ),
+        },
+    )
+    llm_args.quant_config = QuantConfig(kv_cache_quant_algo=QuantAlgo.NVFP4)
+    model_loader = ModelLoader(llm_args)
+
+    with pytest.raises(ValueError, match="conflicting with FP8 KV cache"):
+        model_loader._update_from_hf_quant_config()
diff --git a/tests/unittest/models/test_quant_config_utils.py b/tests/unittest/models/test_quant_config_utils.py