Add NPU quant method coverage (#2845)

Qubitium · web-flow · commit 9697f8751eea · 2026-04-29T23:15:22.000+08:00
* Add NPU quant method coverage

* Simplify NPU device ordering

* Resolve PR code quality comments
diff --git a/gptqmodel/looper/awq_processor.py b/gptqmodel/looper/awq_processor.py
@@ -18,12 +18,13 @@
 from ..looper.loop_processor import DTYPE_SIZE_COLUMN, ExecutionConfig, MODULE_FEATURE_COLUMN, LoopProcessor
 from ..looper.named_module import NamedModule
 from ..models import BaseQModel
-from ..models._const import SUPPORTS_MODULE_TYPES
+from ..models._const import DEVICE, SUPPORTS_MODULE_TYPES
 from ..models.writer import (PROCESS_LOG_LAYER, PROCESS_LOG_MODULE, PROCESS_LOG_NAME,
                              PROCESS_LOG_TIME, PROCESS_USED_MEMORY, QUANT_LOG_LOSS, QUANT_LOG_NSAMPLES)
 from ..nn_modules.qlinear.gemm_awq import AwqGEMMLinear
 from ..nn_modules.qlinear.gemv_awq import AwqGEMVLinear
 from ..nn_modules.qlinear.gemv_fast_awq import AwqGEMVFastLinear, LLMAwqLinear
+from ..nn_modules.qlinear.torch_awq import AwqTorchLinear
 from ..quantization.awq.quantize.scale import apply_clip, apply_scale
 from ..quantization.awq.utils.module import append_str_prefix, get_op_name, get_op_by_name
 from ..quantization.awq.utils.utils import get_best_device
@@ -272,12 +273,31 @@ def set_calibration_dataset(self, calibration_dataset):
 
         raise NotImplementedError("AWQProcessor's calibration_dataset cannot be modified")
 
+    def _quant_device_is_npu(self) -> bool:
+        """Return whether this processor is quantizing for an Ascend NPU runtime."""
+
+        device = getattr(self.qcfg, "device", None)
+        if isinstance(device, DEVICE):
+            return device == DEVICE.NPU
+        if isinstance(device, torch.device):
+            return device.type == "npu"
+        if isinstance(device, str):
+            return device.split(":", 1)[0].lower() == "npu"
+        return False
+
     def _select_qlinear_kernel_for_format(self, format_value: FORMAT):
         """Maps the resolved AWQ format to its concrete quantized linear kernel."""
 
         fmt = FORMAT(format_value) if not isinstance(format_value, FORMAT) else format_value
         if fmt == FORMAT.GEMM:
+            if self._quant_device_is_npu():
+                return AwqTorchLinear
             return AwqGEMMLinear
+        if self._quant_device_is_npu():
+            raise ValueError(
+                "NPU AWQ quantization requires FORMAT.GEMM so the AwqTorchLinear runtime can run on NPU; "
+                f"actual format is `{fmt}`."
+            )
         if fmt == FORMAT.GEMV:
             return AwqGEMVLinear
         if fmt == FORMAT.GEMV_FAST:
@@ -1302,7 +1322,7 @@ def pseudo_quantize_tensor(self, w: torch.Tensor):
 
         scales = scales.view(org_w_shape[0], -1)
 
-        # Symmetric quantization produces signed int values (e.g. int4 ∈ [-8, 7]),
+        # Symmetric quantization produces signed int values (e.g. int4 in [-8, 7]),
         # which cannot be packed directly. To make it packable, we shift the signed
         # representation to unsigned by adding 2^(bits-1), i.e. q_u = q_s + 2^(bits-1).
         # This is equivalent to using an affine form with zero_point = 2^(bits-1),
@@ -1820,11 +1840,7 @@ def preprocess(self, module: NamedModule, fallback=None, **kwargs):
     def is_skipped(self, module: NamedModule) -> bool:
         """Reports whether preprocessing excluded this module from AWQ work."""
 
-        t = self.tasks.get(module.name, False)
-        if t == False:
-            return True
-        else:
-            return False
+        return not self.tasks.get(module.name, False)
 
     def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tensor, ...], torch.Tensor], None]:
         """Returns the forward hook that caches module input activations for AWQ."""
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -781,6 +781,11 @@ def quantize(
                 quant_device_type = str(quant_device).split(":")[0].lower()
 
             if export_quant_method == METHOD.AWQ:
+                if quant_device_type == "npu" and format_code != FORMAT.GEMM:
+                    raise ValueError(
+                        "NPU AWQ quantization requires FORMAT.GEMM so the AwqTorchLinear runtime can run on NPU; "
+                        f"actual format is `{format_code}`."
+                    )
                 if format_code == FORMAT.GEMM:
                     # Weight-only RTN->AWQ export should stay on the portable torch kernel.
                     preferred_backend = (
@@ -2938,8 +2943,8 @@ def _linear_names(module):
         def _find_parents(module, possible_names):
             found = set()
             for n, _ in module.named_children():
-                l = n.lower()
-                if any(k in l for k in possible_names):
+                lowered_name = n.lower()
+                if any(k in lowered_name for k in possible_names):
                     found.add(n)
             return found
 
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
@@ -417,6 +417,42 @@ def torch_devices() -> List[torch.device]:
     else:
         return [CPU]
 
+
+def npu_devices_by_pci_bus_order() -> List[torch.device]:
+    """Return visible NPU devices in torch runtime order.
+
+    Ascend exposes process-level NPU visibility through ASCEND_RT_VISIBLE_DEVICES.
+    torch-npu remaps that visible set to logical indices, so callers should set
+    the env var before process start and then use the resulting torch order.
+    """
+
+    if not HAS_NPU:
+        return []
+
+    try:
+        count = int(torch.npu.device_count())
+    except Exception:
+        return []
+    if count <= 0:
+        return []
+
+    devices: List[torch.device] = []
+    for logical_index in range(count):
+        try:
+            devices.append(torch.device("npu", logical_index))
+        except (RuntimeError, ValueError):
+            return []
+    return devices
+
+
+def last_npu_device_by_pci_bus_order() -> Optional[torch.device]:
+    """Return the last visible NPU in torch runtime order, or None when unavailable."""
+
+    devices = npu_devices_by_pci_bus_order()
+    if not devices:
+        return None
+    return devices[-1]
+
 ALL_DEVICES = torch_devices()
 
 if HAS_CUDA:
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ test = [
     "parameterized",
 ]
 quality = [
-    "ruff==0.13.0",
+    "ruff==0.14.2",
     # "isort==6.0.1",
 ]
 vllm = [
diff --git a/tests/test_npu_support.py b/tests/test_npu_support.py
@@ -1,11 +1,17 @@
 import copy
 import os
+import sys
 import warnings
 
 import pytest
 import torch
 import torch.nn as nn
 
+from gptqmodel.looper.awq_processor import AWQProcessor
+from gptqmodel.looper.gptq_processor import GPTQProcessor
+from gptqmodel.looper.paroquant_processor import ParoQuantProcessor
+from gptqmodel.looper.qqq_processor import QQQProcessor
+from gptqmodel.looper.weight_only_processor import WeightOnlyProcessor
 from gptqmodel.models._const import DEVICE, normalize_device
 from gptqmodel.nn_modules.exllamav3_torch import ExllamaV3TorchLinear
 from gptqmodel.nn_modules.qlinear.fp8 import TorchFP8Linear
@@ -16,21 +22,30 @@
 from gptqmodel.nn_modules.qlinear.torch_awq import AwqTorchLinear
 from gptqmodel.quantization import FORMAT, METHOD
 from gptqmodel.quantization.awq.utils.packing_utils import unpack_awq
+from gptqmodel.quantization.config import AWQConfig, GGUFConfig, ParoConfig, QQQConfig, QuantizeConfig
 from gptqmodel.utils import importer
 from gptqmodel.utils.backend import BACKEND
 from gptqmodel.utils.importer import auto_select_device, get_kernel_for_backend, select_quant_linear
-from gptqmodel.utils.torch import HAS_NPU
+from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
 
 
-NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", "npu:0")
+def _default_npu_test_device() -> str:
+    selected = last_npu_device_by_pci_bus_order()
+    return str(selected) if selected is not None else "npu:0"
+
+
+NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
 NPU_CPU_FALLBACK_MARKERS = (
     "not currently supported on the NPU backend",
     "fall back to run on the CPU",
 )
 
 
 def _test_npu_device() -> torch.device:
-    return torch.device(NPU_TEST_DEVICE)
+    device = torch.device(NPU_TEST_DEVICE)
+    if HAS_NPU:
+        torch.npu.set_device(device)
+    return device
 
 
 def _assert_no_npu_cpu_fallback(caught: list[warnings.WarningMessage]) -> None:
@@ -253,6 +268,58 @@ def _make_exllamav3_torch_module(*, device: torch.device | str = "cpu") -> Exlla
     ).eval()
 
 
+class _NpuProcessorModelStub:
+    def __init__(self, qlinear_kernel=None):
+        self.qlinear_kernel = qlinear_kernel
+        self.rotary_embedding = None
+        self.lm_head = "lm_head"
+        self.model = nn.Sequential()
+
+
+def _processor_common_kwargs(qcfg):
+    return {
+        "tokenizer": None,
+        "qcfg": qcfg,
+        "calibration": None,
+        "prepare_dataset_func": None,
+        "calibration_concat_size": None,
+        "calibration_sort": None,
+        "batch_size": 1,
+    }
+
+
+def _npu_select_quant_linear(qcfg, *, method: METHOD, fmt: FORMAT):
+    return select_quant_linear(
+        bits=qcfg.runtime_bits,
+        group_size=qcfg.group_size,
+        desc_act=qcfg.desc_act,
+        sym=qcfg.sym,
+        device=DEVICE.NPU,
+        backend=BACKEND.AUTO,
+        format=fmt,
+        quant_method=method,
+        pack_dtype=qcfg.pack_dtype,
+    )
+
+
+def test_last_npu_device_by_pci_bus_order_uses_visible_logical_order(monkeypatch):
+    try:
+        torch.device("npu:0")
+    except (RuntimeError, ValueError):
+        pytest.skip("This PyTorch build does not register the npu device type")
+
+    class _FakeNpu:
+        @staticmethod
+        def device_count():
+            return 3
+
+    torch_utils = sys.modules[last_npu_device_by_pci_bus_order.__module__]
+    monkeypatch.setattr(torch_utils, "HAS_NPU", True)
+    monkeypatch.setattr(torch_utils.torch, "npu", _FakeNpu())
+
+    assert str(last_npu_device_by_pci_bus_order()) == "npu:2"
+
+
 def test_npu_device_normalization():
     assert normalize_device("npu") is DEVICE.NPU
     assert normalize_device("npu:3") is DEVICE.NPU
@@ -358,13 +425,111 @@ def test_qqq_torch_backend_selects_torch_kernel():
     assert get_kernel_for_backend(BACKEND.QQQ_TORCH, METHOD.QQQ, FORMAT.QQQ) is QQQTorchLinear
 
 
+def test_npu_gptq_processor_has_torch_runtime_kernel():
+    qcfg = QuantizeConfig(bits=4, group_size=128, device=DEVICE.NPU, offload_to_disk=False)
+    processor = GPTQProcessor(**_processor_common_kwargs(qcfg))
+
+    assert processor.name() == "gptq"
+    assert processor.execution_config.require_fwd is True
+    assert _npu_select_quant_linear(qcfg, method=METHOD.GPTQ, fmt=FORMAT.GPTQ) is TorchLinear
+
+
+def test_npu_awq_processor_selects_torch_runtime_kernel():
+    qcfg = AWQConfig(bits=4, group_size=128, device=DEVICE.NPU, offload_to_disk=False)
+    model_stub = _NpuProcessorModelStub()
+    processor = AWQProcessor(
+        **_processor_common_kwargs(qcfg),
+        gptq_model=model_stub,
+        model=model_stub.model,
+    )
+
+    assert processor.name() == "awq"
+    assert processor.execution_config.enable_activation_capture is True
+    assert processor.qlinear_kernel is AwqTorchLinear
+    assert _npu_select_quant_linear(qcfg, method=METHOD.AWQ, fmt=FORMAT.GEMM) is AwqTorchLinear
+
+
+def test_npu_paroquant_processor_has_torch_runtime_kernel():
+    qcfg = ParoConfig(
+        bits=4,
+        group_size=128,
+        device=DEVICE.NPU,
+        opt_rotation_epochs=1,
+        opt_finetune_epochs=1,
+        offload_to_disk=False,
+    )
+    model_stub = _NpuProcessorModelStub()
+    processor = ParoQuantProcessor(
+        **_processor_common_kwargs(qcfg),
+        gptq_model=model_stub,
+        model=model_stub.model,
+    )
+
+    assert processor.name() == "paroquant"
+    assert processor.execution_config.enable_activation_capture is True
+    assert processor.qlinear_kernel is ParoLinear
+    assert _npu_select_quant_linear(qcfg, method=METHOD.PARO, fmt=FORMAT.PAROQUANT) is ParoLinear
+
+
+def test_npu_qqq_processor_selects_torch_runtime_kernel():
+    qcfg = QQQConfig(bits=4, group_size=128, device=DEVICE.NPU, offload_to_disk=False)
+    processor = QQQProcessor(**_processor_common_kwargs(qcfg))
+    qlinear_cls, backend = processor._quant_linear_kernel()
+
+    assert processor.name() == "qqq"
+    assert qlinear_cls is QQQTorchLinear
+    assert backend is BACKEND.QQQ_TORCH
+    assert _npu_select_quant_linear(qcfg, method=METHOD.QQQ, fmt=FORMAT.QQQ) is QQQTorchLinear
+
+
+def test_npu_gguf_weight_only_processor_has_torch_runtime_kernel():
+    qcfg = GGUFConfig(bits="q4_0", device=DEVICE.NPU, offload_to_disk=False)
+    processor = WeightOnlyProcessor(tokenizer=None, qcfg=qcfg)
+
+    assert processor.name() == "weight_only_gguf"
+    assert processor.execution_config.require_fwd is False
+    assert _npu_select_quant_linear(qcfg, method=METHOD.GGUF, fmt=FORMAT.GGUF) is GGUFTorchLinear
+
+
+def test_npu_supported_quant_methods_have_torch_runnable_kernel():
+    cases = [
+        (METHOD.GPTQ, FORMAT.GPTQ, 4, 128, TorchLinear),
+        (METHOD.AWQ, FORMAT.GEMM, 4, 128, AwqTorchLinear),
+        (METHOD.PARO, FORMAT.PAROQUANT, 4, 128, ParoLinear),
+        (METHOD.GGUF, FORMAT.GGUF, "q4_0", -1, GGUFTorchLinear),
+        (METHOD.QQQ, FORMAT.QQQ, 4, 128, QQQTorchLinear),
+    ]
+    for method, fmt, bits, group_size, expected_cls in cases:
+        qlinear_cls = select_quant_linear(
+            bits=bits,
+            group_size=group_size,
+            desc_act=False,
+            sym=True,
+            device=DEVICE.NPU,
+            backend=BACKEND.AUTO,
+            format=fmt,
+            quant_method=method,
+            pack_dtype=torch.int32,
+        )
+        assert qlinear_cls is expected_cls
+        assert DEVICE.ALL in qlinear_cls.SUPPORTS_DEVICES or DEVICE.NPU in qlinear_cls.SUPPORTS_DEVICES
+
+
+def test_npu_exl3_has_torch_runtime_kernel():
+    module = _make_exllamav3_torch_module()
+
+    assert isinstance(module, ExllamaV3TorchLinear)
+    assert module.QUANT_TYPE == "exl3"
+
+
 def test_npu_does_not_advertise_fp8_torch_until_cann_supports_float8():
     assert DEVICE.ALL not in TorchFP8Linear.SUPPORTS_DEVICES
     assert DEVICE.NPU not in TorchFP8Linear.SUPPORTS_DEVICES
 
 
 @pytest.mark.skipif(not HAS_NPU, reason="NPU is not available")
 def test_npu_awq_unpack_preserves_pack_dimension():
+    device = _test_npu_device()
     qweight_cpu = torch.tensor(
         [[0, 1, -1], [-2147483648, 2147483647, -123456789]],
         dtype=torch.int32,
@@ -373,8 +538,8 @@ def test_npu_awq_unpack_preserves_pack_dimension():
         [[-1, 0, 123456789], [2147483647, -2147483648, 7]],
         dtype=torch.int32,
     )
-    qweight = qweight_cpu.to("npu:0")
-    qzeros = qzeros_cpu.to("npu:0")
+    qweight = qweight_cpu.to(device)
+    qzeros = qzeros_cpu.to(device)
 
     iweight, izeros = unpack_awq(qweight, qzeros, bits=4)
     shifts = torch.arange(0, 32, 4, dtype=torch.int32)
@@ -391,6 +556,7 @@ def test_npu_awq_unpack_preserves_pack_dimension():
 
 @pytest.mark.skipif(not HAS_NPU, reason="NPU is not available")
 def test_npu_torch_gptq_unpack_preserves_pack_dimension():
+    device = _test_npu_device()
     qweight_cpu = torch.tensor(
         [
             [0, 1, -1],
@@ -400,8 +566,8 @@ def test_npu_torch_gptq_unpack_preserves_pack_dimension():
         ],
         dtype=torch.int32,
     )
-    qweight = qweight_cpu.to("npu:0")
-    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device="npu:0").view(1, 8, 1)
+    qweight = qweight_cpu.to(device)
+    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=device).view(1, 8, 1)
 
     unpacked = _right_shift_unpack(
         qweight.unsqueeze(1).expand(-1, 8, -1),

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ test = [`
`45`	`45`	`"parameterized",`
`46`	`46`	`]`
`47`	`47`	`quality = [`
`48`		`- "ruff==0.13.0",`
	`48`	`+ "ruff==0.14.2",`
`49`	`49`	`# "isort==6.0.1",`
`50`	`50`	`]`
`51`	`51`	`vllm = [`