Add NPU quant method coverage

Qubitium · Qubitium · commit 44b02d960cc7 · 2026-04-29T11:46:35.000Z
diff --git a/gptqmodel/looper/awq_processor.py b/gptqmodel/looper/awq_processor.py
@@ -18,12 +18,13 @@
 from ..looper.loop_processor import DTYPE_SIZE_COLUMN, ExecutionConfig, MODULE_FEATURE_COLUMN, LoopProcessor
 from ..looper.named_module import NamedModule
 from ..models import BaseQModel
-from ..models._const import SUPPORTS_MODULE_TYPES
+from ..models._const import DEVICE, SUPPORTS_MODULE_TYPES
 from ..models.writer import (PROCESS_LOG_LAYER, PROCESS_LOG_MODULE, PROCESS_LOG_NAME,
                              PROCESS_LOG_TIME, PROCESS_USED_MEMORY, QUANT_LOG_LOSS, QUANT_LOG_NSAMPLES)
 from ..nn_modules.qlinear.gemm_awq import AwqGEMMLinear
 from ..nn_modules.qlinear.gemv_awq import AwqGEMVLinear
 from ..nn_modules.qlinear.gemv_fast_awq import AwqGEMVFastLinear, LLMAwqLinear
+from ..nn_modules.qlinear.torch_awq import AwqTorchLinear
 from ..quantization.awq.quantize.scale import apply_clip, apply_scale
 from ..quantization.awq.utils.module import append_str_prefix, get_op_name, get_op_by_name
 from ..quantization.awq.utils.utils import get_best_device
@@ -272,12 +273,31 @@ def set_calibration_dataset(self, calibration_dataset):
 
         raise NotImplementedError("AWQProcessor's calibration_dataset cannot be modified")
 
+    def _quant_device_is_npu(self) -> bool:
+        """Return whether this processor is quantizing for an Ascend NPU runtime."""
+
+        device = getattr(self.qcfg, "device", None)
+        if isinstance(device, DEVICE):
+            return device == DEVICE.NPU
+        if isinstance(device, torch.device):
+            return device.type == "npu"
+        if isinstance(device, str):
+            return device.split(":", 1)[0].lower() == "npu"
+        return False
+
     def _select_qlinear_kernel_for_format(self, format_value: FORMAT):
         """Maps the resolved AWQ format to its concrete quantized linear kernel."""
 
         fmt = FORMAT(format_value) if not isinstance(format_value, FORMAT) else format_value
         if fmt == FORMAT.GEMM:
+            if self._quant_device_is_npu():
+                return AwqTorchLinear
             return AwqGEMMLinear
+        if self._quant_device_is_npu():
+            raise ValueError(
+                "NPU AWQ quantization requires FORMAT.GEMM so the AwqTorchLinear runtime can run on NPU; "
+                f"actual format is `{fmt}`."
+            )
         if fmt == FORMAT.GEMV:
             return AwqGEMVLinear
         if fmt == FORMAT.GEMV_FAST:
@@ -1820,11 +1840,7 @@ def preprocess(self, module: NamedModule, fallback=None, **kwargs):
     def is_skipped(self, module: NamedModule) -> bool:
         """Reports whether preprocessing excluded this module from AWQ work."""
 
-        t = self.tasks.get(module.name, False)
-        if t == False:
-            return True
-        else:
-            return False
+        return not self.tasks.get(module.name, False)
 
     def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tensor, ...], torch.Tensor], None]:
         """Returns the forward hook that caches module input activations for AWQ."""
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -781,6 +781,11 @@ def quantize(
                 quant_device_type = str(quant_device).split(":")[0].lower()
 
             if export_quant_method == METHOD.AWQ:
+                if quant_device_type == "npu" and format_code != FORMAT.GEMM:
+                    raise ValueError(
+                        "NPU AWQ quantization requires FORMAT.GEMM so the AwqTorchLinear runtime can run on NPU; "
+                        f"actual format is `{format_code}`."
+                    )
                 if format_code == FORMAT.GEMM:
                     # Weight-only RTN->AWQ export should stay on the portable torch kernel.
                     preferred_backend = (
@@ -2938,8 +2943,8 @@ def _linear_names(module):
         def _find_parents(module, possible_names):
             found = set()
             for n, _ in module.named_children():
-                l = n.lower()
-                if any(k in l for k in possible_names):
+                lowered_name = n.lower()
+                if any(k in lowered_name for k in possible_names):
                     found.add(n)
             return found
 
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
@@ -5,6 +5,7 @@
 
 import contextlib
 import importlib
+import os
 import time
 from contextlib import contextmanager
 from enum import Enum
@@ -417,6 +418,119 @@ def torch_devices() -> List[torch.device]:
     else:
         return [CPU]
 
+
+def _npu_device_pci_bus_id(index: int) -> Optional[str]:
+    """Best-effort PCI bus identifier for a visible NPU logical index."""
+
+    if not HAS_NPU:
+        return None
+
+    npu = getattr(torch, "npu", None)
+    get_props = getattr(npu, "get_device_properties", None)
+    if not callable(get_props):
+        return None
+
+    try:
+        props = get_props(index)
+    except Exception:
+        return None
+
+    attr_names = (
+        "pci_bus_id",
+        "pci_busid",
+        "bus_id",
+        "busid",
+        "pcie_bus_id",
+        "pcie_id",
+    )
+    for attr_name in attr_names:
+        if isinstance(props, dict):
+            value = props.get(attr_name)
+        else:
+            value = getattr(props, attr_name, None)
+        if callable(value):
+            try:
+                value = value()
+            except Exception:
+                value = None
+        if value not in (None, ""):
+            return str(value).strip().lower()
+    return None
+
+
+def _parse_ascend_visible_devices() -> List[int]:
+    """Parse ASCEND_RT_VISIBLE_DEVICES without depending on torch-npu internals."""
+
+    visible = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
+    if visible is None:
+        return []
+    result: List[int] = []
+    for item in visible.split(","):
+        item = item.strip()
+        if not item:
+            return []
+        try:
+            result.append(int(item))
+        except ValueError:
+            return []
+    return result
+
+
+def npu_devices_by_pci_bus_order() -> List[torch.device]:
+    """Return visible NPU devices ordered by PCI bus id when the runtime exposes it.
+
+    If torch-npu does not expose a bus id, the visible logical order is used.
+    With ASCEND_RT_VISIBLE_DEVICES set in PCI order, this keeps the requested
+    bus ordering while still returning logical torch device indices.
+    """
+
+    if not HAS_NPU:
+        return []
+
+    try:
+        count = int(torch.npu.device_count())
+    except Exception:
+        return []
+    if count <= 0:
+        return []
+
+    bus_entries = []
+    for logical_index in range(count):
+        bus_id = _npu_device_pci_bus_id(logical_index)
+        if bus_id:
+            bus_entries.append((bus_id, logical_index))
+
+    if len(bus_entries) == count:
+        entries = sorted(bus_entries, key=lambda item: (item[0], item[1]))
+    else:
+        visible_physical = _parse_ascend_visible_devices()
+        entries = [
+            (
+                visible_physical[logical_index]
+                if logical_index < len(visible_physical)
+                else logical_index,
+                logical_index,
+            )
+            for logical_index in range(count)
+        ]
+
+    devices: List[torch.device] = []
+    for _, logical_index in entries:
+        try:
+            devices.append(torch.device("npu", logical_index))
+        except (RuntimeError, ValueError):
+            return []
+    return devices
+
+
+def last_npu_device_by_pci_bus_order() -> Optional[torch.device]:
+    """Return the last visible NPU in PCI bus order, or None when unavailable."""
+
+    devices = npu_devices_by_pci_bus_order()
+    if not devices:
+        return None
+    return devices[-1]
+
 ALL_DEVICES = torch_devices()
 
 if HAS_CUDA:
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ test = [
     "parameterized",
 ]
 quality = [
-    "ruff==0.13.0",
+    "ruff==0.14.2",
     # "isort==6.0.1",
 ]
 vllm = [
diff --git a/tests/test_npu_support.py b/tests/test_npu_support.py

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ test = [`
`45`	`45`	`"parameterized",`
`46`	`46`	`]`
`47`	`47`	`quality = [`
`48`		`- "ruff==0.13.0",`
	`48`	`+ "ruff==0.14.2",`
`49`	`49`	`# "isort==6.0.1",`
`50`	`50`	`]`
`51`	`51`	`vllm = [`