ascend kernel compat update: cann 9.1beta1 (#2906)

Qubitium · web-flow · commit 3ff03716b6ec · 2026-05-22T14:09:04.000+08:00
* ascend kernel compat update: cann 9.1beta1

* document ascend shift compatibility helpers
diff --git a/gptqmodel/nn_modules/exllamav3_torch.py b/gptqmodel/nn_modules/exllamav3_torch.py
@@ -70,6 +70,40 @@ def _half_scalar_from_bits(bits: int) -> float:
     return float(struct.unpack("<e", packed)[0])
 
 
+# ExLlamaV3's torch fallback decodes packed codebook values with shift
+# operations. Keep those shifts behind helpers so Ascend 910B/CANN 9.1 beta can
+# use arithmetic equivalents for operators that torch-npu does not expose as
+# native device kernels, while CPU/CUDA/XPU keep the standard bitwise ops.
+def _torch_shift_factor(shifts: int | torch.Tensor, device: torch.device) -> int | torch.Tensor:
+    if torch.is_tensor(shifts):
+        # Tensor shifts must be materialized on the target device; otherwise the
+        # NPU arithmetic fallback would introduce cross-device operands.
+        shifts_i64 = shifts.to(device=device, dtype=torch.int64)
+        return torch.pow(torch.full_like(shifts_i64, 2), shifts_i64)
+    return 1 << int(shifts)
+
+
+def _torch_right_shift(values: torch.Tensor, shifts: int | torch.Tensor) -> torch.Tensor:
+    if values.device.type != "npu":
+        return torch.bitwise_right_shift(values, shifts)
+
+    # CANN 9.1 beta on Ascend 910B may not provide bitwise_right_shift kernels
+    # for these tensor paths. floor_divide by powers of two preserves arithmetic
+    # right-shift behavior for signed packed int tensors and stays on-device.
+    shifted = torch.floor_divide(values.to(torch.int64), _torch_shift_factor(shifts, values.device))
+    return shifted.to(values.dtype)
+
+
+def _torch_left_shift(values: torch.Tensor, shifts: int | torch.Tensor) -> torch.Tensor:
+    if values.device.type != "npu":
+        return torch.bitwise_left_shift(values, shifts)
+
+    # Mirror left shift as multiplication by powers of two on NPU to avoid
+    # missing-kernel or CPU-fallback paths in torch-npu.
+    shifted = values.to(torch.int64) * _torch_shift_factor(shifts, values.device)
+    return shifted.to(values.dtype)
+
+
 _EXL3_MUL1_INV = _half_scalar_from_bits(0x1EEE)
 _EXL3_MUL1_BIAS = _half_scalar_from_bits(0xC931)
 
@@ -117,7 +151,7 @@ def _codebook_lut(
         halves = torch.stack(
             (
                 (raw & 0xFFFF).to(torch.uint16),
-                ((raw >> 16) & 0xFFFF).to(torch.uint16),
+                (_torch_right_shift(raw, 16) & 0xFFFF).to(torch.uint16),
             ),
             dim=-1,
         ).contiguous()
@@ -130,7 +164,7 @@ def _codebook_lut(
         halves = torch.stack(
             (
                 (raw & 0xFFFF).to(torch.uint16),
-                ((raw >> 16) & 0xFFFF).to(torch.uint16),
+                (_torch_right_shift(raw, 16) & 0xFFFF).to(torch.uint16),
             ),
             dim=-1,
         ).contiguous()
@@ -141,9 +175,9 @@ def _codebook_lut(
         raw = (values * _EXL3_MUL1_MULT) & 0xFFFFFFFF
         byte_sum = (
             (raw & 0xFF)
-            + ((raw >> 8) & 0xFF)
-            + ((raw >> 16) & 0xFF)
-            + ((raw >> 24) & 0xFF)
+            + (_torch_right_shift(raw, 8) & 0xFF)
+            + (_torch_right_shift(raw, 16) & 0xFF)
+            + (_torch_right_shift(raw, 24) & 0xFF)
         )
         accum = (byte_sum + _EXL3_MUL1_ACC).to(torch.uint16).contiguous()
         floats = accum.view(torch.float16).to(torch.float32)
@@ -297,23 +331,23 @@ def _unpack_indices(self) -> torch.Tensor:
             bit_in_word = bit_offset % 16
             if bit_in_word + bits <= 16:
                 shift = 16 - bit_in_word - bits
-                value = (words[..., word_idx] >> shift) & mask
+                value = _torch_right_shift(words[..., word_idx], shift) & mask
             else:
                 bits_first = 16 - bit_in_word
                 bits_second = bits - bits_first
-                high = (words[..., word_idx] & ((1 << bits_first) - 1)) << bits_second
-                low = words[..., word_idx + 1] >> (16 - bits_second)
+                high = _torch_left_shift(words[..., word_idx] & ((1 << bits_first) - 1), bits_second)
+                low = _torch_right_shift(words[..., word_idx + 1], 16 - bits_second)
                 value = (high | low) & mask
             symbols[..., pos::16] = value.to(torch.long)
 
         warmup = (16 + bits - 1) // bits - 1
         state = torch.zeros_like(symbols[..., 0], dtype=torch.long)
         for idx in range(256 - warmup, 256):
-            state = ((state << bits) | symbols[..., idx]) & 0xFFFF
+            state = (_torch_left_shift(state, bits) | symbols[..., idx]) & 0xFFFF
 
         encoded = torch.empty_like(symbols)
         for idx in range(256):
-            state = ((state << bits) | symbols[..., idx]) & 0xFFFF
+            state = (_torch_left_shift(state, bits) | symbols[..., idx]) & 0xFFFF
             encoded[..., idx] = state
 
         return encoded
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -27,6 +27,41 @@
 
 log = setup_logger()
 
+
+# Packed quantized weights are unpacked through shift operations in several
+# kernels. Keep those shifts behind helpers so Ascend 910B/CANN 9.1 beta can use
+# arithmetic equivalents for operators that torch-npu does not expose as native
+# device kernels, while CPU/CUDA/XPU keep the standard bitwise ops.
+def _torch_shift_factor(shifts: int | t.Tensor, device: t.device) -> int | t.Tensor:
+    if t.is_tensor(shifts):
+        # Tensor shifts must be materialized on the target device; otherwise the
+        # NPU arithmetic fallback would introduce cross-device operands.
+        shifts_i64 = shifts.to(device=device, dtype=t.int64)
+        return t.pow(t.full_like(shifts_i64, 2), shifts_i64)
+    return 1 << int(shifts)
+
+
+def _torch_right_shift(values: t.Tensor, shifts: int | t.Tensor) -> t.Tensor:
+    if values.device.type != "npu":
+        return t.bitwise_right_shift(values, shifts)
+
+    # CANN 9.1 beta on Ascend 910B may not provide bitwise_right_shift kernels
+    # for these tensor paths. floor_divide by powers of two preserves arithmetic
+    # right-shift behavior for signed packed int tensors and stays on-device.
+    shifted = t.floor_divide(values.to(t.int64), _torch_shift_factor(shifts, values.device))
+    return shifted.to(values.dtype)
+
+
+def _torch_left_shift(values: t.Tensor, shifts: int | t.Tensor) -> t.Tensor:
+    if values.device.type != "npu":
+        return t.bitwise_left_shift(values, shifts)
+
+    # Mirror left shift as multiplication by powers of two on NPU to avoid
+    # missing-kernel or CPU-fallback paths in torch-npu.
+    shifted = values.to(t.int64) * _torch_shift_factor(shifts, values.device)
+    return shifted.to(values.dtype)
+
+
 class BaseQuantLinear(nn.Module):
     SUPPORTS_BACKENDS: List[BACKEND] = None
     SUPPORTS_METHODS: List[METHOD] = None
@@ -806,14 +841,14 @@ def dequantize_weight(self, num_itr: int = 1):
             )
 
         if self.bits in [2, 4, 8]:
-            zeros = t.bitwise_right_shift(
+            zeros = _torch_right_shift(
                 t.unsqueeze(self.qzeros, 2).expand(-1, -1, self.pack_factor),
                 self.wf_unsqueeze_zero  # self.wf.unsqueeze(0),
             ).to(self.dequant_dtype)
             zeros = t.bitwise_and(zeros, self.maxq).reshape(self.scales.shape)
 
             weight = t.bitwise_and(
-                t.bitwise_right_shift(
+                _torch_right_shift(
                     t.unsqueeze(self.qweight, 1).expand(-1, self.pack_factor, -1),
                     self.wf_unsqueeze_neg_one  # self.wf.unsqueeze(-1)
                 ).to(self.dequant_dtype),
@@ -823,9 +858,9 @@ def dequantize_weight(self, num_itr: int = 1):
             zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
                 -1, -1, -1, 12
             )
-            zeros = zeros >> self.wf_unsqueeze_zero  # self.wf.unsqueeze(0)
-            zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
-            zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
+            zeros = _torch_right_shift(zeros, self.wf_unsqueeze_zero)  # self.wf.unsqueeze(0)
+            zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | (_torch_left_shift(zeros[:, :, 1, 0], 2) & 0x4)
+            zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | (_torch_left_shift(zeros[:, :, 2, 0], 1) & 0x6)
             zeros = zeros & 0x7
             zeros = t.cat(
                 [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
@@ -835,9 +870,9 @@ def dequantize_weight(self, num_itr: int = 1):
             weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
                 -1, -1, 12, -1
             )
-            weight = (weight >> self.wf_unsqueeze_neg_one) & 0x7  # self.wf.unsqueeze(-1)
-            weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
-            weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
+            weight = _torch_right_shift(weight, self.wf_unsqueeze_neg_one) & 0x7  # self.wf.unsqueeze(-1)
+            weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | (_torch_left_shift(weight[:, 1, 0], 2) & 0x4)
+            weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | (_torch_left_shift(weight[:, 2, 0], 1) & 0x6)
             weight = weight & 0x7
             weight = t.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
         weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
@@ -5,6 +5,7 @@
 
 import contextlib
 import importlib
+import os
 import time
 from contextlib import contextmanager
 from enum import Enum
@@ -84,9 +85,21 @@ def timed_gc_collect() -> int:
 except Exception:
     HAS_MPS = False
 
+
+def _ascend_runtime_env_ready() -> bool:
+    # torch_npu may report available before the CANN environment is fully
+    # sourced. Requiring the standard Ascend paths avoids import-time lazy
+    # initialization failures when the Python package is installed but the
+    # runtime is not usable in this shell.
+    return any(
+        os.environ.get(name)
+        for name in ("ASCEND_HOME_PATH", "ASCEND_TOOLKIT_HOME", "ASCEND_OPP_PATH")
+    )
+
+
 try:
     importlib.import_module("torch_npu")
-    HAS_NPU = torch.npu.is_available()
+    HAS_NPU = _ascend_runtime_env_ready() and torch.npu.is_available()
 except Exception:
     HAS_NPU = False
 
@@ -455,20 +468,62 @@ def last_npu_device_by_pci_bus_order() -> Optional[torch.device]:
 
 ALL_DEVICES = torch_devices()
 
-if HAS_CUDA:
-    ALL_STREAMS = [torch.cuda.Stream(device=device) for device in ALL_DEVICES]
-elif HAS_XPU:
-    ALL_STREAMS = [torch.xpu.Stream(device=device) for device in ALL_DEVICES]
-elif HAS_NPU:
-    ALL_STREAMS = [torch.npu.Stream(device=device) for device in ALL_DEVICES]
-else:
-    ALL_STREAMS = [contextlib.nullcontext()]
+
+class _LazyAcceleratorStreams:
+    """Create accelerator streams only when a caller actually needs them."""
+
+    def __init__(self, devices: List[torch.device]):
+        self._devices = devices
+        self._streams: List[Optional[object]] = [None] * len(devices)
+
+    def __len__(self):
+        return max(1, len(self._devices))
+
+    def __iter__(self):
+        for index in range(len(self)):
+            yield self[index]
+
+    def __getitem__(self, index: int):
+        if not self._devices:
+            if index == 0:
+                return contextlib.nullcontext()
+            raise IndexError(index)
+
+        stream = self._streams[index]
+        if stream is not None:
+            return stream
+
+        device = self._devices[index]
+        if device.type == "cuda":
+            stream = torch.cuda.Stream(device=device)
+        elif device.type == "xpu":
+            stream = torch.xpu.Stream(device=device)
+        elif device.type == "npu":
+            stream = torch.npu.Stream(device=device)
+        else:
+            stream = contextlib.nullcontext()
+        self._streams[index] = stream
+        return stream
+
+
+class _LazyAcceleratorStreamRef:
+    """Reference one lazy stream without materializing it at import time."""
+
+    def __init__(self, streams: _LazyAcceleratorStreams, index: int):
+        self._streams = streams
+        self._index = index
+
+    def get(self):
+        return self._streams[self._index]
+
+
+ALL_STREAMS = _LazyAcceleratorStreams(ALL_DEVICES)
 
 DEVICE_0 = auto_select_torch_device(index=0)
 # device_1 may be same as device_0 if there is only 1 visible/active device
 DEVICE_1 = auto_select_torch_device(index=1)
 
-DEVICE_0_STREAM = ALL_STREAMS[0]
+DEVICE_0_STREAM = _LazyAcceleratorStreamRef(ALL_STREAMS, 0)
 
 NEXT_DEVICE_INDEX = 0
 
@@ -494,6 +549,8 @@ def last_npu_device_by_pci_bus_order() -> Optional[torch.device]:
 #     return device
 
 def torch_streamCtx(stream) -> StreamContext:
+    if isinstance(stream, _LazyAcceleratorStreamRef):
+        stream = stream.get()
     if HAS_CUDA:
         return torch.cuda.stream(stream)
     if HAS_XPU: