From 0c2294c1119973439697119048b86aab2b0030f1 Mon Sep 17 00:00:00 2001
From: Qubitium <qubitium@modelcloud.ai>
Date: Fri, 22 May 2026 06:36:10 +0000
Subject: [PATCH 1/2] Pin Ascend unit tests to NPU 7

---
 .github/scripts/ci_tests.py    | 41 ++++++++++++++++++++++++-------
 .github/scripts/ci_workflow.py |  7 +++++-
 tests/test_npu_linalg.py       | 44 +++++++++++++++++++++++++++-------
 3 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/.github/scripts/ci_tests.py b/.github/scripts/ci_tests.py
index faa8bf2cb..bda63d1f0 100644
--- a/.github/scripts/ci_tests.py
+++ b/.github/scripts/ci_tests.py
@@ -24,6 +24,27 @@
 ERROR_PATTERN = re.compile(
     r"nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in"
 )
+DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
+
+
+def is_ascend_npu_test(test_script: str) -> bool:
+    return "npu" in re.split(r"[/_.-]+", test_script.removesuffix(".py"))
+
+
+def configure_ascend_npu_test_env(env: dict[str, str], test_script: str) -> bool:
+    if not is_ascend_npu_test(test_script):
+        return False
+
+    visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
+    if not visible_devices:
+        visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
+        env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
+
+    if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
+        env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
+
+    env["CUDA_VISIBLE_DEVICES"] = ""
+    return True
 
 
 def kill_process_group(proc: subprocess.Popen[str]) -> None:
@@ -112,6 +133,11 @@ def run_tests(args: argparse.Namespace) -> int:
         env["CUDA_VISIBLE_DEVICES"] = ""
         print("CUDA_VISIBLE_DEVICES=")
 
+    ascend_npu_test = configure_ascend_npu_test_env(env, args.test_script)
+    if ascend_npu_test:
+        print(f"ASCEND_RT_VISIBLE_DEVICES={env.get('ASCEND_RT_VISIBLE_DEVICES', '')}")
+        print(f"GPTQMODEL_TEST_NPU_DEVICE={env.get('GPTQMODEL_TEST_NPU_DEVICE', '')}")
+
     if args.xpu_mode:
         maybe_uninstall_vllm()
 
@@ -138,17 +164,16 @@ def run_tests(args: argparse.Namespace) -> int:
         start_new_session=True,
     )
 
-    keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
-    keepalive_payload = build_job_request(
-        runner_name=args.runner,
-        run_id=args.run_id,
-        test_name=args.test_script,
-    )
-
     monitor_thread = None
     monitor_stop = None
     monitor_state = {"forced_exit_code": 0}
-    if env.get("CUDA_VISIBLE_DEVICES", ""):
+    if env.get("CUDA_VISIBLE_DEVICES", "") and not ascend_npu_test:
+        keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
+        keepalive_payload = build_job_request(
+            runner_name=args.runner,
+            run_id=args.run_id,
+            test_name=args.test_script,
+        )
         monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor(
             proc=proc,
             keepalive_endpoint=keepalive_endpoint,
diff --git a/.github/scripts/ci_workflow.py b/.github/scripts/ci_workflow.py
index 7e256c39f..58112f383 100644
--- a/.github/scripts/ci_workflow.py
+++ b/.github/scripts/ci_workflow.py
@@ -121,6 +121,10 @@ def normalize_test_name(name: str) -> str:
     return strip_py_suffix(name.removeprefix("tests/"))
 
 
+def is_npu_test_name(name: str) -> bool:
+    return "npu" in re.split(r"[/_.-]+", normalize_test_name(name))
+
+
 def test_path_from_name(test_name: str, tests_root: str | Path = "tests") -> Path:
     normalized = normalize_test_name(test_name)
     return Path(tests_root) / f"{normalized}.py"
@@ -372,7 +376,8 @@ def resolve_test_runtime(test_name: str, tests_root: str | Path = "tests") -> Te
     normalized = normalize_test_name(test_name)
     test_path = test_path_from_name(normalized, tests_root=tests_root)
     xpu_mode = "xpu" in normalized
-    skip_gpu_allocation = xpu_mode or has_no_gpu_marker(test_path)
+    npu_mode = is_npu_test_name(normalized)
+    skip_gpu_allocation = xpu_mode or npu_mode or has_no_gpu_marker(test_path)
     return TestRuntime(
         test_name=normalized,
         test_path=str(test_path),
diff --git a/tests/test_npu_linalg.py b/tests/test_npu_linalg.py
index d865d0827..a6eb4a0a8 100644
--- a/tests/test_npu_linalg.py
+++ b/tests/test_npu_linalg.py
@@ -10,10 +10,37 @@
 import torch
 
 from gptqmodel.quantization.npu_linalg import npu_inverse_cholesky_factor
-from gptqmodel.utils.torch import HAS_NPU
+from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
 
 
 pytestmark = pytest.mark.skipif(not HAS_NPU, reason="Ascend NPU is required")
+DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
+
+
+def _default_npu_test_device() -> str:
+    selected = last_npu_device_by_pci_bus_order()
+    return str(selected) if selected is not None else "npu:0"
+
+
+NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
+
+
+def _test_npu_device() -> torch.device:
+    device = torch.device(NPU_TEST_DEVICE)
+    if HAS_NPU:
+        torch.npu.set_device(device)
+    return device
+
+
+def _default_subprocess_env() -> dict[str, str]:
+    env = os.environ.copy()
+    visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
+    if not visible_devices:
+        visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
+        env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
+    if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
+        env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
+    return env
 
 
 def _spd_matrix(size: int, seed: int) -> torch.Tensor:
@@ -23,7 +50,7 @@ def _spd_matrix(size: int, seed: int) -> torch.Tensor:
 
 
 def test_npu_inverse_cholesky_factor_matches_cpu_reference():
-    device = torch.device("npu:0")
+    device = _test_npu_device()
 
     for size in (8, 64, 128):
         matrix_cpu = _spd_matrix(size, seed=1000 + size)
@@ -46,7 +73,7 @@ def test_npu_inverse_cholesky_factor_matches_cpu_reference():
 
 
 def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
-    matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device="npu:0")
+    matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device=_test_npu_device())
 
     with pytest.raises(torch._C._LinAlgError):
         npu_inverse_cholesky_factor(matrix)
@@ -55,6 +82,7 @@ def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
 def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
     script = textwrap.dedent(
         """
+        import os
         import torch
         import torch.nn as nn
         from gptqmodel.quantization.config import QuantizeConfig
@@ -64,15 +92,16 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         if not HAS_NPU:
             raise RuntimeError("Ascend NPU is not available")
 
-        torch.npu.set_device(0)
+        npu_test_device = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", "npu:0")
+        torch.npu.set_device(npu_test_device)
         torch.manual_seed(0)
 
-        module = nn.Linear(16, 16, bias=False, device="npu:0", dtype=torch.float16)
+        module = nn.Linear(16, 16, bias=False, device=npu_test_device, dtype=torch.float16)
         gptq = GPTQ(module, qcfg=QuantizeConfig(damp_percent=0.05, damp_auto_increment=0.05))
 
         base = torch.randn(16, 16, dtype=torch.float32)
         hessian_cpu = base.matmul(base.T) + torch.eye(16, dtype=torch.float32) * 0.25
-        hessian = hessian_cpu.to(device="npu:0")
+        hessian = hessian_cpu.to(device=npu_test_device)
 
         factor, damp = gptq.hessian_inverse(hessian)
         torch.npu.synchronize()
@@ -90,8 +119,7 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         """
     )
 
-    env = os.environ.copy()
-    env.setdefault("ASCEND_RT_VISIBLE_DEVICES", "0")
+    env = _default_subprocess_env()
     proc = subprocess.run(
         [sys.executable, "-c", script],
         cwd=os.getcwd(),

From 504b69f1803fe0dc43c7a4d0d010a7fc84baa6a1 Mon Sep 17 00:00:00 2001
From: Qubitium <qubitium@modelcloud.ai>
Date: Fri, 22 May 2026 08:54:13 +0000
Subject: [PATCH 2/2] Use logical NPU 0 in Ascend tests

---
 .github/scripts/ci_tests.py |  4 ----
 tests/test_npu_linalg.py    | 29 ++++-------------------------
 tests/test_npu_support.py   |  8 +-------
 3 files changed, 5 insertions(+), 36 deletions(-)

diff --git a/.github/scripts/ci_tests.py b/.github/scripts/ci_tests.py
index bda63d1f0..c6ff92175 100644
--- a/.github/scripts/ci_tests.py
+++ b/.github/scripts/ci_tests.py
@@ -40,9 +40,6 @@ def configure_ascend_npu_test_env(env: dict[str, str], test_script: str) -> bool
         visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
         env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
 
-    if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
-        env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
-
     env["CUDA_VISIBLE_DEVICES"] = ""
     return True
 
@@ -136,7 +133,6 @@ def run_tests(args: argparse.Namespace) -> int:
     ascend_npu_test = configure_ascend_npu_test_env(env, args.test_script)
     if ascend_npu_test:
         print(f"ASCEND_RT_VISIBLE_DEVICES={env.get('ASCEND_RT_VISIBLE_DEVICES', '')}")
-        print(f"GPTQMODEL_TEST_NPU_DEVICE={env.get('GPTQMODEL_TEST_NPU_DEVICE', '')}")
 
     if args.xpu_mode:
         maybe_uninstall_vllm()
diff --git a/tests/test_npu_linalg.py b/tests/test_npu_linalg.py
index a6eb4a0a8..9ab810812 100644
--- a/tests/test_npu_linalg.py
+++ b/tests/test_npu_linalg.py
@@ -10,19 +10,11 @@
 import torch
 
 from gptqmodel.quantization.npu_linalg import npu_inverse_cholesky_factor
-from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
+from gptqmodel.utils.torch import HAS_NPU
 
 
 pytestmark = pytest.mark.skipif(not HAS_NPU, reason="Ascend NPU is required")
-DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
-
-
-def _default_npu_test_device() -> str:
-    selected = last_npu_device_by_pci_bus_order()
-    return str(selected) if selected is not None else "npu:0"
-
-
-NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
+NPU_TEST_DEVICE = "npu:0"
 
 
 def _test_npu_device() -> torch.device:
@@ -32,17 +24,6 @@ def _test_npu_device() -> torch.device:
     return device
 
 
-def _default_subprocess_env() -> dict[str, str]:
-    env = os.environ.copy()
-    visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
-    if not visible_devices:
-        visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
-        env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
-    if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
-        env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
-    return env
-
-
 def _spd_matrix(size: int, seed: int) -> torch.Tensor:
     generator = torch.Generator(device="cpu").manual_seed(seed)
     values = torch.randn(size, size, generator=generator, dtype=torch.float32)
@@ -82,7 +63,6 @@ def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
 def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
     script = textwrap.dedent(
         """
-        import os
         import torch
         import torch.nn as nn
         from gptqmodel.quantization.config import QuantizeConfig
@@ -92,7 +72,7 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         if not HAS_NPU:
             raise RuntimeError("Ascend NPU is not available")
 
-        npu_test_device = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", "npu:0")
+        npu_test_device = "npu:0"
         torch.npu.set_device(npu_test_device)
         torch.manual_seed(0)
 
@@ -119,11 +99,10 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         """
     )
 
-    env = _default_subprocess_env()
     proc = subprocess.run(
         [sys.executable, "-c", script],
         cwd=os.getcwd(),
-        env=env,
+        env=os.environ.copy(),
         text=True,
         capture_output=True,
         timeout=60,
diff --git a/tests/test_npu_support.py b/tests/test_npu_support.py
index e69d705b1..5a102abe1 100644
--- a/tests/test_npu_support.py
+++ b/tests/test_npu_support.py
@@ -1,5 +1,4 @@
 import copy
-import os
 import sys
 import warnings
 
@@ -29,12 +28,7 @@
 from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
 
 
-def _default_npu_test_device() -> str:
-    selected = last_npu_device_by_pci_bus_order()
-    return str(selected) if selected is not None else "npu:0"
-
-
-NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
+NPU_TEST_DEVICE = "npu:0"
 NPU_CPU_FALLBACK_MARKERS = (
     "not currently supported on the NPU backend",
     "fall back to run on the CPU",