ModelCloud · Qubitium · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/.github/scripts/ci_tests.py b/.github/scripts/ci_tests.py
@@ -24,6 +24,24 @@
 ERROR_PATTERN = re.compile(
     r"nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in"
 )
+DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
+
+
+def is_ascend_npu_test(test_script: str) -> bool:
+    return "npu" in re.split(r"[/_.-]+", test_script.removesuffix(".py"))
+
+
+def configure_ascend_npu_test_env(env: dict[str, str], test_script: str) -> bool:
+    if not is_ascend_npu_test(test_script):
+        return False
+
+    visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
+    if not visible_devices:
+        visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
+        env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
+
+    env["CUDA_VISIBLE_DEVICES"] = ""
+    return True
 
 
 def kill_process_group(proc: subprocess.Popen[str]) -> None:
@@ -112,6 +130,10 @@ def run_tests(args: argparse.Namespace) -> int:
         env["CUDA_VISIBLE_DEVICES"] = ""
         print("CUDA_VISIBLE_DEVICES=")
 
+    ascend_npu_test = configure_ascend_npu_test_env(env, args.test_script)
+    if ascend_npu_test:
+        print(f"ASCEND_RT_VISIBLE_DEVICES={env.get('ASCEND_RT_VISIBLE_DEVICES', '')}")
+
     if args.xpu_mode:
         maybe_uninstall_vllm()
 
@@ -138,17 +160,16 @@ def run_tests(args: argparse.Namespace) -> int:
         start_new_session=True,
     )
 
-    keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
-    keepalive_payload = build_job_request(
-        runner_name=args.runner,
-        run_id=args.run_id,
-        test_name=args.test_script,
-    )
-
     monitor_thread = None
     monitor_stop = None
     monitor_state = {"forced_exit_code": 0}
-    if env.get("CUDA_VISIBLE_DEVICES", ""):
+    if env.get("CUDA_VISIBLE_DEVICES", "") and not ascend_npu_test:
+        keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
+        keepalive_payload = build_job_request(
+            runner_name=args.runner,
+            run_id=args.run_id,
+            test_name=args.test_script,
+        )
         monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor(
             proc=proc,
             keepalive_endpoint=keepalive_endpoint,

diff --git a/.github/scripts/ci_workflow.py b/.github/scripts/ci_workflow.py
@@ -121,6 +121,10 @@ def normalize_test_name(name: str) -> str:
     return strip_py_suffix(name.removeprefix("tests/"))
 
 
+def is_npu_test_name(name: str) -> bool:
+    return "npu" in re.split(r"[/_.-]+", normalize_test_name(name))
+
+
 def test_path_from_name(test_name: str, tests_root: str | Path = "tests") -> Path:
     normalized = normalize_test_name(test_name)
     return Path(tests_root) / f"{normalized}.py"
@@ -372,7 +376,8 @@ def resolve_test_runtime(test_name: str, tests_root: str | Path = "tests") -> Te
     normalized = normalize_test_name(test_name)
     test_path = test_path_from_name(normalized, tests_root=tests_root)
     xpu_mode = "xpu" in normalized
-    skip_gpu_allocation = xpu_mode or has_no_gpu_marker(test_path)
+    npu_mode = is_npu_test_name(normalized)
+    skip_gpu_allocation = xpu_mode or npu_mode or has_no_gpu_marker(test_path)
     return TestRuntime(
         test_name=normalized,
         test_path=str(test_path),

diff --git a/tests/test_npu_linalg.py b/tests/test_npu_linalg.py
@@ -14,6 +14,14 @@
 
 
 pytestmark = pytest.mark.skipif(not HAS_NPU, reason="Ascend NPU is required")
+NPU_TEST_DEVICE = "npu:0"
+
+
+def _test_npu_device() -> torch.device:
+    device = torch.device(NPU_TEST_DEVICE)
+    if HAS_NPU:
+        torch.npu.set_device(device)
+    return device
 
 
 def _spd_matrix(size: int, seed: int) -> torch.Tensor:
@@ -23,7 +31,7 @@ def _spd_matrix(size: int, seed: int) -> torch.Tensor:
 
 
 def test_npu_inverse_cholesky_factor_matches_cpu_reference():
-    device = torch.device("npu:0")
+    device = _test_npu_device()
 
     for size in (8, 64, 128):
         matrix_cpu = _spd_matrix(size, seed=1000 + size)
@@ -46,7 +54,7 @@ def test_npu_inverse_cholesky_factor_matches_cpu_reference():
 
 
 def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
-    matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device="npu:0")
+    matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device=_test_npu_device())
 
     with pytest.raises(torch._C._LinAlgError):
         npu_inverse_cholesky_factor(matrix)
@@ -64,15 +72,16 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         if not HAS_NPU:
             raise RuntimeError("Ascend NPU is not available")
 
-        torch.npu.set_device(0)
+        npu_test_device = "npu:0"
+        torch.npu.set_device(npu_test_device)
         torch.manual_seed(0)
 
-        module = nn.Linear(16, 16, bias=False, device="npu:0", dtype=torch.float16)
+        module = nn.Linear(16, 16, bias=False, device=npu_test_device, dtype=torch.float16)
         gptq = GPTQ(module, qcfg=QuantizeConfig(damp_percent=0.05, damp_auto_increment=0.05))
 
         base = torch.randn(16, 16, dtype=torch.float32)
         hessian_cpu = base.matmul(base.T) + torch.eye(16, dtype=torch.float32) * 0.25
-        hessian = hessian_cpu.to(device="npu:0")
+        hessian = hessian_cpu.to(device=npu_test_device)
 
         factor, damp = gptq.hessian_inverse(hessian)
         torch.npu.synchronize()
@@ -90,12 +99,10 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
         """
     )
 
-    env = os.environ.copy()
-    env.setdefault("ASCEND_RT_VISIBLE_DEVICES", "0")
     proc = subprocess.run(
         [sys.executable, "-c", script],
         cwd=os.getcwd(),
-        env=env,
+        env=os.environ.copy(),
         text=True,
         capture_output=True,
         timeout=60,

diff --git a/tests/test_npu_support.py b/tests/test_npu_support.py
@@ -1,5 +1,4 @@
 import copy
-import os
 import sys
 import warnings
 
@@ -29,12 +28,7 @@
 from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
 
 
-def _default_npu_test_device() -> str:
-    selected = last_npu_device_by_pci_bus_order()
-    return str(selected) if selected is not None else "npu:0"
-
-
-NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
+NPU_TEST_DEVICE = "npu:0"
 NPU_CPU_FALLBACK_MARKERS = (
     "not currently supported on the NPU backend",
     "fall back to run on the CPU",