Skip to content

Commit 0c2294c

Browse files
committed
Pin Ascend unit tests to NPU 7
1 parent 3ff0371 commit 0c2294c

3 files changed

Lines changed: 75 additions & 17 deletions

File tree

.github/scripts/ci_tests.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,27 @@
2424
ERROR_PATTERN = re.compile(
2525
r"nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in"
2626
)
27+
DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
28+
29+
30+
def is_ascend_npu_test(test_script: str) -> bool:
31+
return "npu" in re.split(r"[/_.-]+", test_script.removesuffix(".py"))
32+
33+
34+
def configure_ascend_npu_test_env(env: dict[str, str], test_script: str) -> bool:
35+
if not is_ascend_npu_test(test_script):
36+
return False
37+
38+
visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
39+
if not visible_devices:
40+
visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
41+
env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
42+
43+
if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
44+
env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
45+
46+
env["CUDA_VISIBLE_DEVICES"] = ""
47+
return True
2748

2849

2950
def kill_process_group(proc: subprocess.Popen[str]) -> None:
@@ -112,6 +133,11 @@ def run_tests(args: argparse.Namespace) -> int:
112133
env["CUDA_VISIBLE_DEVICES"] = ""
113134
print("CUDA_VISIBLE_DEVICES=")
114135

136+
ascend_npu_test = configure_ascend_npu_test_env(env, args.test_script)
137+
if ascend_npu_test:
138+
print(f"ASCEND_RT_VISIBLE_DEVICES={env.get('ASCEND_RT_VISIBLE_DEVICES', '')}")
139+
print(f"GPTQMODEL_TEST_NPU_DEVICE={env.get('GPTQMODEL_TEST_NPU_DEVICE', '')}")
140+
115141
if args.xpu_mode:
116142
maybe_uninstall_vllm()
117143

@@ -138,17 +164,16 @@ def run_tests(args: argparse.Namespace) -> int:
138164
start_new_session=True,
139165
)
140166

141-
keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
142-
keepalive_payload = build_job_request(
143-
runner_name=args.runner,
144-
run_id=args.run_id,
145-
test_name=args.test_script,
146-
)
147-
148167
monitor_thread = None
149168
monitor_stop = None
150169
monitor_state = {"forced_exit_code": 0}
151-
if env.get("CUDA_VISIBLE_DEVICES", ""):
170+
if env.get("CUDA_VISIBLE_DEVICES", "") and not ascend_npu_test:
171+
keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
172+
keepalive_payload = build_job_request(
173+
runner_name=args.runner,
174+
run_id=args.run_id,
175+
test_name=args.test_script,
176+
)
152177
monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor(
153178
proc=proc,
154179
keepalive_endpoint=keepalive_endpoint,

.github/scripts/ci_workflow.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ def normalize_test_name(name: str) -> str:
121121
return strip_py_suffix(name.removeprefix("tests/"))
122122

123123

124+
def is_npu_test_name(name: str) -> bool:
125+
return "npu" in re.split(r"[/_.-]+", normalize_test_name(name))
126+
127+
124128
def test_path_from_name(test_name: str, tests_root: str | Path = "tests") -> Path:
125129
normalized = normalize_test_name(test_name)
126130
return Path(tests_root) / f"{normalized}.py"
@@ -372,7 +376,8 @@ def resolve_test_runtime(test_name: str, tests_root: str | Path = "tests") -> Te
372376
normalized = normalize_test_name(test_name)
373377
test_path = test_path_from_name(normalized, tests_root=tests_root)
374378
xpu_mode = "xpu" in normalized
375-
skip_gpu_allocation = xpu_mode or has_no_gpu_marker(test_path)
379+
npu_mode = is_npu_test_name(normalized)
380+
skip_gpu_allocation = xpu_mode or npu_mode or has_no_gpu_marker(test_path)
376381
return TestRuntime(
377382
test_name=normalized,
378383
test_path=str(test_path),

tests/test_npu_linalg.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,37 @@
1010
import torch
1111

1212
from gptqmodel.quantization.npu_linalg import npu_inverse_cholesky_factor
13-
from gptqmodel.utils.torch import HAS_NPU
13+
from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
1414

1515

1616
pytestmark = pytest.mark.skipif(not HAS_NPU, reason="Ascend NPU is required")
17+
DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
18+
19+
20+
def _default_npu_test_device() -> str:
21+
selected = last_npu_device_by_pci_bus_order()
22+
return str(selected) if selected is not None else "npu:0"
23+
24+
25+
NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
26+
27+
28+
def _test_npu_device() -> torch.device:
29+
device = torch.device(NPU_TEST_DEVICE)
30+
if HAS_NPU:
31+
torch.npu.set_device(device)
32+
return device
33+
34+
35+
def _default_subprocess_env() -> dict[str, str]:
36+
env = os.environ.copy()
37+
visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
38+
if not visible_devices:
39+
visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
40+
env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
41+
if "GPTQMODEL_TEST_NPU_DEVICE" not in env and visible_devices and "," not in visible_devices:
42+
env["GPTQMODEL_TEST_NPU_DEVICE"] = "npu:0"
43+
return env
1744

1845

1946
def _spd_matrix(size: int, seed: int) -> torch.Tensor:
@@ -23,7 +50,7 @@ def _spd_matrix(size: int, seed: int) -> torch.Tensor:
2350

2451

2552
def test_npu_inverse_cholesky_factor_matches_cpu_reference():
26-
device = torch.device("npu:0")
53+
device = _test_npu_device()
2754

2855
for size in (8, 64, 128):
2956
matrix_cpu = _spd_matrix(size, seed=1000 + size)
@@ -46,7 +73,7 @@ def test_npu_inverse_cholesky_factor_matches_cpu_reference():
4673

4774

4875
def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
49-
matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device="npu:0")
76+
matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device=_test_npu_device())
5077

5178
with pytest.raises(torch._C._LinAlgError):
5279
npu_inverse_cholesky_factor(matrix)
@@ -55,6 +82,7 @@ def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
5582
def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
5683
script = textwrap.dedent(
5784
"""
85+
import os
5886
import torch
5987
import torch.nn as nn
6088
from gptqmodel.quantization.config import QuantizeConfig
@@ -64,15 +92,16 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
6492
if not HAS_NPU:
6593
raise RuntimeError("Ascend NPU is not available")
6694
67-
torch.npu.set_device(0)
95+
npu_test_device = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", "npu:0")
96+
torch.npu.set_device(npu_test_device)
6897
torch.manual_seed(0)
6998
70-
module = nn.Linear(16, 16, bias=False, device="npu:0", dtype=torch.float16)
99+
module = nn.Linear(16, 16, bias=False, device=npu_test_device, dtype=torch.float16)
71100
gptq = GPTQ(module, qcfg=QuantizeConfig(damp_percent=0.05, damp_auto_increment=0.05))
72101
73102
base = torch.randn(16, 16, dtype=torch.float32)
74103
hessian_cpu = base.matmul(base.T) + torch.eye(16, dtype=torch.float32) * 0.25
75-
hessian = hessian_cpu.to(device="npu:0")
104+
hessian = hessian_cpu.to(device=npu_test_device)
76105
77106
factor, damp = gptq.hessian_inverse(hessian)
78107
torch.npu.synchronize()
@@ -90,8 +119,7 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
90119
"""
91120
)
92121

93-
env = os.environ.copy()
94-
env.setdefault("ASCEND_RT_VISIBLE_DEVICES", "0")
122+
env = _default_subprocess_env()
95123
proc = subprocess.run(
96124
[sys.executable, "-c", script],
97125
cwd=os.getcwd(),

0 commit comments

Comments
 (0)