Skip to content

Commit 0f44a4c

Browse files
authored
Ascend tests (#2908)
* Pin Ascend unit tests to NPU 7 * Use logical NPU 0 in Ascend tests
1 parent 7bbcb28 commit 0f44a4c

4 files changed

Lines changed: 51 additions & 24 deletions

File tree

.github/scripts/ci_tests.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,24 @@
2424
ERROR_PATTERN = re.compile(
2525
r"nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in"
2626
)
27+
DEFAULT_ASCEND_RT_VISIBLE_DEVICES = "7"
28+
29+
30+
def is_ascend_npu_test(test_script: str) -> bool:
31+
return "npu" in re.split(r"[/_.-]+", test_script.removesuffix(".py"))
32+
33+
34+
def configure_ascend_npu_test_env(env: dict[str, str], test_script: str) -> bool:
35+
if not is_ascend_npu_test(test_script):
36+
return False
37+
38+
visible_devices = env.get("ASCEND_RT_VISIBLE_DEVICES", "").strip()
39+
if not visible_devices:
40+
visible_devices = env.get("GPTQMODEL_TEST_ASCEND_RT_VISIBLE_DEVICES", DEFAULT_ASCEND_RT_VISIBLE_DEVICES)
41+
env["ASCEND_RT_VISIBLE_DEVICES"] = visible_devices
42+
43+
env["CUDA_VISIBLE_DEVICES"] = ""
44+
return True
2745

2846

2947
def kill_process_group(proc: subprocess.Popen[str]) -> None:
@@ -112,6 +130,10 @@ def run_tests(args: argparse.Namespace) -> int:
112130
env["CUDA_VISIBLE_DEVICES"] = ""
113131
print("CUDA_VISIBLE_DEVICES=")
114132

133+
ascend_npu_test = configure_ascend_npu_test_env(env, args.test_script)
134+
if ascend_npu_test:
135+
print(f"ASCEND_RT_VISIBLE_DEVICES={env.get('ASCEND_RT_VISIBLE_DEVICES', '')}")
136+
115137
if args.xpu_mode:
116138
maybe_uninstall_vllm()
117139

@@ -138,17 +160,16 @@ def run_tests(args: argparse.Namespace) -> int:
138160
start_new_session=True,
139161
)
140162

141-
keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
142-
keepalive_payload = build_job_request(
143-
runner_name=args.runner,
144-
run_id=args.run_id,
145-
test_name=args.test_script,
146-
)
147-
148163
monitor_thread = None
149164
monitor_stop = None
150165
monitor_state = {"forced_exit_code": 0}
151-
if env.get("CUDA_VISIBLE_DEVICES", ""):
166+
if env.get("CUDA_VISIBLE_DEVICES", "") and not ascend_npu_test:
167+
keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive"
168+
keepalive_payload = build_job_request(
169+
runner_name=args.runner,
170+
run_id=args.run_id,
171+
test_name=args.test_script,
172+
)
152173
monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor(
153174
proc=proc,
154175
keepalive_endpoint=keepalive_endpoint,

.github/scripts/ci_workflow.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ def normalize_test_name(name: str) -> str:
121121
return strip_py_suffix(name.removeprefix("tests/"))
122122

123123

124+
def is_npu_test_name(name: str) -> bool:
125+
return "npu" in re.split(r"[/_.-]+", normalize_test_name(name))
126+
127+
124128
def test_path_from_name(test_name: str, tests_root: str | Path = "tests") -> Path:
125129
normalized = normalize_test_name(test_name)
126130
return Path(tests_root) / f"{normalized}.py"
@@ -372,7 +376,8 @@ def resolve_test_runtime(test_name: str, tests_root: str | Path = "tests") -> Te
372376
normalized = normalize_test_name(test_name)
373377
test_path = test_path_from_name(normalized, tests_root=tests_root)
374378
xpu_mode = "xpu" in normalized
375-
skip_gpu_allocation = xpu_mode or has_no_gpu_marker(test_path)
379+
npu_mode = is_npu_test_name(normalized)
380+
skip_gpu_allocation = xpu_mode or npu_mode or has_no_gpu_marker(test_path)
376381
return TestRuntime(
377382
test_name=normalized,
378383
test_path=str(test_path),

tests/test_npu_linalg.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414

1515

1616
pytestmark = pytest.mark.skipif(not HAS_NPU, reason="Ascend NPU is required")
17+
NPU_TEST_DEVICE = "npu:0"
18+
19+
20+
def _test_npu_device() -> torch.device:
21+
device = torch.device(NPU_TEST_DEVICE)
22+
if HAS_NPU:
23+
torch.npu.set_device(device)
24+
return device
1725

1826

1927
def _spd_matrix(size: int, seed: int) -> torch.Tensor:
@@ -23,7 +31,7 @@ def _spd_matrix(size: int, seed: int) -> torch.Tensor:
2331

2432

2533
def test_npu_inverse_cholesky_factor_matches_cpu_reference():
26-
device = torch.device("npu:0")
34+
device = _test_npu_device()
2735

2836
for size in (8, 64, 128):
2937
matrix_cpu = _spd_matrix(size, seed=1000 + size)
@@ -46,7 +54,7 @@ def test_npu_inverse_cholesky_factor_matches_cpu_reference():
4654

4755

4856
def test_npu_inverse_cholesky_factor_rejects_non_positive_definite_matrix():
49-
matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device="npu:0")
57+
matrix = torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32, device=_test_npu_device())
5058

5159
with pytest.raises(torch._C._LinAlgError):
5260
npu_inverse_cholesky_factor(matrix)
@@ -64,15 +72,16 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
6472
if not HAS_NPU:
6573
raise RuntimeError("Ascend NPU is not available")
6674
67-
torch.npu.set_device(0)
75+
npu_test_device = "npu:0"
76+
torch.npu.set_device(npu_test_device)
6877
torch.manual_seed(0)
6978
70-
module = nn.Linear(16, 16, bias=False, device="npu:0", dtype=torch.float16)
79+
module = nn.Linear(16, 16, bias=False, device=npu_test_device, dtype=torch.float16)
7180
gptq = GPTQ(module, qcfg=QuantizeConfig(damp_percent=0.05, damp_auto_increment=0.05))
7281
7382
base = torch.randn(16, 16, dtype=torch.float32)
7483
hessian_cpu = base.matmul(base.T) + torch.eye(16, dtype=torch.float32) * 0.25
75-
hessian = hessian_cpu.to(device="npu:0")
84+
hessian = hessian_cpu.to(device=npu_test_device)
7685
7786
factor, damp = gptq.hessian_inverse(hessian)
7887
torch.npu.synchronize()
@@ -90,12 +99,10 @@ def test_gptq_npu_hessian_inverse_avoids_torch_npu_cpu_fallback_warnings():
9099
"""
91100
)
92101

93-
env = os.environ.copy()
94-
env.setdefault("ASCEND_RT_VISIBLE_DEVICES", "0")
95102
proc = subprocess.run(
96103
[sys.executable, "-c", script],
97104
cwd=os.getcwd(),
98-
env=env,
105+
env=os.environ.copy(),
99106
text=True,
100107
capture_output=True,
101108
timeout=60,

tests/test_npu_support.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import copy
2-
import os
32
import sys
43
import warnings
54

@@ -29,12 +28,7 @@
2928
from gptqmodel.utils.torch import HAS_NPU, last_npu_device_by_pci_bus_order
3029

3130

32-
def _default_npu_test_device() -> str:
33-
selected = last_npu_device_by_pci_bus_order()
34-
return str(selected) if selected is not None else "npu:0"
35-
36-
37-
NPU_TEST_DEVICE = os.environ.get("GPTQMODEL_TEST_NPU_DEVICE", _default_npu_test_device())
31+
NPU_TEST_DEVICE = "npu:0"
3832
NPU_CPU_FALLBACK_MARKERS = (
3933
"not currently supported on the NPU backend",
4034
"fall back to run on the CPU",

0 commit comments

Comments
 (0)