From f64ae22976318a9f14be8c1f5bf59ab6371d7606 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 09:35:33 +0800 Subject: [PATCH 1/9] [CI] refactor --- .github/scripts/allocate_gpu.py | 149 ------------- .github/scripts/ci_deps.py | 149 +++++++++++++ .github/scripts/ci_gpu.py | 315 +++++++++++++++++++++++++++ .github/scripts/ci_loop_versions.py | 37 ---- .github/scripts/ci_tests.py | 242 ++++++++++++++++++++ .github/scripts/ci_workflow.py | 221 +++++++++++++++++++ .github/scripts/install_deps.py | 138 ------------ .github/scripts/list_test_files.py | 140 ------------ .github/scripts/parse_test_config.py | 87 -------- .github/scripts/release_gpu.py | 45 ---- .github/scripts/run_tests.py | 200 ----------------- .github/scripts/uninstall_deps.py | 68 ------ .github/workflows/compatibility.yml | 4 +- .github/workflows/unit_tests.yml | 108 ++++----- scripts/arch.md | 6 +- 15 files changed, 974 insertions(+), 935 deletions(-) delete mode 100644 .github/scripts/allocate_gpu.py create mode 100644 .github/scripts/ci_deps.py create mode 100644 .github/scripts/ci_gpu.py delete mode 100644 .github/scripts/ci_loop_versions.py create mode 100644 .github/scripts/ci_tests.py create mode 100644 .github/scripts/ci_workflow.py delete mode 100644 .github/scripts/install_deps.py delete mode 100644 .github/scripts/list_test_files.py delete mode 100644 .github/scripts/parse_test_config.py delete mode 100644 .github/scripts/release_gpu.py delete mode 100644 .github/scripts/run_tests.py delete mode 100644 .github/scripts/uninstall_deps.py diff --git a/.github/scripts/allocate_gpu.py b/.github/scripts/allocate_gpu.py deleted file mode 100644 index e12dba035..000000000 --- a/.github/scripts/allocate_gpu.py +++ /dev/null @@ -1,149 +0,0 @@ -import argparse -import os -import subprocess -import sys -import time -import urllib.error -import urllib.parse -import urllib.request - - -def now_ms() -> int: - return time.time_ns() // 1_000_000 - - -def fetch_text(url: str, *, timeout: float, suppress_error: bool = False) -> str: - try: - with urllib.request.urlopen(url, timeout=timeout) as response: - return response.read().decode("utf-8", errors="replace") - except (urllib.error.URLError, TimeoutError, OSError) as exc: - if suppress_error: - print(f"Request failed for {url}: {exc}") - return "" - raise - - -def fetch_with_retry(url: str, *, timeout: float, retries: int, retry_delay: float) -> str: - last_error: Exception | None = None - for attempt in range(retries + 1): - try: - return fetch_text(url, timeout=timeout) - except (urllib.error.URLError, TimeoutError, OSError) as exc: - last_error = exc - if attempt < retries: - time.sleep(retry_delay) - if last_error is not None: - print(f"Request failed after retries: {last_error}") - return "" - - -def print_status(base_url: str) -> None: - status = fetch_text(f"{base_url}/gpu/status", timeout=10, suppress_error=True).strip() - if status: - print(status) - - -def append_github_env(name: str, value: str) -> None: - github_env = os.environ.get("GITHUB_ENV") - if not github_env: - return - with open(github_env, "a", encoding="utf-8") as fh: - fh.write(f"{name}={value}\n") - - -def is_valid_gpu_response(value: str) -> bool: - if not value: - return False - for part in value.split(","): - if not part: - return False - if part.startswith("-"): - if not part[1:].isdigit(): - return False - elif not part.isdigit(): - return False - return True - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--test", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--count", required=True) - parser.add_argument("--sleep-sec", type=float, default=5) - parser.add_argument("--timeout-sec", type=int, default=18000) - parser.add_argument("--request-timeout", type=float, default=10) - parser.add_argument("--retries", type=int, default=3) - parser.add_argument("--retry-delay", type=float, default=1) - parser.add_argument("--require-single", action="store_true") - args = parser.parse_args() - - encoded_test = urllib.parse.quote(args.test, safe="") - encoded_runner = urllib.parse.quote(args.runner, safe="") - start_s = time.time() - - print("Requesting GPU from allocator") - print( - f"run_id={args.run_id} test={args.test} runner={args.runner} count={args.count}" - ) - - while True: - ts_ms = now_ms() - url = ( - f"{args.base_url}/gpu/get?runid={args.run_id}×tamp={ts_ms}" - f"&test={encoded_test}&runner={encoded_runner}&count={args.count}" - ) - print(f"requesting GPU with: {url}") - - resp = fetch_with_retry( - url, - timeout=args.request_timeout, - retries=args.retries, - retry_delay=args.retry_delay, - ).replace("\r", "").replace("\n", "").strip() - - print(f"resp={{{resp}}}") - - if not is_valid_gpu_response(resp): - print(f"Allocator returned invalid response: {resp!r} (temporary error)") - print_status(args.base_url) - time.sleep(args.sleep_sec) - continue - - if resp.startswith("-") and "," not in resp: - elapsed = int(time.time() - start_s) - if elapsed >= args.timeout_sec: - print( - f"Timed out after {args.timeout_sec}s waiting for GPU " - f"(last response={resp})" - ) - print_status(args.base_url) - return 1 - - print( - f"No GPU available (response={resp}). Waiting {args.sleep_sec}s..." - f" elapsed={elapsed}s" - ) - print_status(args.base_url) - time.sleep(args.sleep_sec) - continue - - if args.require_single and "," in resp: - print(f"Allocator returned multiple GPUs for job requiring one GPU: {resp}") - return 1 - - print(f"Allocated GPU ID: {resp}") - append_github_env("CUDA_VISIBLE_DEVICES", resp) - append_github_env("STEP_TIMESTAMP", str(now_ms())) - print(f"CUDA_VISIBLE_DEVICES set to {resp}") - - print(subprocess.getoutput(f'nvidia-smi -i {resp} --query-gpu=name --format=csv')) - - print_status(args.base_url) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/scripts/ci_deps.py b/.github/scripts/ci_deps.py new file mode 100644 index 000000000..445c8fa86 --- /dev/null +++ b/.github/scripts/ci_deps.py @@ -0,0 +1,149 @@ +import argparse +import re +import subprocess +import sys +from pathlib import Path +from typing import Any + +import yaml + +BASE_DIR = Path(__file__).resolve().parent +_PKG_NAME_RE = re.compile(r"^[A-Za-z0-9_.-]+") + + +def resolve_test_path(raw_name: str) -> Path: + return Path("tests") / f"{raw_name}.py" + + +def load_yaml(filename: str) -> dict[str, Any]: + with (BASE_DIR / filename).open("r", encoding="utf-8") as fh: + return yaml.safe_load(fh) or {} + + +def normalize_pkg_spec(spec: str) -> str: + spec = (spec or "").strip() + if not spec: + return spec + + if spec.startswith("git+"): + return spec + + if spec.startswith("https://github.com/"): + spec = spec.rstrip("/") + if not spec.endswith(".git"): + spec += ".git" + return "git+" + spec + + return spec + + +def pkg_key(spec: str) -> str: + spec = normalize_pkg_spec(spec) + if not spec: + return spec + + if spec.startswith("git+"): + repo = spec.rsplit("/", 1)[-1] + if repo.endswith(".git"): + repo = repo[:-4] + return repo.split("@", 1)[0].lower().replace("_", "-") + + if "://" in spec: + return spec + + spec = spec.split(";", 1)[0].strip() + if " @" in spec: + spec = spec.split(" @", 1)[0].strip() + + match = _PKG_NAME_RE.match(spec) + if not match: + return spec.lower() + + return match.group(0).lower().replace("_", "-") + + +def collect_pkgs(test_path: Path, deps: dict[str, Any], *, dedupe_common: bool) -> tuple[list[str], list[str]]: + specific_pkgs: set[str] = set() + common_pkgs: set[str] = set(deps.get("common") or []) + + specific_pkgs.update(deps.get("tests", {}).get(test_path.name) or []) + + test_path_str = test_path.as_posix() + for key, value in deps.items(): + if not (isinstance(key, str) and key.startswith("tests/")): + continue + if not test_path_str.startswith(key + "/"): + continue + + if isinstance(value, list): + specific_pkgs.update(value) + elif isinstance(value, dict): + specific_pkgs.update(value.get(test_path.name) or []) + + if dedupe_common: + specific_keys = {pkg_key(pkg) for pkg in specific_pkgs} + common_pkgs = {pkg for pkg in common_pkgs if pkg_key(pkg) not in specific_keys} + + return sorted(specific_pkgs), sorted(common_pkgs) + + +def run_uv_pip(action: str, pkgs: list[str], *, extra_args: list[str] | None = None) -> None: + if not pkgs: + return + + normalized = [normalize_pkg_spec(pkg) for pkg in pkgs] + print(f"--- {action.title()} deps with uv:") + for pkg in normalized: + print(" -", pkg) + + for pkg in normalized: + cmd = ["uv", "pip", action] + if extra_args: + cmd.extend(extra_args) + cmd.append(pkg) + print("+", " ".join(cmd)) + try: + subprocess.check_call(cmd, shell=False) + except Exception as exc: + print(f"{action.title()} failed for {pkg}: {exc}") + + +def install_deps(raw_name: str) -> int: + test_path = resolve_test_path(raw_name.removeprefix("tests/").removesuffix(".py")) + deps = load_yaml("deps.yaml") + specific_pkgs, common_pkgs = collect_pkgs(test_path, deps, dedupe_common=True) + run_uv_pip("install", specific_pkgs, extra_args=["--no-cache"]) + run_uv_pip("install", common_pkgs, extra_args=["--no-cache"]) + return 0 + + +def uninstall_deps(raw_name: str) -> int: + test_path = resolve_test_path(raw_name.removeprefix("tests/").removesuffix(".py")) + deps = load_yaml("blacklist.yaml") + specific_pkgs, common_pkgs = collect_pkgs(test_path, deps, dedupe_common=False) + run_uv_pip("uninstall", specific_pkgs) + run_uv_pip("uninstall", common_pkgs) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + install_parser = subparsers.add_parser("install") + install_parser.add_argument("test_name") + + uninstall_parser = subparsers.add_parser("uninstall") + uninstall_parser.add_argument("test_name") + + args = parser.parse_args() + + if args.command == "install": + return install_deps(args.test_name) + if args.command == "uninstall": + return uninstall_deps(args.test_name) + raise AssertionError(f"Unhandled command: {args.command}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/ci_gpu.py b/.github/scripts/ci_gpu.py new file mode 100644 index 000000000..6d9e8a328 --- /dev/null +++ b/.github/scripts/ci_gpu.py @@ -0,0 +1,315 @@ +import argparse +import json +import os +import platform as py_platform +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + + +def now_ms() -> int: + return time.time_ns() // 1_000_000 + + +def normalize_base_url(base_url: str) -> str: + return base_url.rstrip("/") + + +def build_server_info(runner_name: str | None) -> dict[str, str]: + machine = py_platform.machine().lower() or "unknown" + system = py_platform.system().lower() or "unknown" + platform_name = ( + os.environ.get("GPU_ALLOCATOR_PLATFORM") + or runner_name + or os.environ.get("RUNNER_NAME") + or os.environ.get("HOSTNAME") + or f"{system}-{machine}" + ) + return { + "platform": platform_name, + "arch": machine, + "system": system, + } + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict[str, object] | None = None, + timeout: float, +) -> object | None: + data = None + headers: dict[str, str] = {} + if body is not None: + data = json.dumps(body).encode("utf-8") + headers["Content-Type"] = "application/json" + + request = urllib.request.Request(url, data=data, headers=headers, method=method) + with urllib.request.urlopen(request, timeout=timeout) as response: + raw = response.read().decode("utf-8", errors="replace") + if not raw.strip(): + return None + return json.loads(raw) + + +def request_json_with_retry( + url: str, + *, + method: str, + body: dict[str, object] | None, + timeout: float, + retries: int, + retry_delay: float, +) -> object | None: + last_error: Exception | None = None + for attempt in range(retries + 1): + try: + return request_json(url, method=method, body=body, timeout=timeout) + except Exception as exc: + last_error = exc + if attempt < retries: + time.sleep(retry_delay) + if last_error is not None: + print(f"Request failed after retries: {last_error}") + return None + + +def extract_gpu_ids(response: object | None) -> str: + if not isinstance(response, dict): + return "" + gpu_ids = response.get("gpuIds") + return gpu_ids.strip() if isinstance(gpu_ids, str) else "" + + +def is_valid_gpu_response(value: str) -> bool: + if not value: + return False + for part in value.split(","): + if not part: + return False + if part.startswith("-"): + if not part[1:].isdigit(): + return False + elif not part.isdigit(): + return False + return True + + +def query_gpu_inventory() -> list[dict[str, object]]: + command = [ + "nvidia-smi", + "--query-gpu=index,uuid,utilization.gpu,memory.used,memory.free,memory.total,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu", + "--format=csv,noheader,nounits", + ] + result = subprocess.run(command, check=True, capture_output=True, text=True) + gpus: list[dict[str, object]] = [] + + for line in result.stdout.splitlines(): + if not line.strip(): + continue + fields = [field.strip() for field in line.split(",")] + if len(fields) != 12: + raise ValueError(f"Unexpected nvidia-smi output line: {line}") + gpus.append( + { + "index": int(fields[0]), + "uuid": fields[1], + "util": int(fields[2]), + "memUsed": int(fields[3]), + "memFree": int(fields[4]), + "memTotal": int(fields[5]), + "driver": fields[6], + "name": fields[7], + "serial": fields[8], + "displayActive": fields[9].lower() == "enabled", + "displayMode": fields[10].lower() == "enabled", + "temperature": int(fields[11]), + } + ) + return gpus + + +def build_get_request( + *, + runner_name: str, + run_id: str, + test_name: str, + count: str, +) -> dict[str, object]: + return { + "server": build_server_info(runner_name), + "job": { + "jobId": int(run_id), + "count": int(count), + "test": test_name, + "exclusive": True, + "timestamp": now_ms(), + }, + "gpu": query_gpu_inventory(), + } + + +def build_job_request(*, runner_name: str, run_id: str, test_name: str) -> dict[str, object]: + return { + "server": build_server_info(runner_name), + "job": { + "jobId": int(run_id), + "test": test_name, + }, + } + + +def format_info_url(base_url: str, platform_name: str) -> str: + query = urllib.parse.urlencode({"platform": platform_name, "plain": "true"}) + return f"{normalize_base_url(base_url)}/info?{query}" + + +def append_github_env(name: str, value: str) -> None: + github_env = os.environ.get("GITHUB_ENV") + if not github_env: + return + with open(github_env, "a", encoding="utf-8") as fh: + fh.write(f"{name}={value}\n") + + +def print_status(base_url: str, runner_name: str) -> None: + server = build_server_info(runner_name) + try: + status = request_json( + format_info_url(base_url, server["platform"]), + timeout=10, + ) + except Exception as exc: + print(f"Request failed for allocator info: {exc}") + return + if status is not None: + print(status) + + +def allocate_gpu(args: argparse.Namespace) -> int: + start_s = time.time() + endpoint = f"{normalize_base_url(args.base_url)}/get" + + print("Requesting GPU from allocator") + print( + f"run_id={args.run_id} test={args.test} runner={args.runner} count={args.count}" + ) + + while True: + request_body = build_get_request( + runner_name=args.runner, + run_id=args.run_id, + test_name=args.test, + count=args.count, + ) + print(f"requesting GPU with: {endpoint}") + + response = request_json_with_retry( + endpoint, + method="POST", + body=request_body, + timeout=args.request_timeout, + retries=args.retries, + retry_delay=args.retry_delay, + ) + resp = extract_gpu_ids(response) + + print(f"resp={{{resp}}}") + + if not is_valid_gpu_response(resp): + print(f"Allocator returned invalid response: {resp!r} (temporary error)") + print_status(args.base_url, args.runner) + time.sleep(args.sleep_sec) + continue + + if resp.startswith("-") and "," not in resp: + elapsed = int(time.time() - start_s) + if elapsed >= args.timeout_sec: + print( + f"Timed out after {args.timeout_sec}s waiting for GPU " + f"(last response={resp})" + ) + print_status(args.base_url, args.runner) + return 1 + + print( + f"No GPU available (response={resp}). Waiting {args.sleep_sec}s..." + f" elapsed={elapsed}s" + ) + print_status(args.base_url, args.runner) + time.sleep(args.sleep_sec) + continue + + if args.require_single and "," in resp: + print(f"Allocator returned multiple GPUs for job requiring one GPU: {resp}") + return 1 + + print(f"Allocated GPU ID: {resp}") + append_github_env("CUDA_VISIBLE_DEVICES", resp) + print(f"CUDA_VISIBLE_DEVICES set to {resp}") + print(subprocess.getoutput(f"nvidia-smi -i {resp} --query-gpu=name --format=csv")) + print_status(args.base_url, args.runner) + return 0 + + +def release_gpu(args: argparse.Namespace) -> int: + request_body = build_job_request( + runner_name=args.runner, + run_id=args.run_id, + test_name=args.test, + ) + url = f"{normalize_base_url(args.base_url)}/release" + print(url) + + try: + response = request_json(url, method="POST", body=request_body, timeout=args.timeout) + except (urllib.error.URLError, TimeoutError, OSError, ValueError) as exc: + print(f"Failed to release GPU: {exc}") + return 0 + + resp = extract_gpu_ids(response) + print(f"response: {resp}") + if args.gpu_id and resp not in {args.gpu_id, "-1"}: + print(f"Error: response ({resp}) != expected ({args.gpu_id})") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + allocate_parser = subparsers.add_parser("allocate") + allocate_parser.add_argument("--base-url", required=True) + allocate_parser.add_argument("--run-id", required=True) + allocate_parser.add_argument("--test", required=True) + allocate_parser.add_argument("--runner", required=True) + allocate_parser.add_argument("--count", required=True) + allocate_parser.add_argument("--sleep-sec", type=float, default=5) + allocate_parser.add_argument("--timeout-sec", type=int, default=18000) + allocate_parser.add_argument("--request-timeout", type=float, default=10) + allocate_parser.add_argument("--retries", type=int, default=3) + allocate_parser.add_argument("--retry-delay", type=float, default=1) + allocate_parser.add_argument("--require-single", action="store_true") + + release_parser = subparsers.add_parser("release") + release_parser.add_argument("--base-url", required=True) + release_parser.add_argument("--run-id", required=True) + release_parser.add_argument("--gpu-id", default="") + release_parser.add_argument("--test", required=True) + release_parser.add_argument("--runner", required=True) + release_parser.add_argument("--timeout", type=float, default=10) + + args = parser.parse_args() + if args.command == "allocate": + return allocate_gpu(args) + if args.command == "release": + return release_gpu(args) + raise AssertionError(f"Unhandled command: {args.command}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/ci_loop_versions.py b/.github/scripts/ci_loop_versions.py deleted file mode 100644 index c65b14d63..000000000 --- a/.github/scripts/ci_loop_versions.py +++ /dev/null @@ -1,37 +0,0 @@ -import argparse -import json - -import requests -from packaging.specifiers import SpecifierSet -from packaging.version import Version - - -def get_versions(package: str, version_spec: str) -> list[str]: - specifier = SpecifierSet(version_spec) - - url = f"https://pypi.org/pypi/{package}/json" - resp = requests.get(url, timeout=30) - resp.raise_for_status() - data = resp.json() - - all_versions = data["releases"].keys() - - matched = sorted( - (Version(v) for v in all_versions if Version(v) in specifier), - reverse=True, - ) - return [str(v) for v in matched] - - -def main(): - parser = argparse.ArgumentParser(description="List matching PyPI versions as JSON") - parser.add_argument("package", help="package name, e.g. setuptools") - parser.add_argument("version", help='version spec, e.g. ">=77.0.1,<83"') - args = parser.parse_args() - - versions = get_versions(args.package, args.version) - print(json.dumps(versions)) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/ci_tests.py b/.github/scripts/ci_tests.py new file mode 100644 index 000000000..22ce86212 --- /dev/null +++ b/.github/scripts/ci_tests.py @@ -0,0 +1,242 @@ +import argparse +import os +import re +import signal +import subprocess +import sys +import threading +import time +import urllib.error +from pathlib import Path + +from ci_gpu import ( + build_job_request, + extract_gpu_ids, + normalize_base_url, + request_json, +) + +ERROR_PATTERN = re.compile( + r"nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in" +) + + +def append_github_env(name: str, value: str) -> None: + github_env = os.environ.get("GITHUB_ENV") + if not github_env: + return + with open(github_env, "a", encoding="utf-8") as fh: + fh.write(f"{name}={value}\n") + + +def kill_process_group(proc: subprocess.Popen[str]) -> None: + try: + os.killpg(proc.pid, signal.SIGKILL) + except ProcessLookupError: + pass + + +def start_keepalive_monitor( + *, + proc: subprocess.Popen[str], + keepalive_endpoint: str, + keepalive_payload: dict[str, object], + expected_gpu_ids: str, + interval_sec: int, +) -> tuple[threading.Thread, threading.Event, dict[str, int]]: + stop_event = threading.Event() + state = {"forced_exit_code": 0} + + def worker() -> None: + print(f"start to keep alive... {keepalive_endpoint}") + while not stop_event.wait(interval_sec): + try: + response = request_json( + keepalive_endpoint, + method="POST", + body=keepalive_payload, + timeout=10, + ) + except (urllib.error.URLError, TimeoutError, OSError, ValueError) as exc: + print(f"Keepalive request failed: {exc}") + continue + + resp = extract_gpu_ids(response) + if resp == "-1": + print(f"Server returned {resp}, terminating job...") + state["forced_exit_code"] = 3 + kill_process_group(proc) + stop_event.set() + return + if expected_gpu_ids and resp != expected_gpu_ids: + print(f"Keepalive returned mismatched GPUs {resp}, expected {expected_gpu_ids}.") + state["forced_exit_code"] = 3 + kill_process_group(proc) + stop_event.set() + return + print("gpu is kept alive...") + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + return thread, stop_event, state + + +def stream_process_output(proc: subprocess.Popen[str], log_file: Path) -> int: + assert proc.stdout is not None + log_file.parent.mkdir(parents=True, exist_ok=True) + with log_file.open("w", encoding="utf-8") as fh: + for line in proc.stdout: + print(line, end="") + fh.write(line) + return proc.wait() + + +def maybe_uninstall_vllm() -> None: + for cmd in (["uv", "pip", "uninstall", "vllm", "-y"], ["uv", "pip", "list"]): + print(f"+ {' '.join(cmd)}") + subprocess.run(cmd, check=False) + + +def run_tests(args: argparse.Namespace) -> int: + env = os.environ.copy() + if args.clear_cuda: + env["CUDA_VISIBLE_DEVICES"] = "" + print("CUDA_VISIBLE_DEVICES=") + + if args.xpu_mode: + maybe_uninstall_vllm() + + if args.model_test_mode is not None: + env["GPTQMODEL_MODEL_TEST_MODE"] = args.model_test_mode + print(f"GPTQMODEL_MODEL_TEST_MODE={args.model_test_mode}") + + print(f"CUDA_VISIBLE_DEVICES={env.get('CUDA_VISIBLE_DEVICES', '')}") + + log_dir = Path(f"/opt/dist/GPTQModel/{args.run_id}/logs") + log_file = log_dir / f"{args.test_script}.log" + log_dir.mkdir(parents=True, exist_ok=True) + + pytest_cmd = ["pytest", "--durations=0", f"tests/{args.test_script}.py"] + print(f"+ {' '.join(pytest_cmd)}") + + proc = subprocess.Popen( + pytest_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + start_new_session=True, + ) + + keepalive_endpoint = f"{normalize_base_url(args.base_url)}/keepalive" + keepalive_payload = build_job_request( + runner_name=args.runner, + run_id=args.run_id, + test_name=args.test_script, + ) + + monitor_thread = None + monitor_stop = None + monitor_state = {"forced_exit_code": 0} + if env.get("CUDA_VISIBLE_DEVICES", ""): + monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor( + proc=proc, + keepalive_endpoint=keepalive_endpoint, + keepalive_payload=keepalive_payload, + expected_gpu_ids=env.get("CUDA_VISIBLE_DEVICES", ""), + interval_sec=args.monitor_interval_sec, + ) + + start_time = time.time() + try: + return_code = stream_process_output(proc, log_file) + finally: + if monitor_stop is not None: + print("trap cleanup EXIT...") + monitor_stop.set() + if monitor_thread is not None: + monitor_thread.join(timeout=5) + + if monitor_state["forced_exit_code"]: + append_github_env("ERROR", "22") + return 22 + + if return_code != 0: + append_github_env("ERROR", "22") + print(f"pipe status wrong: {return_code}") + return 22 + + execution_time = int(time.time() - start_time) + print(f"{execution_time // 60}m {execution_time % 60}s") + + try: + for entry in sorted(log_dir.iterdir()): + stat = entry.stat() + print(f"{stat.st_size:>10} {entry.name}") + except OSError as exc: + print(f"Failed to list log dir: {exc}") + + return 0 + + +def check_log(args: argparse.Namespace) -> int: + log_file = Path(args.log_root) / args.run_id / "logs" / f"{args.test_script}.log" + if not log_file.exists(): + print(f"Log file not found: {log_file}") + return 1 + + try: + lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines() + except OSError as exc: + print(f"Failed to read log file {log_file}: {exc}") + return 1 + + matched = 0 + for lineno, line in enumerate(lines, start=1): + if ERROR_PATTERN.search(line): + print(f"{lineno}:{line}") + matched += 1 + if matched >= args.max_matches: + break + + tail = lines[-args.tail_lines:] + if tail: + print(f"--- tail -n {args.tail_lines} {log_file}") + for line in tail: + print(line) + return 1 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + run_parser = subparsers.add_parser("run") + run_parser.add_argument("--base-url", required=True) + run_parser.add_argument("--run-id", required=True) + run_parser.add_argument("--test-script", required=True) + run_parser.add_argument("--runner", required=True) + run_parser.add_argument("--gpu-id", default="") + run_parser.add_argument("--model-test-mode") + run_parser.add_argument("--clear-cuda", action="store_true") + run_parser.add_argument("--xpu-mode", action="store_true") + run_parser.add_argument("--monitor-interval-sec", type=int, default=60) + + check_parser = subparsers.add_parser("check-log") + check_parser.add_argument("--run-id", required=True) + check_parser.add_argument("--test-script", required=True) + check_parser.add_argument("--log-root", default="/opt/dist/GPTQModel") + check_parser.add_argument("--tail-lines", type=int, default=200) + check_parser.add_argument("--max-matches", type=int, default=50) + + args = parser.parse_args() + if args.command == "run": + return run_tests(args) + if args.command == "check-log": + return check_log(args) + raise AssertionError(f"Unhandled command: {args.command}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/ci_workflow.py b/.github/scripts/ci_workflow.py new file mode 100644 index 000000000..4d1d34d67 --- /dev/null +++ b/.github/scripts/ci_workflow.py @@ -0,0 +1,221 @@ +import argparse +import json +import os +import re +import shlex +import sys +from pathlib import Path +from typing import Any + +import requests +import yaml +from packaging.specifiers import SpecifierSet +from packaging.version import Version + + +def split_csv(raw: str | None) -> list[str]: + if not raw: + return [] + return [value.strip() for value in raw.split(",") if value.strip()] + + +def strip_py_suffix(name: str) -> str: + return name.removesuffix(".py") + + +def append_github_env(name: str, value: str) -> None: + github_env = os.environ.get("GITHUB_ENV") + if not github_env: + return + with open(github_env, "a", encoding="utf-8") as fh: + fh.write(f"{name}={value}\n") + + +def parse_test_config( + yaml_file: str | Path, + group: str, + test_name: str | None = None, +) -> dict[str, Any]: + yaml_path = Path(yaml_file) + with yaml_path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + + result: dict[str, Any] = {} + common_data = data.get("common") or {} + if not isinstance(common_data, dict): + raise ValueError("group must be a mapping: common") + + for key, value in common_data.items(): + if not isinstance(value, dict): + result[key] = value + + if group not in data: + raise KeyError(f"group not found: {group}") + + group_data = data[group] or {} + if not isinstance(group_data, dict): + raise ValueError(f"group must be a mapping: {group}") + + for key, value in group_data.items(): + if not isinstance(value, dict): + result[key] = value + + if test_name is not None: + test_config = group_data.get(test_name) + if test_config is not None: + if not isinstance(test_config, dict): + raise ValueError(f"test config must be a mapping: {test_name}") + result.update(test_config) + + return result + + +def sort_key(path: str) -> tuple[bool, bool, str]: + return ("moe" in path, "/" in path, path) + + +def is_model_compat_test(rel_path: str, file_path: Path) -> bool: + if not rel_path.startswith("models/"): + return False + try: + contents = file_path.read_text(encoding="utf-8") + except OSError: + return False + markers = ("quantize_and_evaluate(", "self.evaluate_model(", "check_results(") + return any(marker in contents for marker in markers) + + +def list_tests(ignored_test_files: str | list[str], test_names: str, test_regex: str, tests_root: str | Path) -> tuple[list[str], list[str], list[str]]: + tests_root = Path(tests_root) + input_tests = [strip_py_suffix(name) for name in split_csv(test_names)] + ignored_raw = ignored_test_files if isinstance(ignored_test_files, list) else split_csv(ignored_test_files) + ignored_set = {strip_py_suffix(name) for name in ignored_raw} + + all_tests = { + rel: path + for path in tests_root.rglob("test_*.py") + for rel in [str(path.relative_to(tests_root).with_suffix(""))] + if rel not in ignored_set and path.stem not in ignored_set + } + + model_tests = { + rel + for rel, path in all_tests.items() + if (not input_tests or rel in input_tests) + and "mlx" not in rel + and "ipex" not in rel + and "xpu" not in rel + and re.match(test_regex, rel) + and is_model_compat_test(rel, path) + } + + torch_tests = { + rel + for rel in all_tests + if (not input_tests or rel in input_tests) + and rel not in model_tests + and "mlx" not in rel + and "ipex" not in rel + and "xpu" not in rel + and re.match(test_regex, rel) + } + + mlx_tests = { + rel + for rel in all_tests + if ("mlx" in rel or "apple" in rel) + and ((rel in input_tests) if input_tests else True) + and re.match(test_regex, rel) + } + + return ( + sorted(torch_tests, key=sort_key), + sorted(model_tests, key=sort_key), + sorted(mlx_tests, key=sort_key), + ) + + +def list_matching_versions(package: str, version_spec: str) -> list[str]: + specifier = SpecifierSet(version_spec) + response = requests.get(f"https://pypi.org/pypi/{package}/json", timeout=30) + response.raise_for_status() + data = response.json() + matched = sorted( + (Version(version) for version in data["releases"].keys() if Version(version) in specifier), + reverse=True, + ) + return [str(version) for version in matched] + + +def cmd_list_tests(args: argparse.Namespace) -> int: + torch_files, model_files, mlx_files = list_tests( + ignored_test_files=args.ignored_test_files, + test_names=args.test_names, + test_regex=args.test_regex, + tests_root=args.tests_root, + ) + print(f"{json.dumps(torch_files)}|{json.dumps(model_files)}|{json.dumps(mlx_files)}") + return 0 + + +def cmd_resolve_env(args: argparse.Namespace) -> int: + config = parse_test_config(args.yaml_file, args.group, args.test_name) + py = str(config["py"]) + gpu = str(config["gpu"]) + env_name = ( + f"{args.env_prefix}_cu{args.cuda_version}_torch{args.torch_version}" + f"_py{py}_{args.test_name}" + ) + + append_github_env("PYTHON_VERSION", py) + append_github_env("GPU_COUNT", gpu) + append_github_env("ENV_NAME", env_name) + + print(f"using py={py} gpu={gpu} env={env_name} for test {args.test_name}") + if args.shell: + print(f"PYTHON_VERSION={shlex.quote(py)}") + print(f"GPU_COUNT={shlex.quote(gpu)}") + print(f"ENV_NAME={shlex.quote(env_name)}") + return 0 + + +def cmd_loop_versions(args: argparse.Namespace) -> int: + print(json.dumps(list_matching_versions(args.package, args.version))) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="command", required=True) + + list_parser = subparsers.add_parser("list-tests") + list_parser.add_argument("--ignored-test-files", default=os.getenv("IGNORED_TEST_FILES", "")) + list_parser.add_argument("--test-names", default=os.getenv("TEST_NAMES", "")) + list_parser.add_argument("--test-regex", default=os.getenv("TEST_REGEX", ".*")) + list_parser.add_argument("--tests-root", default="tests") + + resolve_parser = subparsers.add_parser("resolve-env") + resolve_parser.add_argument("--yaml-file", default=Path(__file__).with_name("test.yaml")) + resolve_parser.add_argument("--group", required=True) + resolve_parser.add_argument("--test-name", required=True) + resolve_parser.add_argument("--cuda-version", required=True) + resolve_parser.add_argument("--torch-version", required=True) + resolve_parser.add_argument("--env-prefix", default="gptqmodel_test") + resolve_parser.add_argument("--shell", action="store_true") + + versions_parser = subparsers.add_parser("loop-versions") + versions_parser.add_argument("package") + versions_parser.add_argument("version") + + args = parser.parse_args() + if args.command == "list-tests": + return cmd_list_tests(args) + if args.command == "resolve-env": + return cmd_resolve_env(args) + if args.command == "loop-versions": + return cmd_loop_versions(args) + raise AssertionError(f"Unhandled command: {args.command}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/install_deps.py b/.github/scripts/install_deps.py deleted file mode 100644 index 4c9ce346e..000000000 --- a/.github/scripts/install_deps.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import re -import subprocess -import sys -from pathlib import Path - -import yaml - -base_dir = os.path.dirname(os.path.abspath(__file__)) -_PKG_NAME_RE = re.compile(r"^[A-Za-z0-9_.-]+") - - -def resolve_test_path(raw_name: str) -> Path: - return Path("tests") / f"{raw_name}.py" - - -def normalize_pkg_spec(s: str) -> str: - s = (s or "").strip() - if not s: - return s - - if s.startswith("git+"): - return s - - if s.startswith("https://github.com/"): - s = s.rstrip("/") - if not s.endswith(".git"): - s += ".git" - return "git+" + s - - return s - - -def pkg_key(spec: str) -> str: - spec = normalize_pkg_spec(spec) - if not spec: - return spec - - if spec.startswith("git+"): - repo = spec.rsplit("/", 1)[-1] - if repo.endswith(".git"): - repo = repo[:-4] - return repo.split("@", 1)[0].lower().replace("_", "-") - - if "://" in spec: - return spec - - spec = spec.split(";", 1)[0].strip() - if " @" in spec: - spec = spec.split(" @", 1)[0].strip() - - match = _PKG_NAME_RE.match(spec) - if not match: - return spec.lower() - - return match.group(0).lower().replace("_", "-") - - -def collect_pkgs(test_path: Path, deps: dict): - specific_pkgs = set() - - common_pkgs = set(deps.get("common") or []) - - specific_pkgs.update(deps.get("tests", {}).get(test_path.name) or []) - - test_path_str = test_path.as_posix() - for key, value in deps.items(): - if not (isinstance(key, str) and key.startswith("tests/")): - continue - if not test_path_str.startswith(key + "/"): - continue - - if isinstance(value, list): - specific_pkgs.update(value) - - elif isinstance(value, dict): - specific_pkgs.update(value.get(test_path.name) or []) - - else: - pass - - specific_pkg_keys = {pkg_key(pkg) for pkg in specific_pkgs} - common_pkgs = {pkg for pkg in common_pkgs if pkg_key(pkg) not in specific_pkg_keys} - - return specific_pkgs, common_pkgs - - -def pip_install(pkgs): - if not pkgs: - return - - print("--- Installing deps:") - for p in pkgs: - print(" -", p) - - cmd = [ - sys.executable, - "-m", "pip", "install", - "--disable-pip-version-check", - "--no-cache-dir", - ] - pkgs = [normalize_pkg_spec(p) for p in pkgs] - cmd.extend(pkgs) - - subprocess.check_call(cmd, shell=False) - - -def uv_install(pkgs): - if not pkgs: - return - - pkgs = [normalize_pkg_spec(p) for p in pkgs] - - print("--- Installing deps with uv:") - for p in pkgs: - print(" -", p) - - for p in pkgs: - cmd = ["uv", "pip", "install", "--no-cache", p] - print("installing: ", cmd) - try: - subprocess.check_call(cmd, shell=False) - except Exception as e: - print(f"Install failed: {e}") - - -if __name__ == "__main__": - raw_name = sys.argv[1].removeprefix("tests/").removesuffix(".py") - test_path = resolve_test_path(raw_name) - - with open(os.path.join(base_dir, "deps.yaml")) as f: - deps = yaml.safe_load(f) - - specific_pkgs, common_pkgs = collect_pkgs(test_path, deps) - - uv_install(specific_pkgs) - - uv_install(common_pkgs) diff --git a/.github/scripts/list_test_files.py b/.github/scripts/list_test_files.py deleted file mode 100644 index 0eeec8812..000000000 --- a/.github/scripts/list_test_files.py +++ /dev/null @@ -1,140 +0,0 @@ -# file: .github/scripts/list_test_files.py -import json -import os -import re -from pathlib import Path -from typing import Dict, List, Tuple, Union, Optional - - -def _sort_key(p: str): - return ("moe" in p, "/" in p, p) - - -def _split_csv(s: Optional[str]) -> List[str]: - if not s: - return [] - return [x.strip() for x in s.split(",") if x.strip()] - - -def _strip_py_suffix(name: str) -> str: - return name.removesuffix(".py") - - -def _is_model_compat_test(rel_path: str, file_path: Path) -> bool: - if not rel_path.startswith("models/"): - return False - - try: - contents = file_path.read_text(encoding="utf-8") - except OSError: - return False - - compat_markers = ( - "quantize_and_evaluate(", - "self.evaluate_model(", - "check_results(", - ) - return any(marker in contents for marker in compat_markers) - - -def getFiles( - ignored_test_files: Union[str, List[str]], - test_names: str = "", - test_regex: str = ".*", - tests_root: Union[str, Path] = "tests", -) -> Tuple[List[str], List[str], List[str]]: - """ - Returns: - (torch_test_files, model_compat_test_files, m4_test_files) - - torch_test_files: tests/**/test_*.py excluding mlx / ipex / xpu and model compat files - - model_compat_test_files: tests/models/**/test_*.py files that run model quantize + evaluation compat flows - - m4_test_files: tests/**/test_*.py that contains mlx or apple - """ - tests_root = Path(tests_root) - - input_test_files_list = [_strip_py_suffix(f) for f in _split_csv(test_names)] - - ignored_list = ( - ignored_test_files if isinstance(ignored_test_files, list) else _split_csv(ignored_test_files) - ) - ignored_set = set(_strip_py_suffix(x) for x in ignored_list) - - # all tests under tests/**/test_*.py (includes tests/models/**) - all_tests: Dict[str, Path] = { - rel: p - for p in tests_root.rglob("test_*.py") - for rel in [str(p.relative_to(tests_root).with_suffix(""))] - if rel not in ignored_set and p.stem not in ignored_set - } - - model_compat_test_files = { - rel - for rel, path in all_tests.items() - if (not input_test_files_list or rel in input_test_files_list) - and "mlx" not in rel - and "ipex" not in rel - and "xpu" not in rel - and re.match(test_regex, rel) - and _is_model_compat_test(rel, path) - } - - # torch tests - torch_test_files = { - f - for f in all_tests - if (not input_test_files_list or f in input_test_files_list) - and f not in model_compat_test_files - and "mlx" not in f - and "ipex" not in f - and "xpu" not in f - and re.match(test_regex, f) - } - - # m4 tests - m4_test_files = { - f - for f in all_tests - if ("mlx" in f or "apple" in f) - and ((f in input_test_files_list) if input_test_files_list else True) - and re.match(test_regex, f) - } - - return ( - sorted(torch_test_files, key=_sort_key), - sorted(model_compat_test_files, key=_sort_key), - sorted(m4_test_files, key=_sort_key), - ) - - -def main() -> None: - """ - Usage: - test_files=$(python3 .github/scripts/list_test_files.py \ - --ignored-test-files "$IGNORED_TEST_FILES" \ - --test-names "${{ github.event.inputs.test_names }}" \ - --test-regex "${{ github.event.inputs.test_regex }}") - - Output: - | - """ - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--ignored-test-files", default=os.getenv("IGNORED_TEST_FILES", "")) - parser.add_argument("--test-names", default=os.getenv("TEST_NAMES", "")) - parser.add_argument("--test-regex", default=os.getenv("TEST_REGEX", ".*")) - parser.add_argument("--tests-root", default="tests") - args = parser.parse_args() - - torch_files, model_compat_files, m4_files = getFiles( - ignored_test_files=args.ignored_test_files, - test_names=args.test_names, - test_regex=args.test_regex, - tests_root=args.tests_root, - ) - - print(f"{json.dumps(torch_files)}|{json.dumps(model_compat_files)}|{json.dumps(m4_files)}") - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/parse_test_config.py b/.github/scripts/parse_test_config.py deleted file mode 100644 index 3d8b808b3..000000000 --- a/.github/scripts/parse_test_config.py +++ /dev/null @@ -1,87 +0,0 @@ -import json -from pathlib import Path -from typing import Any - -import yaml - - -def parse_test_config( - yaml_file: str | Path, - group: str, - test_name: str | None = None, -) -> dict[str, Any]: - yaml_path = Path(yaml_file) - with yaml_path.open("r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - - result: dict[str, Any] = {} - - common_data = data.get("common") or {} - if not isinstance(common_data, dict): - raise ValueError("group must be a mapping: common") - - for key, value in common_data.items(): - if not isinstance(value, dict): - result[key] = value - - if group not in data: - raise KeyError(f"group not found: {group}") - - group_data = data[group] or {} - if not isinstance(group_data, dict): - raise ValueError(f"group must be a mapping: {group}") - - # Group-level shared config overrides common defaults. - for key, value in group_data.items(): - if not isinstance(value, dict): - result[key] = value - - # Per-test config overrides group/common defaults. Missing per-test entries - # fall back to the merged defaults. - if test_name is not None: - test_config = group_data.get(test_name) - if test_config is None: - return result - if not isinstance(test_config, dict): - raise ValueError(f"test config must be a mapping: {test_name}") - result.update(test_config) - - return result - - -def parse_test_config_flags( - yaml_file: str | Path, - group: str, - test_name: str | None = None, -) -> dict[str, int]: - config = parse_test_config(yaml_file, group, test_name) - return {key: 1 for key in config} - - -def main() -> None: - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--yaml-file", - default=Path(__file__).with_name("test.yaml"), - ) - parser.add_argument("--group", required=True) - parser.add_argument("--test-name") - parser.add_argument( - "--flags-only", - action="store_true", - help="Return only config keys with value 1, for example: {'py': 1, 'gpu': 1}", - ) - args = parser.parse_args() - - if args.flags_only: - result = parse_test_config_flags(args.yaml_file, args.group, args.test_name) - else: - result = parse_test_config(args.yaml_file, args.group, args.test_name) - - print(json.dumps(result, ensure_ascii=False)) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/release_gpu.py b/.github/scripts/release_gpu.py deleted file mode 100644 index dceae3580..000000000 --- a/.github/scripts/release_gpu.py +++ /dev/null @@ -1,45 +0,0 @@ -import argparse -import sys -import urllib.error -import urllib.parse -import urllib.request - - -def fetch_text(url: str, *, timeout: float) -> str: - with urllib.request.urlopen(url, timeout=timeout) as response: - return response.read().decode("utf-8", errors="replace") - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--gpu-id", required=True) - parser.add_argument("--timestamp", required=True) - parser.add_argument("--test", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--timeout", type=float, default=10) - args = parser.parse_args() - - encoded_test = urllib.parse.quote(args.test, safe="") - encoded_runner = urllib.parse.quote(args.runner, safe="") - url = ( - f"{args.base_url}/gpu/release?runid={args.run_id}&gpu={args.gpu_id}" - f"×tamp={args.timestamp}&test={encoded_test}&runner={encoded_runner}" - ) - print(url) - - try: - resp = fetch_text(url, timeout=args.timeout).strip() - except (urllib.error.URLError, TimeoutError, OSError) as exc: - print(f"Failed to release GPU: {exc}") - return 0 - - print(f"response: {resp}") - if resp != args.gpu_id: - print(f"Error: response ({resp}) != expected ({args.gpu_id})") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/scripts/run_tests.py b/.github/scripts/run_tests.py deleted file mode 100644 index 662c6c938..000000000 --- a/.github/scripts/run_tests.py +++ /dev/null @@ -1,200 +0,0 @@ -import argparse -import os -import signal -import subprocess -import sys -import threading -import time -import urllib.error -import urllib.parse -import urllib.request -from pathlib import Path - - -def append_github_env(name: str, value: str) -> None: - github_env = os.environ.get("GITHUB_ENV") - if not github_env: - return - with open(github_env, "a", encoding="utf-8") as fh: - fh.write(f"{name}={value}\n") - - -def fetch_text(url: str, *, timeout: float, suppress_error: bool = False) -> str: - try: - with urllib.request.urlopen(url, timeout=timeout) as response: - return response.read().decode("utf-8", errors="replace") - except (urllib.error.URLError, TimeoutError, OSError) as exc: - if suppress_error: - print(f"Request failed for {url}: {exc}") - return "" - raise - - -def kill_process_group(proc: subprocess.Popen[str]) -> None: - try: - os.killpg(proc.pid, signal.SIGKILL) - except ProcessLookupError: - # Process (or process group) is already gone; nothing to do. - pass - - -def start_keepalive_monitor( - *, - proc: subprocess.Popen[str], - keep_alive_url: str, - interval_sec: int, -) -> tuple[threading.Thread, threading.Event, dict[str, int]]: - stop_event = threading.Event() - state = {"forced_exit_code": 0} - - def worker() -> None: - print(f"start to keep alive... {keep_alive_url}") - while not stop_event.wait(interval_sec): - resp = fetch_text(keep_alive_url, timeout=10, suppress_error=True) - # if resp.strip() == "-1": - if int(resp.strip()) < 0: - print(f"Server returned {resp.strip()}, terminating job...") - state["forced_exit_code"] = 3 - kill_process_group(proc) - stop_event.set() - return - print("gpu is kept alive...") - - thread = threading.Thread(target=worker, daemon=True) - thread.start() - return thread, stop_event, state - - -def stream_process_output(proc: subprocess.Popen[str], log_file: Path) -> int: - assert proc.stdout is not None - log_file.parent.mkdir(parents=True, exist_ok=True) - with log_file.open("w", encoding="utf-8") as fh: - for line in proc.stdout: - print(line, end="") - fh.write(line) - return proc.wait() - - -def maybe_uninstall_vllm() -> None: - uninstall_cmd = ["uv", "pip", "uninstall", "vllm", "-y"] - print(f"+ {' '.join(uninstall_cmd)}") - subprocess.run(uninstall_cmd, check=False) - - list_cmd = ["uv", "pip", "list"] - print(f"+ {' '.join(list_cmd)}") - subprocess.run(list_cmd, check=False) - - -def log_vram(base_url: str, run_id: str, gpu_id: str, execution_time: int, test: str) -> None: - encoded_test = urllib.parse.quote(test, safe="") - url = ( - f"{base_url}/gpu/logVram?runid={run_id}&gpu={gpu_id}" - f"&range={execution_time}&unit=second&test={encoded_test}" - ) - try: - print(fetch_text(url, timeout=30, suppress_error=True)) - except Exception: - # Logging VRAM usage is best-effort; failures must not affect the main test flow. - pass - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", required=True) - parser.add_argument("--run-id", required=True) - parser.add_argument("--test-script", required=True) - parser.add_argument("--runner", required=True) - parser.add_argument("--gpu-id", default="") - parser.add_argument("--model-test-mode") - parser.add_argument("--clear-cuda", action="store_true") - parser.add_argument("--xpu-mode", action="store_true") - parser.add_argument("--monitor-interval-sec", type=int, default=60) - args = parser.parse_args() - - env = os.environ.copy() - if args.clear_cuda: - env["CUDA_VISIBLE_DEVICES"] = "" - print("CUDA_VISIBLE_DEVICES=") - - if args.xpu_mode: - maybe_uninstall_vllm() - - if args.model_test_mode is not None: - env["GPTQMODEL_MODEL_TEST_MODE"] = args.model_test_mode - print(f"GPTQMODEL_MODEL_TEST_MODE={args.model_test_mode}") - - print(f"CUDA_VISIBLE_DEVICES={env.get('CUDA_VISIBLE_DEVICES', '')}") - - log_dir = Path(f"/opt/dist/GPTQModel/{args.run_id}/logs") - log_file = log_dir / f"{args.test_script}.log" - log_dir.mkdir(parents=True, exist_ok=True) - - pytest_cmd = ["pytest", "--durations=0", f"tests/{args.test_script}.py"] - print(f"+ {' '.join(pytest_cmd)}") - - proc = subprocess.Popen( - pytest_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, - start_new_session=True, - ) - - encoded_test = urllib.parse.quote(args.test_script, safe="") - encoded_runner = urllib.parse.quote(args.runner, safe="") - keep_alive_url = ( - f"{args.base_url}/gpu/keepalive?runid={args.run_id}&test={encoded_test}" - f"&runner={encoded_runner}×tamp={int(time.time())}&gpu={env.get('CUDA_VISIBLE_DEVICES', '')}" - ) - - monitor_thread = None - monitor_stop = None - monitor_state = {"forced_exit_code": 0} - if env.get("CUDA_VISIBLE_DEVICES", ""): - monitor_thread, monitor_stop, monitor_state = start_keepalive_monitor( - proc=proc, - keep_alive_url=keep_alive_url, - interval_sec=args.monitor_interval_sec, - ) - - start_time = time.time() - try: - return_code = stream_process_output(proc, log_file) - finally: - if monitor_stop is not None: - print("trap cleanup EXIT...") - monitor_stop.set() - if monitor_thread is not None: - monitor_thread.join(timeout=5) - - if monitor_state["forced_exit_code"]: - append_github_env("ERROR", "22") - return 22 - - if return_code != 0: - append_github_env("ERROR", "22") - print(f"pipe status wrong: {return_code}") - return 22 - - execution_time = int(time.time() - start_time) - print(f"{execution_time // 60}m {execution_time % 60}s") - - try: - for entry in sorted(log_dir.iterdir()): - stat = entry.stat() - size = stat.st_size - print(f"{size:>10} {entry.name}") - except OSError as exc: - print(f"Failed to list log dir: {exc}") - - gpu_id = args.gpu_id or env.get("CUDA_VISIBLE_DEVICES", "") - if gpu_id: - log_vram(args.base_url, args.run_id, gpu_id, execution_time, args.test_script) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/scripts/uninstall_deps.py b/.github/scripts/uninstall_deps.py deleted file mode 100644 index 5e1763e0a..000000000 --- a/.github/scripts/uninstall_deps.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import subprocess -import sys -from pathlib import Path - -import yaml - -base_dir = os.path.dirname(os.path.abspath(__file__)) - - -def resolve_test_path(raw_name: str) -> Path: - return Path("tests") / f"{raw_name}.py" - - -def collect_pkgs(test_path: Path, deps: dict): - specific_pkgs = set() - - common_pkgs = set(deps.get("common") or []) - - specific_pkgs.update(deps.get("tests", {}).get(test_path.name) or []) - - test_path_str = test_path.as_posix() - for key, value in deps.items(): - if not (isinstance(key, str) and key.startswith("tests/")): - continue - if not test_path_str.startswith(key + "/"): - continue - - if isinstance(value, list): - specific_pkgs.update(value) - - elif isinstance(value, dict): - specific_pkgs.update(value.get(test_path.name) or []) - - else: - pass - - return specific_pkgs, common_pkgs - - -def uv_uninstall(pkgs): - if not pkgs: - return - - print("--- Uninstalling deps with uv:") - for p in pkgs: - print(" -", p) - - for p in pkgs: - cmd = ["uv", "pip", "uninstall", p] - try: - subprocess.check_call(cmd, shell=False) - except Exception as e: - print(f"--- Unnstall failed: {e}") - - -if __name__ == "__main__": - raw_name = sys.argv[1].removeprefix("tests/").removesuffix(".py") - test_path = resolve_test_path(raw_name) - - with open(os.path.join(base_dir, "blacklist.yaml")) as f: - deps = yaml.safe_load(f) - - specific_pkgs, common_pkgs = collect_pkgs(test_path, deps) - - uv_uninstall(sorted(specific_pkgs)) - - uv_uninstall(sorted(common_pkgs)) diff --git a/.github/workflows/compatibility.yml b/.github/workflows/compatibility.yml index 2637d5ed3..588db28b5 100644 --- a/.github/workflows/compatibility.yml +++ b/.github/workflows/compatibility.yml @@ -24,7 +24,7 @@ jobs: id: parser run: | python -m pip install --upgrade requests packaging - versions=$(python .github/scripts/ci_loop_versions.py setuptools ">=77.0.1,<83") + versions=$(python .github/scripts/ci_workflow.py loop-versions setuptools ">=77.0.1,<83") echo "versions=$versions" >> "$GITHUB_OUTPUT" check-setuptools: @@ -51,4 +51,4 @@ jobs: - name: Show versions run: | python --version - python -m pip show setuptools \ No newline at end of file + python -m pip show setuptools diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index d6fb11b15..4f4227237 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -63,6 +63,7 @@ env: PYTORCH_ALLOC_CONF: 'expandable_segments:True' RUNNER: 10.0.13.31 XEON5: 10.0.14.249 + GPU_ALLOCATOR_URL: http://10.0.13.31/gpu LOGBAR_ANIMATION: '0' CUDA_VERSION: 130 UV_TORCH_BACKEND: cu130 @@ -152,7 +153,7 @@ jobs: - name: List files id: files run: | - test_files=$(python3 .github/scripts/list_test_files.py \ + test_files=$(python3 .github/scripts/ci_workflow.py list-tests \ --ignored-test-files "$IGNORED_TEST_FILES" \ --test-regex "${{ github.event.inputs.test_regex }}") @@ -264,27 +265,18 @@ jobs: run: | echo "-- loading unit test's config --" source /opt/uv/setup_uv_venv.sh unit_test_env - - config_json="$(python3 .github/scripts/parse_test_config.py \ + eval "$(python3 .github/scripts/ci_workflow.py resolve-env \ --group tests \ - --test-name "${{ matrix.test_script }}")" - - py="$(printf '%s' "$config_json" | python3 -c 'import json, sys; print(json.load(sys.stdin)["py"])')" - gpu="$(printf '%s' "$config_json" | python3 -c 'import json, sys; print(json.load(sys.stdin)["gpu"])')" - - echo "PYTHON_VERSION=$py" >> "$GITHUB_ENV" - echo "GPU_COUNT=$gpu" >> "$GITHUB_ENV" - - echo "using py=$py gpu=$gpu for test ${{ matrix.test_script }}" - echo "-- loaded --" + --test-name "${{ matrix.test_script }}" \ + --cuda-version "${{ needs.check-vm.outputs.cuda_version }}" \ + --torch-version "${{ env.TORCH_VERSION }}" \ + --shell)" echo "-- setting up env --" - safe_ref=$(printf '%s' "${{ env.ref }}" | sed -e 's/[\\/]/_/g' -e 's/^refs_heads_//') - env_name="gptqmodel_${safe_ref}_test_${{ matrix.test_script }}" - echo "env_name: $env_name" + echo "env_name: $ENV_NAME" # will clean later - mv /opt/uv/venvs/$env_name "/opt/uv/tmp/${env_name}_$(date +%s)" || true - /opt/uv/setup_uv_venv.sh $env_name + mv "/opt/uv/venvs/$ENV_NAME" "/opt/uv/tmp/${ENV_NAME}_$(date +%s)" || true + /opt/uv/setup_uv_venv.sh "$ENV_NAME" echo "-- set --" - name: Setup uv env @@ -300,13 +292,13 @@ jobs: echo "" echo "--- installing required deps..." - python .github/scripts/install_deps.py ${{ matrix.test_script }} + python .github/scripts/ci_deps.py install ${{ matrix.test_script }} echo "" echo "" echo "--- uninstalling required deps..." - python .github/scripts/uninstall_deps.py ${{ matrix.test_script }} + python .github/scripts/ci_deps.py uninstall ${{ matrix.test_script }} # - name: Install Evalution # run: | @@ -400,8 +392,8 @@ jobs: - name: Find suitable GPU if: ${{ !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') && !cancelled() }} run: | - python .github/scripts/allocate_gpu.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_gpu.py allocate \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --test "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME:-unknown}" \ @@ -417,8 +409,8 @@ jobs: if [[ "${{ matrix.test_script }}" == *xpu* ]]; then extra_args+=(--clear-cuda --xpu-mode) fi - python .github/scripts/run_tests.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_tests.py run \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --test-script "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME}" \ @@ -429,22 +421,17 @@ jobs: if: ${{ !cancelled() && failure() }} continue-on-error: true run: | - log_dir=/opt/dist/GPTQModel/${{ github.run_id }}/logs - log_file=$log_dir/${{ matrix.test_script }}.log - - grep -nE "nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in" "$log_file" | head -n 50 || true - - tail -n 200 $log_file - exit 1 + python .github/scripts/ci_tests.py check-log \ + --run-id "${{ github.run_id }}" \ + --test-script "${{ matrix.test_script }}" - name: Release GPU if: always() && !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') run: | - python .github/scripts/release_gpu.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_gpu.py release \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --gpu-id "${{ env.CUDA_VISIBLE_DEVICES }}" \ - --timestamp "${{ env.STEP_TIMESTAMP }}" \ --test "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME}" @@ -552,27 +539,18 @@ jobs: run: | echo "-- loading unit test's config --" source /opt/uv/setup_uv_venv.sh unit_test_env - - config_json="$(python3 .github/scripts/parse_test_config.py \ + eval "$(python3 .github/scripts/ci_workflow.py resolve-env \ --group tests \ - --test-name "${{ matrix.test_script }}")" - - py="$(printf '%s' "$config_json" | python3 -c 'import json, sys; print(json.load(sys.stdin)["py"])')" - gpu="$(printf '%s' "$config_json" | python3 -c 'import json, sys; print(json.load(sys.stdin)["gpu"])')" - - echo "PYTHON_VERSION=$py" >> "$GITHUB_ENV" - echo "GPU_COUNT=$gpu" >> "$GITHUB_ENV" - - echo "using py=$py gpu=$gpu for test ${{ matrix.test_script }}" - echo "-- loaded --" + --test-name "${{ matrix.test_script }}" \ + --cuda-version "${{ needs.check-vm.outputs.cuda_version }}" \ + --torch-version "${{ env.TORCH_VERSION }}" \ + --shell)" echo "-- setting up env --" - safe_ref=$(printf '%s' "${{ env.ref }}" | sed -e 's/[\\/]/_/g' -e 's/^refs_heads_//') - env_name="gptqmodel_${safe_ref}_test_${{ matrix.test_script }}" - echo "env_name: $env_name" + echo "env_name: $ENV_NAME" # will clean later - mv /opt/uv/venvs/$env_name "/opt/uv/tmp/${env_name}_$(date +%s)" || true - /opt/uv/setup_uv_venv.sh $env_name + mv "/opt/uv/venvs/$ENV_NAME" "/opt/uv/tmp/${ENV_NAME}_$(date +%s)" || true + /opt/uv/setup_uv_venv.sh "$ENV_NAME" echo "-- set --" - name: Setup uv env @@ -584,8 +562,8 @@ jobs: echo "setting env... cuda=${{ needs.check-vm.outputs.cuda_version }} torch=${{ env.TORCH_VERSION }} python=${{ env.PYTHON_VERSION }}" bash /opt/env/init_compiler_no_env.sh ${{ needs.check-vm.outputs.cuda_version }} ${{ env.TORCH_VERSION }} ${{ env.PYTHON_VERSION }} - python .github/scripts/install_deps.py ${{ matrix.test_script }} - python .github/scripts/uninstall_deps.py ${{ matrix.test_script }} + python .github/scripts/ci_deps.py install ${{ matrix.test_script }} + python .github/scripts/ci_deps.py uninstall ${{ matrix.test_script }} - name: Install package from source run: | @@ -638,8 +616,8 @@ jobs: - name: Find suitable GPU if: ${{ !cancelled() }} run: | - python .github/scripts/allocate_gpu.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_gpu.py allocate \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --test "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME:-unknown}" \ @@ -648,8 +626,8 @@ jobs: - name: Run tests run: | - python .github/scripts/run_tests.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_tests.py run \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --test-script "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME}" \ @@ -660,21 +638,17 @@ jobs: if: ${{ !cancelled() && failure() }} continue-on-error: true run: | - log_dir=/opt/dist/GPTQModel/${{ github.run_id }}/logs - log_file=$log_dir/${{ matrix.test_script }}.log - - grep -nE "nvcc fatal|error:|fatal error|ModuleNotFoundError|ImportError|AssertionError|Exception|is the correct path|No such file or directory|Repo id must be in" "$log_file" | head -n 50 || true - tail -n 200 $log_file - exit 1 + python .github/scripts/ci_tests.py check-log \ + --run-id "${{ github.run_id }}" \ + --test-script "${{ matrix.test_script }}" - name: Release GPU if: always() run: | - python .github/scripts/release_gpu.py \ - --base-url "http://${XEON5}" \ + python .github/scripts/ci_gpu.py release \ + --base-url "${GPU_ALLOCATOR_URL}" \ --run-id "${{ github.run_id }}" \ --gpu-id "${{ env.CUDA_VISIBLE_DEVICES }}" \ - --timestamp "${{ env.STEP_TIMESTAMP }}" \ --test "${{ matrix.test_script }}" \ --runner "${RUNNER_NAME}" @@ -745,7 +719,7 @@ jobs: id: parser run: | python -m pip install --upgrade requests packaging - versions=$(python .github/scripts/ci_loop_versions.py setuptools ">=77.0.1,<83") + versions=$(python .github/scripts/ci_workflow.py loop-versions setuptools ">=77.0.1,<83") echo "versions=$versions" >> "$GITHUB_OUTPUT" check-setuptools: diff --git a/scripts/arch.md b/scripts/arch.md index f4af337a2..7129b18f4 100644 --- a/scripts/arch.md +++ b/scripts/arch.md @@ -21,8 +21,10 @@ - `ci_workflow.py activate-test-env` resolves `GPU_COUNT`, `HAS_SPECIFIC_DEPS`, `ENV_NAME`, and `UV_CACHE_DIR`. - `ci_workflow.py setup-specific-env` applies per-test compiler/python settings and test-specific install/uninstall package rules from `deps.yaml` and `blacklist.yaml`. - `ci_workflow.py install-package` serializes source installs with lock files so only one job populates a dedicated env at a time. -- `ci_gpu.py allocate` and `ci_gpu.py release` talk to the shared GPU allocator service. -- `ci_tests.py run` executes pytest, streams logs, keeps GPU leases alive, and logs VRAM usage. +- `.github/scripts/ci_workflow.py` owns workflow-oriented commands: test discovery, per-test env resolution, and package-version matrix generation. +- `.github/scripts/ci_deps.py` owns dependency install/uninstall commands and their shared YAML/package logic. +- `.github/scripts/ci_gpu.py` owns allocator lease commands and the shared allocator client logic. +- `.github/scripts/ci_tests.py` owns pytest execution and failure-log extraction. - `ci_tests.py check-log` prints the same failure excerpts the old shell step grepped from the test log. ## Config files From ea5a423b477117b8a77460f1ac7fc263b628659f Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 09:48:26 +0800 Subject: [PATCH 2/9] [CI] install pkgs --- .github/workflows/unit_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 4f4227237..abd538a52 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -265,6 +265,7 @@ jobs: run: | echo "-- loading unit test's config --" source /opt/uv/setup_uv_venv.sh unit_test_env + uv pip install requests packaging -U eval "$(python3 .github/scripts/ci_workflow.py resolve-env \ --group tests \ --test-name "${{ matrix.test_script }}" \ @@ -718,7 +719,7 @@ jobs: - name: Generate version matrix id: parser run: | - python -m pip install --upgrade requests packaging + python -m pip install --upgrade requests packaging pyyaml versions=$(python .github/scripts/ci_workflow.py loop-versions setuptools ">=77.0.1,<83") echo "versions=$versions" >> "$GITHUB_OUTPUT" From 695c17f270af9f0f563c1f7e1b296f776d82f301 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 09:56:24 +0800 Subject: [PATCH 3/9] [CI] install pkgs --- .github/workflows/unit_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index abd538a52..a6ab87a1d 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -540,6 +540,7 @@ jobs: run: | echo "-- loading unit test's config --" source /opt/uv/setup_uv_venv.sh unit_test_env + uv pip install requests packaging -U eval "$(python3 .github/scripts/ci_workflow.py resolve-env \ --group tests \ --test-name "${{ matrix.test_script }}" \ From d08d8ea33fbb3c11b1ac508b12ed77e3d252a5d0 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 10:00:04 +0800 Subject: [PATCH 4/9] [CI] test torch with github runner --- .github/workflows/unit_tests.yml | 43 ++++++++++++++------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a6ab87a1d..5a8dec701 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -663,16 +663,7 @@ jobs: rm -rf "${{ env.VIRTUAL_ENV }}" check-torch: - runs-on: [ self-hosted, xeon5 ] - container: - image: 10.0.13.31:5000/nvidia/cuda:130-ubuntu24.04_0325 - options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all - volumes: - - /monster/ci/env/entrypoint.sh:/entrypoint.sh - - /monster/ci/env/entrypoint.sh:/etc/profile.d/01-entrypoint.sh - - /dev/dri/by-path:/dev/dri/by-path - - /monster/ci/uv:/opt/uv - - /monster/ci/env:/opt/env + runs-on: ubuntu-latest steps: - name: Checkout Codes uses: actions/checkout@v6 @@ -680,31 +671,35 @@ jobs: repository: ${{ env.repo }} ref: ${{ env.ref }} + - uses: actions/setup-python@v6 + with: + python-version: "3.14" + cache: pip + + - name: Install package with selected setuptools + run: | + python -m pip install --upgrade pip + python -m pip install . "setuptools==${{ matrix.version }}" + - name: Test pypi pip run: | - uv venv pypi_pip_env - source pypi_pip_env/bin/activate - uv pip install pip -U - pip install gptqmodel torch -U + python -m pip install --upgrade pip + python -m pip install gptqmodel torch - name: Test pypi uv run: | - uv venv pypi_uv_env - source pypi_uv_env/bin/activate - uv pip install gptqmodel torch -U + python -m pip install --upgrade uv + uv pip install gptqmodel torch - name: test local pip run: | - uv venv local_pip_env - source local_pip_env/bin/activate - uv pip install pip -U - pip install . torch -U + python -m pip install --upgrade pip + pip install . torch - name: test local uv run: | - uv venv local_uv_env - source local_uv_env/bin/activate - uv pip install . torch -U + python -m pip install --upgrade uv + uv pip install . torch prepare-setuptools: From f823d9287f8f200d65f1e376553a204e1c41112d Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 10:05:17 +0800 Subject: [PATCH 5/9] [CI] fix eval caused command not found --- .github/scripts/ci_workflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/ci_workflow.py b/.github/scripts/ci_workflow.py index 4d1d34d67..6c0c881fc 100644 --- a/.github/scripts/ci_workflow.py +++ b/.github/scripts/ci_workflow.py @@ -171,11 +171,12 @@ def cmd_resolve_env(args: argparse.Namespace) -> int: append_github_env("GPU_COUNT", gpu) append_github_env("ENV_NAME", env_name) - print(f"using py={py} gpu={gpu} env={env_name} for test {args.test_name}") if args.shell: print(f"PYTHON_VERSION={shlex.quote(py)}") print(f"GPU_COUNT={shlex.quote(gpu)}") print(f"ENV_NAME={shlex.quote(env_name)}") + else: + print(f"using py={py} gpu={gpu} env={env_name} for test {args.test_name}") return 0 From fae54b035863aa4a318296979d627c3ff30825b5 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 10:08:22 +0800 Subject: [PATCH 6/9] [CI] remove unused --- .github/workflows/unit_tests.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 5a8dec701..5257e1dc1 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -676,11 +676,6 @@ jobs: python-version: "3.14" cache: pip - - name: Install package with selected setuptools - run: | - python -m pip install --upgrade pip - python -m pip install . "setuptools==${{ matrix.version }}" - - name: Test pypi pip run: | python -m pip install --upgrade pip From d3fb31391a8230085f212f09d4f999f05de64cdf Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 10:12:07 +0800 Subject: [PATCH 7/9] Revert "[CI] test torch with github runner" This reverts commit d08d8ea3 --- .github/workflows/unit_tests.yml | 38 ++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 5257e1dc1..a6ab87a1d 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -663,7 +663,16 @@ jobs: rm -rf "${{ env.VIRTUAL_ENV }}" check-torch: - runs-on: ubuntu-latest + runs-on: [ self-hosted, xeon5 ] + container: + image: 10.0.13.31:5000/nvidia/cuda:130-ubuntu24.04_0325 + options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all + volumes: + - /monster/ci/env/entrypoint.sh:/entrypoint.sh + - /monster/ci/env/entrypoint.sh:/etc/profile.d/01-entrypoint.sh + - /dev/dri/by-path:/dev/dri/by-path + - /monster/ci/uv:/opt/uv + - /monster/ci/env:/opt/env steps: - name: Checkout Codes uses: actions/checkout@v6 @@ -671,30 +680,31 @@ jobs: repository: ${{ env.repo }} ref: ${{ env.ref }} - - uses: actions/setup-python@v6 - with: - python-version: "3.14" - cache: pip - - name: Test pypi pip run: | - python -m pip install --upgrade pip - python -m pip install gptqmodel torch + uv venv pypi_pip_env + source pypi_pip_env/bin/activate + uv pip install pip -U + pip install gptqmodel torch -U - name: Test pypi uv run: | - python -m pip install --upgrade uv - uv pip install gptqmodel torch + uv venv pypi_uv_env + source pypi_uv_env/bin/activate + uv pip install gptqmodel torch -U - name: test local pip run: | - python -m pip install --upgrade pip - pip install . torch + uv venv local_pip_env + source local_pip_env/bin/activate + uv pip install pip -U + pip install . torch -U - name: test local uv run: | - python -m pip install --upgrade uv - uv pip install . torch + uv venv local_uv_env + source local_uv_env/bin/activate + uv pip install . torch -U prepare-setuptools: From 39fac559d514b805717e6703e0d9ec7f1c5396ca Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 11:17:44 +0800 Subject: [PATCH 8/9] [CI] skip gpu request if no gpu was needed --- .github/workflows/unit_tests.yml | 24 +++++++++++++++---- tests/module_tree/test_auto_detect.py | 1 + tests/module_tree/test_moe_flag_parsing.py | 3 +-- tests/qcfg/test_activation.py | 1 + tests/qcfg/test_fallback_meta.py | 1 + tests/qcfg/test_zero_point.py | 2 +- tests/test_adapter_config.py | 3 +-- tests/test_awq_jit_include_paths.py | 2 +- tests/test_compute_device_filter.py | 1 + tests/test_cpp_jit_progress.py | 2 +- tests/test_cpp_nvcc_flags.py | 2 +- tests/test_evalution_suite_stream_defaults.py | 1 + tests/test_format_conversion_flow.py | 2 +- tests/test_hf_init_guard.py | 2 +- tests/test_hf_utils.py | 2 +- tests/test_logger.py | 2 +- tests/test_qzero_offsets.py | 2 +- tests/test_random_string.py | 1 + tests/test_require_pkgs.py | 2 +- tests/test_threadpoolctl.py | 3 +-- tests/test_weight_only_config.py | 2 +- 21 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a6ab87a1d..47cb08fef 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -390,8 +390,16 @@ jobs: uv pip list echo "::endgroup::" + - name: Detect GPU marker + run: | + if grep -q '^# GPU=-1$' "${{ matrix.test_script }}"; then + echo "SKIP_GPU_ALLOCATION=true" >> "$GITHUB_ENV" + else + echo "SKIP_GPU_ALLOCATION=false" >> "$GITHUB_ENV" + fi + - name: Find suitable GPU - if: ${{ !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') && !cancelled() }} + if: ${{ !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') && !cancelled() && env.SKIP_GPU_ALLOCATION != 'true' }} run: | python .github/scripts/ci_gpu.py allocate \ --base-url "${GPU_ALLOCATOR_URL}" \ @@ -427,7 +435,7 @@ jobs: --test-script "${{ matrix.test_script }}" - name: Release GPU - if: always() && !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') + if: always() && !contains(matrix.test_script, 'ipex') && !contains(matrix.test_script, 'xpu') && env.SKIP_GPU_ALLOCATION != 'true' run: | python .github/scripts/ci_gpu.py release \ --base-url "${GPU_ALLOCATOR_URL}" \ @@ -615,8 +623,16 @@ jobs: uv pip list echo "::endgroup::" + - name: Detect GPU marker + run: | + if grep -q '^# GPU=-1$' "${{ matrix.test_script }}"; then + echo "SKIP_GPU_ALLOCATION=true" >> "$GITHUB_ENV" + else + echo "SKIP_GPU_ALLOCATION=false" >> "$GITHUB_ENV" + fi + - name: Find suitable GPU - if: ${{ !cancelled() }} + if: ${{ !cancelled() && env.SKIP_GPU_ALLOCATION != 'true' }} run: | python .github/scripts/ci_gpu.py allocate \ --base-url "${GPU_ALLOCATOR_URL}" \ @@ -645,7 +661,7 @@ jobs: --test-script "${{ matrix.test_script }}" - name: Release GPU - if: always() + if: always() && env.SKIP_GPU_ALLOCATION != 'true' run: | python .github/scripts/ci_gpu.py release \ --base-url "${GPU_ALLOCATOR_URL}" \ diff --git a/tests/module_tree/test_auto_detect.py b/tests/module_tree/test_auto_detect.py index c7dd04403..39179a726 100644 --- a/tests/module_tree/test_auto_detect.py +++ b/tests/module_tree/test_auto_detect.py @@ -1,3 +1,4 @@ +# GPU=-1 import unittest import torch.nn as nn diff --git a/tests/module_tree/test_moe_flag_parsing.py b/tests/module_tree/test_moe_flag_parsing.py index f56920d80..754d12361 100644 --- a/tests/module_tree/test_moe_flag_parsing.py +++ b/tests/module_tree/test_moe_flag_parsing.py @@ -4,8 +4,7 @@ """ Unit tests for :moe flag parsing and MoE module detection. """ - - +# GPU=-1 from gptqmodel.models.base import MOE_FLAG, BaseQModel diff --git a/tests/qcfg/test_activation.py b/tests/qcfg/test_activation.py index 1e2b27123..b65f25293 100644 --- a/tests/qcfg/test_activation.py +++ b/tests/qcfg/test_activation.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # Contact: qubitium@modelcloud.ai, x.com/qubitium +# GPU=-1 import pytest from gptqmodel.quantization import METHOD, QuantizeConfig diff --git a/tests/qcfg/test_fallback_meta.py b/tests/qcfg/test_fallback_meta.py index b09755208..69a2aa3ec 100644 --- a/tests/qcfg/test_fallback_meta.py +++ b/tests/qcfg/test_fallback_meta.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Contact: qubitium@modelcloud.ai, x.com/qubitium +# GPU=-1 from gptqmodel.quantization.config import Fallback, QuantizeConfig, SmoothMAD diff --git a/tests/qcfg/test_zero_point.py b/tests/qcfg/test_zero_point.py index cba76a18c..215428a24 100644 --- a/tests/qcfg/test_zero_point.py +++ b/tests/qcfg/test_zero_point.py @@ -1,4 +1,4 @@ - +# GPU=-1 from gptqmodel.quantization.config import FORMAT, METHOD, QuantizeConfig diff --git a/tests/test_adapter_config.py b/tests/test_adapter_config.py index dc635087a..dc16942ba 100644 --- a/tests/test_adapter_config.py +++ b/tests/test_adapter_config.py @@ -14,12 +14,12 @@ # limitations under the License. # -- do not touch +# GPU=-1 import os from gptqmodel import QuantizeConfig from gptqmodel.adapter.adapter import Lora, normalize_adapter - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -90,4 +90,3 @@ def test_extension_embed(self): assert qconfig.adapter.rank == rank - diff --git a/tests/test_awq_jit_include_paths.py b/tests/test_awq_jit_include_paths.py index 19c2f0d63..cf61ad557 100644 --- a/tests/test_awq_jit_include_paths.py +++ b/tests/test_awq_jit_include_paths.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 from __future__ import annotations import gptqmodel.utils.awq as awq_module diff --git a/tests/test_compute_device_filter.py b/tests/test_compute_device_filter.py index d666da8d5..1114416af 100644 --- a/tests/test_compute_device_filter.py +++ b/tests/test_compute_device_filter.py @@ -1,3 +1,4 @@ +# GPU=-1 import types import torch diff --git a/tests/test_cpp_jit_progress.py b/tests/test_cpp_jit_progress.py index 9d7949eb8..0feff61a4 100644 --- a/tests/test_cpp_jit_progress.py +++ b/tests/test_cpp_jit_progress.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 import time import pytest diff --git a/tests/test_cpp_nvcc_flags.py b/tests/test_cpp_nvcc_flags.py index 589676e20..9f3876320 100644 --- a/tests/test_cpp_nvcc_flags.py +++ b/tests/test_cpp_nvcc_flags.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 from __future__ import annotations import subprocess diff --git a/tests/test_evalution_suite_stream_defaults.py b/tests/test_evalution_suite_stream_defaults.py index 3a618c72d..826883543 100644 --- a/tests/test_evalution_suite_stream_defaults.py +++ b/tests/test_evalution_suite_stream_defaults.py @@ -1,3 +1,4 @@ +# GPU=-1 from __future__ import annotations from types import SimpleNamespace diff --git a/tests/test_format_conversion_flow.py b/tests/test_format_conversion_flow.py index 2d073f3ad..c4554615b 100644 --- a/tests/test_format_conversion_flow.py +++ b/tests/test_format_conversion_flow.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai # SPDX-License-Identifier: Apache-2.0 # Contact: qubitium@modelcloud.ai, x.com/qubitium - +# GPU=-1 import threading from unittest import mock diff --git a/tests/test_hf_init_guard.py b/tests/test_hf_init_guard.py index 2bd98de02..581eabae2 100644 --- a/tests/test_hf_init_guard.py +++ b/tests/test_hf_init_guard.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 import pytest import torch import torch.nn as nn diff --git a/tests/test_hf_utils.py b/tests/test_hf_utils.py index d36d3ab16..6e743fdaf 100644 --- a/tests/test_hf_utils.py +++ b/tests/test_hf_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai # SPDX-License-Identifier: Apache-2.0 # Contact: qubitium@modelcloud.ai, x.com/qubitium - +# GPU=-1 import tempfile from torch import nn diff --git a/tests/test_logger.py b/tests/test_logger.py index 83d42c995..5a51d4a71 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 import sys from gptqmodel.utils.logger import setup_logger diff --git a/tests/test_qzero_offsets.py b/tests/test_qzero_offsets.py index 0a769196c..0deedc63e 100644 --- a/tests/test_qzero_offsets.py +++ b/tests/test_qzero_offsets.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 from types import SimpleNamespace import pytest diff --git a/tests/test_random_string.py b/tests/test_random_string.py index 2c385b273..f6f645523 100644 --- a/tests/test_random_string.py +++ b/tests/test_random_string.py @@ -1,3 +1,4 @@ +# GPU=-1 import random import string diff --git a/tests/test_require_pkgs.py b/tests/test_require_pkgs.py index 227d8b090..c639ff1bf 100644 --- a/tests/test_require_pkgs.py +++ b/tests/test_require_pkgs.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 # Contact: qubitium@modelcloud.ai, x.com/qubitium - +# GPU=-1 from importlib.metadata import PackageNotFoundError import pytest diff --git a/tests/test_threadpoolctl.py b/tests/test_threadpoolctl.py index 68bc96ee1..6535fa401 100644 --- a/tests/test_threadpoolctl.py +++ b/tests/test_threadpoolctl.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2025 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 import time from typing import Dict, List @@ -76,4 +76,3 @@ def test_threadpool_limits_inside_device_threadpool(): ) finally: pool.shutdown(wait=True) - diff --git a/tests/test_weight_only_config.py b/tests/test_weight_only_config.py index a04bdbf67..04917e652 100644 --- a/tests/test_weight_only_config.py +++ b/tests/test_weight_only_config.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 - +# GPU=-1 from dataclasses import fields from inspect import signature From 7ad54cf5d1d4cb9ec7bccb823870a30758c9485d Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 17 Apr 2026 11:37:27 +0800 Subject: [PATCH 9/9] Potential fix for pull request finding 'Empty except' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- .github/scripts/ci_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/ci_tests.py b/.github/scripts/ci_tests.py index 22ce86212..9bdcfe85d 100644 --- a/.github/scripts/ci_tests.py +++ b/.github/scripts/ci_tests.py @@ -33,7 +33,8 @@ def kill_process_group(proc: subprocess.Popen[str]) -> None: try: os.killpg(proc.pid, signal.SIGKILL) except ProcessLookupError: - pass + # Process group may already have exited; nothing to kill. + return def start_keepalive_monitor(