From 0b76e5d7f4638c9bd8288e3de8f499289d528447 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 11 May 2026 17:39:56 +0800 Subject: [PATCH 1/4] CI: stabilize Aiter wheel prebuilds Lower prebuild concurrency and emit timing summaries so CI can distinguish runner slowdowns from kernel count changes. --- .github/scripts/aiter_prebuild_summary.py | 76 +++++++++++++++++++++++ .github/workflows/aiter-test.yaml | 27 +++++++- 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 .github/scripts/aiter_prebuild_summary.py diff --git a/.github/scripts/aiter_prebuild_summary.py b/.github/scripts/aiter_prebuild_summary.py new file mode 100644 index 0000000000..97da3f5901 --- /dev/null +++ b/.github/scripts/aiter_prebuild_summary.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""Summarize Aiter prebuild timing from setup.py build logs.""" + +from __future__ import annotations + +import argparse +import glob +import os +import re +from pathlib import Path + + +FINISH_RE = re.compile(r"finish build \[([^\]]+)\], cost ([0-9.]+)s") + + +def parse_module_costs(log_path: Path) -> list[tuple[str, float]]: + module_costs: list[tuple[str, float]] = [] + try: + with log_path.open(encoding="utf-8", errors="replace") as log: + for line in log: + match = FINISH_RE.search(line) + if match: + module_costs.append((match.group(1), float(match.group(2)))) + except OSError as exc: + print(f"::warning::Unable to read prebuild log {log_path}: {exc}") + return module_costs + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--log", required=True, type=Path, help="Path to the tee'd prebuild log") + parser.add_argument("--build-status", required=True, type=int, help="setup.py exit code") + parser.add_argument("--start", required=True, type=int, help="Prebuild start timestamp in seconds") + parser.add_argument("--end", required=True, type=int, help="Prebuild end timestamp in seconds") + parser.add_argument( + "--kernel-glob", + default="aiter/jit/*.so", + help="Glob used to count prebuilt kernel shared objects", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + module_costs = parse_module_costs(args.log) + kernels = sorted(glob.glob(args.kernel_glob)) + wall_seconds = max(0, args.end - args.start) + total_module_seconds = sum(cost for _, cost in module_costs) + + print("=== Aiter prebuild summary ===") + print(f"Runner: {os.environ.get('AITER_RUNNER_NAME', 'unknown')}") + print(f"GPU_ARCHS: {os.environ.get('GPU_ARCHS', 'unknown')}") + print(f"PREBUILD_KERNELS: {os.environ.get('PREBUILD_KERNELS', 'unknown')}") + print(f"MAX_JOBS: {os.environ.get('MAX_JOBS', 'unknown')}") + print(f"Build status: {args.build_status}") + print(f"Prebuild wall time: {wall_seconds}s ({wall_seconds / 60:.1f} min)") + print(f"Kernel count: {len(kernels)}") + print(f"Module builds observed: {len(module_costs)}") + print(f"Total module compile cost: {total_module_seconds:.1f}s ({total_module_seconds / 60:.1f} min)") + + if module_costs: + print("Top slowest module builds:") + for name, cost in sorted(module_costs, key=lambda item: item[1], reverse=True)[:20]: + print(f" {name}: {cost:.1f}s") + + print("All module build costs (seconds):") + for name, cost in sorted(module_costs): + print(f" {name}: {cost:.1f}") + else: + print("::warning::No module build cost lines were found in the prebuild log") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml index 6a71d8b7ae..5ab4fcf658 100644 --- a/.github/workflows/aiter-test.yaml +++ b/.github/workflows/aiter-test.yaml @@ -26,6 +26,7 @@ env: GPU_ARCH_LIST: "gfx942;gfx950" AITER_TEST: "op_tests" AITER_WHEEL_ARTIFACT_NAME: aiter-whl-${{ github.run_id }} + AITER_PREBUILD_MAX_JOBS: "64" jobs: check-signal: @@ -93,10 +94,13 @@ jobs: set -euo pipefail docker run --rm \ --network=host \ + -e AITER_PREBUILD_MAX_JOBS="${{ env.AITER_PREBUILD_MAX_JOBS }}" \ + -e AITER_RUNNER_NAME="${RUNNER_NAME:-unknown}" \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ ${{ env.DOCKER_IMAGE }} \ bash -lc ' + set -euo pipefail git config --global --add safe.directory /workspace && git -C /workspace rev-parse HEAD >/dev/null && shopt -s nullglob && @@ -107,8 +111,27 @@ jobs: pip install --upgrade "ninja>=1.11.1" && pip install --upgrade setuptools_scm && pip install tabulate && - echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }}, PREBUILD_KERNELS: 1, and MAX_JOBS: 128" && - PREBUILD_KERNELS=1 MAX_JOBS=128 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py bdist_wheel && + echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }}, PREBUILD_KERNELS: 1, and MAX_JOBS: ${AITER_PREBUILD_MAX_JOBS}" && + export PREBUILD_KERNELS=1 && + export MAX_JOBS="${AITER_PREBUILD_MAX_JOBS}" && + export GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" && + prebuild_start=$(date +%s) && + set +e && + python setup.py bdist_wheel 2>&1 | tee /tmp/aiter-prebuild.log + build_status=${PIPESTATUS[0]} + set -e + prebuild_end=$(date +%s) + if ! python3 .github/scripts/aiter_prebuild_summary.py \ + --log /tmp/aiter-prebuild.log \ + --build-status "${build_status}" \ + --start "${prebuild_start}" \ + --end "${prebuild_end}" + then + echo "::warning::Failed to produce Aiter prebuild summary" + fi + if [ "${build_status}" -ne 0 ]; then + exit "${build_status}" + fi ls -lh dist/*.whl ' From 85c545f5001eb53fbf2784eefea34c410cbf847c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 11 May 2026 17:45:23 +0800 Subject: [PATCH 2/4] CI: format prebuild summary script Apply Black formatting to keep the CI style check green. --- .github/scripts/aiter_prebuild_summary.py | 25 ++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/.github/scripts/aiter_prebuild_summary.py b/.github/scripts/aiter_prebuild_summary.py index 97da3f5901..00570500b3 100644 --- a/.github/scripts/aiter_prebuild_summary.py +++ b/.github/scripts/aiter_prebuild_summary.py @@ -9,7 +9,6 @@ import re from pathlib import Path - FINISH_RE = re.compile(r"finish build \[([^\]]+)\], cost ([0-9.]+)s") @@ -28,10 +27,18 @@ def parse_module_costs(log_path: Path) -> list[tuple[str, float]]: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--log", required=True, type=Path, help="Path to the tee'd prebuild log") - parser.add_argument("--build-status", required=True, type=int, help="setup.py exit code") - parser.add_argument("--start", required=True, type=int, help="Prebuild start timestamp in seconds") - parser.add_argument("--end", required=True, type=int, help="Prebuild end timestamp in seconds") + parser.add_argument( + "--log", required=True, type=Path, help="Path to the tee'd prebuild log" + ) + parser.add_argument( + "--build-status", required=True, type=int, help="setup.py exit code" + ) + parser.add_argument( + "--start", required=True, type=int, help="Prebuild start timestamp in seconds" + ) + parser.add_argument( + "--end", required=True, type=int, help="Prebuild end timestamp in seconds" + ) parser.add_argument( "--kernel-glob", default="aiter/jit/*.so", @@ -56,11 +63,15 @@ def main() -> int: print(f"Prebuild wall time: {wall_seconds}s ({wall_seconds / 60:.1f} min)") print(f"Kernel count: {len(kernels)}") print(f"Module builds observed: {len(module_costs)}") - print(f"Total module compile cost: {total_module_seconds:.1f}s ({total_module_seconds / 60:.1f} min)") + print( + f"Total module compile cost: {total_module_seconds:.1f}s ({total_module_seconds / 60:.1f} min)" + ) if module_costs: print("Top slowest module builds:") - for name, cost in sorted(module_costs, key=lambda item: item[1], reverse=True)[:20]: + for name, cost in sorted(module_costs, key=lambda item: item[1], reverse=True)[ + :20 + ]: print(f" {name}: {cost:.1f}s") print("All module build costs (seconds):") From 766457f032b41a2d8aa9986cf310204946aec512 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 11 May 2026 22:59:18 +0800 Subject: [PATCH 3/4] CI: split Aiter prebuild summary into its own step Persist the prebuild log and timing metadata so the summary is easy to find in the Actions UI. --- .github/workflows/aiter-test.yaml | 34 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml index 5ab4fcf658..a2ce1c3a51 100644 --- a/.github/workflows/aiter-test.yaml +++ b/.github/workflows/aiter-test.yaml @@ -117,24 +117,40 @@ jobs: export GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" && prebuild_start=$(date +%s) && set +e && - python setup.py bdist_wheel 2>&1 | tee /tmp/aiter-prebuild.log + python setup.py bdist_wheel 2>&1 | tee .aiter-prebuild.log build_status=${PIPESTATUS[0]} set -e prebuild_end=$(date +%s) - if ! python3 .github/scripts/aiter_prebuild_summary.py \ - --log /tmp/aiter-prebuild.log \ - --build-status "${build_status}" \ - --start "${prebuild_start}" \ - --end "${prebuild_end}" - then - echo "::warning::Failed to produce Aiter prebuild summary" - fi + { + echo "BUILD_STATUS=${build_status}" + echo "PREBUILD_START=${prebuild_start}" + echo "PREBUILD_END=${prebuild_end}" + } > .aiter-prebuild.env if [ "${build_status}" -ne 0 ]; then exit "${build_status}" fi ls -lh dist/*.whl ' + - name: Summarize Aiter prebuild + if: ${{ always() }} + run: | + set -euo pipefail + if [ ! -f .aiter-prebuild.env ]; then + echo "::warning::Aiter prebuild metadata was not generated" + exit 0 + fi + . ./.aiter-prebuild.env + AITER_RUNNER_NAME="${RUNNER_NAME:-unknown}" \ + GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" \ + PREBUILD_KERNELS=1 \ + MAX_JOBS="${{ env.AITER_PREBUILD_MAX_JOBS }}" \ + python3 .github/scripts/aiter_prebuild_summary.py \ + --log .aiter-prebuild.log \ + --build-status "${BUILD_STATUS}" \ + --start "${PREBUILD_START}" \ + --end "${PREBUILD_END}" + - name: Verify prebuilt kernels in wheel run: | set -euo pipefail From d041b0f1ece020cfec09464cb6a876c7df8ff648 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 12 May 2026 10:01:16 +0800 Subject: [PATCH 4/4] CI: tune Aiter prebuild concurrency Raise the prebuild MAX_JOBS default to 96 to recover compiler parallelism while keeping it below the previous 128 setting. --- .github/workflows/aiter-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml index a2ce1c3a51..b6fe9ac58d 100644 --- a/.github/workflows/aiter-test.yaml +++ b/.github/workflows/aiter-test.yaml @@ -26,7 +26,7 @@ env: GPU_ARCH_LIST: "gfx942;gfx950" AITER_TEST: "op_tests" AITER_WHEEL_ARTIFACT_NAME: aiter-whl-${{ github.run_id }} - AITER_PREBUILD_MAX_JOBS: "64" + AITER_PREBUILD_MAX_JOBS: "96" jobs: check-signal: