diff --git a/.github/scripts/aiter_prebuild_summary.py b/.github/scripts/aiter_prebuild_summary.py new file mode 100644 index 0000000000..00570500b3 --- /dev/null +++ b/.github/scripts/aiter_prebuild_summary.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Summarize Aiter prebuild timing from setup.py build logs.""" + +from __future__ import annotations + +import argparse +import glob +import os +import re +from pathlib import Path + +FINISH_RE = re.compile(r"finish build \[([^\]]+)\], cost ([0-9.]+)s") + + +def parse_module_costs(log_path: Path) -> list[tuple[str, float]]: + module_costs: list[tuple[str, float]] = [] + try: + with log_path.open(encoding="utf-8", errors="replace") as log: + for line in log: + match = FINISH_RE.search(line) + if match: + module_costs.append((match.group(1), float(match.group(2)))) + except OSError as exc: + print(f"::warning::Unable to read prebuild log {log_path}: {exc}") + return module_costs + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--log", required=True, type=Path, help="Path to the tee'd prebuild log" + ) + parser.add_argument( + "--build-status", required=True, type=int, help="setup.py exit code" + ) + parser.add_argument( + "--start", required=True, type=int, help="Prebuild start timestamp in seconds" + ) + parser.add_argument( + "--end", required=True, type=int, help="Prebuild end timestamp in seconds" + ) + parser.add_argument( + "--kernel-glob", + default="aiter/jit/*.so", + help="Glob used to count prebuilt kernel shared objects", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + module_costs = parse_module_costs(args.log) + kernels = sorted(glob.glob(args.kernel_glob)) + wall_seconds = max(0, args.end - args.start) + total_module_seconds = sum(cost for _, cost in module_costs) + + print("=== Aiter prebuild summary ===") + print(f"Runner: {os.environ.get('AITER_RUNNER_NAME', 'unknown')}") + print(f"GPU_ARCHS: {os.environ.get('GPU_ARCHS', 'unknown')}") + print(f"PREBUILD_KERNELS: {os.environ.get('PREBUILD_KERNELS', 'unknown')}") + print(f"MAX_JOBS: {os.environ.get('MAX_JOBS', 'unknown')}") + print(f"Build status: {args.build_status}") + print(f"Prebuild wall time: {wall_seconds}s ({wall_seconds / 60:.1f} min)") + print(f"Kernel count: {len(kernels)}") + print(f"Module builds observed: {len(module_costs)}") + print( + f"Total module compile cost: {total_module_seconds:.1f}s ({total_module_seconds / 60:.1f} min)" + ) + + if module_costs: + print("Top slowest module builds:") + for name, cost in sorted(module_costs, key=lambda item: item[1], reverse=True)[ + :20 + ]: + print(f" {name}: {cost:.1f}s") + + print("All module build costs (seconds):") + for name, cost in sorted(module_costs): + print(f" {name}: {cost:.1f}") + else: + print("::warning::No module build cost lines were found in the prebuild log") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml index 6a71d8b7ae..b6fe9ac58d 100644 --- a/.github/workflows/aiter-test.yaml +++ b/.github/workflows/aiter-test.yaml @@ -26,6 +26,7 @@ env: GPU_ARCH_LIST: "gfx942;gfx950" AITER_TEST: "op_tests" AITER_WHEEL_ARTIFACT_NAME: aiter-whl-${{ github.run_id }} + AITER_PREBUILD_MAX_JOBS: "96" jobs: check-signal: @@ -93,10 +94,13 @@ jobs: set -euo pipefail docker run --rm \ --network=host \ + -e AITER_PREBUILD_MAX_JOBS="${{ env.AITER_PREBUILD_MAX_JOBS }}" \ + -e AITER_RUNNER_NAME="${RUNNER_NAME:-unknown}" \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ ${{ env.DOCKER_IMAGE }} \ bash -lc ' + set -euo pipefail git config --global --add safe.directory /workspace && git -C /workspace rev-parse HEAD >/dev/null && shopt -s nullglob && @@ -107,11 +111,46 @@ jobs: pip install --upgrade "ninja>=1.11.1" && pip install --upgrade setuptools_scm && pip install tabulate && - echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }}, PREBUILD_KERNELS: 1, and MAX_JOBS: 128" && - PREBUILD_KERNELS=1 MAX_JOBS=128 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py bdist_wheel && + echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }}, PREBUILD_KERNELS: 1, and MAX_JOBS: ${AITER_PREBUILD_MAX_JOBS}" && + export PREBUILD_KERNELS=1 && + export MAX_JOBS="${AITER_PREBUILD_MAX_JOBS}" && + export GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" && + prebuild_start=$(date +%s) && + set +e && + python setup.py bdist_wheel 2>&1 | tee .aiter-prebuild.log + build_status=${PIPESTATUS[0]} + set -e + prebuild_end=$(date +%s) + { + echo "BUILD_STATUS=${build_status}" + echo "PREBUILD_START=${prebuild_start}" + echo "PREBUILD_END=${prebuild_end}" + } > .aiter-prebuild.env + if [ "${build_status}" -ne 0 ]; then + exit "${build_status}" + fi ls -lh dist/*.whl ' + - name: Summarize Aiter prebuild + if: ${{ always() }} + run: | + set -euo pipefail + if [ ! -f .aiter-prebuild.env ]; then + echo "::warning::Aiter prebuild metadata was not generated" + exit 0 + fi + . ./.aiter-prebuild.env + AITER_RUNNER_NAME="${RUNNER_NAME:-unknown}" \ + GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" \ + PREBUILD_KERNELS=1 \ + MAX_JOBS="${{ env.AITER_PREBUILD_MAX_JOBS }}" \ + python3 .github/scripts/aiter_prebuild_summary.py \ + --log .aiter-prebuild.log \ + --build-status "${BUILD_STATUS}" \ + --start "${PREBUILD_START}" \ + --end "${PREBUILD_END}" + - name: Verify prebuilt kernels in wheel run: | set -euo pipefail