|
9 | 9 | - main |
10 | 10 | workflow_dispatch: |
11 | 11 |
|
| 12 | +permissions: |
| 13 | + contents: read |
| 14 | + actions: read |
| 15 | + pull-requests: read |
| 16 | + |
12 | 17 | concurrency: |
13 | 18 | group: ${{ github.workflow }}-${{ github.ref }} |
14 | 19 | cancel-in-progress: true |
|
19 | 24 | GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} |
20 | 25 |
|
21 | 26 | jobs: |
| 27 | + # --------------------------------------------------------------------------- |
| 28 | + # Single-GPU tests: kernels, unit, examples, MLIR FileCheck, benchmarks. |
| 29 | + # Runs on 1-GPU and Navi runners only. |
| 30 | + # --------------------------------------------------------------------------- |
22 | 31 | test: |
23 | 32 | strategy: |
24 | 33 | matrix: |
25 | | - runners: [ 'linux-flydsl-mi325-1', 'linux-flydsl-mi355-1', 'linux-flydsl-navi-2' ] |
| 34 | + runners: [ |
| 35 | + 'linux-flydsl-mi325-1', |
| 36 | + 'linux-flydsl-mi355-1', |
| 37 | + 'linux-flydsl-navi-2', |
| 38 | + ] |
26 | 39 | fail-fast: false |
27 | 40 | runs-on: ${{ matrix.runners }} |
28 | 41 | steps: |
@@ -169,3 +182,167 @@ jobs: |
169 | 182 | run: | |
170 | 183 | docker stop flydsl_test |
171 | 184 | docker rm flydsl_test |
| 185 | +
|
| 186 | + # --------------------------------------------------------------------------- |
| 187 | + # Multi-GPU allreduce tests: ONLY for 8-GPU runners. |
| 188 | + # Runs on BOTH linux-flydsl-mi325-8 AND linux-flydsl-mi355-8 independently. |
| 189 | + # fail-fast: false ensures both runners always complete even if one fails. |
| 190 | + # --------------------------------------------------------------------------- |
| 191 | + multi-gpu: |
| 192 | + needs: test |
| 193 | + name: Multi-GPU AllReduce Tests (${{ matrix.runners }}) |
| 194 | + timeout-minutes: 120 |
| 195 | + strategy: |
| 196 | + matrix: |
| 197 | + runners: [ |
| 198 | + 'linux-flydsl-mi325-8', |
| 199 | + 'linux-flydsl-mi355-8', |
| 200 | + ] |
| 201 | + fail-fast: false |
| 202 | + runs-on: ${{ matrix.runners }} |
| 203 | + steps: |
| 204 | + - name: Checkout code |
| 205 | + uses: actions/checkout@v4 |
| 206 | + with: |
| 207 | + repository: ${{ env.GITHUB_REPO_NAME }} |
| 208 | + ref: ${{ env.GITHUB_COMMIT_SHA }} |
| 209 | + path: flydsl-test |
| 210 | + |
| 211 | + - name: Start CI container |
| 212 | + run: | |
| 213 | + echo "Clean up containers..." |
| 214 | + docker ps -aq -f name=flydsl_test | xargs -r docker stop | xargs -r docker rm || true |
| 215 | +
|
| 216 | + echo "Start CI container..." |
| 217 | + if [ -f "/etc/podinfo/gha-render-devices" ]; then |
| 218 | + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) |
| 219 | + else |
| 220 | + DEVICE_FLAG="--device /dev/dri" |
| 221 | + fi |
| 222 | +
|
| 223 | + docker run -dt --network=host --user root --device=/dev/kfd $DEVICE_FLAG \ |
| 224 | + -v "${GITHUB_WORKSPACE:-$PWD}/flydsl-test:/flydsl-test" \ |
| 225 | + --ipc=host --group-add video \ |
| 226 | + --shm-size 16g \ |
| 227 | + --cap-add=SYS_PTRACE \ |
| 228 | + --security-opt seccomp=unconfined \ |
| 229 | + -w /flydsl-test \ |
| 230 | + --name flydsl_test \ |
| 231 | + ${{ env.DOCKER_IMAGE }} |
| 232 | + env: |
| 233 | + GITHUB_WORKSPACE: ${{ github.workspace }} |
| 234 | + |
| 235 | + - name: Install dependencies |
| 236 | + run: | |
| 237 | + docker exec flydsl_test bash -c "apt-get update && apt-get install -y cmake build-essential patchelf" |
| 238 | + docker exec flydsl_test bash -c "python3 -m pip install -U pip setuptools wheel" |
| 239 | + docker exec flydsl_test bash -c "python3 -m pip install ninja>=1.11.1" |
| 240 | + docker exec flydsl_test bash -c "python3 -m pip install -U 'hypothesis>=6.82.0'" |
| 241 | + docker exec flydsl_test bash -c "git config --global --add safe.directory /flydsl-test && cd /flydsl-test && git log" |
| 242 | +
|
| 243 | + - name: Restore cached MLIR install tarball (if available) |
| 244 | + id: mlir-cache |
| 245 | + uses: actions/cache@v4 |
| 246 | + with: |
| 247 | + path: mlir_install.tgz |
| 248 | + key: mlir-install-${{ matrix.runners }}-${{ hashFiles('flydsl-test/thirdparty/llvm-hash.txt', 'flydsl-test/scripts/build_llvm.sh', 'flydsl-test/CMakeLists.txt', 'flydsl-test/.github/workflows/flydsl.yaml') }} |
| 249 | + |
| 250 | + - name: Use cached MLIR install tarball (skip LLVM build) |
| 251 | + if: steps.mlir-cache.outputs.cache-hit == 'true' |
| 252 | + run: | |
| 253 | + ls -lh mlir_install.tgz |
| 254 | + docker cp mlir_install.tgz flydsl_test:/tmp/mlir_install.tgz |
| 255 | + docker exec flydsl_test bash -c "rm -rf /llvm-project/mlir_install && mkdir -p /llvm-project && tar -xzf /tmp/mlir_install.tgz -C /llvm-project" |
| 256 | + docker exec flydsl_test bash -c "ls -la /llvm-project/mlir_install/lib/cmake/mlir" |
| 257 | +
|
| 258 | + - name: Build LLVM |
| 259 | + if: steps.mlir-cache.outputs.cache-hit != 'true' |
| 260 | + run: | |
| 261 | + set -ex |
| 262 | + docker exec flydsl_test bash -c "cd /flydsl-test && bash scripts/build_llvm.sh" |
| 263 | + docker exec flydsl_test bash -c "ls -la /llvm-project/mlir_install/lib/cmake/mlir" |
| 264 | + docker cp flydsl_test:/llvm-project/mlir_install.tgz ./mlir_install.tgz || true |
| 265 | +
|
| 266 | + - name: Build FlyDSL (uses MLIR install prefix) |
| 267 | + run: | |
| 268 | + docker exec flydsl_test bash -c "export MLIR_PATH=/llvm-project/mlir_install && cd /flydsl-test && python3 -m pip install -e . --use-pep517" |
| 269 | +
|
| 270 | + - name: Run multi-GPU allreduce tests |
| 271 | + timeout-minutes: 30 |
| 272 | + run: | |
| 273 | + docker exec flydsl_test bash -c " |
| 274 | + cd /flydsl-test |
| 275 | + python3 -m pytest tests/kernels/test_allreduce.py \ |
| 276 | + -m multi_gpu -v --no-header --tb=short |
| 277 | + " |
| 278 | +
|
| 279 | + - name: Run allreduce benchmark (PR) |
| 280 | + timeout-minutes: 30 |
| 281 | + run: | |
| 282 | + docker exec flydsl_test bash -c " |
| 283 | + cd /flydsl-test |
| 284 | + python3 tests/kernels/test_allreduce.py \ |
| 285 | + --world_size 8 --iters 51 --warmup 5 \ |
| 286 | + --allreduce_impl flydsl --mode cudagraph \ |
| 287 | + --shapes '2,7168,fp16;32,8192,fp32;128,8192,fp16;1024,7168,bf16;4096,8192,bf16' \ |
| 288 | + --output_csv /tmp/bench_pr.csv |
| 289 | + " |
| 290 | +
|
| 291 | + - name: Build main branch baseline |
| 292 | + id: build-main |
| 293 | + timeout-minutes: 20 |
| 294 | + continue-on-error: true |
| 295 | + run: | |
| 296 | + docker exec flydsl_test bash -c " |
| 297 | + cd /flydsl-test |
| 298 | + git fetch origin main --depth=1 |
| 299 | + git worktree add /tmp/flydsl-main origin/main |
| 300 | + cd /tmp/flydsl-main |
| 301 | + export MLIR_PATH=/llvm-project/mlir_install |
| 302 | + python3 -m pip install -e . --use-pep517 2>&1 | tail -5 |
| 303 | + " |
| 304 | +
|
| 305 | + - name: Run allreduce benchmark (main) |
| 306 | + id: bench-main |
| 307 | + if: steps.build-main.outcome == 'success' |
| 308 | + timeout-minutes: 30 |
| 309 | + continue-on-error: true |
| 310 | + run: | |
| 311 | + docker exec flydsl_test bash -c " |
| 312 | + cp /flydsl-test/tests/kernels/test_allreduce.py \ |
| 313 | + /tmp/flydsl-main/tests/kernels/test_allreduce.py |
| 314 | + cd /tmp/flydsl-main |
| 315 | + python3 tests/kernels/test_allreduce.py \ |
| 316 | + --world_size 8 --iters 51 --warmup 5 \ |
| 317 | + --allreduce_impl flydsl --mode cudagraph \ |
| 318 | + --shapes '2,7168,fp16;32,8192,fp32;128,8192,fp16;1024,7168,bf16;4096,8192,bf16' \ |
| 319 | + --output_csv /tmp/bench_main.csv |
| 320 | + " |
| 321 | +
|
| 322 | + - name: Check performance regression (PR vs main) |
| 323 | + if: steps.bench-main.outcome != 'skipped' |
| 324 | + timeout-minutes: 5 |
| 325 | + run: | |
| 326 | + docker exec flydsl_test bash -c " |
| 327 | + cd /flydsl-test |
| 328 | + python3 tests/kernels/compare_allreduce_benchmark.py \ |
| 329 | + /tmp/bench_main.csv /tmp/bench_pr.csv |
| 330 | + " |
| 331 | +
|
| 332 | + - name: Show test logs |
| 333 | + if: failure() |
| 334 | + run: | |
| 335 | + docker exec flydsl_test bash -c 'cd /tmp && tar czf /tmp/logs.tgz *.log 2>/dev/null || echo "no logs"' |
| 336 | + docker cp flydsl_test:/tmp/logs.tgz . || true |
| 337 | + if [ -f logs.tgz ]; then |
| 338 | + tar -xzf logs.tgz || true |
| 339 | + cat *.log || true |
| 340 | + else |
| 341 | + echo "logs.tgz not found; skipping log extraction" |
| 342 | + fi |
| 343 | +
|
| 344 | + - name: Clean up |
| 345 | + if: always() |
| 346 | + run: | |
| 347 | + docker stop flydsl_test |
| 348 | + docker rm flydsl_test |
0 commit comments