Skip to content

Commit 6e635c6

Browse files
yanboshaoyashao@amd.comcoderfeliclaude
authored
Add CI testcases and benchmark for allreduce (#387)
--------- Co-authored-by: yashao@amd.com <yashao@amd.com@tus1-p3-g24.cluster.local> Co-authored-by: Felix Li <felix.li@amd.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f65e930 commit 6e635c6

15 files changed

Lines changed: 805 additions & 383 deletions

.github/workflows/flydsl.yaml

Lines changed: 178 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ on:
99
- main
1010
workflow_dispatch:
1111

12+
permissions:
13+
contents: read
14+
actions: read
15+
pull-requests: read
16+
1217
concurrency:
1318
group: ${{ github.workflow }}-${{ github.ref }}
1419
cancel-in-progress: true
@@ -19,10 +24,18 @@ env:
1924
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
2025

2126
jobs:
27+
# ---------------------------------------------------------------------------
28+
# Single-GPU tests: kernels, unit, examples, MLIR FileCheck, benchmarks.
29+
# Runs on 1-GPU and Navi runners only.
30+
# ---------------------------------------------------------------------------
2231
test:
2332
strategy:
2433
matrix:
25-
runners: [ 'linux-flydsl-mi325-1', 'linux-flydsl-mi355-1', 'linux-flydsl-navi-2' ]
34+
runners: [
35+
'linux-flydsl-mi325-1',
36+
'linux-flydsl-mi355-1',
37+
'linux-flydsl-navi-2',
38+
]
2639
fail-fast: false
2740
runs-on: ${{ matrix.runners }}
2841
steps:
@@ -169,3 +182,167 @@ jobs:
169182
run: |
170183
docker stop flydsl_test
171184
docker rm flydsl_test
185+
186+
# ---------------------------------------------------------------------------
187+
# Multi-GPU allreduce tests: ONLY for 8-GPU runners.
188+
# Runs on BOTH linux-flydsl-mi325-8 AND linux-flydsl-mi355-8 independently.
189+
# fail-fast: false ensures both runners always complete even if one fails.
190+
# ---------------------------------------------------------------------------
191+
multi-gpu:
192+
needs: test
193+
name: Multi-GPU AllReduce Tests (${{ matrix.runners }})
194+
timeout-minutes: 120
195+
strategy:
196+
matrix:
197+
runners: [
198+
'linux-flydsl-mi325-8',
199+
'linux-flydsl-mi355-8',
200+
]
201+
fail-fast: false
202+
runs-on: ${{ matrix.runners }}
203+
steps:
204+
- name: Checkout code
205+
uses: actions/checkout@v4
206+
with:
207+
repository: ${{ env.GITHUB_REPO_NAME }}
208+
ref: ${{ env.GITHUB_COMMIT_SHA }}
209+
path: flydsl-test
210+
211+
- name: Start CI container
212+
run: |
213+
echo "Clean up containers..."
214+
docker ps -aq -f name=flydsl_test | xargs -r docker stop | xargs -r docker rm || true
215+
216+
echo "Start CI container..."
217+
if [ -f "/etc/podinfo/gha-render-devices" ]; then
218+
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
219+
else
220+
DEVICE_FLAG="--device /dev/dri"
221+
fi
222+
223+
docker run -dt --network=host --user root --device=/dev/kfd $DEVICE_FLAG \
224+
-v "${GITHUB_WORKSPACE:-$PWD}/flydsl-test:/flydsl-test" \
225+
--ipc=host --group-add video \
226+
--shm-size 16g \
227+
--cap-add=SYS_PTRACE \
228+
--security-opt seccomp=unconfined \
229+
-w /flydsl-test \
230+
--name flydsl_test \
231+
${{ env.DOCKER_IMAGE }}
232+
env:
233+
GITHUB_WORKSPACE: ${{ github.workspace }}
234+
235+
- name: Install dependencies
236+
run: |
237+
docker exec flydsl_test bash -c "apt-get update && apt-get install -y cmake build-essential patchelf"
238+
docker exec flydsl_test bash -c "python3 -m pip install -U pip setuptools wheel"
239+
docker exec flydsl_test bash -c "python3 -m pip install ninja>=1.11.1"
240+
docker exec flydsl_test bash -c "python3 -m pip install -U 'hypothesis>=6.82.0'"
241+
docker exec flydsl_test bash -c "git config --global --add safe.directory /flydsl-test && cd /flydsl-test && git log"
242+
243+
- name: Restore cached MLIR install tarball (if available)
244+
id: mlir-cache
245+
uses: actions/cache@v4
246+
with:
247+
path: mlir_install.tgz
248+
key: mlir-install-${{ matrix.runners }}-${{ hashFiles('flydsl-test/thirdparty/llvm-hash.txt', 'flydsl-test/scripts/build_llvm.sh', 'flydsl-test/CMakeLists.txt', 'flydsl-test/.github/workflows/flydsl.yaml') }}
249+
250+
- name: Use cached MLIR install tarball (skip LLVM build)
251+
if: steps.mlir-cache.outputs.cache-hit == 'true'
252+
run: |
253+
ls -lh mlir_install.tgz
254+
docker cp mlir_install.tgz flydsl_test:/tmp/mlir_install.tgz
255+
docker exec flydsl_test bash -c "rm -rf /llvm-project/mlir_install && mkdir -p /llvm-project && tar -xzf /tmp/mlir_install.tgz -C /llvm-project"
256+
docker exec flydsl_test bash -c "ls -la /llvm-project/mlir_install/lib/cmake/mlir"
257+
258+
- name: Build LLVM
259+
if: steps.mlir-cache.outputs.cache-hit != 'true'
260+
run: |
261+
set -ex
262+
docker exec flydsl_test bash -c "cd /flydsl-test && bash scripts/build_llvm.sh"
263+
docker exec flydsl_test bash -c "ls -la /llvm-project/mlir_install/lib/cmake/mlir"
264+
docker cp flydsl_test:/llvm-project/mlir_install.tgz ./mlir_install.tgz || true
265+
266+
- name: Build FlyDSL (uses MLIR install prefix)
267+
run: |
268+
docker exec flydsl_test bash -c "export MLIR_PATH=/llvm-project/mlir_install && cd /flydsl-test && python3 -m pip install -e . --use-pep517"
269+
270+
- name: Run multi-GPU allreduce tests
271+
timeout-minutes: 30
272+
run: |
273+
docker exec flydsl_test bash -c "
274+
cd /flydsl-test
275+
python3 -m pytest tests/kernels/test_allreduce.py \
276+
-m multi_gpu -v --no-header --tb=short
277+
"
278+
279+
- name: Run allreduce benchmark (PR)
280+
timeout-minutes: 30
281+
run: |
282+
docker exec flydsl_test bash -c "
283+
cd /flydsl-test
284+
python3 tests/kernels/test_allreduce.py \
285+
--world_size 8 --iters 51 --warmup 5 \
286+
--allreduce_impl flydsl --mode cudagraph \
287+
--shapes '2,7168,fp16;32,8192,fp32;128,8192,fp16;1024,7168,bf16;4096,8192,bf16' \
288+
--output_csv /tmp/bench_pr.csv
289+
"
290+
291+
- name: Build main branch baseline
292+
id: build-main
293+
timeout-minutes: 20
294+
continue-on-error: true
295+
run: |
296+
docker exec flydsl_test bash -c "
297+
cd /flydsl-test
298+
git fetch origin main --depth=1
299+
git worktree add /tmp/flydsl-main origin/main
300+
cd /tmp/flydsl-main
301+
export MLIR_PATH=/llvm-project/mlir_install
302+
python3 -m pip install -e . --use-pep517 2>&1 | tail -5
303+
"
304+
305+
- name: Run allreduce benchmark (main)
306+
id: bench-main
307+
if: steps.build-main.outcome == 'success'
308+
timeout-minutes: 30
309+
continue-on-error: true
310+
run: |
311+
docker exec flydsl_test bash -c "
312+
cp /flydsl-test/tests/kernels/test_allreduce.py \
313+
/tmp/flydsl-main/tests/kernels/test_allreduce.py
314+
cd /tmp/flydsl-main
315+
python3 tests/kernels/test_allreduce.py \
316+
--world_size 8 --iters 51 --warmup 5 \
317+
--allreduce_impl flydsl --mode cudagraph \
318+
--shapes '2,7168,fp16;32,8192,fp32;128,8192,fp16;1024,7168,bf16;4096,8192,bf16' \
319+
--output_csv /tmp/bench_main.csv
320+
"
321+
322+
- name: Check performance regression (PR vs main)
323+
if: steps.bench-main.outcome != 'skipped'
324+
timeout-minutes: 5
325+
run: |
326+
docker exec flydsl_test bash -c "
327+
cd /flydsl-test
328+
python3 tests/kernels/compare_allreduce_benchmark.py \
329+
/tmp/bench_main.csv /tmp/bench_pr.csv
330+
"
331+
332+
- name: Show test logs
333+
if: failure()
334+
run: |
335+
docker exec flydsl_test bash -c 'cd /tmp && tar czf /tmp/logs.tgz *.log 2>/dev/null || echo "no logs"'
336+
docker cp flydsl_test:/tmp/logs.tgz . || true
337+
if [ -f logs.tgz ]; then
338+
tar -xzf logs.tgz || true
339+
cat *.log || true
340+
else
341+
echo "logs.tgz not found; skipping log extraction"
342+
fi
343+
344+
- name: Clean up
345+
if: always()
346+
run: |
347+
docker stop flydsl_test
348+
docker rm flydsl_test

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ See `examples/` for more examples including tiled copy (`02-tiledCopy.py`), tile
363363
| **RMSNorm** | `test_rmsnorm.py` | RMSNorm (layout API) |
364364
| **Softmax** | `test_softmax.py` | Softmax (layout API) |
365365
| **Fused RoPE** | `test_fused_rope_cache.py` | Fused RoPE + KV cache |
366-
| **AllReduce** | `test_flydsl_allreduce.py` | Multi-GPU all-reduce |
366+
| **AllReduce** | `test_allreduce.py` | Multi-GPU all-reduce |
367367
| **RDNA GEMM** | `test_rdna_gemm.py` | RDNA FP16/FP8 GEMM |
368368
| **GFX1250 GEMM** | `test_gemm_fp8fp4_gfx1250.py` | GFX1250 FP8/FP4 GEMM |
369369
| **WMMA GEMM** | `test_wmma_gemm_gfx1250.py` | GFX1250 WMMA GEMM |

docs/prebuilt_kernels_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ What operation do you need?
338338
| `tests/kernels/test_rmsnorm.py` | RMSNorm |
339339
| `tests/kernels/test_softmax.py` | Softmax |
340340
| `tests/kernels/test_fused_rope_cache.py` | Fused RoPE + KV cache |
341-
| `tests/kernels/test_flydsl_allreduce.py` | Multi-GPU all-reduce |
341+
| `tests/kernels/test_allreduce.py` | Multi-GPU all-reduce |
342342
| `tests/kernels/test_rdna_gemm.py` | RDNA GEMM |
343343
| `tests/kernels/test_gemm_fp8fp4_gfx1250.py` | GFX1250 FP8/FP4 GEMM |
344344
| `tests/kernels/test_wmma_gemm_gfx1250.py` | GFX1250 WMMA GEMM |

kernels/custom_all_reduce.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,14 @@ def __init__(self, *, group, device, max_size: int, world_size: int, rank: int,
268268
if self.world_size not in {2, 4, 8}:
269269
raise ValueError(f"world_size must be one of {{2, 4, 8}}, got {self.world_size}")
270270

271+
# Pre-initialize resource attributes so close() is safe on partial init failure.
272+
self._meta_ptr = None
273+
self._meta_bases = [None] * self.world_size
274+
self._input_buffer_bases = [None] * self.world_size
275+
self._output_buffer_bases = [None] * self.world_size
276+
self._graph_ipc_reg_list = []
277+
self._out_ptrs_cache = None
278+
271279
alloc_size = self._SIGNAL_SIZE + int(self.max_size)
272280
self._meta_ptr = self._alloc_uncached(alloc_size)
273281

@@ -373,7 +381,9 @@ def __init__(self, *, group, device, max_size: int, world_size: int, rank: int,
373381

374382
def close(self):
375383
"""Release IPC memory handles for peer GPU buffers."""
376-
for bases in [self._meta_bases, self._input_buffer_bases, self._output_buffer_bases]:
384+
for bases in [getattr(self, '_meta_bases', []),
385+
getattr(self, '_input_buffer_bases', []),
386+
getattr(self, '_output_buffer_bases', [])]:
377387
for b in bases:
378388
if b is not None:
379389
self._close_mem_handle(int(b))

0 commit comments

Comments
 (0)