|
| 1 | +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +# Multi-GPU distributed training validation |
| 7 | +# |
| 8 | +# This workflow validates that multi-GPU training works correctly across: |
| 9 | +# - Physics backends: PhysX, Newton |
| 10 | +# - Rendering backends: none (physics-only), Isaac RTX, Newton Warp |
| 11 | +# |
| 12 | +# Runs on a dedicated multi-GPU runner (separate from standard CI) to minimize costs. |
| 13 | +# Only triggered on PRs that touch distributed training code paths. |
| 14 | + |
| 15 | +name: Multi-GPU Training Tests |
| 16 | + |
| 17 | +on: |
| 18 | + pull_request: |
| 19 | + paths: |
| 20 | + - "source/isaaclab/isaaclab/app/app_launcher.py" |
| 21 | + - "source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py" |
| 22 | + - "scripts/reinforcement_learning/**/train.py" |
| 23 | + - ".github/workflows/test-multi-gpu.yaml" |
| 24 | + workflow_dispatch: |
| 25 | + |
| 26 | +concurrency: |
| 27 | + group: ${{ github.workflow }}-${{ github.ref }} |
| 28 | + cancel-in-progress: true |
| 29 | + |
| 30 | +jobs: |
| 31 | + test-multi-gpu: |
| 32 | + name: Multi-GPU (${{ matrix.physics }}, ${{ matrix.renderer }}) |
| 33 | + # Use dedicated multi-GPU runner to avoid blocking standard CI resources |
| 34 | + # Configure this label on a runner with 2+ GPUs (e.g., g5.12xlarge with 4x A10G) |
| 35 | + runs-on: [self-hosted, linux, x64, gpu, multi-gpu] |
| 36 | + timeout-minutes: 30 |
| 37 | + strategy: |
| 38 | + fail-fast: false |
| 39 | + matrix: |
| 40 | + include: |
| 41 | + # PhysX physics-only |
| 42 | + - physics: physx |
| 43 | + renderer: none |
| 44 | + task: Isaac-Cartpole-Direct-v0 |
| 45 | + extra_args: "" |
| 46 | + |
| 47 | + # PhysX + Isaac RTX renderer |
| 48 | + - physics: physx |
| 49 | + renderer: isaac-rtx |
| 50 | + task: Isaac-Cartpole-Camera-Presets-Direct-v0 |
| 51 | + extra_args: "" |
| 52 | + trainer: skrl |
| 53 | + |
| 54 | + # PhysX + Newton Warp renderer (hybrid) |
| 55 | + - physics: physx |
| 56 | + renderer: newton-warp |
| 57 | + task: Isaac-Cartpole-Camera-Presets-Direct-v0 |
| 58 | + extra_args: "env.tiled_camera.renderer_cfg=newton_renderer" |
| 59 | + trainer: skrl |
| 60 | + |
| 61 | + # Newton physics-only |
| 62 | + - physics: newton |
| 63 | + renderer: none |
| 64 | + task: Isaac-Cartpole-Direct-v0 |
| 65 | + extra_args: "+sim=newton" |
| 66 | + |
| 67 | + # Newton + Newton Warp renderer |
| 68 | + - physics: newton |
| 69 | + renderer: newton-warp |
| 70 | + task: Isaac-Cartpole-Camera-Presets-Direct-v0 |
| 71 | + extra_args: "+sim=newton env.tiled_camera.renderer_cfg=newton_renderer" |
| 72 | + trainer: skrl |
| 73 | + |
| 74 | + # Newton + Isaac RTX renderer (hybrid) |
| 75 | + - physics: newton |
| 76 | + renderer: isaac-rtx |
| 77 | + task: Isaac-Cartpole-Camera-Presets-Direct-v0 |
| 78 | + extra_args: "+sim=newton" |
| 79 | + trainer: skrl |
| 80 | + |
| 81 | + steps: |
| 82 | + - name: Checkout repository |
| 83 | + uses: actions/checkout@v4 |
| 84 | + |
| 85 | + - name: Set up Python |
| 86 | + uses: actions/setup-python@v5 |
| 87 | + with: |
| 88 | + python-version: "3.10" |
| 89 | + |
| 90 | + - name: Install Isaac Lab |
| 91 | + run: | |
| 92 | + ./isaaclab.sh --install |
| 93 | +
|
| 94 | + - name: Verify multi-GPU availability |
| 95 | + run: | |
| 96 | + echo "=== GPU Info ===" |
| 97 | + nvidia-smi --query-gpu=index,name,memory.total --format=csv |
| 98 | +
|
| 99 | + GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())") |
| 100 | + echo "Detected $GPU_COUNT GPU(s)" |
| 101 | +
|
| 102 | + if [ "$GPU_COUNT" -lt 2 ]; then |
| 103 | + echo "::error::At least 2 GPUs required for multi-GPU tests, found $GPU_COUNT" |
| 104 | + exit 1 |
| 105 | + fi |
| 106 | +
|
| 107 | + - name: Run multi-GPU training (${{ matrix.physics }}, ${{ matrix.renderer }}) |
| 108 | + env: |
| 109 | + NCCL_DEBUG: WARN |
| 110 | + run: | |
| 111 | + TRAINER="${{ matrix.trainer || 'rsl_rl' }}" |
| 112 | +
|
| 113 | + echo "==========================================" |
| 114 | + echo "Physics: ${{ matrix.physics }}" |
| 115 | + echo "Renderer: ${{ matrix.renderer }}" |
| 116 | + echo "Task: ${{ matrix.task }}" |
| 117 | + echo "Trainer: $TRAINER" |
| 118 | + echo "Extra args: ${{ matrix.extra_args }}" |
| 119 | + echo "==========================================" |
| 120 | +
|
| 121 | + # Run 2-GPU distributed training for 3 iterations |
| 122 | + ./isaaclab.sh -p -m torch.distributed.run --nproc_per_node=2 \ |
| 123 | + scripts/reinforcement_learning/${TRAINER}/train.py \ |
| 124 | + --task=${{ matrix.task }} \ |
| 125 | + --headless \ |
| 126 | + --distributed \ |
| 127 | + --max_iterations=3 \ |
| 128 | + --num_envs=16 \ |
| 129 | + ${{ matrix.extra_args }} |
| 130 | +
|
| 131 | + - name: Verify training completed |
| 132 | + run: | |
| 133 | + # Find the most recent log directory |
| 134 | + LATEST_LOG=$(ls -td logs/*/*/*/ 2>/dev/null | head -1) |
| 135 | +
|
| 136 | + if [ -z "$LATEST_LOG" ]; then |
| 137 | + echo "::error::No training log directory found" |
| 138 | + exit 1 |
| 139 | + fi |
| 140 | +
|
| 141 | + echo "Log directory: $LATEST_LOG" |
| 142 | + ls -la "$LATEST_LOG" |
| 143 | +
|
| 144 | + # Check for model checkpoints |
| 145 | + MODELS=$(find "$LATEST_LOG" -name "*.pt" | wc -l) |
| 146 | + echo "Model checkpoints found: $MODELS" |
| 147 | +
|
| 148 | + if [ "$MODELS" -lt 1 ]; then |
| 149 | + echo "::error::No model checkpoints found - training may have failed" |
| 150 | + exit 1 |
| 151 | + fi |
| 152 | +
|
| 153 | + echo "✅ Multi-GPU training completed successfully (${{ matrix.physics }}, ${{ matrix.renderer }})" |
0 commit comments