-
Notifications
You must be signed in to change notification settings - Fork 4.8k
380 lines (346 loc) · 15.3 KB
/
aws-torch-latest-full.yml
File metadata and controls
380 lines (346 loc) · 15.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
################################################################################
# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
#
# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
# fallback to 8x A100 nodes when L40S capacity is unavailable.
#
# This workflow runs:
# - Parallel tests with pytest-xdist (-n 8)
# - Sequential tests marked with @pytest.mark.sequential
# - Nightly schedule: skips if no new commits since last successful run
################################################################################
name: aws-torch-latest-full
on:
schedule:
- cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST)
workflow_dispatch:
inputs:
torch_preset:
description: PyTorch preset to install for manual runs
required: false
default: '2.10.0-cu126'
type: choice
options:
- '2.7.1-cu126'
- '2.8.0-cu126'
- '2.9.1-cu126'
- '2.10.0-cu126'
- '2.11.0-cu126'
transformers_version:
description: Hugging Face Transformers PyPI package version to install
required: false
default: '4.50.0'
type: string
transformers_source:
description: Hugging Face Transformers source for manual runs
required: false
default: 'git'
type: choice
options:
- 'pypi'
- 'git'
transformers_ref:
description: Hugging Face Transformers git ref to install when source is git
required: false
default: 'main'
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changes:
name: Check for new commits
runs-on: ubuntu-latest
if: github.event_name == 'schedule'
outputs:
has_changes: ${{ steps.check.outputs.has_changes }}
steps:
- name: Check for commits since last successful run
id: check
env:
GH_TOKEN: ${{ github.token }}
run: |
default_branch="${{ github.event.repository.default_branch }}"
last_sha=$(gh api \
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&event=schedule&branch=${default_branch}&per_page=1" \
--jq '.workflow_runs[0].head_sha // empty')
current_sha="${{ github.sha }}"
if [ -z "$last_sha" ]; then
echo "No previous successful run found - running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
elif [ "$last_sha" = "$current_sha" ]; then
echo "No new commits since last successful run ($last_sha) - skipping"
echo "has_changes=false" >> "$GITHUB_OUTPUT"
else
echo "New commits detected: $last_sha -> $current_sha - running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
fi
unit-tests:
name: Unit Tests (Full)
needs: [check-changes]
if: |
always() &&
(github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
timeout-minutes: 180
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
# Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs)
options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
env:
DEFAULT_TORCH_PRESET: '2.10.0-cu126'
DEFAULT_TRANSFORMERS_SOURCE: 'git'
DEFAULT_TRANSFORMERS_VERSION: '4.50.0'
DEFAULT_TRANSFORMERS_REF: 'main'
CUTLASS_PATH: /opt/cutlass
# Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
DS_DISABLE_REUSE_DIST_ENV: '1'
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git git-lfs libaio-dev pdsh python3 python3-pip
git lfs install
ln -sf /usr/bin/python3 /usr/bin/python
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Resolve dependency inputs
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
MANUAL_TORCH_PRESET: ${{ github.event.inputs.torch_preset || '' }}
MANUAL_TRANSFORMERS_SOURCE: ${{ github.event.inputs.transformers_source || '' }}
MANUAL_TRANSFORMERS_VERSION: ${{ github.event.inputs.transformers_version || '' }}
MANUAL_TRANSFORMERS_REF: ${{ github.event.inputs.transformers_ref || '' }}
run: |
if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TORCH_PRESET" ]; then
selected_preset="$MANUAL_TORCH_PRESET"
else
selected_preset="$DEFAULT_TORCH_PRESET"
fi
if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TRANSFORMERS_SOURCE" ]; then
transformers_source="$MANUAL_TRANSFORMERS_SOURCE"
else
transformers_source="$DEFAULT_TRANSFORMERS_SOURCE"
fi
if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TRANSFORMERS_VERSION" ]; then
transformers_version="$MANUAL_TRANSFORMERS_VERSION"
else
transformers_version="$DEFAULT_TRANSFORMERS_VERSION"
fi
if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TRANSFORMERS_REF" ]; then
transformers_ref="$MANUAL_TRANSFORMERS_REF"
else
transformers_ref="$DEFAULT_TRANSFORMERS_REF"
fi
if [ "$transformers_source" = 'git' ] && [ -z "$transformers_ref" ]; then
transformers_ref='main'
fi
case "$selected_preset" in
'2.7.1-cu126')
torch_install_version='2.7.1'
torchvision_install_version='0.22.1'
torchaudio_install_version='2.7.1'
torch_test_version='2.7'
cuda_test_version='12.6'
pytorch_index_url='https://download.pytorch.org/whl/cu126'
;;
'2.8.0-cu126')
torch_install_version='2.8.0'
torchvision_install_version='0.23.0'
torchaudio_install_version='2.8.0'
torch_test_version='2.8'
cuda_test_version='12.6'
pytorch_index_url='https://download.pytorch.org/whl/cu126'
;;
'2.9.1-cu126')
torch_install_version='2.9.1'
torchvision_install_version='0.24.1'
torchaudio_install_version='2.9.1'
torch_test_version='2.9'
cuda_test_version='12.6'
pytorch_index_url='https://download.pytorch.org/whl/cu126'
;;
'2.10.0-cu126')
torch_install_version='2.10.0'
torchvision_install_version='0.25.0'
torchaudio_install_version='2.10.0'
torch_test_version='2.10'
cuda_test_version='12.6'
pytorch_index_url='https://download.pytorch.org/whl/cu126'
;;
'2.11.0-cu126')
torch_install_version='2.11.0'
torchvision_install_version='0.26.0'
torchaudio_install_version='2.11.0'
torch_test_version='2.11'
cuda_test_version='12.6'
pytorch_index_url='https://download.pytorch.org/whl/cu126'
;;
*)
echo "Unsupported torch_preset: $selected_preset" >&2
exit 1
;;
esac
{
echo "SELECTED_TORCH_PRESET=$selected_preset"
echo "TORCH_INSTALL_VERSION=$torch_install_version"
echo "TORCHVISION_INSTALL_VERSION=$torchvision_install_version"
echo "TORCHAUDIO_INSTALL_VERSION=$torchaudio_install_version"
echo "TORCH_TEST_VERSION=$torch_test_version"
echo "CUDA_TEST_VERSION=$cuda_test_version"
echo "PYTORCH_INDEX_URL=$pytorch_index_url"
echo "TRANSFORMERS_SOURCE=$transformers_source"
echo "TRANSFORMERS_VERSION=$transformers_version"
echo "TRANSFORMERS_REF=$transformers_ref"
} >> "$GITHUB_ENV"
echo "Selected PyTorch preset: $selected_preset"
echo "Resolved install tuple: torch==$torch_install_version torchvision==$torchvision_install_version torchaudio==$torchaudio_install_version"
echo "Resolved test expectations: torch=$torch_test_version cuda=$cuda_test_version"
echo "Resolved PyTorch index: $pytorch_index_url"
echo "Resolved Transformers source: $transformers_source"
echo "Resolved Transformers version: $transformers_version"
echo "Resolved Transformers ref: $transformers_ref"
- name: Install CUTLASS
run: |
git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
echo "CUTLASS installed at /opt/cutlass"
ls -la /opt/cutlass/include/ | head -10
- name: Install PyTorch
run: |
pip install \
torch=="$TORCH_INSTALL_VERSION" \
torchvision=="$TORCHVISION_INSTALL_VERSION" \
torchaudio=="$TORCHAUDIO_INSTALL_VERSION" \
--index-url "$PYTORCH_INDEX_URL"
- name: Install Transformers
run: |
case "$TRANSFORMERS_SOURCE" in
'pypi')
pip install "transformers==$TRANSFORMERS_VERSION"
;;
'git')
git clone --filter=blob:none https://github.com/huggingface/transformers /tmp/transformers
cd /tmp/transformers
git checkout "$TRANSFORMERS_REF"
resolved_ref="$(git rev-parse HEAD)"
echo "TRANSFORMERS_RESOLVED_REF=$resolved_ref" >> "$GITHUB_ENV"
echo "Resolved Transformers git ref: $resolved_ref"
pip install .
;;
*)
echo "Unsupported TRANSFORMERS_SOURCE: $TRANSFORMERS_SOURCE" >&2
exit 1
;;
esac
python -c "import transformers; print('transformers:', transformers.__version__, transformers)"
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-dev.txt
pip install -r requirements/requirements-deepcompile.txt
pip install pytest-timeout pytest-instafail
- name: Check environment
run: |
echo "=== Selected PyTorch Preset ==="
echo "Preset: $SELECTED_TORCH_PRESET"
echo "Install tuple: torch==$TORCH_INSTALL_VERSION torchvision==$TORCHVISION_INSTALL_VERSION torchaudio==$TORCHAUDIO_INSTALL_VERSION"
echo "PyTorch index URL: $PYTORCH_INDEX_URL"
echo "Expected test versions: torch=$TORCH_TEST_VERSION cuda=$CUDA_TEST_VERSION"
echo "Transformers source: $TRANSFORMERS_SOURCE"
echo "Transformers version: $TRANSFORMERS_VERSION"
echo "Transformers ref: $TRANSFORMERS_REF"
echo "Transformers resolved ref: ${TRANSFORMERS_RESOLVED_REF:-}"
echo ""
echo "=== GPU Information ==="
nvidia-smi
echo ""
echo "=== CUDA Version ==="
nvcc --version
echo ""
echo "=== Python/PyTorch Info ==="
python --version
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
echo ""
echo "=== CUTLASS ==="
echo "CUTLASS_PATH: $CUTLASS_PATH"
ls -la "$CUTLASS_PATH"/include/ | head -5
- name: Detect GPU architecture
run: |
python - <<'PY'
import os
import torch
torch.cuda.init()
major, minor = torch.cuda.get_device_capability(0)
arch = f"{major}.{minor}"
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)
with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
env_file.write(f"GPU_COUNT={gpu_count}\n")
print(f"Detected GPU: {gpu_name}")
print(f"Detected compute capability: {arch}")
print(f"Detected GPU count: {gpu_count}")
PY
- name: Install DeepSpeed
run: |
echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile]
ds_report
- name: Reinstall selected Transformers
run: |
case "$TRANSFORMERS_SOURCE" in
'pypi')
pip install --no-deps --force-reinstall "transformers==$TRANSFORMERS_VERSION"
;;
'git')
cd /tmp/transformers
pip install --no-deps --force-reinstall .
;;
*)
echo "Unsupported TRANSFORMERS_SOURCE: $TRANSFORMERS_SOURCE" >&2
exit 1
;;
esac
python -c "import transformers; print('transformers:', transformers.__version__, transformers)"
- name: Python environment
run: |
pip list
- name: Unit tests (parallel)
run: |
echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
cd tests
# Skip tests requiring unavailable hardware or known issues:
# - nvme checkpointing: no nvme device
# - GDS tests: no GPUDirect Storage support
# - launcher user_args: pdsh requires SSH server
# - zenflow: Stage 3 tests have pre-existing bugs + CUDA/fork issues
rm -rf /mnt/aio/pytest
pytest --instafail --timeout 600 --forked -n 8 --basetemp=/mnt/aio/pytest unit/ \
--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
--ignore=unit/ops/aio/test_gds.py \
--ignore=unit/launcher/test_user_args.py \
--ignore=unit/runtime/zenflow \
--ignore=unit/ops/adam/test_zf_torch_adam.py \
--torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
- name: Unit tests (sequential)
run: |
echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
cd tests
rm -rf /mnt/aio/pytest
pytest --instafail --timeout 600 -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
--ignore=unit/ops/aio/test_gds.py \
--ignore=unit/launcher/test_user_args.py \
--ignore=unit/runtime/zenflow \
--ignore=unit/ops/adam/test_zf_torch_adam.py \
--ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
--torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"