From a0400428abc76eed0058aec3d6939f9bd99852bd Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:05:42 +0000 Subject: [PATCH 01/18] refactor: version PyTorch directory structure for multi-version support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move all PyTorch build artifacts under docker/pytorch/2.11/ to support maintaining multiple PyTorch versions concurrently (1-year support window per version). Structure change: docker/pytorch/Dockerfile.cuda → docker/pytorch/2.11/Dockerfile.cuda docker/pytorch/versions-cuda.env → docker/pytorch/2.11/versions-cuda.env docker/pytorch/cuda/pyproject.toml → docker/pytorch/2.11/cuda/pyproject.toml (same pattern for CPU) .github/config/image/pytorch-ec2-cuda.yml → pytorch-2.11-ec2-cuda.yml .github/workflows/pr-pytorch-ec2-cuda.yml → pr-pytorch-2.11-ec2-cuda.yml (same pattern for all 4 variants × PR + autorelease) Adding PyTorch 2.12 when it releases means creating docker/pytorch/2.12/, new configs, and new workflows — without touching 2.11. Co-Authored-By: Claude Opus 4.6 (1M context) --- ...h-ec2-cpu.yml => pytorch-2.11-ec2-cpu.yml} | 0 ...ec2-cuda.yml => pytorch-2.11-ec2-cuda.yml} | 0 ...cpu.yml => pytorch-2.11-sagemaker-cpu.yml} | 0 ...da.yml => pytorch-2.11-sagemaker-cuda.yml} | 0 ...l => autorelease-pytorch-2.11-ec2-cpu.yml} | 8 ++--- ... => autorelease-pytorch-2.11-ec2-cuda.yml} | 18 +++++------ ...utorelease-pytorch-2.11-sagemaker-cpu.yml} | 8 ++--- ...torelease-pytorch-2.11-sagemaker-cuda.yml} | 16 +++++----- ...c2-cpu.yml => pr-pytorch-2.11-ec2-cpu.yml} | 20 ++++++------- ...-cuda.yml => pr-pytorch-2.11-ec2-cuda.yml} | 30 +++++++++---------- ....yml => pr-pytorch-2.11-sagemaker-cpu.yml} | 20 ++++++------- ...yml => pr-pytorch-2.11-sagemaker-cuda.yml} | 26 ++++++++-------- docker/pytorch/{ => 2.11}/Dockerfile.cpu | 4 +-- docker/pytorch/{ => 2.11}/Dockerfile.cuda | 10 +++---- docker/pytorch/{ => 2.11}/cpu/pyproject.toml | 0 docker/pytorch/{ => 2.11}/cpu/uv.lock | 0 docker/pytorch/{ => 2.11}/cuda/pyproject.toml | 0 docker/pytorch/{ => 2.11}/cuda/uv.lock | 0 docker/pytorch/{ => 2.11}/versions-cpu.env | 0 docker/pytorch/{ => 2.11}/versions-cuda.env | 0 20 files changed, 80 insertions(+), 80 deletions(-) rename .github/config/image/{pytorch-ec2-cpu.yml => pytorch-2.11-ec2-cpu.yml} (100%) rename .github/config/image/{pytorch-ec2-cuda.yml => pytorch-2.11-ec2-cuda.yml} (100%) rename .github/config/image/{pytorch-sagemaker-cpu.yml => pytorch-2.11-sagemaker-cpu.yml} (100%) rename .github/config/image/{pytorch-sagemaker-cuda.yml => pytorch-2.11-sagemaker-cuda.yml} (100%) rename .github/workflows/{autorelease-pytorch-ec2-cpu.yml => autorelease-pytorch-2.11-ec2-cpu.yml} (97%) rename .github/workflows/{autorelease-pytorch-ec2-cuda.yml => autorelease-pytorch-2.11-ec2-cuda.yml} (96%) rename .github/workflows/{autorelease-pytorch-sagemaker-cpu.yml => autorelease-pytorch-2.11-sagemaker-cpu.yml} (97%) rename .github/workflows/{autorelease-pytorch-sagemaker-cuda.yml => autorelease-pytorch-2.11-sagemaker-cuda.yml} (96%) rename .github/workflows/{pr-pytorch-ec2-cpu.yml => pr-pytorch-2.11-ec2-cpu.yml} (95%) rename .github/workflows/{pr-pytorch-ec2-cuda.yml => pr-pytorch-2.11-ec2-cuda.yml} (96%) rename .github/workflows/{pr-pytorch-sagemaker-cpu.yml => pr-pytorch-2.11-sagemaker-cpu.yml} (95%) rename .github/workflows/{pr-pytorch-sagemaker-cuda.yml => pr-pytorch-2.11-sagemaker-cuda.yml} (95%) rename docker/pytorch/{ => 2.11}/Dockerfile.cpu (98%) rename docker/pytorch/{ => 2.11}/Dockerfile.cuda (97%) rename docker/pytorch/{ => 2.11}/cpu/pyproject.toml (100%) rename docker/pytorch/{ => 2.11}/cpu/uv.lock (100%) rename docker/pytorch/{ => 2.11}/cuda/pyproject.toml (100%) rename docker/pytorch/{ => 2.11}/cuda/uv.lock (100%) rename docker/pytorch/{ => 2.11}/versions-cpu.env (100%) rename docker/pytorch/{ => 2.11}/versions-cuda.env (100%) diff --git a/.github/config/image/pytorch-ec2-cpu.yml b/.github/config/image/pytorch-2.11-ec2-cpu.yml similarity index 100% rename from .github/config/image/pytorch-ec2-cpu.yml rename to .github/config/image/pytorch-2.11-ec2-cpu.yml diff --git a/.github/config/image/pytorch-ec2-cuda.yml b/.github/config/image/pytorch-2.11-ec2-cuda.yml similarity index 100% rename from .github/config/image/pytorch-ec2-cuda.yml rename to .github/config/image/pytorch-2.11-ec2-cuda.yml diff --git a/.github/config/image/pytorch-sagemaker-cpu.yml b/.github/config/image/pytorch-2.11-sagemaker-cpu.yml similarity index 100% rename from .github/config/image/pytorch-sagemaker-cpu.yml rename to .github/config/image/pytorch-2.11-sagemaker-cpu.yml diff --git a/.github/config/image/pytorch-sagemaker-cuda.yml b/.github/config/image/pytorch-2.11-sagemaker-cuda.yml similarity index 100% rename from .github/config/image/pytorch-sagemaker-cuda.yml rename to .github/config/image/pytorch-2.11-sagemaker-cuda.yml diff --git a/.github/workflows/autorelease-pytorch-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml similarity index 97% rename from .github/workflows/autorelease-pytorch-ec2-cpu.yml rename to .github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml index cd41b1e713b8..65167f2cf0c2 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml @@ -1,4 +1,4 @@ -name: Auto Release - PyTorch EC2 CPU +name: Auto Release - PyTorch 2.11 EC2 CPU on: schedule: @@ -15,7 +15,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cpu.yml" jobs: load-config: @@ -83,7 +83,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cpu.env + source docker/pytorch/2.11/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ @@ -98,7 +98,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/2.11/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/autorelease-pytorch-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml similarity index 96% rename from .github/workflows/autorelease-pytorch-ec2-cuda.yml rename to .github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml index f17a7fd13cb3..3e5f98fbe703 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml @@ -1,4 +1,4 @@ -name: Auto Release - PyTorch EC2 CUDA +name: Auto Release - PyTorch 2.11 EC2 CUDA on: schedule: @@ -16,7 +16,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cuda.yml" jobs: load-config: @@ -85,17 +85,17 @@ jobs: - name: Source versions id: versions run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + source docker/pytorch/2.11/versions-cuda.env + mkdir -p docker/pytorch/2.11/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/2.11/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -105,7 +105,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ @@ -125,13 +125,13 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/2.11/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml similarity index 97% rename from .github/workflows/autorelease-pytorch-sagemaker-cpu.yml rename to .github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml index 95c8780f3277..ee13aef5638a 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml @@ -1,4 +1,4 @@ -name: Auto Release - PyTorch SageMaker CPU +name: Auto Release - PyTorch 2.11 SageMaker CPU on: schedule: @@ -15,7 +15,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" jobs: load-config: @@ -83,7 +83,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cpu.env + source docker/pytorch/2.11/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-cpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -110,7 +110,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/2.11/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml similarity index 96% rename from .github/workflows/autorelease-pytorch-sagemaker-cuda.yml rename to .github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml index c40d70c44bfd..da4e5314bc19 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml @@ -1,4 +1,4 @@ -name: Auto Release - PyTorch SageMaker CUDA +name: Auto Release - PyTorch 2.11 SageMaker CUDA on: schedule: @@ -15,7 +15,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" jobs: load-config: @@ -83,10 +83,10 @@ jobs: - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + source docker/pytorch/2.11/versions-cuda.env + mkdir -p docker/pytorch/2.11/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/2.11/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -96,7 +96,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -129,13 +129,13 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/2.11/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml similarity index 95% rename from .github/workflows/pr-pytorch-ec2-cpu.yml rename to .github/workflows/pr-pytorch-2.11-ec2-cpu.yml index a6264f2df988..7ea638a71c18 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch EC2 CPU +name: PR - PyTorch 2.11 EC2 CPU on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-ec2-cpu.yml" - - ".github/workflows/pr-pytorch-ec2-cpu.yml" - - "docker/pytorch/**" + - ".github/config/image/pytorch-2.11-ec2-cpu.yml" + - ".github/workflows/pr-pytorch-2.11-ec2-cpu.yml" + - "docker/pytorch/2.11/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +22,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cpu.yml" jobs: # ============================================================ @@ -122,9 +122,9 @@ jobs: with: filters: | build-change: - - ".github/config/image/pytorch-ec2-cpu.yml" - - "docker/pytorch/Dockerfile.cpu" - - "docker/pytorch/cpu/**" + - ".github/config/image/pytorch-2.11-ec2-cpu.yml" + - "docker/pytorch/2.11/Dockerfile.cpu" + - "docker/pytorch/2.11/cpu/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/configure_ssh.sh" - "scripts/telemetry/bash_telemetry.sh.template" @@ -164,7 +164,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cpu.env + source docker/pytorch/2.11/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ @@ -179,7 +179,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/2.11/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml similarity index 96% rename from .github/workflows/pr-pytorch-ec2-cuda.yml rename to .github/workflows/pr-pytorch-2.11-ec2-cuda.yml index 09d07e1b68ae..632da680d12a 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch EC2 CUDA +name: PR - PyTorch 2.11 EC2 CUDA on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-ec2-cuda.yml" - - ".github/workflows/pr-pytorch-ec2-cuda.yml" - - "docker/pytorch/**" + - ".github/config/image/pytorch-2.11-ec2-cuda.yml" + - ".github/workflows/pr-pytorch-2.11-ec2-cuda.yml" + - "docker/pytorch/2.11/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -25,7 +25,7 @@ env: FORCE_COLOR: "1" # Config file path - CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cuda.yml" jobs: # ============================================================ @@ -125,9 +125,9 @@ jobs: with: filters: | build-change: - - ".github/config/image/pytorch-ec2-cuda.yml" - - "docker/pytorch/Dockerfile.cuda" - - "docker/pytorch/cuda/**" + - ".github/config/image/pytorch-2.11-ec2-cuda.yml" + - "docker/pytorch/2.11/Dockerfile.cuda" + - "docker/pytorch/2.11/cuda/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/*" - "scripts/telemetry/bash_telemetry.sh.template" @@ -167,17 +167,17 @@ jobs: - name: Source versions id: versions run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + source docker/pytorch/2.11/versions-cuda.env + mkdir -p docker/pytorch/2.11/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/2.11/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -187,7 +187,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ @@ -207,13 +207,13 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/2.11/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml similarity index 95% rename from .github/workflows/pr-pytorch-sagemaker-cpu.yml rename to .github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml index 5cbb926d7917..99accf9564b9 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch SageMaker CPU +name: PR - PyTorch 2.11 SageMaker CPU on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-sagemaker-cpu.yml" - - ".github/workflows/pr-pytorch-sagemaker-cpu.yml" - - "docker/pytorch/**" + - ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" + - ".github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml" + - "docker/pytorch/2.11/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +22,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" jobs: # ============================================================ @@ -122,9 +122,9 @@ jobs: with: filters: | build-change: - - ".github/config/image/pytorch-sagemaker-cpu.yml" - - "docker/pytorch/Dockerfile.cpu" - - "docker/pytorch/cpu/**" + - ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" + - "docker/pytorch/2.11/Dockerfile.cpu" + - "docker/pytorch/2.11/cpu/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/configure_ssh.sh" - "scripts/pytorch/changehostname.c" @@ -166,7 +166,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cpu.env + source docker/pytorch/2.11/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations @@ -193,7 +193,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/2.11/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml similarity index 95% rename from .github/workflows/pr-pytorch-sagemaker-cuda.yml rename to .github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml index e8a6249d4559..ee5401ac6266 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch SageMaker CUDA +name: PR - PyTorch 2.11 SageMaker CUDA on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-sagemaker-cuda.yml" - - ".github/workflows/pr-pytorch-sagemaker-cuda.yml" - - "docker/pytorch/**" + - ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" + - ".github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml" + - "docker/pytorch/2.11/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -24,7 +24,7 @@ env: FORCE_COLOR: "1" # Config file path - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml" + CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" jobs: # ============================================================ @@ -124,9 +124,9 @@ jobs: with: filters: | build-change: - - ".github/config/image/pytorch-sagemaker-cuda.yml" - - "docker/pytorch/Dockerfile.cuda" - - "docker/pytorch/cuda/**" + - ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" + - "docker/pytorch/2.11/Dockerfile.cuda" + - "docker/pytorch/2.11/cuda/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/*" - "scripts/telemetry/bash_telemetry.sh.template" @@ -165,10 +165,10 @@ jobs: - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + source docker/pytorch/2.11/versions-cuda.env + mkdir -p docker/pytorch/2.11/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/2.11/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -178,7 +178,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cuda.env + source docker/pytorch/2.11/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations @@ -211,7 +211,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/2.11/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT diff --git a/docker/pytorch/Dockerfile.cpu b/docker/pytorch/2.11/Dockerfile.cpu similarity index 98% rename from docker/pytorch/Dockerfile.cpu rename to docker/pytorch/2.11/Dockerfile.cpu index 1a2d9a15288d..409fbcfd69e7 100644 --- a/docker/pytorch/Dockerfile.cpu +++ b/docker/pytorch/2.11/Dockerfile.cpu @@ -35,7 +35,7 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/2.11/cpu/pyproject.toml docker/pytorch/2.11/cpu/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project @@ -160,7 +160,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cpu/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/2.11/cpu/pyproject.toml docker/pytorch/2.11/cpu/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/2.11/Dockerfile.cuda similarity index 97% rename from docker/pytorch/Dockerfile.cuda rename to docker/pytorch/2.11/Dockerfile.cuda index 93effb4ff91d..ce69e3874fb6 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/2.11/Dockerfile.cuda @@ -46,14 +46,14 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/cuda/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project # transformer-engine requires torch + cudnn.h at build time; point it to the # cudnn headers shipped inside the nvidia-cudnn pip package. ARG TRANSFORMER_ENGINE_VERSION -COPY docker/pytorch/wheel[s]/ /tmp/wheels/ +COPY docker/pytorch/2.11/wheel[s]/ /tmp/wheels/ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0])") && \ NCCL_HOME=$(python -c "import nvidia.nccl; print(nvidia.nccl.__path__[0])") && \ cp ${CUDNN_HOME}/include/*.h /usr/local/cuda/include/ && \ @@ -81,8 +81,8 @@ ARG MAX_JOBS # If a cached wheel exists in the build context, install it; otherwise build from source. # When building from source, the wheel is saved to /tmp/built_wheels/ for later S3 upload. -# docker/pytorch/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/wheel[s]/ /tmp/wheels/ +# docker/pytorch/2.11/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. +COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/wheel[s]/ /tmp/wheels/ RUN --mount=type=cache,target=/root/.cache/uv \ mkdir -p /tmp/built_wheels && \ WHL=$(find /tmp/wheels -name "flash*attn*.whl" 2>/dev/null | head -1) && \ @@ -244,7 +244,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cuda/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/cuda/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* diff --git a/docker/pytorch/cpu/pyproject.toml b/docker/pytorch/2.11/cpu/pyproject.toml similarity index 100% rename from docker/pytorch/cpu/pyproject.toml rename to docker/pytorch/2.11/cpu/pyproject.toml diff --git a/docker/pytorch/cpu/uv.lock b/docker/pytorch/2.11/cpu/uv.lock similarity index 100% rename from docker/pytorch/cpu/uv.lock rename to docker/pytorch/2.11/cpu/uv.lock diff --git a/docker/pytorch/cuda/pyproject.toml b/docker/pytorch/2.11/cuda/pyproject.toml similarity index 100% rename from docker/pytorch/cuda/pyproject.toml rename to docker/pytorch/2.11/cuda/pyproject.toml diff --git a/docker/pytorch/cuda/uv.lock b/docker/pytorch/2.11/cuda/uv.lock similarity index 100% rename from docker/pytorch/cuda/uv.lock rename to docker/pytorch/2.11/cuda/uv.lock diff --git a/docker/pytorch/versions-cpu.env b/docker/pytorch/2.11/versions-cpu.env similarity index 100% rename from docker/pytorch/versions-cpu.env rename to docker/pytorch/2.11/versions-cpu.env diff --git a/docker/pytorch/versions-cuda.env b/docker/pytorch/2.11/versions-cuda.env similarity index 100% rename from docker/pytorch/versions-cuda.env rename to docker/pytorch/2.11/versions-cuda.env From 949ed3988f5945a8d2997db3cdaaf611cc3dc0ce Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:17:36 +0000 Subject: [PATCH 02/18] fix: update test_versions.py to discover versioned env file path The versions env file moved from docker/pytorch/versions-cuda.env to docker/pytorch/2.11/versions-cuda.env. Use glob to find the file under any version directory. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/pytorch/unit/test_versions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/pytorch/unit/test_versions.py b/test/pytorch/unit/test_versions.py index ddefd85f2319..6afdd7dd6cc0 100644 --- a/test/pytorch/unit/test_versions.py +++ b/test/pytorch/unit/test_versions.py @@ -1,5 +1,6 @@ """Verify installed package versions match pins from versions.env.""" +import glob import os import re import subprocess @@ -10,7 +11,10 @@ _WORKDIR = os.environ.get("DLC_WORKDIR", "/workdir") IS_CUDA = os.path.isdir("/usr/local/cuda") _VERSIONS_FILE = "versions-cuda.env" if IS_CUDA else "versions-cpu.env" -VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _VERSIONS_FILE) +_candidates = sorted(glob.glob(os.path.join(_WORKDIR, "docker", "pytorch", "*", _VERSIONS_FILE))) +VERSIONS_ENV = ( + _candidates[0] if _candidates else os.path.join(_WORKDIR, "docker", "pytorch", _VERSIONS_FILE) +) cuda_only = pytest.mark.skipif(not IS_CUDA, reason="CUDA-only test") From d43920c1d4ae03d3efd24f59ff04fa5db633329a Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:19:46 +0000 Subject: [PATCH 03/18] fix: pass DLC_PYTORCH_VERSION to unit tests for versioned env lookup test_versions.py now requires DLC_PYTORCH_VERSION env var (e.g., "2.11") to locate the correct versions-{cuda,cpu}.env under the versioned directory. All 4 PR workflows pass it via docker run -e. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-2.11-ec2-cpu.yml | 2 +- .github/workflows/pr-pytorch-2.11-ec2-cuda.yml | 6 +++--- .github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml | 2 +- .github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml | 2 +- test/pytorch/unit/test_versions.py | 10 ++++------ 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml b/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml index 7ea638a71c18..8bedf196689a 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml @@ -269,7 +269,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.runtime-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml index 632da680d12a..6b54ed135943 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml @@ -308,7 +308,7 @@ jobs: IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -344,7 +344,7 @@ jobs: docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -393,7 +393,7 @@ jobs: # docker pull ${IMAGE} # CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ # --entrypoint /bin/bash \ - # -e DLC_WORKDIR=/workdir \ + # -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ # -v $(pwd):/workdir --workdir /workdir \ # ${IMAGE} -c 'sleep infinity') # docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml index 99accf9564b9..a8840a3f0b6d 100644 --- a/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml @@ -263,7 +263,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml index ee5401ac6266..3cbdcdcd4a72 100644 --- a/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml @@ -301,7 +301,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/test/pytorch/unit/test_versions.py b/test/pytorch/unit/test_versions.py index 6afdd7dd6cc0..17ae1d61d9ea 100644 --- a/test/pytorch/unit/test_versions.py +++ b/test/pytorch/unit/test_versions.py @@ -1,20 +1,18 @@ """Verify installed package versions match pins from versions.env.""" -import glob import os import re import subprocess import pytest -# Detect GPU vs CPU image by checking for CUDA, then pick the right versions file. +# DLC_PYTORCH_VERSION selects which versioned directory to read (e.g., "2.11"). _WORKDIR = os.environ.get("DLC_WORKDIR", "/workdir") +_PT_VERSION = os.environ.get("DLC_PYTORCH_VERSION", "") +assert _PT_VERSION, "DLC_PYTORCH_VERSION env var is required (e.g., '2.11')" IS_CUDA = os.path.isdir("/usr/local/cuda") _VERSIONS_FILE = "versions-cuda.env" if IS_CUDA else "versions-cpu.env" -_candidates = sorted(glob.glob(os.path.join(_WORKDIR, "docker", "pytorch", "*", _VERSIONS_FILE))) -VERSIONS_ENV = ( - _candidates[0] if _candidates else os.path.join(_WORKDIR, "docker", "pytorch", _VERSIONS_FILE) -) +VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _PT_VERSION, _VERSIONS_FILE) cuda_only = pytest.mark.skipif(not IS_CUDA, reason="CUDA-only test") From dcc1f70edcb5c0ad57538bad7eb2a180625774b5 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:23:54 +0000 Subject: [PATCH 04/18] fix: pass DLC_PYTORCH_VERSION to autorelease unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same fix as the PR workflows — autorelease workflows also run unit tests that need the versioned env file path. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml | 2 +- .github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml | 4 ++-- .github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml | 2 +- .github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml index 65167f2cf0c2..289e57ad3dd6 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml @@ -162,7 +162,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml index 3e5f98fbe703..b9986512a525 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml @@ -200,7 +200,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -230,7 +230,7 @@ jobs: docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml index ee13aef5638a..16c97693aa05 100644 --- a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml @@ -174,7 +174,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml index da4e5314bc19..b875d3562519 100644 --- a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml @@ -204,7 +204,7 @@ jobs: IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q From 5995526c2fbe82349ef0cf8423b9ba94f52a65af Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:28:12 +0000 Subject: [PATCH 05/18] fix: remove DLC_PYTORCH_VERSION from non-unit-test jobs Only unit tests (test_versions.py) need this env var. Removed from single-GPU and multi-GPU test containers in EC2 CUDA workflows. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml | 2 +- .github/workflows/pr-pytorch-2.11-ec2-cuda.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml index b9986512a525..06ba35ee18df 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml @@ -230,7 +230,7 @@ jobs: docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml index 6b54ed135943..f060fddc5a59 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml @@ -344,7 +344,7 @@ jobs: docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -393,7 +393,7 @@ jobs: # docker pull ${IMAGE} # CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ # --entrypoint /bin/bash \ - # -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + # -e DLC_WORKDIR=/workdir \ # -v $(pwd):/workdir --workdir /workdir \ # ${IMAGE} -c 'sleep infinity') # docker exec ${CONTAINER_ID} pip install pytest -q From 172e3cc2c6d45914a7760dc8a76966d1ba14acd1 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 12 May 2026 23:30:47 +0000 Subject: [PATCH 06/18] fix: parameterize Dockerfile path in upload_cached_wheels.sh The script had a hardcoded docker/pytorch/Dockerfile path that no longer exists after the versioned restructure. Accept the Dockerfile path as a parameter and update all 3 callers to pass it. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml | 1 + .../workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml | 1 + .github/workflows/pr-pytorch-2.11-ec2-cuda.yml | 1 + scripts/pytorch/upload_cached_wheels.sh | 8 ++++---- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml index 06ba35ee18df..b5cf3d2bdc78 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml @@ -136,6 +136,7 @@ jobs: "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ + "docker/pytorch/2.11/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true diff --git a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml index b875d3562519..05b05077d854 100644 --- a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml @@ -140,6 +140,7 @@ jobs: "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-sagemaker.outputs.image-uri }}" \ + "docker/pytorch/2.11/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml index f060fddc5a59..e510ca43aa5b 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml @@ -218,6 +218,7 @@ jobs: "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ + "docker/pytorch/2.11/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true diff --git a/scripts/pytorch/upload_cached_wheels.sh b/scripts/pytorch/upload_cached_wheels.sh index be4d2f8c5eb7..e2f59c486cf3 100755 --- a/scripts/pytorch/upload_cached_wheels.sh +++ b/scripts/pytorch/upload_cached_wheels.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # upload_cached_wheels.sh — Extract built wheels from Docker wheel-export stage and upload to S3. # -# Usage: upload_cached_wheels.sh [...] +# Usage: upload_cached_wheels.sh [...] set -euo pipefail -BUCKET="$1"; CUDA="$2"; IMAGE="$5" -shift 5 +BUCKET="$1"; CUDA="$2"; IMAGE="$5"; DOCKERFILE="$6" +shift 6 if [ -z "${BUCKET}" ]; then echo "⚠️ No wheel cache bucket configured — skipping upload" @@ -15,7 +15,7 @@ fi # Build the wheel-export stage and extract to local dir EXPORT_DIR=$(mktemp -d) docker buildx build --progress=plain --target wheel-export --output "type=local,dest=${EXPORT_DIR}" \ - -f docker/pytorch/Dockerfile . 2>/dev/null || { + -f "${DOCKERFILE}" . 2>/dev/null || { echo "⚠️ wheel-export stage not available — extracting from runtime image" CID=$(docker create "${IMAGE}" /bin/true) docker cp "${CID}:/tmp/built_wheels/" "${EXPORT_DIR}/wheels/" 2>/dev/null || true From 1fc41f92d59b00b58477e3363045208f52ef7279 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 13 May 2026 00:56:54 +0000 Subject: [PATCH 07/18] refactor: make PyTorch workflows version-agnostic PR workflows detect the PyTorch version from changed file paths (docker/pytorch/X.Y/ or pytorch-X.Y-*.yml), falling back to LATEST_PYTORCH_VERSION env var for shared file changes. Autorelease workflows use multi-cron scheduling with a case mapping from cron expression to version. Staggered 10 min apart per version. Also supports workflow_dispatch with an explicit pytorch-version input. Adding PyTorch 2.12: create docker/pytorch/2.12/ + config files, then add one cron line + one case entry per autorelease workflow. No new workflow files needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- ...pu.yml => autorelease-pytorch-ec2-cpu.yml} | 49 ++++-- ...a.yml => autorelease-pytorch-ec2-cuda.yml} | 65 ++++++-- ... => autorelease-pytorch-sagemaker-cpu.yml} | 49 ++++-- ...=> autorelease-pytorch-sagemaker-cuda.yml} | 61 ++++++-- ....11-ec2-cpu.yml => pr-pytorch-ec2-cpu.yml} | 128 ++++++++------- ...1-ec2-cuda.yml => pr-pytorch-ec2-cuda.yml} | 146 ++++++++++-------- ...r-cpu.yml => pr-pytorch-sagemaker-cpu.yml} | 132 +++++++++------- ...cuda.yml => pr-pytorch-sagemaker-cuda.yml} | 137 ++++++++-------- 8 files changed, 475 insertions(+), 292 deletions(-) rename .github/workflows/{autorelease-pytorch-2.11-ec2-cpu.yml => autorelease-pytorch-ec2-cpu.yml} (82%) rename .github/workflows/{autorelease-pytorch-2.11-ec2-cuda.yml => autorelease-pytorch-ec2-cuda.yml} (83%) rename .github/workflows/{autorelease-pytorch-2.11-sagemaker-cpu.yml => autorelease-pytorch-sagemaker-cpu.yml} (85%) rename .github/workflows/{autorelease-pytorch-2.11-sagemaker-cuda.yml => autorelease-pytorch-sagemaker-cuda.yml} (84%) rename .github/workflows/{pr-pytorch-2.11-ec2-cpu.yml => pr-pytorch-ec2-cpu.yml} (88%) rename .github/workflows/{pr-pytorch-2.11-ec2-cuda.yml => pr-pytorch-ec2-cuda.yml} (90%) rename .github/workflows/{pr-pytorch-2.11-sagemaker-cpu.yml => pr-pytorch-sagemaker-cpu.yml} (89%) rename .github/workflows/{pr-pytorch-2.11-sagemaker-cuda.yml => pr-pytorch-sagemaker-cuda.yml} (89%) diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-ec2-cpu.yml similarity index 82% rename from .github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml rename to .github/workflows/autorelease-pytorch-ec2-cpu.yml index 289e57ad3dd6..f3aff605dc0c 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cpu.yml @@ -1,10 +1,15 @@ -name: Auto Release - PyTorch 2.11 EC2 CPU +name: Auto Release - PyTorch EC2 CPU on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + pytorch-version: + description: "PyTorch version directory (e.g., 2.11)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,32 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cpu.yml" jobs: + determine-version: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + config-file: ${{ steps.version.outputs.config-file }} + steps: + - name: Determine PyTorch version + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.pytorch-version }}" + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") VERSION="2.11" ;; + # "10 17 * * 1,3") VERSION="2.12" ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "config-file=.github/config/image/pytorch-${VERSION}-ec2-cpu.yml" >> $GITHUB_OUTPUT + load-config: + needs: [determine-version] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +68,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-version.outputs.config-file }} - name: Parse configuration id: parse @@ -60,7 +87,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [load-config] + needs: [determine-version, load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -83,7 +110,8 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/2.11/versions-cpu.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ @@ -98,7 +126,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/2.11/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -142,7 +170,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [determine-version, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -159,10 +187,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.determine-version.outputs.version }}" IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-ec2-cuda.yml similarity index 83% rename from .github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml rename to .github/workflows/autorelease-pytorch-ec2-cuda.yml index b5cf3d2bdc78..dce2c12c07a6 100644 --- a/.github/workflows/autorelease-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cuda.yml @@ -1,11 +1,15 @@ -name: Auto Release - PyTorch 2.11 EC2 CUDA +name: Auto Release - PyTorch EC2 CUDA on: schedule: - # Runs at 9AM/10AM PST/PDT on Mondays and Wednesdays - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + pytorch-version: + description: "PyTorch version directory (e.g., 2.11)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -16,10 +20,32 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cuda.yml" jobs: + determine-version: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + config-file: ${{ steps.version.outputs.config-file }} + steps: + - name: Determine PyTorch version + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.pytorch-version }}" + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") VERSION="2.11" ;; + # "10 17 * * 1,3") VERSION="2.12" ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "config-file=.github/config/image/pytorch-${VERSION}-ec2-cuda.yml" >> $GITHUB_OUTPUT + load-config: + needs: [determine-version] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -42,7 +68,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-version.outputs.config-file }} - name: Parse configuration id: parse @@ -61,7 +87,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [load-config] + needs: [determine-version, load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner @@ -85,17 +111,19 @@ jobs: - name: Source versions id: versions run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/2.11/versions-cuda.env - mkdir -p docker/pytorch/2.11/wheels + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/2.11/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -105,7 +133,8 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ @@ -125,18 +154,19 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/2.11/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ - "docker/pytorch/2.11/Dockerfile.cuda" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true @@ -181,7 +211,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [determine-version, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -198,10 +228,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.determine-version.outputs.version }}" IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml similarity index 85% rename from .github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml rename to .github/workflows/autorelease-pytorch-sagemaker-cpu.yml index 16c97693aa05..5bf252bbd7ad 100644 --- a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml @@ -1,10 +1,15 @@ -name: Auto Release - PyTorch 2.11 SageMaker CPU +name: Auto Release - PyTorch SageMaker CPU on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + pytorch-version: + description: "PyTorch version directory (e.g., 2.11)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,32 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" jobs: + determine-version: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + config-file: ${{ steps.version.outputs.config-file }} + steps: + - name: Determine PyTorch version + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.pytorch-version }}" + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") VERSION="2.11" ;; + # "10 17 * * 1,3") VERSION="2.12" ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "config-file=.github/config/image/pytorch-${VERSION}-sagemaker-cpu.yml" >> $GITHUB_OUTPUT + load-config: + needs: [determine-version] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +68,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-version.outputs.config-file }} - name: Parse configuration id: parse @@ -60,7 +87,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [load-config] + needs: [determine-version, load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -83,7 +110,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/2.11/versions-cpu.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-cpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -110,7 +138,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/2.11/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -154,7 +182,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [determine-version, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -171,10 +199,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.determine-version.outputs.version }}" IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml similarity index 84% rename from .github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml rename to .github/workflows/autorelease-pytorch-sagemaker-cuda.yml index 05b05077d854..e9579682bfa8 100644 --- a/.github/workflows/autorelease-pytorch-2.11-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml @@ -1,10 +1,15 @@ -name: Auto Release - PyTorch 2.11 SageMaker CUDA +name: Auto Release - PyTorch SageMaker CUDA on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + pytorch-version: + description: "PyTorch version directory (e.g., 2.11)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,32 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" jobs: + determine-version: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + config-file: ${{ steps.version.outputs.config-file }} + steps: + - name: Determine PyTorch version + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.pytorch-version }}" + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") VERSION="2.11" ;; + # "10 17 * * 1,3") VERSION="2.12" ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "config-file=.github/config/image/pytorch-${VERSION}-sagemaker-cuda.yml" >> $GITHUB_OUTPUT + load-config: + needs: [determine-version] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +68,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-version.outputs.config-file }} - name: Parse configuration id: parse @@ -60,7 +87,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [load-config] + needs: [determine-version, load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner @@ -83,10 +110,11 @@ jobs: - name: Fetch cached wheels run: | - source docker/pytorch/2.11/versions-cuda.env - mkdir -p docker/pytorch/2.11/wheels + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/2.11/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -96,7 +124,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -129,18 +158,19 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/2.11/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.determine-version.outputs.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-sagemaker.outputs.image-uri }}" \ - "docker/pytorch/2.11/Dockerfile.cuda" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true @@ -185,7 +215,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [determine-version, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -202,10 +232,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.determine-version.outputs.version }}" IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml similarity index 88% rename from .github/workflows/pr-pytorch-2.11-ec2-cpu.yml rename to .github/workflows/pr-pytorch-ec2-cpu.yml index 8bedf196689a..4453999c8f7d 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch 2.11 EC2 CPU +name: PR - PyTorch EC2 CPU on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-2.11-ec2-cpu.yml" - - ".github/workflows/pr-pytorch-2.11-ec2-cpu.yml" - - "docker/pytorch/2.11/**" + - ".github/config/image/pytorch-*-ec2-cpu.yml" + - ".github/workflows/pr-pytorch-ec2-cpu.yml" + - "docker/pytorch/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +22,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cpu.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -43,11 +43,70 @@ jobs: - name: Run permission gate (from base) uses: ./.github/actions/pr-permission-gate + # ============================================================ + # Pre-commit + change detection + # ============================================================ + check-changes: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + build-change: ${{ steps.changes.outputs.build-change }} + sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} + telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + pytorch-version: ${{ steps.version.outputs.version }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files + + - name: Detect PyTorch version + id: version + run: | + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + if [ -z "$VERSION" ]; then + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + fi + echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + + - name: Detect file changes + id: changes + uses: dorny/paths-filter@v4 + with: + filters: | + build-change: + - ".github/config/image/pytorch-*-ec2-cpu.yml" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" + - "scripts/common/setup_oss_compliance.sh" + - "scripts/pytorch/configure_ssh.sh" + - "scripts/telemetry/bash_telemetry.sh.template" + sanity-test-change: + - "test/sanity/**" + telemetry-test-change: + - "test/telemetry/**" + # ============================================================ # Load configuration from YAML # ============================================================ load-config: - needs: [gatekeeper] + needs: [gatekeeper, check-changes] if: success() runs-on: ubuntu-latest outputs: @@ -70,7 +129,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-ec2-cpu.yml" - name: Parse configuration id: parse @@ -88,51 +147,6 @@ jobs: echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} - cancel-in-progress: true - outputs: - build-change: ${{ steps.changes.outputs.build-change }} - sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} - telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Setup python - uses: actions/setup-python@v6 - with: - python-version: "3.12" - - - name: Run pre-commit - uses: pre-commit/action@v3.0.1 - with: - extra_args: --all-files - - - name: Detect file changes - id: changes - uses: dorny/paths-filter@v4 - with: - filters: | - build-change: - - ".github/config/image/pytorch-2.11-ec2-cpu.yml" - - "docker/pytorch/2.11/Dockerfile.cpu" - - "docker/pytorch/2.11/cpu/**" - - "scripts/common/setup_oss_compliance.sh" - - "scripts/pytorch/configure_ssh.sh" - - "scripts/telemetry/bash_telemetry.sh.template" - sanity-test-change: - - "test/sanity/**" - telemetry-test-change: - - "test/telemetry/**" - # ============================================================ # Build CPU runtime image # ============================================================ @@ -164,7 +178,8 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/2.11/versions-cpu.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ @@ -179,7 +194,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/2.11/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -245,7 +260,7 @@ jobs: # Unit tests # ============================================================ unit-test: - needs: [build-image] + needs: [check-changes, build-image] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -266,10 +281,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" IMAGE="${{ needs.build-image.outputs.runtime-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml similarity index 90% rename from .github/workflows/pr-pytorch-2.11-ec2-cuda.yml rename to .github/workflows/pr-pytorch-ec2-cuda.yml index e510ca43aa5b..3e0a1140d2a5 100644 --- a/.github/workflows/pr-pytorch-2.11-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch 2.11 EC2 CUDA +name: PR - PyTorch EC2 CUDA on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-2.11-ec2-cuda.yml" - - ".github/workflows/pr-pytorch-2.11-ec2-cuda.yml" - - "docker/pytorch/2.11/**" + - ".github/config/image/pytorch-*-ec2-cuda.yml" + - ".github/workflows/pr-pytorch-ec2-cuda.yml" + - "docker/pytorch/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -23,9 +23,7 @@ permissions: env: FORCE_COLOR: "1" - - # Config file path - CONFIG_FILE: ".github/config/image/pytorch-2.11-ec2-cuda.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -46,11 +44,70 @@ jobs: - name: Run permission gate (from base) uses: ./.github/actions/pr-permission-gate + # ============================================================ + # Pre-commit + change detection + # ============================================================ + check-changes: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + build-change: ${{ steps.changes.outputs.build-change }} + sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} + telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + pytorch-version: ${{ steps.version.outputs.version }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files + + - name: Detect PyTorch version + id: version + run: | + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + if [ -z "$VERSION" ]; then + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + fi + echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + + - name: Detect file changes + id: changes + uses: dorny/paths-filter@v4 + with: + filters: | + build-change: + - ".github/config/image/pytorch-*-ec2-cuda.yml" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "scripts/common/setup_oss_compliance.sh" + - "scripts/pytorch/*" + - "scripts/telemetry/bash_telemetry.sh.template" + sanity-test-change: + - "test/sanity/**" + telemetry-test-change: + - "test/telemetry/**" + # ============================================================ # Load configuration from YAML # ============================================================ load-config: - needs: [gatekeeper] + needs: [gatekeeper, check-changes] if: success() runs-on: ubuntu-latest outputs: @@ -73,7 +130,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-ec2-cuda.yml" - name: Parse configuration id: parse @@ -91,51 +148,6 @@ jobs: echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} - cancel-in-progress: true - outputs: - build-change: ${{ steps.changes.outputs.build-change }} - sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} - telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Setup python - uses: actions/setup-python@v6 - with: - python-version: "3.12" - - - name: Run pre-commit - uses: pre-commit/action@v3.0.1 - with: - extra_args: --all-files - - - name: Detect file changes - id: changes - uses: dorny/paths-filter@v4 - with: - filters: | - build-change: - - ".github/config/image/pytorch-2.11-ec2-cuda.yml" - - "docker/pytorch/2.11/Dockerfile.cuda" - - "docker/pytorch/2.11/cuda/**" - - "scripts/common/setup_oss_compliance.sh" - - "scripts/pytorch/*" - - "scripts/telemetry/bash_telemetry.sh.template" - sanity-test-change: - - "test/sanity/**" - telemetry-test-change: - - "test/telemetry/**" - # ============================================================ # Build runtime image # ============================================================ @@ -167,17 +179,19 @@ jobs: - name: Source versions id: versions run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/2.11/versions-cuda.env - mkdir -p docker/pytorch/2.11/wheels + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/2.11/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -187,7 +201,8 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ @@ -207,18 +222,19 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/2.11/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ - "docker/pytorch/2.11/Dockerfile.cuda" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true @@ -285,7 +301,7 @@ jobs: # Unit tests (CPU-only, no GPU needed) # ============================================================ unit-test: - needs: [build-images] + needs: [check-changes, build-images] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -306,10 +322,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -410,7 +427,6 @@ jobs: # ============================================================ # Multi-node tests (need 2+ containers on Docker network) - # ============================================================ # Multi-node tests (need 2+ containers on Docker network) # TODO: Re-enable when GPU capacity is available # ============================================================ diff --git a/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml similarity index 89% rename from .github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml rename to .github/workflows/pr-pytorch-sagemaker-cpu.yml index a8840a3f0b6d..5b277da1a835 100644 --- a/.github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch 2.11 SageMaker CPU +name: PR - PyTorch SageMaker CPU on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" - - ".github/workflows/pr-pytorch-2.11-sagemaker-cpu.yml" - - "docker/pytorch/2.11/**" + - ".github/config/image/pytorch-*-sagemaker-cpu.yml" + - ".github/workflows/pr-pytorch-sagemaker-cpu.yml" + - "docker/pytorch/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +22,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -43,51 +43,6 @@ jobs: - name: Run permission gate (from base) uses: ./.github/actions/pr-permission-gate - # ============================================================ - # Load configuration from YAML - # ============================================================ - load-config: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ${{ env.CONFIG_FILE }} - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - # ============================================================ # Pre-commit + change detection # ============================================================ @@ -102,6 +57,7 @@ jobs: build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + pytorch-version: ${{ steps.version.outputs.version }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -116,15 +72,28 @@ jobs: with: extra_args: --all-files + - name: Detect PyTorch version + id: version + run: | + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + if [ -z "$VERSION" ]; then + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + fi + echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + - name: Detect file changes id: changes uses: dorny/paths-filter@v4 with: filters: | build-change: - - ".github/config/image/pytorch-2.11-sagemaker-cpu.yml" - - "docker/pytorch/2.11/Dockerfile.cpu" - - "docker/pytorch/2.11/cpu/**" + - ".github/config/image/pytorch-*-sagemaker-cpu.yml" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/configure_ssh.sh" - "scripts/pytorch/changehostname.c" @@ -135,6 +104,51 @@ jobs: telemetry-test-change: - "test/telemetry/**" + # ============================================================ + # Load configuration from YAML + # ============================================================ + load-config: + needs: [gatekeeper, check-changes] + if: success() + runs-on: ubuntu-latest + outputs: + framework: ${{ steps.parse.outputs.framework }} + framework-version: ${{ steps.parse.outputs.framework-version }} + python-version: ${{ steps.parse.outputs.python-version }} + cuda-version: ${{ steps.parse.outputs.cuda-version }} + os-version: ${{ steps.parse.outputs.os-version }} + container-type: ${{ steps.parse.outputs.container-type }} + device-type: ${{ steps.parse.outputs.device-type }} + arch-type: ${{ steps.parse.outputs.arch-type }} + contributor: ${{ steps.parse.outputs.contributor }} + customer-type: ${{ steps.parse.outputs.customer-type }} + prod-image: ${{ steps.parse.outputs.prod-image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Load configuration + id: load + uses: ./.github/actions/load-config + with: + config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-sagemaker-cpu.yml" + + - name: Parse configuration + id: parse + run: | + echo '${{ steps.load.outputs.config }}' > config.json + echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT + echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT + echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT + echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT + echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT + echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT + echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT + echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT + echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT + echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT + echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT + # ============================================================ # Build CPU SageMaker image # ============================================================ @@ -166,7 +180,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/2.11/versions-cpu.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations @@ -193,7 +208,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/2.11/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -239,7 +254,7 @@ jobs: # Unit tests # ============================================================ unit-test: - needs: [build-image] + needs: [check-changes, build-image] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -260,10 +275,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q @@ -271,7 +287,7 @@ jobs: docker kill ${CONTAINER_ID} # ============================================================ - # SageMaker integration tests (CPU — gloo backend) + # SageMaker integration tests (CPU -- gloo backend) # ============================================================ sagemaker-test: needs: [build-image, sanity-test, security-test, unit-test] diff --git a/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml similarity index 89% rename from .github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml rename to .github/workflows/pr-pytorch-sagemaker-cuda.yml index 3cbdcdcd4a72..62431e604d7e 100644 --- a/.github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -1,13 +1,13 @@ -name: PR - PyTorch 2.11 SageMaker CUDA +name: PR - PyTorch SageMaker CUDA on: pull_request: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" - - ".github/workflows/pr-pytorch-2.11-sagemaker-cuda.yml" - - "docker/pytorch/2.11/**" + - ".github/config/image/pytorch-*-sagemaker-cuda.yml" + - ".github/workflows/pr-pytorch-sagemaker-cuda.yml" + - "docker/pytorch/**" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,9 +22,7 @@ permissions: env: FORCE_COLOR: "1" - - # Config file path - CONFIG_FILE: ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -45,11 +43,70 @@ jobs: - name: Run permission gate (from base) uses: ./.github/actions/pr-permission-gate + # ============================================================ + # Pre-commit + change detection + # ============================================================ + check-changes: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + build-change: ${{ steps.changes.outputs.build-change }} + sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} + telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + pytorch-version: ${{ steps.version.outputs.version }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files + + - name: Detect PyTorch version + id: version + run: | + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + if [ -z "$VERSION" ]; then + VERSION=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u | head -1) + fi + echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + + - name: Detect file changes + id: changes + uses: dorny/paths-filter@v4 + with: + filters: | + build-change: + - ".github/config/image/pytorch-*-sagemaker-cuda.yml" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "scripts/common/setup_oss_compliance.sh" + - "scripts/pytorch/*" + - "scripts/telemetry/bash_telemetry.sh.template" + sanity-test-change: + - "test/sanity/**" + telemetry-test-change: + - "test/telemetry/**" + # ============================================================ # Load configuration from YAML # ============================================================ load-config: - needs: [gatekeeper] + needs: [gatekeeper, check-changes] if: success() runs-on: ubuntu-latest outputs: @@ -72,7 +129,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-sagemaker-cuda.yml" - name: Parse configuration id: parse @@ -90,51 +147,6 @@ jobs: echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} - cancel-in-progress: true - outputs: - build-change: ${{ steps.changes.outputs.build-change }} - sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} - telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Setup python - uses: actions/setup-python@v6 - with: - python-version: "3.12" - - - name: Run pre-commit - uses: pre-commit/action@v3.0.1 - with: - extra_args: --all-files - - - name: Detect file changes - id: changes - uses: dorny/paths-filter@v4 - with: - filters: | - build-change: - - ".github/config/image/pytorch-2.11-sagemaker-cuda.yml" - - "docker/pytorch/2.11/Dockerfile.cuda" - - "docker/pytorch/2.11/cuda/**" - - "scripts/common/setup_oss_compliance.sh" - - "scripts/pytorch/*" - - "scripts/telemetry/bash_telemetry.sh.template" - sanity-test-change: - - "test/sanity/**" - telemetry-test-change: - - "test/telemetry/**" - # ============================================================ # Build SageMaker image # ============================================================ @@ -165,10 +177,11 @@ jobs: - name: Fetch cached wheels run: | - source docker/pytorch/2.11/versions-cuda.env - mkdir -p docker/pytorch/2.11/wheels + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/2.11/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -178,7 +191,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/2.11/versions-cuda.env + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations @@ -211,7 +225,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/2.11/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -277,7 +291,7 @@ jobs: # Unit tests # ============================================================ unit-test: - needs: [build-image] + needs: [check-changes, build-image] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -298,10 +312,11 @@ jobs: - name: Run unit tests run: | + VERSION="${{ needs.check-changes.outputs.pytorch-version }}" IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=2.11 \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q From 77db83a618bf2d1c5bba191e3eb1f9539ef4a6e8 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 19:56:06 +0000 Subject: [PATCH 08/18] =?UTF-8?q?refactor:=20simplify=20autorelease=20to?= =?UTF-8?q?=20map=20cron=20=E2=86=92=20config=20file=20directly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per team feedback: remove the redundant version output from the determine-config job. Map cron directly to config file path. Derive the docker directory version from load-config's framework-version output (cut major.minor from "2.11.0" → "2.11"). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workflows/autorelease-pytorch-ec2-cpu.yml | 33 +++++++--------- .../autorelease-pytorch-ec2-cuda.yml | 39 +++++++++---------- .../autorelease-pytorch-sagemaker-cpu.yml | 33 +++++++--------- .../autorelease-pytorch-sagemaker-cuda.yml | 37 ++++++++---------- 4 files changed, 65 insertions(+), 77 deletions(-) diff --git a/.github/workflows/autorelease-pytorch-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-ec2-cpu.yml index f3aff605dc0c..3c3f809c5374 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cpu.yml @@ -6,8 +6,8 @@ on: # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: inputs: - pytorch-version: - description: "PyTorch version directory (e.g., 2.11)" + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cpu.yml)" required: true type: string @@ -22,30 +22,27 @@ env: FORCE_COLOR: "1" jobs: - determine-version: + determine-config: runs-on: ubuntu-latest outputs: - version: ${{ steps.version.outputs.version }} - config-file: ${{ steps.version.outputs.config-file }} + config-file: ${{ steps.config.outputs.config-file }} steps: - - name: Determine PyTorch version - id: version + - name: Determine config file + id: config run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ inputs.pytorch-version }}" + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT else CRON="${{ github.event.schedule }}" case "$CRON" in - "00 17 * * 1,3") VERSION="2.11" ;; - # "10 17 * * 1,3") VERSION="2.12" ;; + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cpu.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cpu.yml" >> $GITHUB_OUTPUT ;; *) echo "::error::Unknown cron: $CRON"; exit 1 ;; esac fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - echo "config-file=.github/config/image/pytorch-${VERSION}-ec2-cpu.yml" >> $GITHUB_OUTPUT load-config: - needs: [determine-version] + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -68,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ needs.determine-version.outputs.config-file }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -87,7 +84,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [determine-version, load-config] + needs: [load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -110,7 +107,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" @@ -170,7 +167,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [determine-version, build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -187,7 +184,7 @@ jobs: - name: Run unit tests run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ diff --git a/.github/workflows/autorelease-pytorch-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-ec2-cuda.yml index dce2c12c07a6..4bfa283808e7 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cuda.yml @@ -6,8 +6,8 @@ on: # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: inputs: - pytorch-version: - description: "PyTorch version directory (e.g., 2.11)" + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cuda.yml)" required: true type: string @@ -22,30 +22,27 @@ env: FORCE_COLOR: "1" jobs: - determine-version: + determine-config: runs-on: ubuntu-latest outputs: - version: ${{ steps.version.outputs.version }} - config-file: ${{ steps.version.outputs.config-file }} + config-file: ${{ steps.config.outputs.config-file }} steps: - - name: Determine PyTorch version - id: version + - name: Determine config file + id: config run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ inputs.pytorch-version }}" + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT else CRON="${{ github.event.schedule }}" case "$CRON" in - "00 17 * * 1,3") VERSION="2.11" ;; - # "10 17 * * 1,3") VERSION="2.12" ;; + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cuda.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cuda.yml" >> $GITHUB_OUTPUT ;; *) echo "::error::Unknown cron: $CRON"; exit 1 ;; esac fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - echo "config-file=.github/config/image/pytorch-${VERSION}-ec2-cuda.yml" >> $GITHUB_OUTPUT load-config: - needs: [determine-version] + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -68,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ needs.determine-version.outputs.config-file }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -87,7 +84,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [determine-version, load-config] + needs: [load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner @@ -111,7 +108,7 @@ jobs: - name: Source versions id: versions run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT @@ -119,7 +116,7 @@ jobs: - name: Fetch cached wheels run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ @@ -133,7 +130,7 @@ jobs: - name: Build runtime image id: build-runtime run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" @@ -160,7 +157,7 @@ jobs: - name: Upload built wheels to cache run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ @@ -211,7 +208,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [determine-version, build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -228,7 +225,7 @@ jobs: - name: Run unit tests run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml index 5bf252bbd7ad..9b052493909a 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml @@ -6,8 +6,8 @@ on: # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: inputs: - pytorch-version: - description: "PyTorch version directory (e.g., 2.11)" + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cpu.yml)" required: true type: string @@ -22,30 +22,27 @@ env: FORCE_COLOR: "1" jobs: - determine-version: + determine-config: runs-on: ubuntu-latest outputs: - version: ${{ steps.version.outputs.version }} - config-file: ${{ steps.version.outputs.config-file }} + config-file: ${{ steps.config.outputs.config-file }} steps: - - name: Determine PyTorch version - id: version + - name: Determine config file + id: config run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ inputs.pytorch-version }}" + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT else CRON="${{ github.event.schedule }}" case "$CRON" in - "00 17 * * 1,3") VERSION="2.11" ;; - # "10 17 * * 1,3") VERSION="2.12" ;; + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;; *) echo "::error::Unknown cron: $CRON"; exit 1 ;; esac fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - echo "config-file=.github/config/image/pytorch-${VERSION}-sagemaker-cpu.yml" >> $GITHUB_OUTPUT load-config: - needs: [determine-version] + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -68,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ needs.determine-version.outputs.config-file }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -87,7 +84,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [determine-version, load-config] + needs: [load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -110,7 +107,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-cpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" @@ -182,7 +179,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [determine-version, build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -199,7 +196,7 @@ jobs: - name: Run unit tests run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml index e9579682bfa8..79cfb42a90d9 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml @@ -6,8 +6,8 @@ on: # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: inputs: - pytorch-version: - description: "PyTorch version directory (e.g., 2.11)" + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cuda.yml)" required: true type: string @@ -22,30 +22,27 @@ env: FORCE_COLOR: "1" jobs: - determine-version: + determine-config: runs-on: ubuntu-latest outputs: - version: ${{ steps.version.outputs.version }} - config-file: ${{ steps.version.outputs.config-file }} + config-file: ${{ steps.config.outputs.config-file }} steps: - - name: Determine PyTorch version - id: version + - name: Determine config file + id: config run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ inputs.pytorch-version }}" + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT else CRON="${{ github.event.schedule }}" case "$CRON" in - "00 17 * * 1,3") VERSION="2.11" ;; - # "10 17 * * 1,3") VERSION="2.12" ;; + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;; *) echo "::error::Unknown cron: $CRON"; exit 1 ;; esac fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - echo "config-file=.github/config/image/pytorch-${VERSION}-sagemaker-cuda.yml" >> $GITHUB_OUTPUT load-config: - needs: [determine-version] + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -68,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ needs.determine-version.outputs.config-file }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -87,7 +84,7 @@ jobs: echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT build-image: - needs: [determine-version, load-config] + needs: [load-config] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner @@ -110,7 +107,7 @@ jobs: - name: Fetch cached wheels run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ @@ -124,7 +121,7 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" @@ -164,7 +161,7 @@ jobs: - name: Upload built wheels to cache run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ @@ -215,7 +212,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [determine-version, build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -232,7 +229,7 @@ jobs: - name: Run unit tests run: | - VERSION="${{ needs.determine-version.outputs.version }}" + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ From c329c46e483680d29c608336d96935e6c1394482 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 20:11:17 +0000 Subject: [PATCH 09/18] refactor: PR workflows use matrix over detected PyTorch versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses team feedback: - Multi-version PRs now build ALL changed versions in parallel (not just the first detected one) - Removed separate load-config job — config parsing inlined into build-images and detect-versions (eliminates 30s serialization) - Uses strategy.matrix with fail-fast: false for parallel builds Structure: gatekeeper → detect-versions → build-images (matrix) → test jobs Only the latest version runs the full test suite (sanity, security, telemetry, single-gpu). All versions validate that the build compiles. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-ec2-cpu.yml | 251 ++++++++------- .github/workflows/pr-pytorch-ec2-cuda.yml | 263 ++++++++-------- .../workflows/pr-pytorch-sagemaker-cpu.yml | 255 ++++++++-------- .../workflows/pr-pytorch-sagemaker-cuda.yml | 286 +++++++++--------- 4 files changed, 523 insertions(+), 532 deletions(-) diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index 4453999c8f7d..6cfb644b268c 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -44,20 +44,32 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Pre-commit + change detection + # Detect all changed PyTorch versions + file changes # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} + latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - pytorch-version: ${{ steps.version.outputs.version }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -72,18 +84,50 @@ jobs: with: extra_args: --all-files - - name: Detect PyTorch version - id: version + - name: Detect PyTorch versions + id: versions run: | - VERSION=$(git diff --name-only origin/main...HEAD \ + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) - if [ -z "$VERSION" ]; then - VERSION=$(git diff --name-only origin/main...HEAD \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) + | sort -u) fi - echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + LATEST=$(echo "$VERSIONS" | tail -1) + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + echo "Latest version: ${LATEST}" + + - name: Load config for latest version + id: config + run: | + LATEST="${{ steps.versions.outputs.latest-version }}" + CONFIG_FILE=".github/config/image/pytorch-${LATEST}-ec2-cpu.yml" + pip install pyyaml -q + python3 -c " + import yaml, os + with open('${CONFIG_FILE}') as f: + cfg = yaml.safe_load(f)['common'] + with open(os.environ['GITHUB_OUTPUT'], 'a') as out: + out.write(f\"framework={cfg.get('framework', '')}\\n\") + out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") + out.write(f\"python-version={cfg.get('python_version', '')}\\n\") + out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") + out.write(f\"os-version={cfg.get('os_version', '')}\\n\") + out.write(f\"container-type={cfg.get('job_type', '')}\\n\") + out.write(f\"device-type={cfg.get('device_type', 'cpu')}\\n\") + out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") + out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") + out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") + out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") + " - name: Detect file changes id: changes @@ -103,62 +147,21 @@ jobs: - "test/telemetry/**" # ============================================================ - # Load configuration from YAML - # ============================================================ - load-config: - needs: [gatekeeper, check-changes] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-ec2-cpu.yml" - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Build CPU runtime image + # Build CPU images (matrix over detected versions) # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} @@ -175,15 +178,31 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cpu.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Build runtime image id: build-runtime run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cpu.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-pr-${{ github.event.pull_request.number }}" + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \ @@ -198,96 +217,74 @@ jobs: echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Run unit tests + if: matrix.version == needs.detect-versions.outputs.latest-version + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-runtime.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + python-version: ${{ needs.detect-versions.outputs.python-version }} + cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} + os-version: ${{ needs.detect-versions.outputs.os-version }} + customer-type: ${{ needs.detect-versions.outputs.customer-type }} + arch-type: ${{ needs.detect-versions.outputs.arch-type }} + device-type: ${{ needs.detect-versions.outputs.device-type }} + contributor: ${{ needs.detect-versions.outputs.contributor }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # Security tests # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} # ============================================================ # Telemetry tests # ============================================================ telemetry-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') concurrency: group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} cancel-in-progress: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [check-changes, build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" - IMAGE="${{ needs.build-image.outputs.runtime-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + container-type: ${{ needs.detect-versions.outputs.container-type }} diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index 3e0a1140d2a5..121c8cef8192 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -45,20 +45,32 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Pre-commit + change detection + # Detect all changed PyTorch versions + file changes # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} + latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - pytorch-version: ${{ steps.version.outputs.version }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -73,18 +85,50 @@ jobs: with: extra_args: --all-files - - name: Detect PyTorch version - id: version + - name: Detect PyTorch versions + id: versions run: | - VERSION=$(git diff --name-only origin/main...HEAD \ + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) - if [ -z "$VERSION" ]; then - VERSION=$(git diff --name-only origin/main...HEAD \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) + | sort -u) + fi + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" fi - echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + LATEST=$(echo "$VERSIONS" | tail -1) + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + echo "Latest version: ${LATEST}" + + - name: Load config for latest version + id: config + run: | + LATEST="${{ steps.versions.outputs.latest-version }}" + CONFIG_FILE=".github/config/image/pytorch-${LATEST}-ec2-cuda.yml" + pip install pyyaml -q + python3 -c " + import yaml, os + with open('${CONFIG_FILE}') as f: + cfg = yaml.safe_load(f)['common'] + with open(os.environ['GITHUB_OUTPUT'], 'a') as out: + out.write(f\"framework={cfg.get('framework', '')}\\n\") + out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") + out.write(f\"python-version={cfg.get('python_version', '')}\\n\") + out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") + out.write(f\"os-version={cfg.get('os_version', '')}\\n\") + out.write(f\"container-type={cfg.get('job_type', '')}\\n\") + out.write(f\"device-type={cfg.get('device_type', 'gpu')}\\n\") + out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") + out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") + out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") + out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") + " - name: Detect file changes id: changes @@ -104,62 +148,21 @@ jobs: - "test/telemetry/**" # ============================================================ - # Load configuration from YAML - # ============================================================ - load-config: - needs: [gatekeeper, check-changes] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-ec2-cuda.yml" - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Build runtime image + # Build images (matrix over detected versions) # ============================================================ build-images: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} @@ -176,18 +179,25 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - name: Source versions - id: versions + - name: Load and parse config + id: config run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" - source docker/pytorch/${VERSION}/versions-cuda.env - echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT - echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT - echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cuda.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cuda.env mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ @@ -201,12 +211,12 @@ jobs: - name: Build runtime image id: build-runtime run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cuda.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-pr-${{ github.event.pull_request.number }}" + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -228,7 +238,7 @@ jobs: - name: Upload built wheels to cache run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ @@ -239,105 +249,83 @@ jobs: "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true + - name: Run unit tests + if: matrix.version == needs.detect-versions.outputs.latest-version + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-runtime.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests (labels, filesystem, OSS compliance) # ============================================================ sanity-test: - needs: [check-changes, build-images, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + python-version: ${{ needs.detect-versions.outputs.python-version }} + cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} + os-version: ${{ needs.detect-versions.outputs.os-version }} + customer-type: ${{ needs.detect-versions.outputs.customer-type }} + arch-type: ${{ needs.detect-versions.outputs.arch-type }} + device-type: ${{ needs.detect-versions.outputs.device-type }} + contributor: ${{ needs.detect-versions.outputs.contributor }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) # ============================================================ security-test: - needs: [build-images, load-config] + needs: [detect-versions, build-images] if: success() uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) # ============================================================ telemetry-test: - needs: [check-changes, build-images, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') concurrency: group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} cancel-in-progress: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests (CPU-only, no GPU needed) - # ============================================================ - unit-test: - needs: [check-changes, build-images] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" - IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # Single-GPU tests # ============================================================ single-gpu-test: - needs: [build-images, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -373,7 +361,7 @@ jobs: # EFA integration test (2x p4d.24xlarge, NCCL over EFA) # ============================================================ efa-test: - needs: [build-images, sanity-test, security-test, unit-test] + needs: [build-images, sanity-test, security-test] if: success() uses: ./.github/workflows/reusable-efa-tests.yml with: @@ -427,7 +415,6 @@ jobs: # ============================================================ # Multi-node tests (need 2+ containers on Docker network) - # Multi-node tests (need 2+ containers on Docker network) # TODO: Re-enable when GPU capacity is available # ============================================================ # multi-node-test: diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 5b277da1a835..1f0039ffbf04 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -44,20 +44,32 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Pre-commit + change detection + # Detect all changed PyTorch versions + file changes # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} + latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - pytorch-version: ${{ steps.version.outputs.version }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -72,18 +84,50 @@ jobs: with: extra_args: --all-files - - name: Detect PyTorch version - id: version + - name: Detect PyTorch versions + id: versions run: | - VERSION=$(git diff --name-only origin/main...HEAD \ + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) - if [ -z "$VERSION" ]; then - VERSION=$(git diff --name-only origin/main...HEAD \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) + | sort -u) fi - echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + LATEST=$(echo "$VERSIONS" | tail -1) + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + echo "Latest version: ${LATEST}" + + - name: Load config for latest version + id: config + run: | + LATEST="${{ steps.versions.outputs.latest-version }}" + CONFIG_FILE=".github/config/image/pytorch-${LATEST}-sagemaker-cpu.yml" + pip install pyyaml -q + python3 -c " + import yaml, os + with open('${CONFIG_FILE}') as f: + cfg = yaml.safe_load(f)['common'] + with open(os.environ['GITHUB_OUTPUT'], 'a') as out: + out.write(f\"framework={cfg.get('framework', '')}\\n\") + out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") + out.write(f\"python-version={cfg.get('python_version', '')}\\n\") + out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") + out.write(f\"os-version={cfg.get('os_version', '')}\\n\") + out.write(f\"container-type={cfg.get('job_type', '')}\\n\") + out.write(f\"device-type={cfg.get('device_type', 'cpu')}\\n\") + out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") + out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") + out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") + out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") + " - name: Detect file changes id: changes @@ -105,62 +149,21 @@ jobs: - "test/telemetry/**" # ============================================================ - # Load configuration from YAML - # ============================================================ - load-config: - needs: [gatekeeper, check-changes] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-sagemaker-cpu.yml" - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Build CPU SageMaker image + # Build CPU SageMaker images (matrix over detected versions) # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} @@ -177,20 +180,36 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cpu.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Build sagemaker image id: build-sagemaker run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cpu.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations - FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-') - FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-') - OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') + FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-') + FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-') + OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \ @@ -198,11 +217,11 @@ jobs: --build-arg OPEN_MPI_VERSION=${OPEN_MPI_VERSION} \ --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.device.cpu=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \ --cache-to=type=inline \ --cache-from=type=registry,ref=${CI_IMAGE_URI} \ --tag ${CI_IMAGE_URI} \ @@ -212,85 +231,63 @@ jobs: echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Run unit tests + if: matrix.version == needs.detect-versions.outputs.latest-version + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + python-version: ${{ needs.detect-versions.outputs.python-version }} + cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} + os-version: ${{ needs.detect-versions.outputs.os-version }} + customer-type: ${{ needs.detect-versions.outputs.customer-type }} + arch-type: ${{ needs.detect-versions.outputs.arch-type }} + device-type: ${{ needs.detect-versions.outputs.device-type }} + contributor: ${{ needs.detect-versions.outputs.contributor }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # Security tests # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [check-changes, build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" - IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} # ============================================================ # SageMaker integration tests (CPU -- gloo backend) # ============================================================ sagemaker-test: - needs: [build-image, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -316,7 +313,7 @@ jobs: - name: Run SageMaker CPU training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ needs.build-images.outputs.sagemaker-image-uri }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cpu.py -v diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index 62431e604d7e..397cc26bbcca 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -44,20 +44,32 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Pre-commit + change detection + # Detect all changed PyTorch versions + file changes # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} + latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - pytorch-version: ${{ steps.version.outputs.version }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -72,18 +84,50 @@ jobs: with: extra_args: --all-files - - name: Detect PyTorch version - id: version + - name: Detect PyTorch versions + id: versions run: | - VERSION=$(git diff --name-only origin/main...HEAD \ + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) - if [ -z "$VERSION" ]; then - VERSION=$(git diff --name-only origin/main...HEAD \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ - | sort -u | head -1) + | sort -u) fi - echo "version=${VERSION:-$LATEST_PYTORCH_VERSION}" >> $GITHUB_OUTPUT + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + LATEST=$(echo "$VERSIONS" | tail -1) + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + echo "Latest version: ${LATEST}" + + - name: Load config for latest version + id: config + run: | + LATEST="${{ steps.versions.outputs.latest-version }}" + CONFIG_FILE=".github/config/image/pytorch-${LATEST}-sagemaker-cuda.yml" + pip install pyyaml -q + python3 -c " + import yaml, os + with open('${CONFIG_FILE}') as f: + cfg = yaml.safe_load(f)['common'] + with open(os.environ['GITHUB_OUTPUT'], 'a') as out: + out.write(f\"framework={cfg.get('framework', '')}\\n\") + out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") + out.write(f\"python-version={cfg.get('python_version', '')}\\n\") + out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") + out.write(f\"os-version={cfg.get('os_version', '')}\\n\") + out.write(f\"container-type={cfg.get('job_type', '')}\\n\") + out.write(f\"device-type={cfg.get('device_type', 'gpu')}\\n\") + out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") + out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") + out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") + out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") + " - name: Detect file changes id: changes @@ -103,62 +147,21 @@ jobs: - "test/telemetry/**" # ============================================================ - # Load configuration from YAML + # Build SageMaker images (matrix over detected versions) # ============================================================ - load-config: - needs: [gatekeeper, check-changes] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ".github/config/image/pytorch-${{ needs.check-changes.outputs.pytorch-version }}-sagemaker-cuda.yml" - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Build SageMaker image - # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} @@ -175,9 +178,25 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cuda.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Fetch cached wheels run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cuda.env mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ @@ -191,18 +210,18 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" + VERSION="${{ matrix.version }}" source docker/pytorch/${VERSION}/versions-cuda.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations - FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-') - FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-') - CUDA_LABEL="${{ needs.load-config.outputs.cuda-version }}" - OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') + FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-') + FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-') + CUDA_LABEL="${{ steps.config.outputs.cuda-version }}" + OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -215,11 +234,11 @@ jobs: --build-arg MAX_JOBS=${MAX_JOBS} \ --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.device.gpu.${CUDA_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \ --cache-to=type=inline \ --cache-from=type=registry,ref=${CI_IMAGE_URI} \ --tag ${CI_IMAGE_URI} \ @@ -229,105 +248,96 @@ jobs: echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Upload built wheels to cache + run: | + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + bash scripts/pytorch/upload_cached_wheels.sh \ + "${{ vars.WHEEL_CACHE_BUCKET }}" \ + "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ + "${{ steps.build-sagemaker.outputs.image-uri }}" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ + "flash-attn:${FLASH_ATTN_VERSION}" \ + "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ + continue-on-error: true + + - name: Run unit tests + if: matrix.version == needs.detect-versions.outputs.latest-version + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests (labels, filesystem, OSS compliance) # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + python-version: ${{ needs.detect-versions.outputs.python-version }} + cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} + os-version: ${{ needs.detect-versions.outputs.os-version }} + customer-type: ${{ needs.detect-versions.outputs.customer-type }} + arch-type: ${{ needs.detect-versions.outputs.arch-type }} + device-type: ${{ needs.detect-versions.outputs.device-type }} + contributor: ${{ needs.detect-versions.outputs.contributor }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) # ============================================================ telemetry-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') concurrency: group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} cancel-in-progress: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [check-changes, build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - VERSION="${{ needs.check-changes.outputs.pytorch-version }}" - IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.detect-versions.outputs.framework }} + framework-version: ${{ needs.detect-versions.outputs.framework-version }} + container-type: ${{ needs.detect-versions.outputs.container-type }} # ============================================================ # SageMaker integration tests (launch real SM training jobs) # ============================================================ sagemaker-test: - needs: [build-image, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -353,7 +363,7 @@ jobs: - name: Run SageMaker training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ needs.build-images.outputs.sagemaker-image-uri }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cuda.py -v From 50e73fd5cfd45e1902066db85b5692f6747dbf8f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 20:22:18 +0000 Subject: [PATCH 10/18] refactor: remove config loading from detect-versions, use build-images outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per team feedback: detect-versions should only detect versions and path changes. Config values are now output by the build-images matrix job (which already loads config per version). Downstream test jobs reference build-images outputs instead. Also removes the latest-version guard on unit tests — all matrix versions now run unit tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-ec2-cpu.yml | 81 ++++++------------- .github/workflows/pr-pytorch-ec2-cuda.yml | 81 ++++++------------- .../workflows/pr-pytorch-sagemaker-cpu.yml | 75 ++++++----------- .../workflows/pr-pytorch-sagemaker-cuda.yml | 81 ++++++------------- 4 files changed, 101 insertions(+), 217 deletions(-) diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index 6cfb644b268c..a84a6b9b8ad0 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -55,21 +55,9 @@ jobs: cancel-in-progress: true outputs: versions: ${{ steps.versions.outputs.versions }} - latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - framework: ${{ steps.config.outputs.framework }} - framework-version: ${{ steps.config.outputs.framework-version }} - python-version: ${{ steps.config.outputs.python-version }} - cuda-version: ${{ steps.config.outputs.cuda-version }} - os-version: ${{ steps.config.outputs.os-version }} - container-type: ${{ steps.config.outputs.container-type }} - device-type: ${{ steps.config.outputs.device-type }} - arch-type: ${{ steps.config.outputs.arch-type }} - contributor: ${{ steps.config.outputs.contributor }} - customer-type: ${{ steps.config.outputs.customer-type }} - prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -99,35 +87,8 @@ jobs: VERSIONS="$LATEST_PYTORCH_VERSION" fi JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') - LATEST=$(echo "$VERSIONS" | tail -1) echo "versions=${JSON}" >> $GITHUB_OUTPUT - echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT echo "Detected versions: ${JSON}" - echo "Latest version: ${LATEST}" - - - name: Load config for latest version - id: config - run: | - LATEST="${{ steps.versions.outputs.latest-version }}" - CONFIG_FILE=".github/config/image/pytorch-${LATEST}-ec2-cpu.yml" - pip install pyyaml -q - python3 -c " - import yaml, os - with open('${CONFIG_FILE}') as f: - cfg = yaml.safe_load(f)['common'] - with open(os.environ['GITHUB_OUTPUT'], 'a') as out: - out.write(f\"framework={cfg.get('framework', '')}\\n\") - out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") - out.write(f\"python-version={cfg.get('python_version', '')}\\n\") - out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") - out.write(f\"os-version={cfg.get('os_version', '')}\\n\") - out.write(f\"container-type={cfg.get('job_type', '')}\\n\") - out.write(f\"device-type={cfg.get('device_type', 'cpu')}\\n\") - out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") - out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") - out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") - out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") - " - name: Detect file changes id: changes @@ -165,6 +126,17 @@ jobs: cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -218,7 +190,6 @@ jobs: echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Run unit tests - if: matrix.version == needs.detect-versions.outputs.latest-version run: | VERSION="${{ matrix.version }}" IMAGE="${{ steps.build-runtime.outputs.image-uri }}" @@ -244,16 +215,16 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - python-version: ${{ needs.detect-versions.outputs.python-version }} - cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} - os-version: ${{ needs.detect-versions.outputs.os-version }} - customer-type: ${{ needs.detect-versions.outputs.customer-type }} - arch-type: ${{ needs.detect-versions.outputs.arch-type }} - device-type: ${{ needs.detect-versions.outputs.device-type }} - contributor: ${{ needs.detect-versions.outputs.contributor }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests @@ -266,8 +237,8 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests @@ -285,6 +256,6 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index 121c8cef8192..fb1b3b65127a 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -56,21 +56,9 @@ jobs: cancel-in-progress: true outputs: versions: ${{ steps.versions.outputs.versions }} - latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - framework: ${{ steps.config.outputs.framework }} - framework-version: ${{ steps.config.outputs.framework-version }} - python-version: ${{ steps.config.outputs.python-version }} - cuda-version: ${{ steps.config.outputs.cuda-version }} - os-version: ${{ steps.config.outputs.os-version }} - container-type: ${{ steps.config.outputs.container-type }} - device-type: ${{ steps.config.outputs.device-type }} - arch-type: ${{ steps.config.outputs.arch-type }} - contributor: ${{ steps.config.outputs.contributor }} - customer-type: ${{ steps.config.outputs.customer-type }} - prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -100,35 +88,8 @@ jobs: VERSIONS="$LATEST_PYTORCH_VERSION" fi JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') - LATEST=$(echo "$VERSIONS" | tail -1) echo "versions=${JSON}" >> $GITHUB_OUTPUT - echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT echo "Detected versions: ${JSON}" - echo "Latest version: ${LATEST}" - - - name: Load config for latest version - id: config - run: | - LATEST="${{ steps.versions.outputs.latest-version }}" - CONFIG_FILE=".github/config/image/pytorch-${LATEST}-ec2-cuda.yml" - pip install pyyaml -q - python3 -c " - import yaml, os - with open('${CONFIG_FILE}') as f: - cfg = yaml.safe_load(f)['common'] - with open(os.environ['GITHUB_OUTPUT'], 'a') as out: - out.write(f\"framework={cfg.get('framework', '')}\\n\") - out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") - out.write(f\"python-version={cfg.get('python_version', '')}\\n\") - out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") - out.write(f\"os-version={cfg.get('os_version', '')}\\n\") - out.write(f\"container-type={cfg.get('job_type', '')}\\n\") - out.write(f\"device-type={cfg.get('device_type', 'gpu')}\\n\") - out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") - out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") - out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") - out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") - " - name: Detect file changes id: changes @@ -166,6 +127,17 @@ jobs: cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -250,7 +222,6 @@ jobs: continue-on-error: true - name: Run unit tests - if: matrix.version == needs.detect-versions.outputs.latest-version run: | VERSION="${{ matrix.version }}" IMAGE="${{ steps.build-runtime.outputs.image-uri }}" @@ -276,16 +247,16 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - python-version: ${{ needs.detect-versions.outputs.python-version }} - cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} - os-version: ${{ needs.detect-versions.outputs.os-version }} - customer-type: ${{ needs.detect-versions.outputs.customer-type }} - arch-type: ${{ needs.detect-versions.outputs.arch-type }} - device-type: ${{ needs.detect-versions.outputs.device-type }} - contributor: ${{ needs.detect-versions.outputs.contributor }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) @@ -298,8 +269,8 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) @@ -317,9 +288,9 @@ jobs: image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Single-GPU tests diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 1f0039ffbf04..749f194dcca3 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -55,21 +55,9 @@ jobs: cancel-in-progress: true outputs: versions: ${{ steps.versions.outputs.versions }} - latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - framework: ${{ steps.config.outputs.framework }} - framework-version: ${{ steps.config.outputs.framework-version }} - python-version: ${{ steps.config.outputs.python-version }} - cuda-version: ${{ steps.config.outputs.cuda-version }} - os-version: ${{ steps.config.outputs.os-version }} - container-type: ${{ steps.config.outputs.container-type }} - device-type: ${{ steps.config.outputs.device-type }} - arch-type: ${{ steps.config.outputs.arch-type }} - contributor: ${{ steps.config.outputs.contributor }} - customer-type: ${{ steps.config.outputs.customer-type }} - prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -99,35 +87,8 @@ jobs: VERSIONS="$LATEST_PYTORCH_VERSION" fi JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') - LATEST=$(echo "$VERSIONS" | tail -1) echo "versions=${JSON}" >> $GITHUB_OUTPUT - echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT echo "Detected versions: ${JSON}" - echo "Latest version: ${LATEST}" - - - name: Load config for latest version - id: config - run: | - LATEST="${{ steps.versions.outputs.latest-version }}" - CONFIG_FILE=".github/config/image/pytorch-${LATEST}-sagemaker-cpu.yml" - pip install pyyaml -q - python3 -c " - import yaml, os - with open('${CONFIG_FILE}') as f: - cfg = yaml.safe_load(f)['common'] - with open(os.environ['GITHUB_OUTPUT'], 'a') as out: - out.write(f\"framework={cfg.get('framework', '')}\\n\") - out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") - out.write(f\"python-version={cfg.get('python_version', '')}\\n\") - out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") - out.write(f\"os-version={cfg.get('os_version', '')}\\n\") - out.write(f\"container-type={cfg.get('job_type', '')}\\n\") - out.write(f\"device-type={cfg.get('device_type', 'cpu')}\\n\") - out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") - out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") - out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") - out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") - " - name: Detect file changes id: changes @@ -167,6 +128,17 @@ jobs: cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -232,7 +204,6 @@ jobs: echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Run unit tests - if: matrix.version == needs.detect-versions.outputs.latest-version run: | VERSION="${{ matrix.version }}" IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" @@ -258,16 +229,16 @@ jobs: image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - python-version: ${{ needs.detect-versions.outputs.python-version }} - cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} - os-version: ${{ needs.detect-versions.outputs.os-version }} - customer-type: ${{ needs.detect-versions.outputs.customer-type }} - arch-type: ${{ needs.detect-versions.outputs.arch-type }} - device-type: ${{ needs.detect-versions.outputs.device-type }} - contributor: ${{ needs.detect-versions.outputs.contributor }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests @@ -280,8 +251,8 @@ jobs: image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # SageMaker integration tests (CPU -- gloo backend) diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index 397cc26bbcca..d52506fa3bc2 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -55,21 +55,9 @@ jobs: cancel-in-progress: true outputs: versions: ${{ steps.versions.outputs.versions }} - latest-version: ${{ steps.versions.outputs.latest-version }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} - framework: ${{ steps.config.outputs.framework }} - framework-version: ${{ steps.config.outputs.framework-version }} - python-version: ${{ steps.config.outputs.python-version }} - cuda-version: ${{ steps.config.outputs.cuda-version }} - os-version: ${{ steps.config.outputs.os-version }} - container-type: ${{ steps.config.outputs.container-type }} - device-type: ${{ steps.config.outputs.device-type }} - arch-type: ${{ steps.config.outputs.arch-type }} - contributor: ${{ steps.config.outputs.contributor }} - customer-type: ${{ steps.config.outputs.customer-type }} - prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -99,35 +87,8 @@ jobs: VERSIONS="$LATEST_PYTORCH_VERSION" fi JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') - LATEST=$(echo "$VERSIONS" | tail -1) echo "versions=${JSON}" >> $GITHUB_OUTPUT - echo "latest-version=${LATEST}" >> $GITHUB_OUTPUT echo "Detected versions: ${JSON}" - echo "Latest version: ${LATEST}" - - - name: Load config for latest version - id: config - run: | - LATEST="${{ steps.versions.outputs.latest-version }}" - CONFIG_FILE=".github/config/image/pytorch-${LATEST}-sagemaker-cuda.yml" - pip install pyyaml -q - python3 -c " - import yaml, os - with open('${CONFIG_FILE}') as f: - cfg = yaml.safe_load(f)['common'] - with open(os.environ['GITHUB_OUTPUT'], 'a') as out: - out.write(f\"framework={cfg.get('framework', '')}\\n\") - out.write(f\"framework-version={cfg.get('framework_version', '')}\\n\") - out.write(f\"python-version={cfg.get('python_version', '')}\\n\") - out.write(f\"cuda-version={cfg.get('cuda_version', '')}\\n\") - out.write(f\"os-version={cfg.get('os_version', '')}\\n\") - out.write(f\"container-type={cfg.get('job_type', '')}\\n\") - out.write(f\"device-type={cfg.get('device_type', 'gpu')}\\n\") - out.write(f\"arch-type={cfg.get('arch_type', 'x86')}\\n\") - out.write(f\"contributor={cfg.get('contributor', 'None')}\\n\") - out.write(f\"customer-type={cfg.get('customer_type', '')}\\n\") - out.write(f\"prod-image={cfg.get('prod_image', '')}\\n\") - " - name: Detect file changes id: changes @@ -165,6 +126,17 @@ jobs: cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -262,7 +234,6 @@ jobs: continue-on-error: true - name: Run unit tests - if: matrix.version == needs.detect-versions.outputs.latest-version run: | VERSION="${{ matrix.version }}" IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" @@ -288,16 +259,16 @@ jobs: image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - python-version: ${{ needs.detect-versions.outputs.python-version }} - cuda-version: ${{ needs.detect-versions.outputs.cuda-version }} - os-version: ${{ needs.detect-versions.outputs.os-version }} - customer-type: ${{ needs.detect-versions.outputs.customer-type }} - arch-type: ${{ needs.detect-versions.outputs.arch-type }} - device-type: ${{ needs.detect-versions.outputs.device-type }} - contributor: ${{ needs.detect-versions.outputs.contributor }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) @@ -310,8 +281,8 @@ jobs: image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) @@ -329,9 +300,9 @@ jobs: image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.detect-versions.outputs.framework }} - framework-version: ${{ needs.detect-versions.outputs.framework-version }} - container-type: ${{ needs.detect-versions.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # SageMaker integration tests (launch real SM training jobs) From eb45d7a6026be64d7858def1b769d751896d4190 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 20:40:42 +0000 Subject: [PATCH 11/18] refactor: use ARG DLC_PYTORCH_VERSION in Dockerfiles instead of hardcoded paths Per team feedback: Dockerfiles now use ARG DLC_PYTORCH_VERSION=2.11 for COPY paths instead of hardcoding "docker/pytorch/2.11/..." throughout. Workflows pass --build-arg DLC_PYTORCH_VERSION to ensure the value matches the matrix version. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/autorelease-pytorch-ec2-cpu.yml | 1 + .github/workflows/autorelease-pytorch-ec2-cuda.yml | 1 + .../workflows/autorelease-pytorch-sagemaker-cpu.yml | 1 + .../workflows/autorelease-pytorch-sagemaker-cuda.yml | 1 + .github/workflows/pr-pytorch-ec2-cpu.yml | 1 + .github/workflows/pr-pytorch-ec2-cuda.yml | 1 + .github/workflows/pr-pytorch-sagemaker-cpu.yml | 1 + .github/workflows/pr-pytorch-sagemaker-cuda.yml | 1 + docker/pytorch/2.11/Dockerfile.cpu | 5 +++-- docker/pytorch/2.11/Dockerfile.cuda | 11 ++++++----- 10 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/workflows/autorelease-pytorch-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-ec2-cpu.yml index 3c3f809c5374..028ba63c9cae 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cpu.yml @@ -112,6 +112,7 @@ jobs: CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ diff --git a/.github/workflows/autorelease-pytorch-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-ec2-cuda.yml index 4bfa283808e7..730b3875609f 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cuda.yml @@ -135,6 +135,7 @@ jobs: CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml index 9b052493909a..e733adea4679 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml @@ -117,6 +117,7 @@ jobs: OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml index 79cfb42a90d9..691236d8a0ac 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml @@ -132,6 +132,7 @@ jobs: OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index a84a6b9b8ad0..8562aa98605b 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -174,6 +174,7 @@ jobs: CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index fb1b3b65127a..383e59ebdea9 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -188,6 +188,7 @@ jobs: CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 749f194dcca3..5cf97e316068 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -181,6 +181,7 @@ jobs: OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index d52506fa3bc2..d51d5c5137cf 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -193,6 +193,7 @@ jobs: OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ diff --git a/docker/pytorch/2.11/Dockerfile.cpu b/docker/pytorch/2.11/Dockerfile.cpu index 409fbcfd69e7..ffce19b7eb3e 100644 --- a/docker/pytorch/2.11/Dockerfile.cpu +++ b/docker/pytorch/2.11/Dockerfile.cpu @@ -12,6 +12,7 @@ # ============================================================================ # ── Global ARGs (available to all stages) ─────────────────────────────────── +ARG DLC_PYTORCH_VERSION=2.11 ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 ARG PYTHON_VERSION=3.12 @@ -35,7 +36,7 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/2.11/cpu/pyproject.toml docker/pytorch/2.11/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project @@ -160,7 +161,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cpu/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/2.11/cpu/pyproject.toml docker/pytorch/2.11/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* diff --git a/docker/pytorch/2.11/Dockerfile.cuda b/docker/pytorch/2.11/Dockerfile.cuda index ce69e3874fb6..2f5c1e9d058e 100644 --- a/docker/pytorch/2.11/Dockerfile.cuda +++ b/docker/pytorch/2.11/Dockerfile.cuda @@ -16,6 +16,7 @@ # ============================================================================ # ── Global ARGs (available to all stages) ─────────────────────────────────── +ARG DLC_PYTORCH_VERSION=2.11 ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 ARG CUDA_VERSION=13.0.2 @@ -46,14 +47,14 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project # transformer-engine requires torch + cudnn.h at build time; point it to the # cudnn headers shipped inside the nvidia-cudnn pip package. ARG TRANSFORMER_ENGINE_VERSION -COPY docker/pytorch/2.11/wheel[s]/ /tmp/wheels/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0])") && \ NCCL_HOME=$(python -c "import nvidia.nccl; print(nvidia.nccl.__path__[0])") && \ cp ${CUDNN_HOME}/include/*.h /usr/local/cuda/include/ && \ @@ -81,8 +82,8 @@ ARG MAX_JOBS # If a cached wheel exists in the build context, install it; otherwise build from source. # When building from source, the wheel is saved to /tmp/built_wheels/ for later S3 upload. -# docker/pytorch/2.11/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. -COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/wheel[s]/ /tmp/wheels/ +# docker/pytorch/${DLC_PYTORCH_VERSION}/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/ RUN --mount=type=cache,target=/root/.cache/uv \ mkdir -p /tmp/built_wheels && \ WHL=$(find /tmp/wheels -name "flash*attn*.whl" 2>/dev/null | head -1) && \ @@ -244,7 +245,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cuda/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/2.11/cuda/pyproject.toml docker/pytorch/2.11/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* From 1356f3390cdfe4e88eaa1bc0787fb8a6788fa604 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 20:58:20 +0000 Subject: [PATCH 12/18] fix: tighten PR workflow path triggers to avoid cross-triggering CUDA workflows only trigger on CUDA-related paths: docker/pytorch/*/Dockerfile.cuda, docker/pytorch/*/cuda/**, versions-cuda.env CPU workflows only trigger on CPU-related paths: docker/pytorch/*/Dockerfile.cpu, docker/pytorch/*/cpu/**, versions-cpu.env Previously all 4 workflows used docker/pytorch/** which meant a CUDA Dockerfile change triggered the CPU workflow (and vice versa). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-ec2-cpu.yml | 4 +++- .github/workflows/pr-pytorch-ec2-cuda.yml | 4 +++- .github/workflows/pr-pytorch-sagemaker-cpu.yml | 4 +++- .github/workflows/pr-pytorch-sagemaker-cuda.yml | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index 8562aa98605b..cc91bd1cf132 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -7,7 +7,9 @@ on: paths: - ".github/config/image/pytorch-*-ec2-cpu.yml" - ".github/workflows/pr-pytorch-ec2-cpu.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" + - "docker/pytorch/*/versions-cpu.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index 383e59ebdea9..6a21ee981963 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -7,7 +7,9 @@ on: paths: - ".github/config/image/pytorch-*-ec2-cuda.yml" - ".github/workflows/pr-pytorch-ec2-cuda.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "docker/pytorch/*/versions-cuda.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 5cf97e316068..1f3cd46b47d0 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -7,7 +7,9 @@ on: paths: - ".github/config/image/pytorch-*-sagemaker-cpu.yml" - ".github/workflows/pr-pytorch-sagemaker-cpu.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" + - "docker/pytorch/*/versions-cpu.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index d51d5c5137cf..2a01a221703c 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -7,7 +7,9 @@ on: paths: - ".github/config/image/pytorch-*-sagemaker-cuda.yml" - ".github/workflows/pr-pytorch-sagemaker-cuda.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "docker/pytorch/*/versions-cuda.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" From 4e22f4b37de0dfc7b37e7ff56cd1582460530660 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 21:00:11 +0000 Subject: [PATCH 13/18] fix: re-declare ARG DLC_PYTORCH_VERSION in each Dockerfile stage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docker requires ARGs to be re-declared after each FROM — global ARGs are only available in FROM lines, not in stage instructions like COPY. Without the re-declaration, DLC_PYTORCH_VERSION resolves to empty string causing "not found" errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/pytorch/2.11/Dockerfile.cpu | 2 ++ docker/pytorch/2.11/Dockerfile.cuda | 3 +++ 2 files changed, 5 insertions(+) diff --git a/docker/pytorch/2.11/Dockerfile.cpu b/docker/pytorch/2.11/Dockerfile.cpu index ffce19b7eb3e..aaf705935925 100644 --- a/docker/pytorch/2.11/Dockerfile.cpu +++ b/docker/pytorch/2.11/Dockerfile.cpu @@ -22,6 +22,7 @@ ARG OPEN_MPI_VERSION=4.1.7 # ── Stage: builder-base (shared Python venv with lockfile deps) ───────────── FROM amazonlinux:2023 AS builder-base +ARG DLC_PYTORCH_VERSION ARG PYTHON_VERSION RUN dnf install -y --allowerasing \ @@ -151,6 +152,7 @@ CMD ["bash"] # ── Stage: sagemaker (SageMaker Training) ──────────────────────────────────── FROM runtime-base AS sagemaker +ARG DLC_PYTORCH_VERSION ARG TORCH_VERSION # SageMaker BYOC paths diff --git a/docker/pytorch/2.11/Dockerfile.cuda b/docker/pytorch/2.11/Dockerfile.cuda index 2f5c1e9d058e..ba34aa7247b0 100644 --- a/docker/pytorch/2.11/Dockerfile.cuda +++ b/docker/pytorch/2.11/Dockerfile.cuda @@ -33,6 +33,7 @@ ARG MAX_JOBS=8 # ── Stage: builder-base (shared Python venv with lockfile deps) ───────────── FROM nvidia/cuda:${CUDA_VERSION}-devel-amzn2023 AS builder-base +ARG DLC_PYTORCH_VERSION ARG PYTHON_VERSION RUN dnf install -y --allowerasing \ @@ -77,6 +78,7 @@ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0]) # ── Stage: builder-flash-attn (parallel — needs torch only) ───────────────── FROM builder-base AS builder-flash-attn +ARG DLC_PYTORCH_VERSION ARG FLASH_ATTN_VERSION ARG MAX_JOBS @@ -235,6 +237,7 @@ CMD ["bash"] # ── Stage: sagemaker (SageMaker Training) ──────────────────────────────────── FROM runtime-base AS sagemaker +ARG DLC_PYTORCH_VERSION ARG TORCH_VERSION # SageMaker BYOC paths From edd4c39d0f33c37dd2879802246e168a29827f6e Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 21:37:09 +0000 Subject: [PATCH 14/18] debug: log config file content to diagnose empty matrix outputs --- .github/workflows/pr-pytorch-ec2-cuda.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index 6a21ee981963..c9a0fc63e477 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -157,6 +157,9 @@ jobs: id: config run: | CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cuda.yml" + echo "Loading config from: ${CONFIG_FILE}" + cat "${CONFIG_FILE}" + echo "---" echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT From c2ce384003da255a18be2435b6f6803074c8a398 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 21:47:55 +0000 Subject: [PATCH 15/18] feat: add matrix to all downstream test jobs for per-version testing All test jobs (sanity, security, telemetry, single-gpu, efa, sagemaker) now matrix over detected versions. Each version gets its own test run with a constructed image-uri based on the version-specific CI tag. GitHub Actions supports strategy.matrix on reusable workflow calls. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-ec2-cpu.yml | 21 ++++++++--- .github/workflows/pr-pytorch-ec2-cuda.yml | 37 ++++++++++++++----- .../workflows/pr-pytorch-sagemaker-cpu.yml | 20 ++++++++-- .../workflows/pr-pytorch-sagemaker-cuda.yml | 29 +++++++++++---- 4 files changed, 79 insertions(+), 28 deletions(-) diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index cc91bd1cf132..8861e2700216 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -213,9 +213,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -235,9 +239,13 @@ jobs: security-test: needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -251,12 +259,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index c9a0fc63e477..b80eed57e8a1 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -248,9 +248,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -270,9 +274,13 @@ jobs: security-test: needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -286,12 +294,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -304,12 +313,16 @@ jobs: single-gpu-test: needs: [detect-versions, build-images, sanity-test, security-test] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-g6xl-runner buildspec-override:true concurrency: - group: ${{ github.workflow }}-single-gpu-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-single-gpu-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -323,7 +336,7 @@ jobs: - name: Run single-GPU tests run: | - IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" + IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ @@ -338,11 +351,15 @@ jobs: # EFA integration test (2x p4d.24xlarge, NCCL over EFA) # ============================================================ efa-test: - needs: [build-images, sanity-test, security-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-efa-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 1f3cd46b47d0..c0b8e06b1408 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -227,9 +227,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -249,9 +253,13 @@ jobs: security-test: needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -267,8 +275,12 @@ jobs: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -287,7 +299,7 @@ jobs: - name: Run SageMaker CPU training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-images.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cpu.py -v diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index 2a01a221703c..627fe25733a5 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -257,9 +257,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -279,9 +283,13 @@ jobs: security-test: needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -295,12 +303,13 @@ jobs: if: | always() && !failure() && !cancelled() && (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-images.outputs.sagemaker-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} framework: ${{ needs.build-images.outputs.framework }} @@ -317,8 +326,12 @@ jobs: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -337,7 +350,7 @@ jobs: - name: Run SageMaker training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-images.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cuda.py -v From 40c511a89402fa6991a2ce3387309cdadcd69370 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 21:53:46 +0000 Subject: [PATCH 16/18] fix: install yq before config parsing in build-images job The load-config action installs yq but our inlined config step didn't. yq is not available by default on CodeBuild runners, causing all config outputs (framework, container-type, etc.) to be empty. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pr-pytorch-ec2-cpu.yml | 7 +++++++ .github/workflows/pr-pytorch-ec2-cuda.yml | 7 +++++++ .github/workflows/pr-pytorch-sagemaker-cpu.yml | 7 +++++++ .github/workflows/pr-pytorch-sagemaker-cuda.yml | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index 8861e2700216..4fba570ea808 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -152,6 +152,13 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + - name: Load and parse config id: config run: | diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index b80eed57e8a1..4d0d3925ce74 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -153,6 +153,13 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + - name: Load and parse config id: config run: | diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index c0b8e06b1408..7bb491b2510a 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -154,6 +154,13 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + - name: Load and parse config id: config run: | diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index 627fe25733a5..2bcafa1a65d7 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -152,6 +152,13 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + - name: Load and parse config id: config run: | From b0dccfaf070c99770a97f6bf9470129b0be9bec9 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 22:50:20 +0000 Subject: [PATCH 17/18] fix: add defensive cleanup for stale EFA test instances and EIPs When a workflow is cancelled mid-EFA-test, the finally block may not execute, leaking p4d.24xlarge instances and EIPs. This adds a cleanup_stale_efa_instances() call at the start of each test run that terminates instances tagged "CI-CD EFA efa-test" older than 4 hours and releases orphaned EIPs. Prevents: AddressLimitExceeded errors from accumulated leaked EIPs. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/scripts/efa/ec2_helpers.py | 41 ++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py index 0cba98608ab5..85de0136eaf5 100644 --- a/.github/scripts/efa/ec2_helpers.py +++ b/.github/scripts/efa/ec2_helpers.py @@ -386,6 +386,44 @@ def release_eip(aws_session, alloc_id): LOGGER.warning(f"Failed to release EIP {alloc_id}: {e}") +def cleanup_stale_efa_instances(aws_session, max_age_hours=4): + """Terminate EFA test instances older than max_age_hours and release their EIPs. + + Prevents resource leaks from cancelled/crashed workflow runs that didn't reach cleanup. + """ + from datetime import datetime, timezone + + cutoff = datetime.now(timezone.utc).timestamp() - (max_age_hours * 3600) + + try: + resp = aws_session.ec2.describe_instances( + Filters=[ + {"Name": "tag:Name", "Values": ["CI-CD EFA efa-test"]}, + {"Name": "instance-state-name", "Values": ["running", "stopped"]}, + ] + ) + for reservation in resp.get("Reservations", []): + for instance in reservation.get("Instances", []): + launch_time = instance["LaunchTime"].timestamp() + if launch_time < cutoff: + instance_id = instance["InstanceId"] + LOGGER.warning( + f"Terminating stale EFA instance {instance_id} (launched {instance['LaunchTime']})" + ) + aws_session.ec2.terminate_instances(InstanceIds=[instance_id]) + + # Release unassociated EIPs (leaked from terminated instances) + addresses = aws_session.ec2.describe_addresses().get("Addresses", []) + for addr in addresses: + if not addr.get("AssociationId") and addr.get("AllocationId"): + LOGGER.warning( + f"Releasing orphaned EIP {addr['AllocationId']} ({addr.get('PublicIp', '')})" + ) + release_eip(aws_session, addr["AllocationId"]) + except Exception as e: + LOGGER.warning(f"Stale resource cleanup failed (non-fatal): {e}") + + @contextmanager def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION): """Context manager that launches 2 EFA instances, sets up containers + SSH, and cleans up. @@ -396,6 +434,9 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION ami_id = aws_session.get_latest_ami() sg_id = get_efa_security_group_id(aws_session) + # Clean up leaked resources from previous cancelled/crashed runs + cleanup_stale_efa_instances(aws_session) + key_name = None key_path = None runner_ip = None From 5c8953fcfa85aa3e31914efbbf7a5f6fda8932f7 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 19 May 2026 23:58:14 +0000 Subject: [PATCH 18/18] debug: add diagnostic logging to EFA NCCL test Logs LD_LIBRARY_PATH, ofi-nccl lib presence, all_reduce_perf binary, fi_info output, NCCL lib path, and full NCCL log on failure. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/efa/scripts/nccl_allreduce.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index bd185bfb8169..5b01a996cc33 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,6 +52,15 @@ check_efa_nccl_all_reduce_performance(){ fi } +echo "=== Debug: Environment and library info ===" +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +echo "CUDA_HOME=$CUDA_HOME" +ls -la /opt/amazon/ofi-nccl/lib64/ 2>/dev/null || echo "/opt/amazon/ofi-nccl/lib64/ NOT FOUND" +ls -la /usr/local/bin/all_reduce_perf 2>/dev/null || echo "all_reduce_perf NOT FOUND" +fi_info -p efa 2>&1 | head -5 || echo "fi_info failed" +echo "NCCL lib: $(ls /opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib/libnccl.so* 2>/dev/null || echo 'not found')" +echo "=== End debug ===" + echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ @@ -63,7 +72,10 @@ RETURN_VAL=${PIPESTATUS[0]} if [ ${RETURN_VAL} -eq 0 ]; then echo "check_efa_nccl_all_reduce passed" else - echo "check_efa_nccl_all_reduce failed" + echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})" + echo "=== Full NCCL log ===" + cat "${TRAINING_LOG}" + echo "=== End NCCL log ===" fi validate_all_reduce_performance_logs