diff --git a/.github/config/image/pytorch-ec2-cpu.yml b/.github/config/image/pytorch-2.11-ec2-cpu.yml similarity index 100% rename from .github/config/image/pytorch-ec2-cpu.yml rename to .github/config/image/pytorch-2.11-ec2-cpu.yml diff --git a/.github/config/image/pytorch-ec2-cuda.yml b/.github/config/image/pytorch-2.11-ec2-cuda.yml similarity index 100% rename from .github/config/image/pytorch-ec2-cuda.yml rename to .github/config/image/pytorch-2.11-ec2-cuda.yml diff --git a/.github/config/image/pytorch-sagemaker-cpu.yml b/.github/config/image/pytorch-2.11-sagemaker-cpu.yml similarity index 100% rename from .github/config/image/pytorch-sagemaker-cpu.yml rename to .github/config/image/pytorch-2.11-sagemaker-cpu.yml diff --git a/.github/config/image/pytorch-sagemaker-cuda.yml b/.github/config/image/pytorch-2.11-sagemaker-cuda.yml similarity index 100% rename from .github/config/image/pytorch-sagemaker-cuda.yml rename to .github/config/image/pytorch-2.11-sagemaker-cuda.yml diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py index 0cba98608ab5..85de0136eaf5 100644 --- a/.github/scripts/efa/ec2_helpers.py +++ b/.github/scripts/efa/ec2_helpers.py @@ -386,6 +386,44 @@ def release_eip(aws_session, alloc_id): LOGGER.warning(f"Failed to release EIP {alloc_id}: {e}") +def cleanup_stale_efa_instances(aws_session, max_age_hours=4): + """Terminate EFA test instances older than max_age_hours and release their EIPs. + + Prevents resource leaks from cancelled/crashed workflow runs that didn't reach cleanup. + """ + from datetime import datetime, timezone + + cutoff = datetime.now(timezone.utc).timestamp() - (max_age_hours * 3600) + + try: + resp = aws_session.ec2.describe_instances( + Filters=[ + {"Name": "tag:Name", "Values": ["CI-CD EFA efa-test"]}, + {"Name": "instance-state-name", "Values": ["running", "stopped"]}, + ] + ) + for reservation in resp.get("Reservations", []): + for instance in reservation.get("Instances", []): + launch_time = instance["LaunchTime"].timestamp() + if launch_time < cutoff: + instance_id = instance["InstanceId"] + LOGGER.warning( + f"Terminating stale EFA instance {instance_id} (launched {instance['LaunchTime']})" + ) + aws_session.ec2.terminate_instances(InstanceIds=[instance_id]) + + # Release unassociated EIPs (leaked from terminated instances) + addresses = aws_session.ec2.describe_addresses().get("Addresses", []) + for addr in addresses: + if not addr.get("AssociationId") and addr.get("AllocationId"): + LOGGER.warning( + f"Releasing orphaned EIP {addr['AllocationId']} ({addr.get('PublicIp', '')})" + ) + release_eip(aws_session, addr["AllocationId"]) + except Exception as e: + LOGGER.warning(f"Stale resource cleanup failed (non-fatal): {e}") + + @contextmanager def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION): """Context manager that launches 2 EFA instances, sets up containers + SSH, and cleans up. @@ -396,6 +434,9 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION ami_id = aws_session.get_latest_ami() sg_id = get_efa_security_group_id(aws_session) + # Clean up leaked resources from previous cancelled/crashed runs + cleanup_stale_efa_instances(aws_session) + key_name = None key_path = None runner_ip = None diff --git a/.github/workflows/autorelease-pytorch-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-ec2-cpu.yml index cd41b1e713b8..028ba63c9cae 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cpu.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cpu.yml @@ -2,9 +2,14 @@ name: Auto Release - PyTorch EC2 CPU on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cpu.yml)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,29 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml" jobs: + determine-config: + runs-on: ubuntu-latest + outputs: + config-file: ${{ steps.config.outputs.config-file }} + steps: + - name: Determine config file + id: config + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cpu.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cpu.yml" >> $GITHUB_OUTPUT ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + load-config: + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -83,10 +107,12 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cpu.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -98,7 +124,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -142,7 +168,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -159,10 +185,11 @@ jobs: - name: Run unit tests run: | + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-ec2-cuda.yml index f17a7fd13cb3..730b3875609f 100644 --- a/.github/workflows/autorelease-pytorch-ec2-cuda.yml +++ b/.github/workflows/autorelease-pytorch-ec2-cuda.yml @@ -2,10 +2,14 @@ name: Auto Release - PyTorch EC2 CUDA on: schedule: - # Runs at 9AM/10AM PST/PDT on Mondays and Wednesdays - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cuda.yml)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -16,10 +20,29 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml" jobs: + determine-config: + runs-on: ubuntu-latest + outputs: + config-file: ${{ steps.config.outputs.config-file }} + steps: + - name: Determine config file + id: config + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cuda.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cuda.yml" >> $GITHUB_OUTPUT ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + load-config: + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -42,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -85,17 +108,19 @@ jobs: - name: Source versions id: versions run: | - source docker/pytorch/versions-cuda.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -105,10 +130,12 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cuda.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}" docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ @@ -125,17 +152,19 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true @@ -180,7 +209,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -197,10 +226,11 @@ jobs: - name: Run unit tests run: | + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml index 95c8780f3277..e733adea4679 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml @@ -2,9 +2,14 @@ name: Auto Release - PyTorch SageMaker CPU on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cpu.yml)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,29 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml" jobs: + determine-config: + runs-on: ubuntu-latest + outputs: + config-file: ${{ steps.config.outputs.config-file }} + steps: + - name: Determine config file + id: config + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + load-config: + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -83,7 +107,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cpu.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cpu.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-cpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -92,6 +117,7 @@ jobs: OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -110,7 +136,7 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT @@ -154,7 +180,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -171,10 +197,11 @@ jobs: - name: Run unit tests run: | + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml index c40d70c44bfd..691236d8a0ac 100644 --- a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml @@ -2,9 +2,14 @@ name: Auto Release - PyTorch SageMaker CUDA on: schedule: - - cron: '00 17 * * 1,3' - + - cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST + # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future) workflow_dispatch: + inputs: + config-file: + description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cuda.yml)" + required: true + type: string concurrency: group: ${{ github.workflow }} @@ -15,10 +20,29 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml" jobs: + determine-config: + runs-on: ubuntu-latest + outputs: + config-file: ${{ steps.config.outputs.config-file }} + steps: + - name: Determine config file + id: config + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT + else + CRON="${{ github.event.schedule }}" + case "$CRON" in + "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;; + # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;; + *) echo "::error::Unknown cron: $CRON"; exit 1 ;; + esac + fi + load-config: + needs: [determine-config] runs-on: ubuntu-latest outputs: config: ${{ steps.load.outputs.config }} @@ -41,7 +65,7 @@ jobs: id: load uses: ./.github/actions/load-config with: - config-file: ${{ env.CONFIG_FILE }} + config-file: ${{ needs.determine-config.outputs.config-file }} - name: Parse configuration id: parse @@ -83,10 +107,11 @@ jobs: - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -96,7 +121,8 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cuda.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}" # Derive label values to match check_labels.py expectations @@ -106,6 +132,7 @@ jobs: OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ + --build-arg DLC_PYTORCH_VERSION=${VERSION} \ --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ @@ -129,17 +156,19 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-sagemaker.outputs.image-uri }}" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true @@ -184,7 +213,7 @@ jobs: container-type: ${{ needs.load-config.outputs.container-type }} unit-test: - needs: [build-image] + needs: [load-config, build-image] runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner @@ -201,10 +230,11 @@ jobs: - name: Run unit tests run: | + VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2) IMAGE="${{ needs.build-image.outputs.ci-image }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ -v $(pwd):/workdir --workdir /workdir \ ${IMAGE} -c 'sleep infinity') docker exec ${CONTAINER_ID} pip install pytest -q diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml index a6264f2df988..4fba570ea808 100644 --- a/.github/workflows/pr-pytorch-ec2-cpu.yml +++ b/.github/workflows/pr-pytorch-ec2-cpu.yml @@ -5,9 +5,11 @@ on: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-ec2-cpu.yml" + - ".github/config/image/pytorch-*-ec2-cpu.yml" - ".github/workflows/pr-pytorch-ec2-cpu.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" + - "docker/pytorch/*/versions-cpu.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +24,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -44,61 +46,17 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Load configuration from YAML + # Detect all changed PyTorch versions + file changes # ============================================================ - load-config: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ${{ env.CONFIG_FILE }} - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} @@ -116,15 +74,33 @@ jobs: with: extra_args: --all-files + - name: Detect PyTorch versions + id: versions + run: | + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u) + fi + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + - name: Detect file changes id: changes uses: dorny/paths-filter@v4 with: filters: | build-change: - - ".github/config/image/pytorch-ec2-cpu.yml" - - "docker/pytorch/Dockerfile.cpu" - - "docker/pytorch/cpu/**" + - ".github/config/image/pytorch-*-ec2-cpu.yml" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/configure_ssh.sh" - "scripts/telemetry/bash_telemetry.sh.template" @@ -134,20 +110,35 @@ jobs: - "test/telemetry/**" # ============================================================ - # Build CPU runtime image + # Build CPU images (matrix over detected versions) # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -161,14 +152,39 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cpu.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cpu.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-pr-${{ github.event.pull_request.number }}" + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cpu.env + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \ @@ -179,99 +195,86 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Run unit tests + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-runtime.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests # ============================================================ telemetry-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - IMAGE="${{ needs.build-image.outputs.runtime-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml index 09d07e1b68ae..4d0d3925ce74 100644 --- a/.github/workflows/pr-pytorch-ec2-cuda.yml +++ b/.github/workflows/pr-pytorch-ec2-cuda.yml @@ -5,9 +5,11 @@ on: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-ec2-cuda.yml" + - ".github/config/image/pytorch-*-ec2-cuda.yml" - ".github/workflows/pr-pytorch-ec2-cuda.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "docker/pytorch/*/versions-cuda.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -23,9 +25,7 @@ permissions: env: FORCE_COLOR: "1" - - # Config file path - CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -47,61 +47,17 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Load configuration from YAML + # Detect all changed PyTorch versions + file changes # ============================================================ - load-config: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ${{ env.CONFIG_FILE }} - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} @@ -119,15 +75,33 @@ jobs: with: extra_args: --all-files + - name: Detect PyTorch versions + id: versions + run: | + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u) + fi + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + - name: Detect file changes id: changes uses: dorny/paths-filter@v4 with: filters: | build-change: - - ".github/config/image/pytorch-ec2-cuda.yml" - - "docker/pytorch/Dockerfile.cuda" - - "docker/pytorch/cuda/**" + - ".github/config/image/pytorch-*-ec2-cuda.yml" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/*" - "scripts/telemetry/bash_telemetry.sh.template" @@ -137,20 +111,35 @@ jobs: - "test/telemetry/**" # ============================================================ - # Build runtime image + # Build images (matrix over detected versions) # ============================================================ build-images: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -164,20 +153,39 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - name: Source versions - id: versions + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + + - name: Load and parse config + id: config run: | - source docker/pytorch/versions-cuda.env - echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT - echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT - echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cuda.yml" + echo "Loading config from: ${CONFIG_FILE}" + cat "${CONFIG_FILE}" + echo "---" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -187,11 +195,13 @@ jobs: - name: Build runtime image id: build-runtime run: | - source docker/pytorch/versions-cuda.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-pr-${{ github.event.pull_request.number }}" + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -207,126 +217,119 @@ jobs: --tag ${CI_IMAGE_URI} \ --push \ --target runtime \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT - name: Upload built wheels to cache run: | - source docker/pytorch/versions-cuda.env + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env bash scripts/pytorch/upload_cached_wheels.sh \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "${{ steps.build-runtime.outputs.image-uri }}" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ "flash-attn:${FLASH_ATTN_VERSION}" \ "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ continue-on-error: true + - name: Run unit tests + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-runtime.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests (labels, filesystem, OSS compliance) # ============================================================ sanity-test: - needs: [check-changes, build-images, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) # ============================================================ security-test: - needs: [build-images, load-config] + needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) # ============================================================ telemetry-test: - needs: [check-changes, build-images, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests (CPU-only, no GPU needed) - # ============================================================ - unit-test: - needs: [build-images] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Single-GPU tests # ============================================================ single-gpu-test: - needs: [build-images, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-g6xl-runner buildspec-override:true concurrency: - group: ${{ github.workflow }}-single-gpu-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-single-gpu-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -340,7 +343,7 @@ jobs: - name: Run single-GPU tests run: | - IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}" + IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" docker pull ${IMAGE} CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \ --entrypoint /bin/bash \ @@ -355,11 +358,15 @@ jobs: # EFA integration test (2x p4d.24xlarge, NCCL over EFA) # ============================================================ efa-test: - needs: [build-images, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-efa-tests.yml with: - image-uri: ${{ needs.build-images.outputs.runtime-image-uri }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} @@ -407,8 +414,6 @@ jobs: # # docker kill ${CONTAINER_ID} - # ============================================================ - # Multi-node tests (need 2+ containers on Docker network) # ============================================================ # Multi-node tests (need 2+ containers on Docker network) # TODO: Re-enable when GPU capacity is available diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml index 5cbb926d7917..7bb491b2510a 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml @@ -5,9 +5,11 @@ on: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-sagemaker-cpu.yml" + - ".github/config/image/pytorch-*-sagemaker-cpu.yml" - ".github/workflows/pr-pytorch-sagemaker-cpu.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" + - "docker/pytorch/*/versions-cpu.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,7 +24,7 @@ permissions: env: FORCE_COLOR: "1" - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -44,61 +46,17 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Load configuration from YAML + # Detect all changed PyTorch versions + file changes # ============================================================ - load-config: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ${{ env.CONFIG_FILE }} - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} @@ -116,15 +74,33 @@ jobs: with: extra_args: --all-files + - name: Detect PyTorch versions + id: versions + run: | + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u) + fi + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + - name: Detect file changes id: changes uses: dorny/paths-filter@v4 with: filters: | build-change: - - ".github/config/image/pytorch-sagemaker-cpu.yml" - - "docker/pytorch/Dockerfile.cpu" - - "docker/pytorch/cpu/**" + - ".github/config/image/pytorch-*-sagemaker-cpu.yml" + - "docker/pytorch/*/Dockerfile.cpu" + - "docker/pytorch/*/cpu/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/configure_ssh.sh" - "scripts/pytorch/changehostname.c" @@ -136,20 +112,35 @@ jobs: - "test/telemetry/**" # ============================================================ - # Build CPU SageMaker image + # Build CPU SageMaker images (matrix over detected versions) # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -163,19 +154,44 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cpu.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cpu.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cpu.env + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations - FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-') - FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-') - OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') + FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-') + FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-') + OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \ @@ -183,105 +199,95 @@ jobs: --build-arg OPEN_MPI_VERSION=${OPEN_MPI_VERSION} \ --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.device.cpu=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \ --cache-to=type=inline \ --cache-from=type=registry,ref=${CI_IMAGE_URI} \ --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cpu . + -f docker/pytorch/${VERSION}/Dockerfile.cpu . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Run unit tests + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ - # SageMaker integration tests (CPU — gloo backend) + # SageMaker integration tests (CPU -- gloo backend) # ============================================================ sagemaker-test: - needs: [build-image, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -300,7 +306,7 @@ jobs: - name: Run SageMaker CPU training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cpu.py -v diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml index e8a6249d4559..2bcafa1a65d7 100644 --- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml +++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml @@ -5,9 +5,11 @@ on: branches: [main] types: [opened, reopened, synchronize] paths: - - ".github/config/image/pytorch-sagemaker-cuda.yml" + - ".github/config/image/pytorch-*-sagemaker-cuda.yml" - ".github/workflows/pr-pytorch-sagemaker-cuda.yml" - - "docker/pytorch/**" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" + - "docker/pytorch/*/versions-cuda.env" - "scripts/common/**" - "scripts/pytorch/**" - "scripts/telemetry/**" @@ -22,9 +24,7 @@ permissions: env: FORCE_COLOR: "1" - - # Config file path - CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml" + LATEST_PYTORCH_VERSION: "2.11" jobs: # ============================================================ @@ -46,61 +46,17 @@ jobs: uses: ./.github/actions/pr-permission-gate # ============================================================ - # Load configuration from YAML + # Detect all changed PyTorch versions + file changes # ============================================================ - load-config: - needs: [gatekeeper] - if: success() - runs-on: ubuntu-latest - outputs: - framework: ${{ steps.parse.outputs.framework }} - framework-version: ${{ steps.parse.outputs.framework-version }} - python-version: ${{ steps.parse.outputs.python-version }} - cuda-version: ${{ steps.parse.outputs.cuda-version }} - os-version: ${{ steps.parse.outputs.os-version }} - container-type: ${{ steps.parse.outputs.container-type }} - device-type: ${{ steps.parse.outputs.device-type }} - arch-type: ${{ steps.parse.outputs.arch-type }} - contributor: ${{ steps.parse.outputs.contributor }} - customer-type: ${{ steps.parse.outputs.customer-type }} - prod-image: ${{ steps.parse.outputs.prod-image }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Load configuration - id: load - uses: ./.github/actions/load-config - with: - config-file: ${{ env.CONFIG_FILE }} - - - name: Parse configuration - id: parse - run: | - echo '${{ steps.load.outputs.config }}' > config.json - echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT - echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT - echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT - echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT - echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT - echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT - echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT - echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT - echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT - echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT - echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT - - # ============================================================ - # Pre-commit + change detection - # ============================================================ - check-changes: + detect-versions: needs: [gatekeeper] if: success() runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: + versions: ${{ steps.versions.outputs.versions }} build-change: ${{ steps.changes.outputs.build-change }} sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }} telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} @@ -118,15 +74,33 @@ jobs: with: extra_args: --all-files + - name: Detect PyTorch versions + id: versions + run: | + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \ + | sort -u) + if [ -z "$VERSIONS" ]; then + VERSIONS=$(git diff --name-only origin/main...HEAD \ + | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \ + | sort -u) + fi + if [ -z "$VERSIONS" ]; then + VERSIONS="$LATEST_PYTORCH_VERSION" + fi + JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "versions=${JSON}" >> $GITHUB_OUTPUT + echo "Detected versions: ${JSON}" + - name: Detect file changes id: changes uses: dorny/paths-filter@v4 with: filters: | build-change: - - ".github/config/image/pytorch-sagemaker-cuda.yml" - - "docker/pytorch/Dockerfile.cuda" - - "docker/pytorch/cuda/**" + - ".github/config/image/pytorch-*-sagemaker-cuda.yml" + - "docker/pytorch/*/Dockerfile.cuda" + - "docker/pytorch/*/cuda/**" - "scripts/common/setup_oss_compliance.sh" - "scripts/pytorch/*" - "scripts/telemetry/bash_telemetry.sh.template" @@ -136,20 +110,35 @@ jobs: - "test/telemetry/**" # ============================================================ - # Build SageMaker image + # Build SageMaker images (matrix over detected versions) # ============================================================ - build-image: - needs: [check-changes, load-config] - if: needs.check-changes.outputs.build-change == 'true' + build-images: + needs: [detect-versions] + if: needs.detect-versions.outputs.build-change == 'true' runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:x86-build-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }} + framework: ${{ steps.config.outputs.framework }} + framework-version: ${{ steps.config.outputs.framework-version }} + python-version: ${{ steps.config.outputs.python-version }} + cuda-version: ${{ steps.config.outputs.cuda-version }} + os-version: ${{ steps.config.outputs.os-version }} + container-type: ${{ steps.config.outputs.container-type }} + device-type: ${{ steps.config.outputs.device-type }} + arch-type: ${{ steps.config.outputs.arch-type }} + contributor: ${{ steps.config.outputs.contributor }} + customer-type: ${{ steps.config.outputs.customer-type }} + prod-image: ${{ steps.config.outputs.prod-image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -163,12 +152,36 @@ jobs: aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} + - name: Install yq + run: | + if ! command -v yq &> /dev/null; then + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + + - name: Load and parse config + id: config + run: | + CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cuda.yml" + echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT + echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT + - name: Fetch cached wheels run: | - source docker/pytorch/versions-cuda.env - mkdir -p docker/pytorch/wheels + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + mkdir -p docker/pytorch/${VERSION}/wheels bash scripts/pytorch/fetch_cached_wheels.sh \ - docker/pytorch/wheels \ + docker/pytorch/${VERSION}/wheels \ "${{ vars.WHEEL_CACHE_BUCKET }}" \ "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ "flash-attn:${FLASH_ATTN_VERSION}" \ @@ -178,17 +191,19 @@ jobs: - name: Build sagemaker image id: build-sagemaker run: | - source docker/pytorch/versions-cuda.env - CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-pr-${{ github.event.pull_request.number }}" + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}" # Derive label values to match check_labels.py expectations - FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-') - FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-') - CUDA_LABEL="${{ needs.load-config.outputs.cuda-version }}" - OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-') + FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-') + FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-') + CUDA_LABEL="${{ steps.config.outputs.cuda-version }}" + OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-') docker buildx build --progress plain \ - --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \ + --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \ + --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ --build-arg TORCH_VERSION=${TORCH_VERSION} \ @@ -201,125 +216,129 @@ jobs: --build-arg MAX_JOBS=${MAX_JOBS} \ --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.device.gpu.${CUDA_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \ --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \ - --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \ + --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \ --cache-to=type=inline \ --cache-from=type=registry,ref=${CI_IMAGE_URI} \ --tag ${CI_IMAGE_URI} \ --push \ --target sagemaker \ - -f docker/pytorch/Dockerfile.cuda . + -f docker/pytorch/${VERSION}/Dockerfile.cuda . echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT + - name: Upload built wheels to cache + run: | + VERSION="${{ matrix.version }}" + source docker/pytorch/${VERSION}/versions-cuda.env + bash scripts/pytorch/upload_cached_wheels.sh \ + "${{ vars.WHEEL_CACHE_BUCKET }}" \ + "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \ + "${{ steps.build-sagemaker.outputs.image-uri }}" \ + "docker/pytorch/${VERSION}/Dockerfile.cuda" \ + "flash-attn:${FLASH_ATTN_VERSION}" \ + "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \ + continue-on-error: true + + - name: Run unit tests + run: | + VERSION="${{ matrix.version }}" + IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}" + docker pull ${IMAGE} + CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ + -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \ + -v $(pwd):/workdir --workdir /workdir \ + ${IMAGE} -c 'sleep infinity') + docker exec ${CONTAINER_ID} pip install pytest -q + docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v + docker kill ${CONTAINER_ID} + # ============================================================ # Sanity tests (labels, filesystem, OSS compliance) # ============================================================ sanity-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true') + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-sanity-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - python-version: ${{ needs.load-config.outputs.python-version }} - cuda-version: ${{ needs.load-config.outputs.cuda-version }} - os-version: ${{ needs.load-config.outputs.os-version }} - customer-type: ${{ needs.load-config.outputs.customer-type }} - arch-type: ${{ needs.load-config.outputs.arch-type }} - device-type: ${{ needs.load-config.outputs.device-type }} - contributor: ${{ needs.load-config.outputs.contributor }} - container-type: ${{ needs.load-config.outputs.container-type }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + python-version: ${{ needs.build-images.outputs.python-version }} + cuda-version: ${{ needs.build-images.outputs.cuda-version }} + os-version: ${{ needs.build-images.outputs.os-version }} + customer-type: ${{ needs.build-images.outputs.customer-type }} + arch-type: ${{ needs.build-images.outputs.arch-type }} + device-type: ${{ needs.build-images.outputs.device-type }} + contributor: ${{ needs.build-images.outputs.contributor }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # Security tests (ECR scan, CVE allowlist) # ============================================================ security-test: - needs: [build-image, load-config] + needs: [detect-versions, build-images] if: success() + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-security-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} # ============================================================ # Telemetry tests (opt-out, environment variables) # ============================================================ telemetry-test: - needs: [check-changes, build-image, load-config] + needs: [detect-versions, build-images] if: | always() && !failure() && !cancelled() && - (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') - concurrency: - group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} - cancel-in-progress: false + (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true') + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false uses: ./.github/workflows/reusable-telemetry-tests.yml with: - image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }} - aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }} + image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - container-type: ${{ needs.load-config.outputs.container-type }} - - # ============================================================ - # Unit tests - # ============================================================ - unit-test: - needs: [build-image] - if: success() - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }} - cancel-in-progress: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Run unit tests - run: | - IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}" - docker pull ${IMAGE} - CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \ - -e DLC_WORKDIR=/workdir \ - -v $(pwd):/workdir --workdir /workdir \ - ${IMAGE} -c 'sleep infinity') - docker exec ${CONTAINER_ID} pip install pytest -q - docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v - docker kill ${CONTAINER_ID} + framework: ${{ needs.build-images.outputs.framework }} + framework-version: ${{ needs.build-images.outputs.framework-version }} + container-type: ${{ needs.build-images.outputs.container-type }} # ============================================================ # SageMaker integration tests (launch real SM training jobs) # ============================================================ sagemaker-test: - needs: [build-image, sanity-test, security-test, unit-test] + needs: [detect-versions, build-images, sanity-test, security-test] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true + strategy: + matrix: + version: ${{ fromJson(needs.detect-versions.outputs.versions) }} + fail-fast: false concurrency: - group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }} cancel-in-progress: true steps: - name: Checkout code @@ -338,7 +357,7 @@ jobs: - name: Run SageMaker training tests env: PYTHONPATH: ${{ github.workspace }}/test - TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }} + TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }} SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole run: | pytest test/pytorch/integration/sagemaker/test_sm_training_cuda.py -v diff --git a/docker/pytorch/Dockerfile.cpu b/docker/pytorch/2.11/Dockerfile.cpu similarity index 96% rename from docker/pytorch/Dockerfile.cpu rename to docker/pytorch/2.11/Dockerfile.cpu index 1a2d9a15288d..aaf705935925 100644 --- a/docker/pytorch/Dockerfile.cpu +++ b/docker/pytorch/2.11/Dockerfile.cpu @@ -12,6 +12,7 @@ # ============================================================================ # ── Global ARGs (available to all stages) ─────────────────────────────────── +ARG DLC_PYTORCH_VERSION=2.11 ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 ARG PYTHON_VERSION=3.12 @@ -21,6 +22,7 @@ ARG OPEN_MPI_VERSION=4.1.7 # ── Stage: builder-base (shared Python venv with lockfile deps) ───────────── FROM amazonlinux:2023 AS builder-base +ARG DLC_PYTORCH_VERSION ARG PYTHON_VERSION RUN dnf install -y --allowerasing \ @@ -35,7 +37,7 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project @@ -150,6 +152,7 @@ CMD ["bash"] # ── Stage: sagemaker (SageMaker Training) ──────────────────────────────────── FROM runtime-base AS sagemaker +ARG DLC_PYTORCH_VERSION ARG TORCH_VERSION # SageMaker BYOC paths @@ -160,7 +163,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cpu/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/2.11/Dockerfile.cuda similarity index 95% rename from docker/pytorch/Dockerfile.cuda rename to docker/pytorch/2.11/Dockerfile.cuda index 93effb4ff91d..ba34aa7247b0 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/2.11/Dockerfile.cuda @@ -16,6 +16,7 @@ # ============================================================================ # ── Global ARGs (available to all stages) ─────────────────────────────────── +ARG DLC_PYTORCH_VERSION=2.11 ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 ARG CUDA_VERSION=13.0.2 @@ -32,6 +33,7 @@ ARG MAX_JOBS=8 # ── Stage: builder-base (shared Python venv with lockfile deps) ───────────── FROM nvidia/cuda:${CUDA_VERSION}-devel-amzn2023 AS builder-base +ARG DLC_PYTORCH_VERSION ARG PYTHON_VERSION RUN dnf install -y --allowerasing \ @@ -46,14 +48,14 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv" RUN python${PYTHON_VERSION} -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/ WORKDIR /tmp/build RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project # transformer-engine requires torch + cudnn.h at build time; point it to the # cudnn headers shipped inside the nvidia-cudnn pip package. ARG TRANSFORMER_ENGINE_VERSION -COPY docker/pytorch/wheel[s]/ /tmp/wheels/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0])") && \ NCCL_HOME=$(python -c "import nvidia.nccl; print(nvidia.nccl.__path__[0])") && \ cp ${CUDNN_HOME}/include/*.h /usr/local/cuda/include/ && \ @@ -76,13 +78,14 @@ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0]) # ── Stage: builder-flash-attn (parallel — needs torch only) ───────────────── FROM builder-base AS builder-flash-attn +ARG DLC_PYTORCH_VERSION ARG FLASH_ATTN_VERSION ARG MAX_JOBS # If a cached wheel exists in the build context, install it; otherwise build from source. # When building from source, the wheel is saved to /tmp/built_wheels/ for later S3 upload. -# docker/pytorch/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/wheel[s]/ /tmp/wheels/ +# docker/pytorch/${DLC_PYTORCH_VERSION}/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally. +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/ RUN --mount=type=cache,target=/root/.cache/uv \ mkdir -p /tmp/built_wheels && \ WHL=$(find /tmp/wheels -name "flash*attn*.whl" 2>/dev/null | head -1) && \ @@ -234,6 +237,7 @@ CMD ["bash"] # ── Stage: sagemaker (SageMaker Training) ──────────────────────────────────── FROM runtime-base AS sagemaker +ARG DLC_PYTORCH_VERSION ARG TORCH_VERSION # SageMaker BYOC paths @@ -244,7 +248,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # SageMaker packages (defined in cuda/pyproject.toml [project.optional-dependencies.sagemaker]) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv ENV UV_PROJECT_ENVIRONMENT="/opt/venv" -COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/ +COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/ RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \ && rm -rf /tmp/build /tmp/uv-* diff --git a/docker/pytorch/cpu/pyproject.toml b/docker/pytorch/2.11/cpu/pyproject.toml similarity index 100% rename from docker/pytorch/cpu/pyproject.toml rename to docker/pytorch/2.11/cpu/pyproject.toml diff --git a/docker/pytorch/cpu/uv.lock b/docker/pytorch/2.11/cpu/uv.lock similarity index 100% rename from docker/pytorch/cpu/uv.lock rename to docker/pytorch/2.11/cpu/uv.lock diff --git a/docker/pytorch/cuda/pyproject.toml b/docker/pytorch/2.11/cuda/pyproject.toml similarity index 100% rename from docker/pytorch/cuda/pyproject.toml rename to docker/pytorch/2.11/cuda/pyproject.toml diff --git a/docker/pytorch/cuda/uv.lock b/docker/pytorch/2.11/cuda/uv.lock similarity index 100% rename from docker/pytorch/cuda/uv.lock rename to docker/pytorch/2.11/cuda/uv.lock diff --git a/docker/pytorch/versions-cpu.env b/docker/pytorch/2.11/versions-cpu.env similarity index 100% rename from docker/pytorch/versions-cpu.env rename to docker/pytorch/2.11/versions-cpu.env diff --git a/docker/pytorch/versions-cuda.env b/docker/pytorch/2.11/versions-cuda.env similarity index 100% rename from docker/pytorch/versions-cuda.env rename to docker/pytorch/2.11/versions-cuda.env diff --git a/scripts/pytorch/upload_cached_wheels.sh b/scripts/pytorch/upload_cached_wheels.sh index be4d2f8c5eb7..e2f59c486cf3 100755 --- a/scripts/pytorch/upload_cached_wheels.sh +++ b/scripts/pytorch/upload_cached_wheels.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # upload_cached_wheels.sh — Extract built wheels from Docker wheel-export stage and upload to S3. # -# Usage: upload_cached_wheels.sh [...] +# Usage: upload_cached_wheels.sh [...] set -euo pipefail -BUCKET="$1"; CUDA="$2"; IMAGE="$5" -shift 5 +BUCKET="$1"; CUDA="$2"; IMAGE="$5"; DOCKERFILE="$6" +shift 6 if [ -z "${BUCKET}" ]; then echo "⚠️ No wheel cache bucket configured — skipping upload" @@ -15,7 +15,7 @@ fi # Build the wheel-export stage and extract to local dir EXPORT_DIR=$(mktemp -d) docker buildx build --progress=plain --target wheel-export --output "type=local,dest=${EXPORT_DIR}" \ - -f docker/pytorch/Dockerfile . 2>/dev/null || { + -f "${DOCKERFILE}" . 2>/dev/null || { echo "⚠️ wheel-export stage not available — extracting from runtime image" CID=$(docker create "${IMAGE}" /bin/true) docker cp "${CID}:/tmp/built_wheels/" "${EXPORT_DIR}/wheels/" 2>/dev/null || true diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index bd185bfb8169..5b01a996cc33 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,6 +52,15 @@ check_efa_nccl_all_reduce_performance(){ fi } +echo "=== Debug: Environment and library info ===" +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +echo "CUDA_HOME=$CUDA_HOME" +ls -la /opt/amazon/ofi-nccl/lib64/ 2>/dev/null || echo "/opt/amazon/ofi-nccl/lib64/ NOT FOUND" +ls -la /usr/local/bin/all_reduce_perf 2>/dev/null || echo "all_reduce_perf NOT FOUND" +fi_info -p efa 2>&1 | head -5 || echo "fi_info failed" +echo "NCCL lib: $(ls /opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib/libnccl.so* 2>/dev/null || echo 'not found')" +echo "=== End debug ===" + echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ @@ -63,7 +72,10 @@ RETURN_VAL=${PIPESTATUS[0]} if [ ${RETURN_VAL} -eq 0 ]; then echo "check_efa_nccl_all_reduce passed" else - echo "check_efa_nccl_all_reduce failed" + echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})" + echo "=== Full NCCL log ===" + cat "${TRAINING_LOG}" + echo "=== End NCCL log ===" fi validate_all_reduce_performance_logs diff --git a/test/pytorch/unit/test_versions.py b/test/pytorch/unit/test_versions.py index ddefd85f2319..17ae1d61d9ea 100644 --- a/test/pytorch/unit/test_versions.py +++ b/test/pytorch/unit/test_versions.py @@ -6,11 +6,13 @@ import pytest -# Detect GPU vs CPU image by checking for CUDA, then pick the right versions file. +# DLC_PYTORCH_VERSION selects which versioned directory to read (e.g., "2.11"). _WORKDIR = os.environ.get("DLC_WORKDIR", "/workdir") +_PT_VERSION = os.environ.get("DLC_PYTORCH_VERSION", "") +assert _PT_VERSION, "DLC_PYTORCH_VERSION env var is required (e.g., '2.11')" IS_CUDA = os.path.isdir("/usr/local/cuda") _VERSIONS_FILE = "versions-cuda.env" if IS_CUDA else "versions-cpu.env" -VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _VERSIONS_FILE) +VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _PT_VERSION, _VERSIONS_FILE) cuda_only = pytest.mark.skipif(not IS_CUDA, reason="CUDA-only test")