Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
a040042
refactor: version PyTorch directory structure for multi-version support
Eren-Jeager123 May 12, 2026
949ed39
fix: update test_versions.py to discover versioned env file path
Eren-Jeager123 May 12, 2026
d43920c
fix: pass DLC_PYTORCH_VERSION to unit tests for versioned env lookup
Eren-Jeager123 May 12, 2026
dcc1f70
fix: pass DLC_PYTORCH_VERSION to autorelease unit tests
Eren-Jeager123 May 12, 2026
5995526
fix: remove DLC_PYTORCH_VERSION from non-unit-test jobs
Eren-Jeager123 May 12, 2026
172e3cc
fix: parameterize Dockerfile path in upload_cached_wheels.sh
Eren-Jeager123 May 12, 2026
1fc41f9
refactor: make PyTorch workflows version-agnostic
Eren-Jeager123 May 13, 2026
77db83a
refactor: simplify autorelease to map cron → config file directly
Eren-Jeager123 May 19, 2026
c329c46
refactor: PR workflows use matrix over detected PyTorch versions
Eren-Jeager123 May 19, 2026
50e73fd
refactor: remove config loading from detect-versions, use build-image…
Eren-Jeager123 May 19, 2026
eb45d7a
refactor: use ARG DLC_PYTORCH_VERSION in Dockerfiles instead of hardc…
Eren-Jeager123 May 19, 2026
1356f33
fix: tighten PR workflow path triggers to avoid cross-triggering
Eren-Jeager123 May 19, 2026
4e22f4b
fix: re-declare ARG DLC_PYTORCH_VERSION in each Dockerfile stage
Eren-Jeager123 May 19, 2026
edd4c39
debug: log config file content to diagnose empty matrix outputs
Eren-Jeager123 May 19, 2026
c2ce384
feat: add matrix to all downstream test jobs for per-version testing
Eren-Jeager123 May 19, 2026
40c511a
fix: install yq before config parsing in build-images job
Eren-Jeager123 May 19, 2026
b0dccfa
fix: add defensive cleanup for stale EFA test instances and EIPs
Eren-Jeager123 May 19, 2026
5c8953f
debug: add diagnostic logging to EFA NCCL test
Eren-Jeager123 May 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/scripts/efa/ec2_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,44 @@ def release_eip(aws_session, alloc_id):
LOGGER.warning(f"Failed to release EIP {alloc_id}: {e}")


def cleanup_stale_efa_instances(aws_session, max_age_hours=4):
"""Terminate EFA test instances older than max_age_hours and release their EIPs.

Prevents resource leaks from cancelled/crashed workflow runs that didn't reach cleanup.
"""
from datetime import datetime, timezone

cutoff = datetime.now(timezone.utc).timestamp() - (max_age_hours * 3600)

try:
resp = aws_session.ec2.describe_instances(
Filters=[
{"Name": "tag:Name", "Values": ["CI-CD EFA efa-test"]},
{"Name": "instance-state-name", "Values": ["running", "stopped"]},
]
)
for reservation in resp.get("Reservations", []):
for instance in reservation.get("Instances", []):
launch_time = instance["LaunchTime"].timestamp()
if launch_time < cutoff:
instance_id = instance["InstanceId"]
LOGGER.warning(
f"Terminating stale EFA instance {instance_id} (launched {instance['LaunchTime']})"
)
aws_session.ec2.terminate_instances(InstanceIds=[instance_id])

# Release unassociated EIPs (leaked from terminated instances)
addresses = aws_session.ec2.describe_addresses().get("Addresses", [])
for addr in addresses:
if not addr.get("AssociationId") and addr.get("AllocationId"):
LOGGER.warning(
f"Releasing orphaned EIP {addr['AllocationId']} ({addr.get('PublicIp', '')})"
)
release_eip(aws_session, addr["AllocationId"])
except Exception as e:
LOGGER.warning(f"Stale resource cleanup failed (non-fatal): {e}")


@contextmanager
def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION):
"""Context manager that launches 2 EFA instances, sets up containers + SSH, and cleans up.
Expand All @@ -396,6 +434,9 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION
ami_id = aws_session.get_latest_ami()
sg_id = get_efa_security_group_id(aws_session)

# Clean up leaked resources from previous cancelled/crashed runs
cleanup_stale_efa_instances(aws_session)

key_name = None
key_path = None
runner_ip = None
Expand Down
43 changes: 35 additions & 8 deletions .github/workflows/autorelease-pytorch-ec2-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ name: Auto Release - PyTorch EC2 CPU

on:
schedule:
- cron: '00 17 * * 1,3'

- cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST
# - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
workflow_dispatch:
inputs:
config-file:
description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cpu.yml)"
required: true
type: string

concurrency:
group: ${{ github.workflow }}
Expand All @@ -15,10 +20,29 @@ permissions:

env:
FORCE_COLOR: "1"
CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml"

jobs:
determine-config:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.config.outputs.config-file }}
steps:
- name: Determine config file
id: config
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
else
CRON="${{ github.event.schedule }}"
case "$CRON" in
"00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cpu.yml" >> $GITHUB_OUTPUT ;;
# "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cpu.yml" >> $GITHUB_OUTPUT ;;
*) echo "::error::Unknown cron: $CRON"; exit 1 ;;
esac
fi

load-config:
needs: [determine-config]
runs-on: ubuntu-latest
outputs:
config: ${{ steps.load.outputs.config }}
Expand All @@ -41,7 +65,7 @@ jobs:
id: load
uses: ./.github/actions/load-config
with:
config-file: ${{ env.CONFIG_FILE }}
config-file: ${{ needs.determine-config.outputs.config-file }}

- name: Parse configuration
id: parse
Expand Down Expand Up @@ -83,10 +107,12 @@ jobs:
- name: Build runtime image
id: build-runtime
run: |
source docker/pytorch/versions-cpu.env
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
source docker/pytorch/${VERSION}/versions-cpu.env
CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}"

docker buildx build --progress plain \
--build-arg DLC_PYTORCH_VERSION=${VERSION} \
--build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
--build-arg PYTHON_VERSION=${PYTHON_VERSION} \
--build-arg TORCH_VERSION=${TORCH_VERSION} \
Expand All @@ -98,7 +124,7 @@ jobs:
--tag ${CI_IMAGE_URI} \
--push \
--target runtime \
-f docker/pytorch/Dockerfile.cpu .
-f docker/pytorch/${VERSION}/Dockerfile.cpu .

echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT

Expand Down Expand Up @@ -142,7 +168,7 @@ jobs:
container-type: ${{ needs.load-config.outputs.container-type }}

unit-test:
needs: [build-image]
needs: [load-config, build-image]
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
Expand All @@ -159,10 +185,11 @@ jobs:

- name: Run unit tests
run: |
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
IMAGE="${{ needs.build-image.outputs.ci-image }}"
docker pull ${IMAGE}
CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-e DLC_WORKDIR=/workdir \
-e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
-v $(pwd):/workdir --workdir /workdir \
${IMAGE} -c 'sleep infinity')
docker exec ${CONTAINER_ID} pip install pytest -q
Expand Down
58 changes: 44 additions & 14 deletions .github/workflows/autorelease-pytorch-ec2-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ name: Auto Release - PyTorch EC2 CUDA

on:
schedule:
# Runs at 9AM/10AM PST/PDT on Mondays and Wednesdays
- cron: '00 17 * * 1,3'

- cron: '00 17 * * 1,3' # PyTorch 2.11 — Mon/Wed 9:00 AM PST
# - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
workflow_dispatch:
inputs:
config-file:
description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cuda.yml)"
required: true
type: string

concurrency:
group: ${{ github.workflow }}
Expand All @@ -16,10 +20,29 @@ permissions:

env:
FORCE_COLOR: "1"
CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml"

jobs:
determine-config:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.config.outputs.config-file }}
steps:
- name: Determine config file
id: config
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
else
CRON="${{ github.event.schedule }}"
case "$CRON" in
"00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cuda.yml" >> $GITHUB_OUTPUT ;;
# "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cuda.yml" >> $GITHUB_OUTPUT ;;
*) echo "::error::Unknown cron: $CRON"; exit 1 ;;
esac
fi

load-config:
needs: [determine-config]
runs-on: ubuntu-latest
outputs:
config: ${{ steps.load.outputs.config }}
Expand All @@ -42,7 +65,7 @@ jobs:
id: load
uses: ./.github/actions/load-config
with:
config-file: ${{ env.CONFIG_FILE }}
config-file: ${{ needs.determine-config.outputs.config-file }}

- name: Parse configuration
id: parse
Expand Down Expand Up @@ -85,17 +108,19 @@ jobs:
- name: Source versions
id: versions
run: |
source docker/pytorch/versions-cuda.env
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
source docker/pytorch/${VERSION}/versions-cuda.env
echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT
echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT
echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT

- name: Fetch cached wheels
run: |
source docker/pytorch/versions-cuda.env
mkdir -p docker/pytorch/wheels
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
source docker/pytorch/${VERSION}/versions-cuda.env
mkdir -p docker/pytorch/${VERSION}/wheels
bash scripts/pytorch/fetch_cached_wheels.sh \
docker/pytorch/wheels \
docker/pytorch/${VERSION}/wheels \
"${{ vars.WHEEL_CACHE_BUCKET }}" \
"${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
"flash-attn:${FLASH_ATTN_VERSION}" \
Expand All @@ -105,10 +130,12 @@ jobs:
- name: Build runtime image
id: build-runtime
run: |
source docker/pytorch/versions-cuda.env
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
source docker/pytorch/${VERSION}/versions-cuda.env
CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}"

docker buildx build --progress plain \
--build-arg DLC_PYTORCH_VERSION=${VERSION} \
--build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
--build-arg CUDA_VERSION=${CUDA_VERSION} \
--build-arg PYTHON_VERSION=${PYTHON_VERSION} \
Expand All @@ -125,17 +152,19 @@ jobs:
--tag ${CI_IMAGE_URI} \
--push \
--target runtime \
-f docker/pytorch/Dockerfile.cuda .
-f docker/pytorch/${VERSION}/Dockerfile.cuda .

echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT

- name: Upload built wheels to cache
run: |
source docker/pytorch/versions-cuda.env
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
source docker/pytorch/${VERSION}/versions-cuda.env
bash scripts/pytorch/upload_cached_wheels.sh \
"${{ vars.WHEEL_CACHE_BUCKET }}" \
"${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
"${{ steps.build-runtime.outputs.image-uri }}" \
"docker/pytorch/${VERSION}/Dockerfile.cuda" \
"flash-attn:${FLASH_ATTN_VERSION}" \
"transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \
continue-on-error: true
Expand Down Expand Up @@ -180,7 +209,7 @@ jobs:
container-type: ${{ needs.load-config.outputs.container-type }}

unit-test:
needs: [build-image]
needs: [load-config, build-image]
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
Expand All @@ -197,10 +226,11 @@ jobs:

- name: Run unit tests
run: |
VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
IMAGE="${{ needs.build-image.outputs.ci-image }}"
docker pull ${IMAGE}
CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-e DLC_WORKDIR=/workdir \
-e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
-v $(pwd):/workdir --workdir /workdir \
${IMAGE} -c 'sleep infinity')
docker exec ${CONTAINER_ID} pip install pytest -q
Expand Down
Loading
Loading