diff --git a/.github/config/image/pytorch-ec2-cpu.yml b/.github/config/image/pytorch-2.11-ec2-cpu.yml
similarity index 100%
rename from .github/config/image/pytorch-ec2-cpu.yml
rename to .github/config/image/pytorch-2.11-ec2-cpu.yml
diff --git a/.github/config/image/pytorch-ec2-cuda.yml b/.github/config/image/pytorch-2.11-ec2-cuda.yml
similarity index 100%
rename from .github/config/image/pytorch-ec2-cuda.yml
rename to .github/config/image/pytorch-2.11-ec2-cuda.yml
diff --git a/.github/config/image/pytorch-sagemaker-cpu.yml b/.github/config/image/pytorch-2.11-sagemaker-cpu.yml
similarity index 100%
rename from .github/config/image/pytorch-sagemaker-cpu.yml
rename to .github/config/image/pytorch-2.11-sagemaker-cpu.yml
diff --git a/.github/config/image/pytorch-sagemaker-cuda.yml b/.github/config/image/pytorch-2.11-sagemaker-cuda.yml
similarity index 100%
rename from .github/config/image/pytorch-sagemaker-cuda.yml
rename to .github/config/image/pytorch-2.11-sagemaker-cuda.yml
diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py
index 0cba98608ab5..85de0136eaf5 100644
--- a/.github/scripts/efa/ec2_helpers.py
+++ b/.github/scripts/efa/ec2_helpers.py
@@ -386,6 +386,44 @@ def release_eip(aws_session, alloc_id):
         LOGGER.warning(f"Failed to release EIP {alloc_id}: {e}")
 
 
+def cleanup_stale_efa_instances(aws_session, max_age_hours=4):
+    """Terminate EFA test instances older than max_age_hours and release their EIPs.
+
+    Prevents resource leaks from cancelled/crashed workflow runs that didn't reach cleanup.
+    """
+    from datetime import datetime, timezone
+
+    cutoff = datetime.now(timezone.utc).timestamp() - (max_age_hours * 3600)
+
+    try:
+        resp = aws_session.ec2.describe_instances(
+            Filters=[
+                {"Name": "tag:Name", "Values": ["CI-CD EFA efa-test"]},
+                {"Name": "instance-state-name", "Values": ["running", "stopped"]},
+            ]
+        )
+        for reservation in resp.get("Reservations", []):
+            for instance in reservation.get("Instances", []):
+                launch_time = instance["LaunchTime"].timestamp()
+                if launch_time < cutoff:
+                    instance_id = instance["InstanceId"]
+                    LOGGER.warning(
+                        f"Terminating stale EFA instance {instance_id} (launched {instance['LaunchTime']})"
+                    )
+                    aws_session.ec2.terminate_instances(InstanceIds=[instance_id])
+
+        # Release unassociated EIPs (leaked from terminated instances)
+        addresses = aws_session.ec2.describe_addresses().get("Addresses", [])
+        for addr in addresses:
+            if not addr.get("AssociationId") and addr.get("AllocationId"):
+                LOGGER.warning(
+                    f"Releasing orphaned EIP {addr['AllocationId']} ({addr.get('PublicIp', '')})"
+                )
+                release_eip(aws_session, addr["AllocationId"])
+    except Exception as e:
+        LOGGER.warning(f"Stale resource cleanup failed (non-fatal): {e}")
+
+
 @contextmanager
 def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION):
     """Context manager that launches 2 EFA instances, sets up containers + SSH, and cleans up.
@@ -396,6 +434,9 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION
     ami_id = aws_session.get_latest_ami()
     sg_id = get_efa_security_group_id(aws_session)
 
+    # Clean up leaked resources from previous cancelled/crashed runs
+    cleanup_stale_efa_instances(aws_session)
+
     key_name = None
     key_path = None
     runner_ip = None
diff --git a/.github/workflows/autorelease-pytorch-ec2-cpu.yml b/.github/workflows/autorelease-pytorch-ec2-cpu.yml
index cd41b1e713b8..028ba63c9cae 100644
--- a/.github/workflows/autorelease-pytorch-ec2-cpu.yml
+++ b/.github/workflows/autorelease-pytorch-ec2-cpu.yml
@@ -2,9 +2,14 @@ name: Auto Release - PyTorch EC2 CPU
 
 on:
   schedule:
-    - cron: '00 17 * * 1,3'
-
+    - cron: '00 17 * * 1,3'   # PyTorch 2.11 — Mon/Wed 9:00 AM PST
+    # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
   workflow_dispatch:
+    inputs:
+      config-file:
+        description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cpu.yml)"
+        required: true
+        type: string
 
 concurrency:
   group: ${{ github.workflow }}
@@ -15,10 +20,29 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml"
 
 jobs:
+  determine-config:
+    runs-on: ubuntu-latest
+    outputs:
+      config-file: ${{ steps.config.outputs.config-file }}
+    steps:
+      - name: Determine config file
+        id: config
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
+          else
+            CRON="${{ github.event.schedule }}"
+            case "$CRON" in
+              "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cpu.yml" >> $GITHUB_OUTPUT ;;
+              # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cpu.yml" >> $GITHUB_OUTPUT ;;
+              *) echo "::error::Unknown cron: $CRON"; exit 1 ;;
+            esac
+          fi
+
   load-config:
+    needs: [determine-config]
     runs-on: ubuntu-latest
     outputs:
       config: ${{ steps.load.outputs.config }}
@@ -41,7 +65,7 @@ jobs:
         id: load
         uses: ./.github/actions/load-config
         with:
-          config-file: ${{ env.CONFIG_FILE }}
+          config-file: ${{ needs.determine-config.outputs.config-file }}
 
       - name: Parse configuration
         id: parse
@@ -83,10 +107,12 @@ jobs:
       - name: Build runtime image
         id: build-runtime
         run: |
-          source docker/pytorch/versions-cpu.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cpu.env
           CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}"
 
           docker buildx build --progress plain \
+            --build-arg DLC_PYTORCH_VERSION=${VERSION} \
             --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
@@ -98,7 +124,7 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target runtime \
-            -f docker/pytorch/Dockerfile.cpu .
+            -f docker/pytorch/${VERSION}/Dockerfile.cpu .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
@@ -142,7 +168,7 @@ jobs:
       container-type: ${{ needs.load-config.outputs.container-type }}
 
   unit-test:
-    needs: [build-image]
+    needs: [load-config, build-image]
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
@@ -159,10 +185,11 @@ jobs:
 
       - name: Run unit tests
         run: |
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
           IMAGE="${{ needs.build-image.outputs.ci-image }}"
           docker pull ${IMAGE}
           CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
             -v $(pwd):/workdir --workdir /workdir \
             ${IMAGE} -c 'sleep infinity')
           docker exec ${CONTAINER_ID} pip install pytest -q
diff --git a/.github/workflows/autorelease-pytorch-ec2-cuda.yml b/.github/workflows/autorelease-pytorch-ec2-cuda.yml
index f17a7fd13cb3..730b3875609f 100644
--- a/.github/workflows/autorelease-pytorch-ec2-cuda.yml
+++ b/.github/workflows/autorelease-pytorch-ec2-cuda.yml
@@ -2,10 +2,14 @@ name: Auto Release - PyTorch EC2 CUDA
 
 on:
   schedule:
-    # Runs at 9AM/10AM PST/PDT on Mondays and Wednesdays
-    - cron: '00 17 * * 1,3'
-
+    - cron: '00 17 * * 1,3'   # PyTorch 2.11 — Mon/Wed 9:00 AM PST
+    # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
   workflow_dispatch:
+    inputs:
+      config-file:
+        description: "Config file path (e.g., .github/config/image/pytorch-2.11-ec2-cuda.yml)"
+        required: true
+        type: string
 
 concurrency:
   group: ${{ github.workflow }}
@@ -16,10 +20,29 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml"
 
 jobs:
+  determine-config:
+    runs-on: ubuntu-latest
+    outputs:
+      config-file: ${{ steps.config.outputs.config-file }}
+    steps:
+      - name: Determine config file
+        id: config
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
+          else
+            CRON="${{ github.event.schedule }}"
+            case "$CRON" in
+              "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-ec2-cuda.yml" >> $GITHUB_OUTPUT ;;
+              # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-ec2-cuda.yml" >> $GITHUB_OUTPUT ;;
+              *) echo "::error::Unknown cron: $CRON"; exit 1 ;;
+            esac
+          fi
+
   load-config:
+    needs: [determine-config]
     runs-on: ubuntu-latest
     outputs:
       config: ${{ steps.load.outputs.config }}
@@ -42,7 +65,7 @@ jobs:
         id: load
         uses: ./.github/actions/load-config
         with:
-          config-file: ${{ env.CONFIG_FILE }}
+          config-file: ${{ needs.determine-config.outputs.config-file }}
 
       - name: Parse configuration
         id: parse
@@ -85,17 +108,19 @@ jobs:
       - name: Source versions
         id: versions
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
           echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT
           echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT
           echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT
 
       - name: Fetch cached wheels
         run: |
-          source docker/pytorch/versions-cuda.env
-          mkdir -p docker/pytorch/wheels
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          mkdir -p docker/pytorch/${VERSION}/wheels
           bash scripts/pytorch/fetch_cached_wheels.sh \
-            docker/pytorch/wheels \
+            docker/pytorch/${VERSION}/wheels \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
@@ -105,10 +130,12 @@ jobs:
       - name: Build runtime image
         id: build-runtime
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
           CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-${{ github.run_id }}"
 
           docker buildx build --progress plain \
+            --build-arg DLC_PYTORCH_VERSION=${VERSION} \
             --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
             --build-arg CUDA_VERSION=${CUDA_VERSION} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
@@ -125,17 +152,19 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target runtime \
-            -f docker/pytorch/Dockerfile.cuda .
+            -f docker/pytorch/${VERSION}/Dockerfile.cuda .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
       - name: Upload built wheels to cache
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
           bash scripts/pytorch/upload_cached_wheels.sh \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "${{ steps.build-runtime.outputs.image-uri }}" \
+            "docker/pytorch/${VERSION}/Dockerfile.cuda" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
             "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \
         continue-on-error: true
@@ -180,7 +209,7 @@ jobs:
       container-type: ${{ needs.load-config.outputs.container-type }}
 
   unit-test:
-    needs: [build-image]
+    needs: [load-config, build-image]
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
@@ -197,10 +226,11 @@ jobs:
 
       - name: Run unit tests
         run: |
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
           IMAGE="${{ needs.build-image.outputs.ci-image }}"
           docker pull ${IMAGE}
           CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
             -v $(pwd):/workdir --workdir /workdir \
             ${IMAGE} -c 'sleep infinity')
           docker exec ${CONTAINER_ID} pip install pytest -q
diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml
index 95c8780f3277..e733adea4679 100644
--- a/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml
+++ b/.github/workflows/autorelease-pytorch-sagemaker-cpu.yml
@@ -2,9 +2,14 @@ name: Auto Release - PyTorch SageMaker CPU
 
 on:
   schedule:
-    - cron: '00 17 * * 1,3'
-
+    - cron: '00 17 * * 1,3'   # PyTorch 2.11 — Mon/Wed 9:00 AM PST
+    # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
   workflow_dispatch:
+    inputs:
+      config-file:
+        description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cpu.yml)"
+        required: true
+        type: string
 
 concurrency:
   group: ${{ github.workflow }}
@@ -15,10 +20,29 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml"
 
 jobs:
+  determine-config:
+    runs-on: ubuntu-latest
+    outputs:
+      config-file: ${{ steps.config.outputs.config-file }}
+    steps:
+      - name: Determine config file
+        id: config
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
+          else
+            CRON="${{ github.event.schedule }}"
+            case "$CRON" in
+              "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;;
+              # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cpu.yml" >> $GITHUB_OUTPUT ;;
+              *) echo "::error::Unknown cron: $CRON"; exit 1 ;;
+            esac
+          fi
+
   load-config:
+    needs: [determine-config]
     runs-on: ubuntu-latest
     outputs:
       config: ${{ steps.load.outputs.config }}
@@ -41,7 +65,7 @@ jobs:
         id: load
         uses: ./.github/actions/load-config
         with:
-          config-file: ${{ env.CONFIG_FILE }}
+          config-file: ${{ needs.determine-config.outputs.config-file }}
 
       - name: Parse configuration
         id: parse
@@ -83,7 +107,8 @@ jobs:
       - name: Build sagemaker image
         id: build-sagemaker
         run: |
-          source docker/pytorch/versions-cpu.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cpu.env
           CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-cpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}"
 
           # Derive label values to match check_labels.py expectations
@@ -92,6 +117,7 @@ jobs:
           OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-')
 
           docker buildx build --progress plain \
+            --build-arg DLC_PYTORCH_VERSION=${VERSION} \
             --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
@@ -110,7 +136,7 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target sagemaker \
-            -f docker/pytorch/Dockerfile.cpu .
+            -f docker/pytorch/${VERSION}/Dockerfile.cpu .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
@@ -154,7 +180,7 @@ jobs:
       container-type: ${{ needs.load-config.outputs.container-type }}
 
   unit-test:
-    needs: [build-image]
+    needs: [load-config, build-image]
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
@@ -171,10 +197,11 @@ jobs:
 
       - name: Run unit tests
         run: |
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
           IMAGE="${{ needs.build-image.outputs.ci-image }}"
           docker pull ${IMAGE}
           CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
             -v $(pwd):/workdir --workdir /workdir \
             ${IMAGE} -c 'sleep infinity')
           docker exec ${CONTAINER_ID} pip install pytest -q
diff --git a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml
index c40d70c44bfd..691236d8a0ac 100644
--- a/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml
+++ b/.github/workflows/autorelease-pytorch-sagemaker-cuda.yml
@@ -2,9 +2,14 @@ name: Auto Release - PyTorch SageMaker CUDA
 
 on:
   schedule:
-    - cron: '00 17 * * 1,3'
-
+    - cron: '00 17 * * 1,3'   # PyTorch 2.11 — Mon/Wed 9:00 AM PST
+    # - cron: '10 17 * * 1,3' # PyTorch 2.12 — Mon/Wed 9:10 AM PST (future)
   workflow_dispatch:
+    inputs:
+      config-file:
+        description: "Config file path (e.g., .github/config/image/pytorch-2.11-sagemaker-cuda.yml)"
+        required: true
+        type: string
 
 concurrency:
   group: ${{ github.workflow }}
@@ -15,10 +20,29 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml"
 
 jobs:
+  determine-config:
+    runs-on: ubuntu-latest
+    outputs:
+      config-file: ${{ steps.config.outputs.config-file }}
+    steps:
+      - name: Determine config file
+        id: config
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "config-file=${{ inputs.config-file }}" >> $GITHUB_OUTPUT
+          else
+            CRON="${{ github.event.schedule }}"
+            case "$CRON" in
+              "00 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.11-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;;
+              # "10 17 * * 1,3") echo "config-file=.github/config/image/pytorch-2.12-sagemaker-cuda.yml" >> $GITHUB_OUTPUT ;;
+              *) echo "::error::Unknown cron: $CRON"; exit 1 ;;
+            esac
+          fi
+
   load-config:
+    needs: [determine-config]
     runs-on: ubuntu-latest
     outputs:
       config: ${{ steps.load.outputs.config }}
@@ -41,7 +65,7 @@ jobs:
         id: load
         uses: ./.github/actions/load-config
         with:
-          config-file: ${{ env.CONFIG_FILE }}
+          config-file: ${{ needs.determine-config.outputs.config-file }}
 
       - name: Parse configuration
         id: parse
@@ -83,10 +107,11 @@ jobs:
 
       - name: Fetch cached wheels
         run: |
-          source docker/pytorch/versions-cuda.env
-          mkdir -p docker/pytorch/wheels
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          mkdir -p docker/pytorch/${VERSION}/wheels
           bash scripts/pytorch/fetch_cached_wheels.sh \
-            docker/pytorch/wheels \
+            docker/pytorch/${VERSION}/wheels \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
@@ -96,7 +121,8 @@ jobs:
       - name: Build sagemaker image
         id: build-sagemaker
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
           CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-training-${TORCH_VERSION}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-${{ github.run_id }}"
 
           # Derive label values to match check_labels.py expectations
@@ -106,6 +132,7 @@ jobs:
           OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-')
 
           docker buildx build --progress plain \
+            --build-arg DLC_PYTORCH_VERSION=${VERSION} \
             --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
             --build-arg CUDA_VERSION=${CUDA_VERSION} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
@@ -129,17 +156,19 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target sagemaker \
-            -f docker/pytorch/Dockerfile.cuda .
+            -f docker/pytorch/${VERSION}/Dockerfile.cuda .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
       - name: Upload built wheels to cache
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
+          source docker/pytorch/${VERSION}/versions-cuda.env
           bash scripts/pytorch/upload_cached_wheels.sh \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "${{ steps.build-sagemaker.outputs.image-uri }}" \
+            "docker/pytorch/${VERSION}/Dockerfile.cuda" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
             "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \
         continue-on-error: true
@@ -184,7 +213,7 @@ jobs:
       container-type: ${{ needs.load-config.outputs.container-type }}
 
   unit-test:
-    needs: [build-image]
+    needs: [load-config, build-image]
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
@@ -201,10 +230,11 @@ jobs:
 
       - name: Run unit tests
         run: |
+          VERSION=$(echo "${{ needs.load-config.outputs.framework-version }}" | cut -d. -f1,2)
           IMAGE="${{ needs.build-image.outputs.ci-image }}"
           docker pull ${IMAGE}
           CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
             -v $(pwd):/workdir --workdir /workdir \
             ${IMAGE} -c 'sleep infinity')
           docker exec ${CONTAINER_ID} pip install pytest -q
diff --git a/.github/workflows/pr-pytorch-ec2-cpu.yml b/.github/workflows/pr-pytorch-ec2-cpu.yml
index a6264f2df988..4fba570ea808 100644
--- a/.github/workflows/pr-pytorch-ec2-cpu.yml
+++ b/.github/workflows/pr-pytorch-ec2-cpu.yml
@@ -5,9 +5,11 @@ on:
     branches: [main]
     types: [opened, reopened, synchronize]
     paths:
-      - ".github/config/image/pytorch-ec2-cpu.yml"
+      - ".github/config/image/pytorch-*-ec2-cpu.yml"
       - ".github/workflows/pr-pytorch-ec2-cpu.yml"
-      - "docker/pytorch/**"
+      - "docker/pytorch/*/Dockerfile.cpu"
+      - "docker/pytorch/*/cpu/**"
+      - "docker/pytorch/*/versions-cpu.env"
       - "scripts/common/**"
       - "scripts/pytorch/**"
       - "scripts/telemetry/**"
@@ -22,7 +24,7 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-ec2-cpu.yml"
+  LATEST_PYTORCH_VERSION: "2.11"
 
 jobs:
   # ============================================================
@@ -44,61 +46,17 @@ jobs:
         uses: ./.github/actions/pr-permission-gate
 
   # ============================================================
-  # Load configuration from YAML
+  # Detect all changed PyTorch versions + file changes
   # ============================================================
-  load-config:
-    needs: [gatekeeper]
-    if: success()
-    runs-on: ubuntu-latest
-    outputs:
-      framework: ${{ steps.parse.outputs.framework }}
-      framework-version: ${{ steps.parse.outputs.framework-version }}
-      python-version: ${{ steps.parse.outputs.python-version }}
-      cuda-version: ${{ steps.parse.outputs.cuda-version }}
-      os-version: ${{ steps.parse.outputs.os-version }}
-      container-type: ${{ steps.parse.outputs.container-type }}
-      device-type: ${{ steps.parse.outputs.device-type }}
-      arch-type: ${{ steps.parse.outputs.arch-type }}
-      contributor: ${{ steps.parse.outputs.contributor }}
-      customer-type: ${{ steps.parse.outputs.customer-type }}
-      prod-image: ${{ steps.parse.outputs.prod-image }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Load configuration
-        id: load
-        uses: ./.github/actions/load-config
-        with:
-          config-file: ${{ env.CONFIG_FILE }}
-
-      - name: Parse configuration
-        id: parse
-        run: |
-          echo '${{ steps.load.outputs.config }}' > config.json
-          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
-          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
-          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
-          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
-          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
-          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
-          echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT
-          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
-          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
-          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
-          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
-
-  # ============================================================
-  # Pre-commit + change detection
-  # ============================================================
-  check-changes:
+  detect-versions:
     needs: [gatekeeper]
     if: success()
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
+      versions: ${{ steps.versions.outputs.versions }}
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
@@ -116,15 +74,33 @@ jobs:
         with:
           extra_args: --all-files
 
+      - name: Detect PyTorch versions
+        id: versions
+        run: |
+          VERSIONS=$(git diff --name-only origin/main...HEAD \
+            | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \
+            | sort -u)
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS=$(git diff --name-only origin/main...HEAD \
+              | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \
+              | sort -u)
+          fi
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS="$LATEST_PYTORCH_VERSION"
+          fi
+          JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))')
+          echo "versions=${JSON}" >> $GITHUB_OUTPUT
+          echo "Detected versions: ${JSON}"
+
       - name: Detect file changes
         id: changes
         uses: dorny/paths-filter@v4
         with:
           filters: |
             build-change:
-              - ".github/config/image/pytorch-ec2-cpu.yml"
-              - "docker/pytorch/Dockerfile.cpu"
-              - "docker/pytorch/cpu/**"
+              - ".github/config/image/pytorch-*-ec2-cpu.yml"
+              - "docker/pytorch/*/Dockerfile.cpu"
+              - "docker/pytorch/*/cpu/**"
               - "scripts/common/setup_oss_compliance.sh"
               - "scripts/pytorch/configure_ssh.sh"
               - "scripts/telemetry/bash_telemetry.sh.template"
@@ -134,20 +110,35 @@ jobs:
               - "test/telemetry/**"
 
   # ============================================================
-  # Build CPU runtime image
+  # Build CPU images (matrix over detected versions)
   # ============================================================
-  build-image:
-    needs: [check-changes, load-config]
-    if: needs.check-changes.outputs.build-change == 'true'
+  build-images:
+    needs: [detect-versions]
+    if: needs.detect-versions.outputs.build-change == 'true'
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
       runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }}
+      framework: ${{ steps.config.outputs.framework }}
+      framework-version: ${{ steps.config.outputs.framework-version }}
+      python-version: ${{ steps.config.outputs.python-version }}
+      cuda-version: ${{ steps.config.outputs.cuda-version }}
+      os-version: ${{ steps.config.outputs.os-version }}
+      container-type: ${{ steps.config.outputs.container-type }}
+      device-type: ${{ steps.config.outputs.device-type }}
+      arch-type: ${{ steps.config.outputs.arch-type }}
+      contributor: ${{ steps.config.outputs.contributor }}
+      customer-type: ${{ steps.config.outputs.customer-type }}
+      prod-image: ${{ steps.config.outputs.prod-image }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
@@ -161,14 +152,39 @@ jobs:
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
 
+      - name: Install yq
+        run: |
+          if ! command -v yq &> /dev/null; then
+            sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+            sudo chmod +x /usr/local/bin/yq
+          fi
+
+      - name: Load and parse config
+        id: config
+        run: |
+          CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cpu.yml"
+          echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+
       - name: Build runtime image
         id: build-runtime
         run: |
-          source docker/pytorch/versions-cpu.env
-          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-pr-${{ github.event.pull_request.number }}"
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cpu.env
+          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}"
 
           docker buildx build --progress plain \
-            --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
+            --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \
+            --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
             --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \
@@ -179,99 +195,86 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target runtime \
-            -f docker/pytorch/Dockerfile.cpu .
+            -f docker/pytorch/${VERSION}/Dockerfile.cpu .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
+      - name: Run unit tests
+        run: |
+          VERSION="${{ matrix.version }}"
+          IMAGE="${{ steps.build-runtime.outputs.image-uri }}"
+          docker pull ${IMAGE}
+          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
+            -v $(pwd):/workdir --workdir /workdir \
+            ${IMAGE} -c 'sleep infinity')
+          docker exec ${CONTAINER_ID} pip install pytest -q
+          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
+          docker kill ${CONTAINER_ID}
+
   # ============================================================
   # Sanity tests
   # ============================================================
   sanity-test:
-    needs: [check-changes, build-image, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true')
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-sanity-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      python-version: ${{ needs.load-config.outputs.python-version }}
-      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
-      os-version: ${{ needs.load-config.outputs.os-version }}
-      customer-type: ${{ needs.load-config.outputs.customer-type }}
-      arch-type: ${{ needs.load-config.outputs.arch-type }}
-      device-type: ${{ needs.load-config.outputs.device-type }}
-      contributor: ${{ needs.load-config.outputs.contributor }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      python-version: ${{ needs.build-images.outputs.python-version }}
+      cuda-version: ${{ needs.build-images.outputs.cuda-version }}
+      os-version: ${{ needs.build-images.outputs.os-version }}
+      customer-type: ${{ needs.build-images.outputs.customer-type }}
+      arch-type: ${{ needs.build-images.outputs.arch-type }}
+      device-type: ${{ needs.build-images.outputs.device-type }}
+      contributor: ${{ needs.build-images.outputs.contributor }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # Security tests
   # ============================================================
   security-test:
-    needs: [build-image, load-config]
+    needs: [detect-versions, build-images]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-security-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
 
   # ============================================================
   # Telemetry tests
   # ============================================================
   telemetry-test:
-    needs: [check-changes, build-image, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
-    concurrency:
-      group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
-      cancel-in-progress: false
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-telemetry-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
-
-  # ============================================================
-  # Unit tests
-  # ============================================================
-  unit-test:
-    needs: [build-image]
-    if: success()
-    runs-on:
-      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
-        fleet:default-runner
-        buildspec-override:true
-    concurrency:
-      group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }}
-      cancel-in-progress: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
-        with:
-          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Run unit tests
-        run: |
-          IMAGE="${{ needs.build-image.outputs.runtime-image-uri }}"
-          docker pull ${IMAGE}
-          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
-            -v $(pwd):/workdir --workdir /workdir \
-            ${IMAGE} -c 'sleep infinity')
-          docker exec ${CONTAINER_ID} pip install pytest -q
-          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
-          docker kill ${CONTAINER_ID}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml
index 09d07e1b68ae..4d0d3925ce74 100644
--- a/.github/workflows/pr-pytorch-ec2-cuda.yml
+++ b/.github/workflows/pr-pytorch-ec2-cuda.yml
@@ -5,9 +5,11 @@ on:
     branches: [main]
     types: [opened, reopened, synchronize]
     paths:
-      - ".github/config/image/pytorch-ec2-cuda.yml"
+      - ".github/config/image/pytorch-*-ec2-cuda.yml"
       - ".github/workflows/pr-pytorch-ec2-cuda.yml"
-      - "docker/pytorch/**"
+      - "docker/pytorch/*/Dockerfile.cuda"
+      - "docker/pytorch/*/cuda/**"
+      - "docker/pytorch/*/versions-cuda.env"
       - "scripts/common/**"
       - "scripts/pytorch/**"
       - "scripts/telemetry/**"
@@ -23,9 +25,7 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-
-  # Config file path
-  CONFIG_FILE: ".github/config/image/pytorch-ec2-cuda.yml"
+  LATEST_PYTORCH_VERSION: "2.11"
 
 jobs:
   # ============================================================
@@ -47,61 +47,17 @@ jobs:
         uses: ./.github/actions/pr-permission-gate
 
   # ============================================================
-  # Load configuration from YAML
+  # Detect all changed PyTorch versions + file changes
   # ============================================================
-  load-config:
-    needs: [gatekeeper]
-    if: success()
-    runs-on: ubuntu-latest
-    outputs:
-      framework: ${{ steps.parse.outputs.framework }}
-      framework-version: ${{ steps.parse.outputs.framework-version }}
-      python-version: ${{ steps.parse.outputs.python-version }}
-      cuda-version: ${{ steps.parse.outputs.cuda-version }}
-      os-version: ${{ steps.parse.outputs.os-version }}
-      container-type: ${{ steps.parse.outputs.container-type }}
-      device-type: ${{ steps.parse.outputs.device-type }}
-      arch-type: ${{ steps.parse.outputs.arch-type }}
-      contributor: ${{ steps.parse.outputs.contributor }}
-      customer-type: ${{ steps.parse.outputs.customer-type }}
-      prod-image: ${{ steps.parse.outputs.prod-image }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Load configuration
-        id: load
-        uses: ./.github/actions/load-config
-        with:
-          config-file: ${{ env.CONFIG_FILE }}
-
-      - name: Parse configuration
-        id: parse
-        run: |
-          echo '${{ steps.load.outputs.config }}' > config.json
-          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
-          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
-          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
-          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
-          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
-          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
-          echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT
-          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
-          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
-          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
-          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
-
-  # ============================================================
-  # Pre-commit + change detection
-  # ============================================================
-  check-changes:
+  detect-versions:
     needs: [gatekeeper]
     if: success()
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
+      versions: ${{ steps.versions.outputs.versions }}
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
@@ -119,15 +75,33 @@ jobs:
         with:
           extra_args: --all-files
 
+      - name: Detect PyTorch versions
+        id: versions
+        run: |
+          VERSIONS=$(git diff --name-only origin/main...HEAD \
+            | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \
+            | sort -u)
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS=$(git diff --name-only origin/main...HEAD \
+              | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \
+              | sort -u)
+          fi
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS="$LATEST_PYTORCH_VERSION"
+          fi
+          JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))')
+          echo "versions=${JSON}" >> $GITHUB_OUTPUT
+          echo "Detected versions: ${JSON}"
+
       - name: Detect file changes
         id: changes
         uses: dorny/paths-filter@v4
         with:
           filters: |
             build-change:
-              - ".github/config/image/pytorch-ec2-cuda.yml"
-              - "docker/pytorch/Dockerfile.cuda"
-              - "docker/pytorch/cuda/**"
+              - ".github/config/image/pytorch-*-ec2-cuda.yml"
+              - "docker/pytorch/*/Dockerfile.cuda"
+              - "docker/pytorch/*/cuda/**"
               - "scripts/common/setup_oss_compliance.sh"
               - "scripts/pytorch/*"
               - "scripts/telemetry/bash_telemetry.sh.template"
@@ -137,20 +111,35 @@ jobs:
               - "test/telemetry/**"
 
   # ============================================================
-  # Build runtime image
+  # Build images (matrix over detected versions)
   # ============================================================
   build-images:
-    needs: [check-changes, load-config]
-    if: needs.check-changes.outputs.build-change == 'true'
+    needs: [detect-versions]
+    if: needs.detect-versions.outputs.build-change == 'true'
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:x86-build-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
       runtime-image-uri: ${{ steps.build-runtime.outputs.image-uri }}
+      framework: ${{ steps.config.outputs.framework }}
+      framework-version: ${{ steps.config.outputs.framework-version }}
+      python-version: ${{ steps.config.outputs.python-version }}
+      cuda-version: ${{ steps.config.outputs.cuda-version }}
+      os-version: ${{ steps.config.outputs.os-version }}
+      container-type: ${{ steps.config.outputs.container-type }}
+      device-type: ${{ steps.config.outputs.device-type }}
+      arch-type: ${{ steps.config.outputs.arch-type }}
+      contributor: ${{ steps.config.outputs.contributor }}
+      customer-type: ${{ steps.config.outputs.customer-type }}
+      prod-image: ${{ steps.config.outputs.prod-image }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
@@ -164,20 +153,39 @@ jobs:
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
 
-      - name: Source versions
-        id: versions
+      - name: Install yq
+        run: |
+          if ! command -v yq &> /dev/null; then
+            sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+            sudo chmod +x /usr/local/bin/yq
+          fi
+
+      - name: Load and parse config
+        id: config
         run: |
-          source docker/pytorch/versions-cuda.env
-          echo "torch-version=${TORCH_VERSION}" >> $GITHUB_OUTPUT
-          echo "cuda-version=${CUDA_VERSION}" >> $GITHUB_OUTPUT
-          echo "python-version=${PYTHON_VERSION}" >> $GITHUB_OUTPUT
+          CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-ec2-cuda.yml"
+          echo "Loading config from: ${CONFIG_FILE}"
+          cat "${CONFIG_FILE}"
+          echo "---"
+          echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT
 
       - name: Fetch cached wheels
         run: |
-          source docker/pytorch/versions-cuda.env
-          mkdir -p docker/pytorch/wheels
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          mkdir -p docker/pytorch/${VERSION}/wheels
           bash scripts/pytorch/fetch_cached_wheels.sh \
-            docker/pytorch/wheels \
+            docker/pytorch/${VERSION}/wheels \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
@@ -187,11 +195,13 @@ jobs:
       - name: Build runtime image
         id: build-runtime
         run: |
-          source docker/pytorch/versions-cuda.env
-          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-pr-${{ github.event.pull_request.number }}"
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}"
 
           docker buildx build --progress plain \
-            --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
+            --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \
+            --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \
             --build-arg CUDA_VERSION=${CUDA_VERSION} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
@@ -207,126 +217,119 @@ jobs:
             --tag ${CI_IMAGE_URI} \
             --push \
             --target runtime \
-            -f docker/pytorch/Dockerfile.cuda .
+            -f docker/pytorch/${VERSION}/Dockerfile.cuda .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
       - name: Upload built wheels to cache
         run: |
-          source docker/pytorch/versions-cuda.env
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
           bash scripts/pytorch/upload_cached_wheels.sh \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "${{ steps.build-runtime.outputs.image-uri }}" \
+            "docker/pytorch/${VERSION}/Dockerfile.cuda" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
             "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \
         continue-on-error: true
 
+      - name: Run unit tests
+        run: |
+          VERSION="${{ matrix.version }}"
+          IMAGE="${{ steps.build-runtime.outputs.image-uri }}"
+          docker pull ${IMAGE}
+          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
+            -v $(pwd):/workdir --workdir /workdir \
+            ${IMAGE} -c 'sleep infinity')
+          docker exec ${CONTAINER_ID} pip install pytest -q
+          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
+          docker kill ${CONTAINER_ID}
+
   # ============================================================
   # Sanity tests (labels, filesystem, OSS compliance)
   # ============================================================
   sanity-test:
-    needs: [check-changes, build-images, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true')
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-sanity-tests.yml
     with:
-      image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      python-version: ${{ needs.load-config.outputs.python-version }}
-      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
-      os-version: ${{ needs.load-config.outputs.os-version }}
-      customer-type: ${{ needs.load-config.outputs.customer-type }}
-      arch-type: ${{ needs.load-config.outputs.arch-type }}
-      device-type: ${{ needs.load-config.outputs.device-type }}
-      contributor: ${{ needs.load-config.outputs.contributor }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      python-version: ${{ needs.build-images.outputs.python-version }}
+      cuda-version: ${{ needs.build-images.outputs.cuda-version }}
+      os-version: ${{ needs.build-images.outputs.os-version }}
+      customer-type: ${{ needs.build-images.outputs.customer-type }}
+      arch-type: ${{ needs.build-images.outputs.arch-type }}
+      device-type: ${{ needs.build-images.outputs.device-type }}
+      contributor: ${{ needs.build-images.outputs.contributor }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # Security tests (ECR scan, CVE allowlist)
   # ============================================================
   security-test:
-    needs: [build-images, load-config]
+    needs: [detect-versions, build-images]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-security-tests.yml
     with:
-      image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
 
   # ============================================================
   # Telemetry tests (opt-out, environment variables)
   # ============================================================
   telemetry-test:
-    needs: [check-changes, build-images, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
-    concurrency:
-      group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
-      cancel-in-progress: false
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-telemetry-tests.yml
     with:
-      image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
-
-  # ============================================================
-  # Unit tests (CPU-only, no GPU needed)
-  # ============================================================
-  unit-test:
-    needs: [build-images]
-    if: success()
-    runs-on:
-      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
-        fleet:default-runner
-        buildspec-override:true
-    concurrency:
-      group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }}
-      cancel-in-progress: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
-        with:
-          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Run unit tests
-        run: |
-          IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}"
-          docker pull ${IMAGE}
-          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
-            -v $(pwd):/workdir --workdir /workdir \
-            ${IMAGE} -c 'sleep infinity')
-          docker exec ${CONTAINER_ID} pip install pytest -q
-          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
-          docker kill ${CONTAINER_ID}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # Single-GPU tests
   # ============================================================
   single-gpu-test:
-    needs: [build-images, sanity-test, security-test, unit-test]
+    needs: [detect-versions, build-images, sanity-test, security-test]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:x86-g6xl-runner
         buildspec-override:true
     concurrency:
-      group: ${{ github.workflow }}-single-gpu-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-single-gpu-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     steps:
       - name: Checkout code
@@ -340,7 +343,7 @@ jobs:
 
       - name: Run single-GPU tests
         run: |
-          IMAGE="${{ needs.build-images.outputs.runtime-image-uri }}"
+          IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}"
           docker pull ${IMAGE}
           CONTAINER_ID=$(docker run -d --rm --gpus all --shm-size=2g \
             --entrypoint /bin/bash \
@@ -355,11 +358,15 @@ jobs:
   # EFA integration test (2x p4d.24xlarge, NCCL over EFA)
   # ============================================================
   efa-test:
-    needs: [build-images, sanity-test, security-test, unit-test]
+    needs: [detect-versions, build-images, sanity-test, security-test]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-efa-tests.yml
     with:
-      image-uri: ${{ needs.build-images.outputs.runtime-image-uri }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
       aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
 
@@ -407,8 +414,6 @@ jobs:
   #
   #         docker kill ${CONTAINER_ID}
 
-  # ============================================================
-  # Multi-node tests (need 2+ containers on Docker network)
   # ============================================================
   # Multi-node tests (need 2+ containers on Docker network)
   # TODO: Re-enable when GPU capacity is available
diff --git a/.github/workflows/pr-pytorch-sagemaker-cpu.yml b/.github/workflows/pr-pytorch-sagemaker-cpu.yml
index 5cbb926d7917..7bb491b2510a 100644
--- a/.github/workflows/pr-pytorch-sagemaker-cpu.yml
+++ b/.github/workflows/pr-pytorch-sagemaker-cpu.yml
@@ -5,9 +5,11 @@ on:
     branches: [main]
     types: [opened, reopened, synchronize]
     paths:
-      - ".github/config/image/pytorch-sagemaker-cpu.yml"
+      - ".github/config/image/pytorch-*-sagemaker-cpu.yml"
       - ".github/workflows/pr-pytorch-sagemaker-cpu.yml"
-      - "docker/pytorch/**"
+      - "docker/pytorch/*/Dockerfile.cpu"
+      - "docker/pytorch/*/cpu/**"
+      - "docker/pytorch/*/versions-cpu.env"
       - "scripts/common/**"
       - "scripts/pytorch/**"
       - "scripts/telemetry/**"
@@ -22,7 +24,7 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-  CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cpu.yml"
+  LATEST_PYTORCH_VERSION: "2.11"
 
 jobs:
   # ============================================================
@@ -44,61 +46,17 @@ jobs:
         uses: ./.github/actions/pr-permission-gate
 
   # ============================================================
-  # Load configuration from YAML
+  # Detect all changed PyTorch versions + file changes
   # ============================================================
-  load-config:
-    needs: [gatekeeper]
-    if: success()
-    runs-on: ubuntu-latest
-    outputs:
-      framework: ${{ steps.parse.outputs.framework }}
-      framework-version: ${{ steps.parse.outputs.framework-version }}
-      python-version: ${{ steps.parse.outputs.python-version }}
-      cuda-version: ${{ steps.parse.outputs.cuda-version }}
-      os-version: ${{ steps.parse.outputs.os-version }}
-      container-type: ${{ steps.parse.outputs.container-type }}
-      device-type: ${{ steps.parse.outputs.device-type }}
-      arch-type: ${{ steps.parse.outputs.arch-type }}
-      contributor: ${{ steps.parse.outputs.contributor }}
-      customer-type: ${{ steps.parse.outputs.customer-type }}
-      prod-image: ${{ steps.parse.outputs.prod-image }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Load configuration
-        id: load
-        uses: ./.github/actions/load-config
-        with:
-          config-file: ${{ env.CONFIG_FILE }}
-
-      - name: Parse configuration
-        id: parse
-        run: |
-          echo '${{ steps.load.outputs.config }}' > config.json
-          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
-          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
-          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
-          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
-          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
-          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
-          echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT
-          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
-          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
-          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
-          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
-
-  # ============================================================
-  # Pre-commit + change detection
-  # ============================================================
-  check-changes:
+  detect-versions:
     needs: [gatekeeper]
     if: success()
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
+      versions: ${{ steps.versions.outputs.versions }}
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
@@ -116,15 +74,33 @@ jobs:
         with:
           extra_args: --all-files
 
+      - name: Detect PyTorch versions
+        id: versions
+        run: |
+          VERSIONS=$(git diff --name-only origin/main...HEAD \
+            | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \
+            | sort -u)
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS=$(git diff --name-only origin/main...HEAD \
+              | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \
+              | sort -u)
+          fi
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS="$LATEST_PYTORCH_VERSION"
+          fi
+          JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))')
+          echo "versions=${JSON}" >> $GITHUB_OUTPUT
+          echo "Detected versions: ${JSON}"
+
       - name: Detect file changes
         id: changes
         uses: dorny/paths-filter@v4
         with:
           filters: |
             build-change:
-              - ".github/config/image/pytorch-sagemaker-cpu.yml"
-              - "docker/pytorch/Dockerfile.cpu"
-              - "docker/pytorch/cpu/**"
+              - ".github/config/image/pytorch-*-sagemaker-cpu.yml"
+              - "docker/pytorch/*/Dockerfile.cpu"
+              - "docker/pytorch/*/cpu/**"
               - "scripts/common/setup_oss_compliance.sh"
               - "scripts/pytorch/configure_ssh.sh"
               - "scripts/pytorch/changehostname.c"
@@ -136,20 +112,35 @@ jobs:
               - "test/telemetry/**"
 
   # ============================================================
-  # Build CPU SageMaker image
+  # Build CPU SageMaker images (matrix over detected versions)
   # ============================================================
-  build-image:
-    needs: [check-changes, load-config]
-    if: needs.check-changes.outputs.build-change == 'true'
+  build-images:
+    needs: [detect-versions]
+    if: needs.detect-versions.outputs.build-change == 'true'
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
       sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }}
+      framework: ${{ steps.config.outputs.framework }}
+      framework-version: ${{ steps.config.outputs.framework-version }}
+      python-version: ${{ steps.config.outputs.python-version }}
+      cuda-version: ${{ steps.config.outputs.cuda-version }}
+      os-version: ${{ steps.config.outputs.os-version }}
+      container-type: ${{ steps.config.outputs.container-type }}
+      device-type: ${{ steps.config.outputs.device-type }}
+      arch-type: ${{ steps.config.outputs.arch-type }}
+      contributor: ${{ steps.config.outputs.contributor }}
+      customer-type: ${{ steps.config.outputs.customer-type }}
+      prod-image: ${{ steps.config.outputs.prod-image }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
@@ -163,19 +154,44 @@ jobs:
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
 
+      - name: Install yq
+        run: |
+          if ! command -v yq &> /dev/null; then
+            sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+            sudo chmod +x /usr/local/bin/yq
+          fi
+
+      - name: Load and parse config
+        id: config
+        run: |
+          CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cpu.yml"
+          echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "device-type=$(yq '.common.device_type // "cpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+
       - name: Build sagemaker image
         id: build-sagemaker
         run: |
-          source docker/pytorch/versions-cpu.env
-          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-pr-${{ github.event.pull_request.number }}"
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cpu.env
+          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}"
 
           # Derive label values to match check_labels.py expectations
-          FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-')
-          FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-')
-          OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-')
+          FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-')
+          FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-')
+          OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-')
 
           docker buildx build --progress plain \
-            --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
+            --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \
+            --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
             --build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \
@@ -183,105 +199,95 @@ jobs:
             --build-arg OPEN_MPI_VERSION=${OPEN_MPI_VERSION} \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.device.cpu=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \
             --cache-to=type=inline \
             --cache-from=type=registry,ref=${CI_IMAGE_URI} \
             --tag ${CI_IMAGE_URI} \
             --push \
             --target sagemaker \
-            -f docker/pytorch/Dockerfile.cpu .
+            -f docker/pytorch/${VERSION}/Dockerfile.cpu .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
+      - name: Run unit tests
+        run: |
+          VERSION="${{ matrix.version }}"
+          IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}"
+          docker pull ${IMAGE}
+          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
+            -v $(pwd):/workdir --workdir /workdir \
+            ${IMAGE} -c 'sleep infinity')
+          docker exec ${CONTAINER_ID} pip install pytest -q
+          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
+          docker kill ${CONTAINER_ID}
+
   # ============================================================
   # Sanity tests
   # ============================================================
   sanity-test:
-    needs: [check-changes, build-image, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true')
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-sanity-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      python-version: ${{ needs.load-config.outputs.python-version }}
-      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
-      os-version: ${{ needs.load-config.outputs.os-version }}
-      customer-type: ${{ needs.load-config.outputs.customer-type }}
-      arch-type: ${{ needs.load-config.outputs.arch-type }}
-      device-type: ${{ needs.load-config.outputs.device-type }}
-      contributor: ${{ needs.load-config.outputs.contributor }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      python-version: ${{ needs.build-images.outputs.python-version }}
+      cuda-version: ${{ needs.build-images.outputs.cuda-version }}
+      os-version: ${{ needs.build-images.outputs.os-version }}
+      customer-type: ${{ needs.build-images.outputs.customer-type }}
+      arch-type: ${{ needs.build-images.outputs.arch-type }}
+      device-type: ${{ needs.build-images.outputs.device-type }}
+      contributor: ${{ needs.build-images.outputs.contributor }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # Security tests
   # ============================================================
   security-test:
-    needs: [build-image, load-config]
+    needs: [detect-versions, build-images]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-security-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-
-  # ============================================================
-  # Unit tests
-  # ============================================================
-  unit-test:
-    needs: [build-image]
-    if: success()
-    runs-on:
-      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
-        fleet:default-runner
-        buildspec-override:true
-    concurrency:
-      group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }}
-      cancel-in-progress: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
-        with:
-          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Run unit tests
-        run: |
-          IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}"
-          docker pull ${IMAGE}
-          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
-            -v $(pwd):/workdir --workdir /workdir \
-            ${IMAGE} -c 'sleep infinity')
-          docker exec ${CONTAINER_ID} pip install pytest -q
-          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
-          docker kill ${CONTAINER_ID}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
 
   # ============================================================
-  # SageMaker integration tests (CPU — gloo backend)
+  # SageMaker integration tests (CPU -- gloo backend)
   # ============================================================
   sagemaker-test:
-    needs: [build-image, sanity-test, security-test, unit-test]
+    needs: [detect-versions, build-images, sanity-test, security-test]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     steps:
       - name: Checkout code
@@ -300,7 +306,7 @@ jobs:
       - name: Run SageMaker CPU training tests
         env:
           PYTHONPATH: ${{ github.workspace }}/test
-          TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }}
+          TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-cpu-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
           SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole
         run: |
           pytest test/pytorch/integration/sagemaker/test_sm_training_cpu.py -v
diff --git a/.github/workflows/pr-pytorch-sagemaker-cuda.yml b/.github/workflows/pr-pytorch-sagemaker-cuda.yml
index e8a6249d4559..2bcafa1a65d7 100644
--- a/.github/workflows/pr-pytorch-sagemaker-cuda.yml
+++ b/.github/workflows/pr-pytorch-sagemaker-cuda.yml
@@ -5,9 +5,11 @@ on:
     branches: [main]
     types: [opened, reopened, synchronize]
     paths:
-      - ".github/config/image/pytorch-sagemaker-cuda.yml"
+      - ".github/config/image/pytorch-*-sagemaker-cuda.yml"
       - ".github/workflows/pr-pytorch-sagemaker-cuda.yml"
-      - "docker/pytorch/**"
+      - "docker/pytorch/*/Dockerfile.cuda"
+      - "docker/pytorch/*/cuda/**"
+      - "docker/pytorch/*/versions-cuda.env"
       - "scripts/common/**"
       - "scripts/pytorch/**"
       - "scripts/telemetry/**"
@@ -22,9 +24,7 @@ permissions:
 
 env:
   FORCE_COLOR: "1"
-
-  # Config file path
-  CONFIG_FILE: ".github/config/image/pytorch-sagemaker-cuda.yml"
+  LATEST_PYTORCH_VERSION: "2.11"
 
 jobs:
   # ============================================================
@@ -46,61 +46,17 @@ jobs:
         uses: ./.github/actions/pr-permission-gate
 
   # ============================================================
-  # Load configuration from YAML
+  # Detect all changed PyTorch versions + file changes
   # ============================================================
-  load-config:
-    needs: [gatekeeper]
-    if: success()
-    runs-on: ubuntu-latest
-    outputs:
-      framework: ${{ steps.parse.outputs.framework }}
-      framework-version: ${{ steps.parse.outputs.framework-version }}
-      python-version: ${{ steps.parse.outputs.python-version }}
-      cuda-version: ${{ steps.parse.outputs.cuda-version }}
-      os-version: ${{ steps.parse.outputs.os-version }}
-      container-type: ${{ steps.parse.outputs.container-type }}
-      device-type: ${{ steps.parse.outputs.device-type }}
-      arch-type: ${{ steps.parse.outputs.arch-type }}
-      contributor: ${{ steps.parse.outputs.contributor }}
-      customer-type: ${{ steps.parse.outputs.customer-type }}
-      prod-image: ${{ steps.parse.outputs.prod-image }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Load configuration
-        id: load
-        uses: ./.github/actions/load-config
-        with:
-          config-file: ${{ env.CONFIG_FILE }}
-
-      - name: Parse configuration
-        id: parse
-        run: |
-          echo '${{ steps.load.outputs.config }}' > config.json
-          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
-          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
-          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
-          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
-          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
-          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
-          echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT
-          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
-          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
-          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
-          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
-
-  # ============================================================
-  # Pre-commit + change detection
-  # ============================================================
-  check-changes:
+  detect-versions:
     needs: [gatekeeper]
     if: success()
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-detect-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
+      versions: ${{ steps.versions.outputs.versions }}
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
@@ -118,15 +74,33 @@ jobs:
         with:
           extra_args: --all-files
 
+      - name: Detect PyTorch versions
+        id: versions
+        run: |
+          VERSIONS=$(git diff --name-only origin/main...HEAD \
+            | grep -oP 'docker/pytorch/\K[0-9]+\.[0-9]+' \
+            | sort -u)
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS=$(git diff --name-only origin/main...HEAD \
+              | grep -oP 'pytorch-\K[0-9]+\.[0-9]+' \
+              | sort -u)
+          fi
+          if [ -z "$VERSIONS" ]; then
+            VERSIONS="$LATEST_PYTORCH_VERSION"
+          fi
+          JSON=$(echo "$VERSIONS" | jq -R -s -c 'split("\n") | map(select(length > 0))')
+          echo "versions=${JSON}" >> $GITHUB_OUTPUT
+          echo "Detected versions: ${JSON}"
+
       - name: Detect file changes
         id: changes
         uses: dorny/paths-filter@v4
         with:
           filters: |
             build-change:
-              - ".github/config/image/pytorch-sagemaker-cuda.yml"
-              - "docker/pytorch/Dockerfile.cuda"
-              - "docker/pytorch/cuda/**"
+              - ".github/config/image/pytorch-*-sagemaker-cuda.yml"
+              - "docker/pytorch/*/Dockerfile.cuda"
+              - "docker/pytorch/*/cuda/**"
               - "scripts/common/setup_oss_compliance.sh"
               - "scripts/pytorch/*"
               - "scripts/telemetry/bash_telemetry.sh.template"
@@ -136,20 +110,35 @@ jobs:
               - "test/telemetry/**"
 
   # ============================================================
-  # Build SageMaker image
+  # Build SageMaker images (matrix over detected versions)
   # ============================================================
-  build-image:
-    needs: [check-changes, load-config]
-    if: needs.check-changes.outputs.build-change == 'true'
+  build-images:
+    needs: [detect-versions]
+    if: needs.detect-versions.outputs.build-change == 'true'
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:x86-build-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-build-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
       sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }}
+      framework: ${{ steps.config.outputs.framework }}
+      framework-version: ${{ steps.config.outputs.framework-version }}
+      python-version: ${{ steps.config.outputs.python-version }}
+      cuda-version: ${{ steps.config.outputs.cuda-version }}
+      os-version: ${{ steps.config.outputs.os-version }}
+      container-type: ${{ steps.config.outputs.container-type }}
+      device-type: ${{ steps.config.outputs.device-type }}
+      arch-type: ${{ steps.config.outputs.arch-type }}
+      contributor: ${{ steps.config.outputs.contributor }}
+      customer-type: ${{ steps.config.outputs.customer-type }}
+      prod-image: ${{ steps.config.outputs.prod-image }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
@@ -163,12 +152,36 @@ jobs:
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
 
+      - name: Install yq
+        run: |
+          if ! command -v yq &> /dev/null; then
+            sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+            sudo chmod +x /usr/local/bin/yq
+          fi
+
+      - name: Load and parse config
+        id: config
+        run: |
+          CONFIG_FILE=".github/config/image/pytorch-${{ matrix.version }}-sagemaker-cuda.yml"
+          echo "framework=$(yq '.common.framework' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(yq '.common.framework_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "python-version=$(yq '.common.python_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(yq '.common.cuda_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "os-version=$(yq '.common.os_version' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "container-type=$(yq '.common.job_type' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "device-type=$(yq '.common.device_type // "gpu"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(yq '.common.arch_type // "x86"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "contributor=$(yq '.common.contributor // "None"' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(yq '.common.customer_type // ""' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(yq '.common.prod_image' $CONFIG_FILE)" >> $GITHUB_OUTPUT
+
       - name: Fetch cached wheels
         run: |
-          source docker/pytorch/versions-cuda.env
-          mkdir -p docker/pytorch/wheels
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          mkdir -p docker/pytorch/${VERSION}/wheels
           bash scripts/pytorch/fetch_cached_wheels.sh \
-            docker/pytorch/wheels \
+            docker/pytorch/${VERSION}/wheels \
             "${{ vars.WHEEL_CACHE_BUCKET }}" \
             "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
             "flash-attn:${FLASH_ATTN_VERSION}" \
@@ -178,17 +191,19 @@ jobs:
       - name: Build sagemaker image
         id: build-sagemaker
         run: |
-          source docker/pytorch/versions-cuda.env
-          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-pr-${{ github.event.pull_request.number }}"
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:pytorch-runtime-sagemaker-${{ matrix.version }}-pr-${{ github.event.pull_request.number }}"
 
           # Derive label values to match check_labels.py expectations
-          FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-')
-          FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-')
-          CUDA_LABEL="${{ needs.load-config.outputs.cuda-version }}"
-          OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-')
+          FRAMEWORK_LABEL=$(echo "${{ steps.config.outputs.framework }}" | tr '_' '-')
+          FWK_VER_LABEL=$(echo "${{ steps.config.outputs.framework-version }}" | tr '.' '-')
+          CUDA_LABEL="${{ steps.config.outputs.cuda-version }}"
+          OS_LABEL=$(echo "${{ steps.config.outputs.os-version }}" | tr '.' '-')
 
           docker buildx build --progress plain \
-            --build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
+            --build-arg DLC_PYTORCH_VERSION=${{ matrix.version }} \
+            --build-arg FRAMEWORK=${{ steps.config.outputs.framework }} \
             --build-arg CUDA_VERSION=${CUDA_VERSION} \
             --build-arg PYTHON_VERSION=${PYTHON_VERSION} \
             --build-arg TORCH_VERSION=${TORCH_VERSION} \
@@ -201,125 +216,129 @@ jobs:
             --build-arg MAX_JOBS=${MAX_JOBS} \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.device.gpu.${CUDA_LABEL}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ steps.config.outputs.container-type }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ steps.config.outputs.arch-type }}=true" \
             --label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \
-            --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ steps.config.outputs.python-version }}=true" \
+            --label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ steps.config.outputs.contributor }}=true" \
             --cache-to=type=inline \
             --cache-from=type=registry,ref=${CI_IMAGE_URI} \
             --tag ${CI_IMAGE_URI} \
             --push \
             --target sagemaker \
-            -f docker/pytorch/Dockerfile.cuda .
+            -f docker/pytorch/${VERSION}/Dockerfile.cuda .
 
           echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT
 
+      - name: Upload built wheels to cache
+        run: |
+          VERSION="${{ matrix.version }}"
+          source docker/pytorch/${VERSION}/versions-cuda.env
+          bash scripts/pytorch/upload_cached_wheels.sh \
+            "${{ vars.WHEEL_CACHE_BUCKET }}" \
+            "${CUDA_VERSION}" "${TORCH_VERSION}" "${PYTHON_VERSION}" \
+            "${{ steps.build-sagemaker.outputs.image-uri }}" \
+            "docker/pytorch/${VERSION}/Dockerfile.cuda" \
+            "flash-attn:${FLASH_ATTN_VERSION}" \
+            "transformer-engine-torch:${TRANSFORMER_ENGINE_VERSION}" \
+        continue-on-error: true
+
+      - name: Run unit tests
+        run: |
+          VERSION="${{ matrix.version }}"
+          IMAGE="${{ steps.build-sagemaker.outputs.image-uri }}"
+          docker pull ${IMAGE}
+          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
+            -e DLC_WORKDIR=/workdir -e DLC_PYTORCH_VERSION=${VERSION} \
+            -v $(pwd):/workdir --workdir /workdir \
+            ${IMAGE} -c 'sleep infinity')
+          docker exec ${CONTAINER_ID} pip install pytest -q
+          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
+          docker kill ${CONTAINER_ID}
+
   # ============================================================
   # Sanity tests (labels, filesystem, OSS compliance)
   # ============================================================
   sanity-test:
-    needs: [check-changes, build-image, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true')
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.sanity-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-sanity-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      python-version: ${{ needs.load-config.outputs.python-version }}
-      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
-      os-version: ${{ needs.load-config.outputs.os-version }}
-      customer-type: ${{ needs.load-config.outputs.customer-type }}
-      arch-type: ${{ needs.load-config.outputs.arch-type }}
-      device-type: ${{ needs.load-config.outputs.device-type }}
-      contributor: ${{ needs.load-config.outputs.contributor }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      python-version: ${{ needs.build-images.outputs.python-version }}
+      cuda-version: ${{ needs.build-images.outputs.cuda-version }}
+      os-version: ${{ needs.build-images.outputs.os-version }}
+      customer-type: ${{ needs.build-images.outputs.customer-type }}
+      arch-type: ${{ needs.build-images.outputs.arch-type }}
+      device-type: ${{ needs.build-images.outputs.device-type }}
+      contributor: ${{ needs.build-images.outputs.contributor }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # Security tests (ECR scan, CVE allowlist)
   # ============================================================
   security-test:
-    needs: [build-image, load-config]
+    needs: [detect-versions, build-images]
     if: success()
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-security-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
 
   # ============================================================
   # Telemetry tests (opt-out, environment variables)
   # ============================================================
   telemetry-test:
-    needs: [check-changes, build-image, load-config]
+    needs: [detect-versions, build-images]
     if: |
       always() && !failure() && !cancelled() &&
-      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
-    concurrency:
-      group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
-      cancel-in-progress: false
+      (needs.detect-versions.outputs.build-change == 'true' || needs.detect-versions.outputs.telemetry-test-change == 'true')
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     uses: ./.github/workflows/reusable-telemetry-tests.yml
     with:
-      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
-      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      image-uri: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
-      framework: ${{ needs.load-config.outputs.framework }}
-      framework-version: ${{ needs.load-config.outputs.framework-version }}
-      container-type: ${{ needs.load-config.outputs.container-type }}
-
-  # ============================================================
-  # Unit tests
-  # ============================================================
-  unit-test:
-    needs: [build-image]
-    if: success()
-    runs-on:
-      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
-        fleet:default-runner
-        buildspec-override:true
-    concurrency:
-      group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }}
-      cancel-in-progress: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
-        with:
-          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Run unit tests
-        run: |
-          IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}"
-          docker pull ${IMAGE}
-          CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-            -e DLC_WORKDIR=/workdir \
-            -v $(pwd):/workdir --workdir /workdir \
-            ${IMAGE} -c 'sleep infinity')
-          docker exec ${CONTAINER_ID} pip install pytest -q
-          docker exec ${CONTAINER_ID} pytest /workdir/test/pytorch/unit/ -v
-          docker kill ${CONTAINER_ID}
+      framework: ${{ needs.build-images.outputs.framework }}
+      framework-version: ${{ needs.build-images.outputs.framework-version }}
+      container-type: ${{ needs.build-images.outputs.container-type }}
 
   # ============================================================
   # SageMaker integration tests (launch real SM training jobs)
   # ============================================================
   sagemaker-test:
-    needs: [build-image, sanity-test, security-test, unit-test]
+    needs: [detect-versions, build-images, sanity-test, security-test]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
         fleet:default-runner
         buildspec-override:true
+    strategy:
+      matrix:
+        version: ${{ fromJson(needs.detect-versions.outputs.versions) }}
+      fail-fast: false
     concurrency:
-      group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }}
+      group: ${{ github.workflow }}-sagemaker-${{ matrix.version }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     steps:
       - name: Checkout code
@@ -338,7 +357,7 @@ jobs:
       - name: Run SageMaker training tests
         env:
           PYTHONPATH: ${{ github.workspace }}/test
-          TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }}
+          TEST_IMAGE_URI: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/ci:pytorch-runtime-sagemaker-{2}-pr-{3}', vars.CI_AWS_ACCOUNT_ID, vars.AWS_REGION, matrix.version, github.event.pull_request.number) }}
           SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole
         run: |
           pytest test/pytorch/integration/sagemaker/test_sm_training_cuda.py -v
diff --git a/docker/pytorch/Dockerfile.cpu b/docker/pytorch/2.11/Dockerfile.cpu
similarity index 96%
rename from docker/pytorch/Dockerfile.cpu
rename to docker/pytorch/2.11/Dockerfile.cpu
index 1a2d9a15288d..aaf705935925 100644
--- a/docker/pytorch/Dockerfile.cpu
+++ b/docker/pytorch/2.11/Dockerfile.cpu
@@ -12,6 +12,7 @@
 # ============================================================================
 
 # ── Global ARGs (available to all stages) ───────────────────────────────────
+ARG DLC_PYTORCH_VERSION=2.11
 ARG DLC_MAJOR_VERSION=1
 ARG DLC_MINOR_VERSION=0
 ARG PYTHON_VERSION=3.12
@@ -21,6 +22,7 @@ ARG OPEN_MPI_VERSION=4.1.7
 
 # ── Stage: builder-base (shared Python venv with lockfile deps) ─────────────
 FROM amazonlinux:2023 AS builder-base
+ARG DLC_PYTORCH_VERSION
 ARG PYTHON_VERSION
 
 RUN dnf install -y --allowerasing \
@@ -35,7 +37,7 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv"
 RUN python${PYTHON_VERSION} -m venv /opt/venv
 ENV PATH="/opt/venv/bin:${PATH}"
 
-COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/
 WORKDIR /tmp/build
 RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project
 
@@ -150,6 +152,7 @@ CMD ["bash"]
 
 # ── Stage: sagemaker (SageMaker Training) ────────────────────────────────────
 FROM runtime-base AS sagemaker
+ARG DLC_PYTORCH_VERSION
 ARG TORCH_VERSION
 
 # SageMaker BYOC paths
@@ -160,7 +163,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 # SageMaker packages (defined in cpu/pyproject.toml [project.optional-dependencies.sagemaker])
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 ENV UV_PROJECT_ENVIRONMENT="/opt/venv"
-COPY docker/pytorch/cpu/pyproject.toml docker/pytorch/cpu/uv.lock /tmp/build/
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cpu/uv.lock /tmp/build/
 RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \
   && rm -rf /tmp/build /tmp/uv-*
 
diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/2.11/Dockerfile.cuda
similarity index 95%
rename from docker/pytorch/Dockerfile.cuda
rename to docker/pytorch/2.11/Dockerfile.cuda
index 93effb4ff91d..ba34aa7247b0 100644
--- a/docker/pytorch/Dockerfile.cuda
+++ b/docker/pytorch/2.11/Dockerfile.cuda
@@ -16,6 +16,7 @@
 # ============================================================================
 
 # ── Global ARGs (available to all stages) ───────────────────────────────────
+ARG DLC_PYTORCH_VERSION=2.11
 ARG DLC_MAJOR_VERSION=1
 ARG DLC_MINOR_VERSION=0
 ARG CUDA_VERSION=13.0.2
@@ -32,6 +33,7 @@ ARG MAX_JOBS=8
 
 # ── Stage: builder-base (shared Python venv with lockfile deps) ─────────────
 FROM nvidia/cuda:${CUDA_VERSION}-devel-amzn2023 AS builder-base
+ARG DLC_PYTORCH_VERSION
 ARG PYTHON_VERSION
 
 RUN dnf install -y --allowerasing \
@@ -46,14 +48,14 @@ ENV UV_PROJECT_ENVIRONMENT="/opt/venv"
 RUN python${PYTHON_VERSION} -m venv /opt/venv
 ENV PATH="/opt/venv/bin:${PATH}"
 
-COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/
 WORKDIR /tmp/build
 RUN --mount=type=cache,target=/root/.cache/uv uv sync --frozen --no-dev --no-install-project
 
 # transformer-engine requires torch + cudnn.h at build time; point it to the
 # cudnn headers shipped inside the nvidia-cudnn pip package.
 ARG TRANSFORMER_ENGINE_VERSION
-COPY docker/pytorch/wheel[s]/ /tmp/wheels/
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/
 RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0])") && \
   NCCL_HOME=$(python -c "import nvidia.nccl; print(nvidia.nccl.__path__[0])") && \
   cp ${CUDNN_HOME}/include/*.h /usr/local/cuda/include/ && \
@@ -76,13 +78,14 @@ RUN CUDNN_HOME=$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__path__[0])
 
 # ── Stage: builder-flash-attn (parallel — needs torch only) ─────────────────
 FROM builder-base AS builder-flash-attn
+ARG DLC_PYTORCH_VERSION
 ARG FLASH_ATTN_VERSION
 ARG MAX_JOBS
 
 # If a cached wheel exists in the build context, install it; otherwise build from source.
 # When building from source, the wheel is saved to /tmp/built_wheels/ for later S3 upload.
-# docker/pytorch/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally.
-COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/wheel[s]/ /tmp/wheels/
+# docker/pytorch/${DLC_PYTORCH_VERSION}/wheels/ is created by CI (fetch_cached_wheels.sh); may not exist locally.
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/wheel[s]/ /tmp/wheels/
 RUN --mount=type=cache,target=/root/.cache/uv \
   mkdir -p /tmp/built_wheels && \
   WHL=$(find /tmp/wheels -name "flash*attn*.whl" 2>/dev/null | head -1) && \
@@ -234,6 +237,7 @@ CMD ["bash"]
 
 # ── Stage: sagemaker (SageMaker Training) ────────────────────────────────────
 FROM runtime-base AS sagemaker
+ARG DLC_PYTORCH_VERSION
 ARG TORCH_VERSION
 
 # SageMaker BYOC paths
@@ -244,7 +248,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 # SageMaker packages (defined in cuda/pyproject.toml [project.optional-dependencies.sagemaker])
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 ENV UV_PROJECT_ENVIRONMENT="/opt/venv"
-COPY docker/pytorch/cuda/pyproject.toml docker/pytorch/cuda/uv.lock /tmp/build/
+COPY docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/pyproject.toml docker/pytorch/${DLC_PYTORCH_VERSION}/cuda/uv.lock /tmp/build/
 RUN --mount=type=cache,target=/root/.cache/uv cd /tmp/build && uv sync --frozen --no-dev --extra sagemaker --no-install-project --inexact \
   && rm -rf /tmp/build /tmp/uv-*
 
diff --git a/docker/pytorch/cpu/pyproject.toml b/docker/pytorch/2.11/cpu/pyproject.toml
similarity index 100%
rename from docker/pytorch/cpu/pyproject.toml
rename to docker/pytorch/2.11/cpu/pyproject.toml
diff --git a/docker/pytorch/cpu/uv.lock b/docker/pytorch/2.11/cpu/uv.lock
similarity index 100%
rename from docker/pytorch/cpu/uv.lock
rename to docker/pytorch/2.11/cpu/uv.lock
diff --git a/docker/pytorch/cuda/pyproject.toml b/docker/pytorch/2.11/cuda/pyproject.toml
similarity index 100%
rename from docker/pytorch/cuda/pyproject.toml
rename to docker/pytorch/2.11/cuda/pyproject.toml
diff --git a/docker/pytorch/cuda/uv.lock b/docker/pytorch/2.11/cuda/uv.lock
similarity index 100%
rename from docker/pytorch/cuda/uv.lock
rename to docker/pytorch/2.11/cuda/uv.lock
diff --git a/docker/pytorch/versions-cpu.env b/docker/pytorch/2.11/versions-cpu.env
similarity index 100%
rename from docker/pytorch/versions-cpu.env
rename to docker/pytorch/2.11/versions-cpu.env
diff --git a/docker/pytorch/versions-cuda.env b/docker/pytorch/2.11/versions-cuda.env
similarity index 100%
rename from docker/pytorch/versions-cuda.env
rename to docker/pytorch/2.11/versions-cuda.env
diff --git a/scripts/pytorch/upload_cached_wheels.sh b/scripts/pytorch/upload_cached_wheels.sh
index be4d2f8c5eb7..e2f59c486cf3 100755
--- a/scripts/pytorch/upload_cached_wheels.sh
+++ b/scripts/pytorch/upload_cached_wheels.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
 # upload_cached_wheels.sh — Extract built wheels from Docker wheel-export stage and upload to S3.
 #
-# Usage: upload_cached_wheels.sh <bucket> <cuda> <torch> <python> <image_uri> <pkg:ver> [...]
+# Usage: upload_cached_wheels.sh <bucket> <cuda> <torch> <python> <image_uri> <dockerfile> <pkg:ver> [...]
 set -euo pipefail
 
-BUCKET="$1"; CUDA="$2"; IMAGE="$5"
-shift 5
+BUCKET="$1"; CUDA="$2"; IMAGE="$5"; DOCKERFILE="$6"
+shift 6
 
 if [ -z "${BUCKET}" ]; then
   echo "⚠️  No wheel cache bucket configured — skipping upload"
@@ -15,7 +15,7 @@ fi
 # Build the wheel-export stage and extract to local dir
 EXPORT_DIR=$(mktemp -d)
 docker buildx build --progress=plain --target wheel-export --output "type=local,dest=${EXPORT_DIR}" \
-  -f docker/pytorch/Dockerfile . 2>/dev/null || {
+  -f "${DOCKERFILE}" . 2>/dev/null || {
   echo "⚠️  wheel-export stage not available — extracting from runtime image"
   CID=$(docker create "${IMAGE}" /bin/true)
   docker cp "${CID}:/tmp/built_wheels/" "${EXPORT_DIR}/wheels/" 2>/dev/null || true
diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh
index bd185bfb8169..5b01a996cc33 100755
--- a/test/efa/scripts/nccl_allreduce.sh
+++ b/test/efa/scripts/nccl_allreduce.sh
@@ -52,6 +52,15 @@ check_efa_nccl_all_reduce_performance(){
     fi
 }
 
+echo "=== Debug: Environment and library info ==="
+echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+echo "CUDA_HOME=$CUDA_HOME"
+ls -la /opt/amazon/ofi-nccl/lib64/ 2>/dev/null || echo "/opt/amazon/ofi-nccl/lib64/ NOT FOUND"
+ls -la /usr/local/bin/all_reduce_perf 2>/dev/null || echo "all_reduce_perf NOT FOUND"
+fi_info -p efa 2>&1 | head -5 || echo "fi_info failed"
+echo "NCCL lib: $(ls /opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib/libnccl.so* 2>/dev/null || echo 'not found')"
+echo "=== End debug ==="
+
 echo "Running all_reduce_perf test"
 mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
     -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
@@ -63,7 +72,10 @@ RETURN_VAL=${PIPESTATUS[0]}
 if [ ${RETURN_VAL} -eq 0 ]; then
     echo "check_efa_nccl_all_reduce passed"
 else
-    echo "check_efa_nccl_all_reduce failed"
+    echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})"
+    echo "=== Full NCCL log ==="
+    cat "${TRAINING_LOG}"
+    echo "=== End NCCL log ==="
 fi
 
 validate_all_reduce_performance_logs
diff --git a/test/pytorch/unit/test_versions.py b/test/pytorch/unit/test_versions.py
index ddefd85f2319..17ae1d61d9ea 100644
--- a/test/pytorch/unit/test_versions.py
+++ b/test/pytorch/unit/test_versions.py
@@ -6,11 +6,13 @@
 
 import pytest
 
-# Detect GPU vs CPU image by checking for CUDA, then pick the right versions file.
+# DLC_PYTORCH_VERSION selects which versioned directory to read (e.g., "2.11").
 _WORKDIR = os.environ.get("DLC_WORKDIR", "/workdir")
+_PT_VERSION = os.environ.get("DLC_PYTORCH_VERSION", "")
+assert _PT_VERSION, "DLC_PYTORCH_VERSION env var is required (e.g., '2.11')"
 IS_CUDA = os.path.isdir("/usr/local/cuda")
 _VERSIONS_FILE = "versions-cuda.env" if IS_CUDA else "versions-cpu.env"
-VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _VERSIONS_FILE)
+VERSIONS_ENV = os.path.join(_WORKDIR, "docker", "pytorch", _PT_VERSION, _VERSIONS_FILE)
 cuda_only = pytest.mark.skipif(not IS_CUDA, reason="CUDA-only test")