Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/config/image/tensorflow-sagemaker-cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
image:
name: "tensorflow-sagemaker-cpu"
description: "TensorFlow CPU training for SageMaker"
common:
framework: "tensorflow"
framework_version: "2.21.0"
job_type: "training"
python_version: "py312"
cuda_version: ""
os_version: "amzn2023"
customer_type: "sagemaker"
platform: "sagemaker"
arch_type: "x86"
prod_image: "tensorflow:2.21-cpu-amzn2023-sagemaker"
device_type: "cpu"
contributor: "None"
release:
release: false
force_release: false
public_registry: false
private_registry: false
enable_soci: false
environment: production
23 changes: 23 additions & 0 deletions .github/config/image/tensorflow-sagemaker-cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
image:
name: "tensorflow-sagemaker-cuda"
description: "TensorFlow CUDA training for SageMaker"
common:
framework: "tensorflow"
framework_version: "2.21.0"
job_type: "training"
python_version: "py312"
cuda_version: "cu129"
os_version: "amzn2023"
customer_type: "sagemaker"
platform: "sagemaker"
arch_type: "x86"
prod_image: "tensorflow:2.21-cu129-amzn2023-sagemaker"
device_type: "gpu"
contributor: "None"
release:
release: false
force_release: false
public_registry: false
private_registry: false
enable_soci: false
environment: production
329 changes: 329 additions & 0 deletions .github/workflows/pr-tensorflow-sagemaker-cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
name: PR - TensorFlow SageMaker CPU

on:
pull_request:
branches: [main]
types: [opened, reopened, synchronize]
paths:
- ".github/config/image/tensorflow-sagemaker-cpu.yml"
- ".github/workflows/pr-tensorflow-sagemaker-cpu.yml"
- "docker/tensorflow/**"
- "scripts/common/**"
- "scripts/tensorflow/**"
- "scripts/telemetry/**"
- "test/tensorflow/**"
- "test/sanity/**"
- "test/telemetry/**"
- "!docs/**"

permissions:
contents: read
pull-requests: read

env:
FORCE_COLOR: "1"
CONFIG_FILE: ".github/config/image/tensorflow-sagemaker-cpu.yml"

jobs:
# ============================================================
# Gate: permission check on base branch
# ============================================================
gatekeeper:
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-gate-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout base branch (safe)
uses: actions/checkout@v5
with:
ref: ${{ github.event.pull_request.base.sha }}
fetch-depth: 1

- name: Run permission gate (from base)
uses: ./.github/actions/pr-permission-gate

# ============================================================
# Load configuration from YAML
# ============================================================
load-config:
needs: [gatekeeper]
if: success()
runs-on: ubuntu-latest
outputs:
framework: ${{ steps.parse.outputs.framework }}
framework-version: ${{ steps.parse.outputs.framework-version }}
python-version: ${{ steps.parse.outputs.python-version }}
cuda-version: ${{ steps.parse.outputs.cuda-version }}
os-version: ${{ steps.parse.outputs.os-version }}
container-type: ${{ steps.parse.outputs.container-type }}
device-type: ${{ steps.parse.outputs.device-type }}
arch-type: ${{ steps.parse.outputs.arch-type }}
contributor: ${{ steps.parse.outputs.contributor }}
customer-type: ${{ steps.parse.outputs.customer-type }}
prod-image: ${{ steps.parse.outputs.prod-image }}
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Load configuration
id: load
uses: ./.github/actions/load-config
with:
config-file: ${{ env.CONFIG_FILE }}

- name: Parse configuration
id: parse
run: |
echo '${{ steps.load.outputs.config }}' > config.json
echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
echo "device-type=$(jq -r '.common.device_type // "cpu"' config.json)" >> $GITHUB_OUTPUT
echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT

# ============================================================
# Pre-commit + change detection
# ============================================================
check-changes:
needs: [gatekeeper]
if: success()
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-check-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
build-change: ${{ steps.changes.outputs.build-change }}
sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup python
uses: actions/setup-python@v6
with:
python-version: "3.12"

- name: Run pre-commit
uses: pre-commit/action@v3.0.1
with:
extra_args: --all-files

- name: Detect file changes
id: changes
uses: dorny/paths-filter@v4
with:
filters: |
build-change:
- ".github/config/image/tensorflow-sagemaker-cpu.yml"
- "docker/tensorflow/Dockerfile.cpu"
- "docker/tensorflow/cpu/**"
- "docker/tensorflow/versions-cpu.env"
- "scripts/common/setup_oss_compliance.sh"
- "scripts/tensorflow/*"
- "scripts/telemetry/bash_telemetry.sh.template"
sanity-test-change:
- "test/sanity/**"
telemetry-test-change:
- "test/telemetry/**"

# ============================================================
# Build CPU SageMaker image
# ============================================================
build-image:
needs: [check-changes, load-config]
if: needs.check-changes.outputs.build-change == 'true'
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-build-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
sagemaker-image-uri: ${{ steps.build-sagemaker.outputs.image-uri }}
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup buildkitd
run: .github/scripts/buildkitd.sh

- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}

- name: Build sagemaker image
id: build-sagemaker
run: |
source docker/tensorflow/versions-cpu.env
CI_IMAGE_URI="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:tensorflow-training-${TF_VERSION}-cpu-py312-sagemaker-pr-${{ github.event.pull_request.number }}"

# Derive label values to match check_labels.py expectations
FRAMEWORK_LABEL=$(echo "${{ needs.load-config.outputs.framework }}" | tr '_' '-')
FWK_VER_LABEL=$(echo "${{ needs.load-config.outputs.framework-version }}" | tr '.' '-')
OS_LABEL=$(echo "${{ needs.load-config.outputs.os-version }}" | tr '.' '-')

docker buildx build --progress plain \
--build-arg FRAMEWORK=${{ needs.load-config.outputs.framework }} \
--build-arg PYTHON_VERSION=${PYTHON_VERSION} \
--build-arg TF_VERSION=${TF_VERSION} \
--build-arg DLC_MAJOR_VERSION=${DLC_MAJOR_VERSION} \
--build-arg DLC_MINOR_VERSION=${DLC_MINOR_VERSION} \
--build-arg OPEN_MPI_VERSION=${OPEN_MPI_VERSION} \
--label "com.amazonaws.ml.engines.sagemaker.dlc.framework.${FRAMEWORK_LABEL}.${FWK_VER_LABEL}=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.device.cpu=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.job.${{ needs.load-config.outputs.container-type }}=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.arch.${{ needs.load-config.outputs.arch-type }}=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.os.${OS_LABEL}=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.python.${{ needs.load-config.outputs.python-version }}=true" \
--label "com.amazonaws.ml.engines.sagemaker.dlc.contributor.${{ needs.load-config.outputs.contributor }}=true" \
--cache-to=type=inline \
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
--tag ${CI_IMAGE_URI} \
--push \
--target sagemaker \
-f docker/tensorflow/Dockerfile.cpu .

echo "image-uri=${CI_IMAGE_URI}" >> $GITHUB_OUTPUT

# ============================================================
# Sanity tests
# ============================================================
sanity-test:
needs: [check-changes, build-image, load-config]
if: |
always() && !failure() && !cancelled() &&
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.sanity-test-change == 'true')
uses: ./.github/workflows/reusable-sanity-tests.yml
with:
image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
framework: ${{ needs.load-config.outputs.framework }}
framework-version: ${{ needs.load-config.outputs.framework-version }}
python-version: ${{ needs.load-config.outputs.python-version }}
cuda-version: ${{ needs.load-config.outputs.cuda-version }}
os-version: ${{ needs.load-config.outputs.os-version }}
customer-type: ${{ needs.load-config.outputs.customer-type }}
arch-type: ${{ needs.load-config.outputs.arch-type }}
device-type: ${{ needs.load-config.outputs.device-type }}
contributor: ${{ needs.load-config.outputs.contributor }}
container-type: ${{ needs.load-config.outputs.container-type }}

# ============================================================
# Security tests
# ============================================================
security-test:
needs: [build-image, load-config]
if: success()
uses: ./.github/workflows/reusable-security-tests.yml
with:
image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
framework: ${{ needs.load-config.outputs.framework }}
framework-version: ${{ needs.load-config.outputs.framework-version }}

# ============================================================
# Telemetry tests
# ============================================================
telemetry-test:
needs: [check-changes, build-image, load-config]
if: |
always() && !failure() && !cancelled() &&
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
concurrency:
group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
cancel-in-progress: false
uses: ./.github/workflows/reusable-telemetry-tests.yml
with:
image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.sagemaker-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
framework: ${{ needs.load-config.outputs.framework }}
framework-version: ${{ needs.load-config.outputs.framework-version }}
container-type: ${{ needs.load-config.outputs.container-type }}

# ============================================================
# Unit tests
# ============================================================
unit-test:
needs: [build-image]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-unit-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}

- name: Run unit tests
run: |
IMAGE="${{ needs.build-image.outputs.sagemaker-image-uri }}"
docker pull ${IMAGE}
CONTAINER_ID=$(docker run -d --rm --entrypoint /bin/bash \
-e DLC_WORKDIR=/workdir \
-v $(pwd):/workdir --workdir /workdir \
${IMAGE} -c 'sleep infinity')
docker exec ${CONTAINER_ID} pip install pytest -q
docker exec ${CONTAINER_ID} pytest /workdir/test/tensorflow/unit/ -v
docker kill ${CONTAINER_ID}

# ============================================================
# SageMaker integration tests (CPU — MultiWorkerMirroredStrategy)
# ============================================================
sagemaker-test:
needs: [build-image, sanity-test, security-test, unit-test]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-sagemaker-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}

- name: Install test dependencies
run: |
pip install -r test/tensorflow/integration/sagemaker/requirements.txt

- name: Run SageMaker CPU training tests
env:
PYTHONPATH: ${{ github.workspace }}/test
TEST_IMAGE_URI: ${{ needs.build-image.outputs.sagemaker-image-uri }}
SM_ROLE_ARN: arn:aws:iam::${{ vars.CI_AWS_ACCOUNT_ID }}:role/SageMakerRole
run: |
pytest \
test/tensorflow/integration/sagemaker/test_mnist_cpu.py \
test/tensorflow/integration/sagemaker/test_experiments_cpu.py \
test/tensorflow/integration/sagemaker/test_tuning_cpu.py \
-v
Loading
Loading