GPU Integ Tests #59
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU Integ Tests | |
| on: | |
| schedule: | |
| # US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC. | |
| # All three fire within the same UTC day so the run-level CloudWatch metric | |
| # (GpuIntegRunFailure) aggregates correctly per day. | |
| - cron: "0 6 * * *" | |
| - cron: "0 9 * * *" | |
| - cron: "0 12 * * *" | |
| workflow_dispatch: | |
| permissions: | |
| id-token: write # This is required for requesting the JWT | |
| actions: read # required for the gate job to query prior runs of this workflow | |
| jobs: | |
| # Gate: if an earlier scheduled run already succeeded today, skip the rest of | |
| # today's scheduled runs. Manual (workflow_dispatch) runs always proceed. | |
| check-prior-success: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| already_succeeded: ${{ steps.check.outputs.already_succeeded }} | |
| steps: | |
| - name: Check for a successful scheduled run earlier today | |
| id: check | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| if [ "${{ github.event_name }}" != "schedule" ]; then | |
| echo "Not a scheduled run; proceeding." | |
| echo "already_succeeded=false" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| today=$(date -u +%Y-%m-%d) | |
| count=$(gh api -X GET \ | |
| "/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \ | |
| -f event=schedule \ | |
| -f status=success \ | |
| -f "created=>=${today}T00:00:00Z" \ | |
| --jq '.workflow_runs | length') | |
| echo "Successful scheduled runs today: $count" | |
| if [ "$count" -gt 0 ]; then | |
| echo "already_succeeded=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "already_succeeded=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| gpu-integ-tests: | |
| needs: check-prior-success | |
| if: needs.check-prior-success.outputs.already_succeeded != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} | |
| aws-region: us-west-2 | |
| role-duration-seconds: 10800 | |
| - name: Run GPU Integ Tests | |
| uses: aws-actions/aws-codebuild-run-build@v1 | |
| with: | |
| project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests | |
| source-version: refs/heads/master | |
| gpu-integ-tests-us-east-1: | |
| needs: check-prior-success | |
| if: needs.check-prior-success.outputs.already_succeeded != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Configure AWS Credentials (us-east-1) | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.CI_AWS_ROLE_US_EAST_1_ARN }} | |
| aws-region: us-east-1 | |
| role-duration-seconds: 10800 | |
| - name: Run GPU Integ Tests (us-east-1) | |
| uses: aws-actions/aws-codebuild-run-build@v1 | |
| with: | |
| project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests | |
| source-version: refs/heads/master | |
| # Run-level result: a run is successful only if BOTH region jobs succeeded. | |
| # Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in | |
| # us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and | |
| # cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate | |
| # short-circuited today's run (an earlier run already succeeded). | |
| report-result: | |
| needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1] | |
| # Only emit the daily alarm metric for scheduled runs that actually executed | |
| # the test jobs: | |
| # - check-prior-success.result == 'success': if the gate job itself failed, | |
| # the test jobs are skipped; without this guard always() would still run | |
| # report-result and read those skips as a (false) failure -> emit 1. | |
| # - already_succeeded != 'true': an earlier run today already passed, so the | |
| # gate short-circuited this run; nothing to report. | |
| if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} | |
| aws-region: us-west-2 | |
| - name: Emit run-level pass/fail metric | |
| run: | | |
| # Manual (workflow_dispatch) runs must not contribute to the daily | |
| # GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled | |
| # runs count toward the "all of today's scheduled runs failed" alarm. | |
| if [ "${{ github.event_name }}" != "schedule" ]; then | |
| echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission." | |
| exit 0 | |
| fi | |
| if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \ | |
| [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then | |
| value=0 | |
| echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0" | |
| else | |
| value=1 | |
| echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1" | |
| fi | |
| aws cloudwatch put-metric-data \ | |
| --namespace GpuIntegRunMetrics \ | |
| --metric-name GpuIntegRunFailure \ | |
| --value "$value" \ | |
| --unit Count |