|
1 | 1 | name: GPU Integ Tests |
2 | 2 | on: |
3 | 3 | schedule: |
4 | | - - cron: "0 */8 * * *" |
| 4 | + # US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC. |
| 5 | + # All three fire within the same UTC day so the run-level CloudWatch metric |
| 6 | + # (GpuIntegRunFailure) aggregates correctly per day. |
| 7 | + - cron: "0 6 * * *" |
| 8 | + - cron: "0 9 * * *" |
| 9 | + - cron: "0 12 * * *" |
5 | 10 | workflow_dispatch: |
6 | 11 |
|
7 | 12 | permissions: |
8 | | - id-token: write # This is required for requesting the JWT |
| 13 | + id-token: write # This is required for requesting the JWT |
| 14 | + actions: read # required for the gate job to query prior runs of this workflow |
9 | 15 |
|
10 | 16 | jobs: |
| 17 | + # Gate: if an earlier scheduled run already succeeded today, skip the rest of |
| 18 | + # today's scheduled runs. Manual (workflow_dispatch) runs always proceed. |
| 19 | + check-prior-success: |
| 20 | + runs-on: ubuntu-latest |
| 21 | + outputs: |
| 22 | + already_succeeded: ${{ steps.check.outputs.already_succeeded }} |
| 23 | + steps: |
| 24 | + - name: Check for a successful scheduled run earlier today |
| 25 | + id: check |
| 26 | + env: |
| 27 | + GH_TOKEN: ${{ github.token }} |
| 28 | + run: | |
| 29 | + if [ "${{ github.event_name }}" != "schedule" ]; then |
| 30 | + echo "Not a scheduled run; proceeding." |
| 31 | + echo "already_succeeded=false" >> "$GITHUB_OUTPUT" |
| 32 | + exit 0 |
| 33 | + fi |
| 34 | + today=$(date -u +%Y-%m-%d) |
| 35 | + count=$(gh api -X GET \ |
| 36 | + "/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \ |
| 37 | + -f event=schedule \ |
| 38 | + -f status=success \ |
| 39 | + -f "created=>=${today}T00:00:00Z" \ |
| 40 | + --jq '.workflow_runs | length') |
| 41 | + echo "Successful scheduled runs today: $count" |
| 42 | + if [ "$count" -gt 0 ]; then |
| 43 | + echo "already_succeeded=true" >> "$GITHUB_OUTPUT" |
| 44 | + else |
| 45 | + echo "already_succeeded=false" >> "$GITHUB_OUTPUT" |
| 46 | + fi |
| 47 | +
|
11 | 48 | gpu-integ-tests: |
| 49 | + needs: check-prior-success |
| 50 | + if: needs.check-prior-success.outputs.already_succeeded != 'true' |
12 | 51 | runs-on: ubuntu-latest |
13 | 52 | steps: |
14 | 53 | - name: Configure AWS Credentials |
|
24 | 63 | source-version: refs/heads/master |
25 | 64 |
|
26 | 65 | gpu-integ-tests-us-east-1: |
| 66 | + needs: check-prior-success |
| 67 | + if: needs.check-prior-success.outputs.already_succeeded != 'true' |
27 | 68 | runs-on: ubuntu-latest |
28 | 69 | steps: |
29 | 70 | - name: Configure AWS Credentials (us-east-1) |
|
37 | 78 | with: |
38 | 79 | project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests |
39 | 80 | source-version: refs/heads/master |
| 81 | + |
| 82 | + # Run-level result: a run is successful only if BOTH region jobs succeeded. |
| 83 | + # Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in |
| 84 | + # us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and |
| 85 | + # cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate |
| 86 | + # short-circuited today's run (an earlier run already succeeded). |
| 87 | + report-result: |
| 88 | + needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1] |
| 89 | + # Only emit the daily alarm metric for scheduled runs that actually executed |
| 90 | + # the test jobs: |
| 91 | + # - check-prior-success.result == 'success': if the gate job itself failed, |
| 92 | + # the test jobs are skipped; without this guard always() would still run |
| 93 | + # report-result and read those skips as a (false) failure -> emit 1. |
| 94 | + # - already_succeeded != 'true': an earlier run today already passed, so the |
| 95 | + # gate short-circuited this run; nothing to report. |
| 96 | + if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true' |
| 97 | + runs-on: ubuntu-latest |
| 98 | + steps: |
| 99 | + - name: Configure AWS Credentials |
| 100 | + uses: aws-actions/configure-aws-credentials@v4 |
| 101 | + with: |
| 102 | + role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} |
| 103 | + aws-region: us-west-2 |
| 104 | + - name: Emit run-level pass/fail metric |
| 105 | + run: | |
| 106 | + # Manual (workflow_dispatch) runs must not contribute to the daily |
| 107 | + # GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled |
| 108 | + # runs count toward the "all of today's scheduled runs failed" alarm. |
| 109 | + if [ "${{ github.event_name }}" != "schedule" ]; then |
| 110 | + echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission." |
| 111 | + exit 0 |
| 112 | + fi |
| 113 | + if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \ |
| 114 | + [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then |
| 115 | + value=0 |
| 116 | + echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0" |
| 117 | + else |
| 118 | + value=1 |
| 119 | + echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1" |
| 120 | + fi |
| 121 | + aws cloudwatch put-metric-data \ |
| 122 | + --namespace GpuIntegRunMetrics \ |
| 123 | + --metric-name GpuIntegRunFailure \ |
| 124 | + --value "$value" \ |
| 125 | + --unit Count |
0 commit comments