forked from aws/sagemaker-python-sdk
-
Notifications
You must be signed in to change notification settings - Fork 0
125 lines (120 loc) · 5.28 KB
/
Copy pathgpu-integ-tests.yml
File metadata and controls
125 lines (120 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
name: GPU Integ Tests
on:
schedule:
# US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC.
# All three fire within the same UTC day so the run-level CloudWatch metric
# (GpuIntegRunFailure) aggregates correctly per day.
- cron: "0 6 * * *"
- cron: "0 9 * * *"
- cron: "0 12 * * *"
workflow_dispatch:
permissions:
id-token: write # This is required for requesting the JWT
actions: read # required for the gate job to query prior runs of this workflow
jobs:
# Gate: if an earlier scheduled run already succeeded today, skip the rest of
# today's scheduled runs. Manual (workflow_dispatch) runs always proceed.
check-prior-success:
runs-on: ubuntu-latest
outputs:
already_succeeded: ${{ steps.check.outputs.already_succeeded }}
steps:
- name: Check for a successful scheduled run earlier today
id: check
env:
GH_TOKEN: ${{ github.token }}
run: |
if [ "${{ github.event_name }}" != "schedule" ]; then
echo "Not a scheduled run; proceeding."
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
exit 0
fi
today=$(date -u +%Y-%m-%d)
count=$(gh api -X GET \
"/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \
-f event=schedule \
-f status=success \
-f "created=>=${today}T00:00:00Z" \
--jq '.workflow_runs | length')
echo "Successful scheduled runs today: $count"
if [ "$count" -gt 0 ]; then
echo "already_succeeded=true" >> "$GITHUB_OUTPUT"
else
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
fi
gpu-integ-tests:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
aws-region: us-west-2
role-duration-seconds: 10800
- name: Run GPU Integ Tests
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master
gpu-integ-tests-us-east-1:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials (us-east-1)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_US_EAST_1_ARN }}
aws-region: us-east-1
role-duration-seconds: 10800
- name: Run GPU Integ Tests (us-east-1)
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master
# Run-level result: a run is successful only if BOTH region jobs succeeded.
# Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
# us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
# cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
# short-circuited today's run (an earlier run already succeeded).
report-result:
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
# Only emit the daily alarm metric for scheduled runs that actually executed
# the test jobs:
# - check-prior-success.result == 'success': if the gate job itself failed,
# the test jobs are skipped; without this guard always() would still run
# report-result and read those skips as a (false) failure -> emit 1.
# - already_succeeded != 'true': an earlier run today already passed, so the
# gate short-circuited this run; nothing to report.
if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }}
aws-region: us-west-2
- name: Emit run-level pass/fail metric
run: |
# Manual (workflow_dispatch) runs must not contribute to the daily
# GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled
# runs count toward the "all of today's scheduled runs failed" alarm.
if [ "${{ github.event_name }}" != "schedule" ]; then
echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission."
exit 0
fi
if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
value=0
echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
else
value=1
echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
fi
aws cloudwatch put-metric-data \
--namespace GpuIntegRunMetrics \
--metric-name GpuIntegRunFailure \
--value "$value" \
--unit Count