-
Notifications
You must be signed in to change notification settings - Fork 226
212 lines (200 loc) · 8.87 KB
/
main.yml
File metadata and controls
212 lines (200 loc) · 8.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
name: Run all tests
on:
# enable to manually trigger the tests
workflow_dispatch:
# re-enable if we want automatic CI again
# pull_request:
# paths:
# - "**.py"
jobs:
# GPU sizes and types that we could use:
# g4dn.12xlarge 4x 16GB T4 (CC 7.5) (low availability)
# p3.8xlarge 4x 16GB V100 (CC 7.0) (very low availability)
# Unfit:
# g3.16xlarge 4x 8GB Tesla M60 (CC 5.2) (not supported by cuda-11)
# p2.8xlarge 8x 12GB K80 (CC 3.7 not supported by cuda-11)
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
# don't use the following subnets as p3.8xlarge is not supported there:
# - subnet-06576a4b # us-east-1d
# - subnet-859322b4 # us-east-1e
# - subnet-47cfad21 # us-east-1b
- name: Try to start EC2 runner (a)
id: try-us-east-1a
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-b7533b96 # us-east-1c
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (b)
id: try-us-east-1b
if: steps.try-us-east-1a.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-a396b2ad # us-east-1f
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (c)
id: try-us-east-1c
if: steps.try-us-east-1b.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-df0f6180 # us-east-1a
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (a-2)
id: try-us-east-1a-2
if: steps.try-us-east-1c.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-b7533b96 # us-east-1c
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (b-2)
id: try-us-east-1b-2
if: steps.try-us-east-1a-2.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-a396b2ad # us-east-1f
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (c-2)
id: try-us-east-1c-2
if: steps.try-us-east-1b-2.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-df0f6180 # us-east-1a
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: See if any of 3 sub-regions had the resource
id: start-ec2-runner
run: |
if [ "${{ steps.try-us-east-1a.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1a.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1b.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1b.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1c.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1c.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1a-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1a-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a-2.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1b-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1b-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b-2.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1c-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1c-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c-2.outputs.ec2-instance-id }}"
fi
do-the-job:
name: Do the job on the runner
needs: start-runner # required to start the main job when the runner is ready
# need to figure out how to cancel the previous build if a new push was made the old test is still running
# concurrency: # cancel previous build on a new push
# group: ${{ github.ref }} # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#github-context
# cancel-in-progress: true
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
steps:
- name: NVIDIA-SMI
run: nvidia-smi
- name: Checkout
uses: actions/checkout@v2
- name: Install Dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install pytest-timeout
- name: Run tests
run: pytest --timeout=600 tests
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner # required to get output from the start-runner job
- do-the-job # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}