File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -156,9 +156,19 @@ jobs:
156156 # Replaces the container: directive so we can free disk space first.
157157 # Uses "docker run -d ... sleep infinity" + "docker exec" to preserve
158158 # installed packages and env vars across steps.
159+ # Retry the pull: nvcr.io intermittently times out ("context deadline
160+ # exceeded") under load, and ~30 matrix jobs hit it at once. Pulls
161+ # resume completed layers, so retries are cheap.
159162 - name : Pull NVHPC container
160163 if : matrix.nvhpc
161- run : docker pull "$NVHPC_IMAGE"
164+ run : |
165+ for attempt in 1 2 3 4 5; do
166+ docker pull "$NVHPC_IMAGE" && exit 0
167+ echo "docker pull failed (attempt $attempt/5); retrying in $((attempt * 30))s..."
168+ sleep $((attempt * 30))
169+ done
170+ echo "::error::Failed to pull $NVHPC_IMAGE after 5 attempts"
171+ exit 1
162172
163173 - name : Start NVHPC container
164174 if : matrix.nvhpc
You can’t perform that action at this time.
0 commit comments