Skip to content

Commit 528db0f

Browse files
AndrewJGautwwwjn
andauthored
Fix GPU Discovery. (#4522)
* Fix gpu output. Already tested on slurm * Add more logging --------- Co-authored-by: Jiani Wang <40016222+wwwjn@users.noreply.github.com>
1 parent 2b2a31c commit 528db0f

2 files changed

Lines changed: 11 additions & 3 deletions

File tree

codalab/worker/docker_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,11 @@ def get_nvidia_devices(self, use_docker=True):
152152
docker.errors.ImageNotFound if the CUDA image cannot be pulled
153153
docker.errors.APIError if another server error occurs
154154
"""
155-
cuda_image = 'sulfurheron/nvidia-cuda:9.0-cudnn7-devel-ubuntu16.04-2018-06-08'
155+
156+
# Note: Do NOT update the NVIDIA image to use a CUDA version higher than
157+
# that supported by the NLP machines. Otherwise, Slurm Batch Worker
158+
# Manager will no longer function.
159+
cuda_image = 'nvidia/cuda:11.5.2-base-ubuntu20.04'
156160
nvidia_command = 'nvidia-smi --query-gpu=index,uuid --format=csv,noheader'
157161
if use_docker:
158162
self.client.images.pull(cuda_image)

codalab/worker/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,13 +453,17 @@ def parse_gpuset_args(arg):
453453

454454
try:
455455
all_gpus = DockerRuntime().get_nvidia_devices() # Dict[GPU index: GPU UUID]
456-
except DockerException:
456+
except DockerException as e:
457+
logger.error(e)
458+
logger.error("Setting all_gpus to be empty...")
457459
all_gpus = {}
458460
# Docker socket can't be used
459461
except requests.exceptions.ConnectionError:
460462
try:
461463
all_gpus = DockerRuntime().get_nvidia_devices(use_docker=False)
462-
except SingularityError:
464+
except SingularityError as e:
465+
logger.error(e)
466+
logger.error("Setting all_gpus to be empty...")
463467
all_gpus = {}
464468

465469
if arg == 'ALL':

0 commit comments

Comments
 (0)