File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -152,7 +152,11 @@ def get_nvidia_devices(self, use_docker=True):
152152 docker.errors.ImageNotFound if the CUDA image cannot be pulled
153153 docker.errors.APIError if another server error occurs
154154 """
155- cuda_image = 'sulfurheron/nvidia-cuda:9.0-cudnn7-devel-ubuntu16.04-2018-06-08'
155+
156+ # Note: Do NOT update the NVIDIA image to use a CUDA version higher than
157+ # that supported by the NLP machines. Otherwise, Slurm Batch Worker
158+ # Manager will no longer function.
159+ cuda_image = 'nvidia/cuda:11.5.2-base-ubuntu20.04'
156160 nvidia_command = 'nvidia-smi --query-gpu=index,uuid --format=csv,noheader'
157161 if use_docker :
158162 self .client .images .pull (cuda_image )
Original file line number Diff line number Diff line change @@ -453,13 +453,17 @@ def parse_gpuset_args(arg):
453453
454454 try :
455455 all_gpus = DockerRuntime ().get_nvidia_devices () # Dict[GPU index: GPU UUID]
456- except DockerException :
456+ except DockerException as e :
457+ logger .error (e )
458+ logger .error ("Setting all_gpus to be empty..." )
457459 all_gpus = {}
458460 # Docker socket can't be used
459461 except requests .exceptions .ConnectionError :
460462 try :
461463 all_gpus = DockerRuntime ().get_nvidia_devices (use_docker = False )
462- except SingularityError :
464+ except SingularityError as e :
465+ logger .error (e )
466+ logger .error ("Setting all_gpus to be empty..." )
463467 all_gpus = {}
464468
465469 if arg == 'ALL' :
You can’t perform that action at this time.
0 commit comments