Skip to content

Commit 2229170

Browse files
author
Caspar van Leeuwen
committed
Change strategy in order to avoid an LmodError: we don't like executing the get_cuda_driver_script twice, as it's costly. We simply adapt the script to always return a 0 exit, and then do any handling of the case where EESSI_CUDA_DRIVER_VERSION is NOT set by the end in the calling Lmod hook
1 parent 08e0cbc commit 2229170

2 files changed

Lines changed: 9 additions & 22 deletions

File tree

create_lmodsitepackage.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -192,28 +192,13 @@
192192
if not cudaVersion or cudaVersion == "" then
193193
local eessi_prefix = os.getenv("EESSI_PREFIX")
194194
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
195-
-- We cannot immedately use source_sh, since lmod has no way of catching a potential error
196-
-- and we don't want this to raise an LmodError just because nvidia-smi doesn't exist or
197-
-- doesn't print the right output (happens on a node with nvidia-smi but no driver installed).
198-
-- The only way to catch this is to source the script first with os.execute and make sure it
199-
-- returns with a zero exit code. Unfortunately, this means we have to run nvidia-smi twice, which
200-
-- is a bit slow. Since the result is then cached in the EESSI_CUDA_DRIVER_VERSION environment
201-
-- variable, this is probably acceptable
202-
local r1, r2, r3 = os.execute("bash -c 'source " .. script .. "'")
203-
local exit_code = 0
204-
if type(r1) == "number" then
205-
-- Lua 5.1 or earlier, this is our exit code
206-
exit_code = r1
207-
else
208-
-- Lua 5.2 or later, r3 is our exit code
209-
exit_code = r3
210-
end
211-
if exit_code == 0 then
212-
source_sh("bash", script)
213-
end
195+
source_sh("bash", script)
214196
end
215197
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
216198
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
199+
-- Account for the fact that the script sourced above was designed to never return a non-zero exit
200+
-- even if it failes to set EESSI_CUDA_DRIVER_VERSION
201+
-- Essentially, we handle that case here by raising an error, which can be suppressed
217202
if not cudaVersion or cudaVersion == "" then
218203
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
219204
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# This can be leveraged by the source_sh() feature of Lmod
2-
set -o pipefail
3-
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return $?
4-
export EESSI_CUDA_DRIVER_VERSION
2+
# Because we want to source this without immediately raising an LmodError upon failure, this script
3+
# is designed to ALWAYS return a 0 exit code
4+
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return 0
5+
# The || return 0 shouldn't be needed, but just to be overly sure that this script always returns 0
6+
export EESSI_CUDA_DRIVER_VERSION || return 0

0 commit comments

Comments
 (0)