Skip to content

Commit a4cd7b3

Browse files
committed
Merge branch 'extrae_5.0.0' of github.com:bedroge/software-layer-scripts into extrae_5.0.0
2 parents de0398a + 21b6d7c commit a4cd7b3

3 files changed

Lines changed: 43 additions & 19 deletions

File tree

create_lmodsitepackage.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@
172172
else
173173
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
174174
end
175-
local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt"
176175
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
177176
local cudaDriverExists = isFile(cudaDriverFile)
178177
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
@@ -189,28 +188,46 @@
189188
else
190189
-- CUDA driver exists, now we check its version to see if an update is needed
191190
if cudaDriverExists then
192-
local cudaVersion = read_file(cudaVersionFile)
193-
if not cudaVersion then
194-
LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs)
191+
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
192+
if not cudaVersion or cudaVersion == "" then
193+
local eessi_prefix = os.getenv("EESSI_PREFIX")
194+
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
195+
source_sh("bash", script)
195196
end
197+
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
196198
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
197-
-- driver CUDA versions don't give a patch version for CUDA
198-
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
199-
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
200-
local driver_libs_need_update = false
201-
if major < major_req then
202-
driver_libs_need_update = true
203-
elseif major == major_req then
204-
if minor < minor_req then
199+
-- Account for the fact that the script sourced above was designed to never return a non-zero exit
200+
-- even if it fails to set EESSI_CUDA_DRIVER_VERSION
201+
-- Essentially, we handle that case here by raising an error, which can be suppressed
202+
if not cudaVersion or cudaVersion == "" then
203+
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
204+
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
205+
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
206+
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
207+
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
208+
local suppress_warn = os.getenv(suppress_var)
209+
if not suppress_warn or suppress_warn == 1 then
210+
LmodWarning(warn)
211+
end
212+
else
213+
-- driver CUDA versions don't give a patch version for CUDA
214+
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
215+
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
216+
local driver_libs_need_update = false
217+
if tonumber(major) < tonumber(major_req) then
205218
driver_libs_need_update = true
219+
elseif tonumber(major) == tonumber(major_req) then
220+
if tonumber(minor) < tonumber(minor_req) then
221+
driver_libs_need_update = true
222+
end
223+
end
224+
if driver_libs_need_update == true then
225+
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
226+
advice = advice .. "Please update your CUDA driver libraries and then "
227+
advice = advice .. "let EESSI know about the update.\\n"
228+
advice = advice .. refer_to_docs
229+
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
206230
end
207-
end
208-
if driver_libs_need_update == true then
209-
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
210-
advice = advice .. "Please update your CUDA driver libraries and then "
211-
advice = advice .. "let EESSI know about the update.\\n"
212-
advice = advice .. refer_to_docs
213-
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
214231
end
215232
end
216233
end

install_scripts.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ nvidia_files=(
211211
install_cuda_and_libraries.sh
212212
install_cuda_host_injections.sh
213213
link_nvidia_host_libraries.sh
214+
get_cuda_driver_version.sh
214215
)
215216
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"
216217

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# This can be leveraged by the source_sh() feature of Lmod
2+
# Because we want to source this without immediately raising an LmodError upon failure, this script
3+
# is designed to ALWAYS return a 0 exit code
4+
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return 0
5+
# The || return 0 shouldn't be needed, but just to be overly sure that this script always returns 0
6+
export EESSI_CUDA_DRIVER_VERSION || return 0

0 commit comments

Comments
 (0)