Skip to content

Commit c63e8c4

Browse files
authored
Merge pull request EESSI#962 from bedroge/lmod_hook_removed_modules
Add Lmod startup hook that prints an error when loading removed/relocated modules
2 parents d1b35bb + 6b88810 commit c63e8c4

1 file changed

Lines changed: 67 additions & 0 deletions

File tree

create_lmodsitepackage.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,73 @@
210210
end
211211
end
212212
213+
local function using_eessi_accel_stack ()
214+
local modulepath = os.getenv("MODULEPATH") or ""
215+
local accel_stack_in_modulepath = false
216+
217+
-- Check if we are using an EESSI version 2023 accelerator stack by checking if the $MODULEPATH contains
218+
-- a path that starts with /cvmfs/software.eessi.io and contains accel/nvidia/ccNN
219+
for path in string.gmatch(modulepath, '(.-):') do
220+
if string.sub(path, 1, 41) == "/cvmfs/software.eessi.io/versions/2023.06" then
221+
if string.find(path, "accel/nvidia/cc%d%d") then
222+
accel_stack_in_modulepath = true
223+
break
224+
end
225+
end
226+
end
227+
return accel_stack_in_modulepath
228+
end
229+
230+
local function eessi_removed_module_warning_startup_hook(usrCmd)
231+
if usrCmd == 'load' and not os.getenv("EESSI_SKIP_REMOVED_MODULES_CHECK") then
232+
local CUDA_RELOCATION_MSG = [[All CUDA installations and modules depending on CUDA have been relocated to GPU-specific stacks.
233+
Please see https://www.eessi.io/docs/site_specific_config/gpu/ for more information.]]
234+
235+
local RELOCATED_CUDA_MODULES = {
236+
['NCCL'] = CUDA_RELOCATION_MSG,
237+
['NCCL/2.18.3-GCCcore-12.3.0-CUDA-12.1.1'] = CUDA_RELOCATION_MSG,
238+
['UCX-CUDA'] = CUDA_RELOCATION_MSG,
239+
['UCX-CUDA/1.14.1-GCCcore-12.3.0-CUDA-12.1.1'] = CUDA_RELOCATION_MSG,
240+
-- we also have non-CUDA versions of OSU Micro Benchmarks, so only match the CUDA version
241+
['OSU-Micro-Benchmarks/7.2-gompi-2023a-CUDA-12.1.1'] = CUDA_RELOCATION_MSG,
242+
['UCC-CUDA'] = CUDA_RELOCATION_MSG,
243+
['UCC-CUDA/1.2.0-GCCcore-12.3.0-CUDA-12.1.1'] = CUDA_RELOCATION_MSG,
244+
['CUDA'] = CUDA_RELOCATION_MSG,
245+
['CUDA/12.1.1'] = CUDA_RELOCATION_MSG,
246+
['CUDA-Samples'] = CUDA_RELOCATION_MSG,
247+
['CUDA-Samples/12.1-GCC-12.3.0-CUDA-12.1.1'] = CUDA_RELOCATION_MSG,
248+
}
249+
250+
local REMOVED_MODULES = {
251+
['ipympl/0.9.3-foss-2023a'] = 'This module has been replaced by ipympl/0.9.3-gfbf-2023a',
252+
}
253+
254+
local masterTbl = masterTbl()
255+
local error_msg = ""
256+
-- The CUDA messages should only be shown if the accelerator stack is NOT being used
257+
if not using_eessi_accel_stack() then
258+
for _, module in pairs(masterTbl.pargs) do
259+
if RELOCATED_CUDA_MODULES[module] ~= nil then
260+
error_msg = error_msg .. module .. ': ' .. RELOCATED_CUDA_MODULES[module] .. '\\n\\n'
261+
end
262+
end
263+
end
264+
for _, module in pairs(masterTbl.pargs) do
265+
if REMOVED_MODULES[module] ~= nil then
266+
error_msg = error_msg .. module .. ': ' .. REMOVED_MODULES[module] .. '\\n\\n'
267+
end
268+
end
269+
if error_msg ~= "" then
270+
LmodError('\\n' .. error_msg .. 'If you know what you are doing and you want to ignore this check for removed/relocated modules, set $EESSI_SKIP_REMOVED_MODULES_CHECK to any value.')
271+
end
272+
end
273+
end
274+
275+
function eessi_startup_hook(usrCmd)
276+
eessi_removed_module_warning_startup_hook(usrCmd)
277+
end
278+
279+
hook.register("startup", eessi_startup_hook)
213280
hook.register("load", eessi_load_hook)
214281
215282
"""

0 commit comments

Comments
 (0)