@@ -121,27 +121,38 @@ def parse_hook(ec, *args, **kwargs):
121121
122122def post_ready_hook (self , * args , ** kwargs ):
123123 """
124- Post-ready hook: limit parallellism for selected builds, because they require a lot of memory per used core.
124+ Post-ready hook: limit parallellism for selected builds based on software name and CPU target.
125+ parallelism needs to be limited because some builds require a lot of memory per used core.
125126 """
126127 # 'parallel' easyconfig parameter is set via EasyBlock.set_parallel in ready step based on available cores.
127- # here we reduce parallellism to only use half of that for selected software,
128- # to avoid failing builds/tests due to out-of-memory problems;
129- memory_hungry_build = self .name in ['libxc' , 'MBX' , 'TensorFlow' ]
130- # on A64FX systems, (HBM) memory is typically scarce, so we need to use fewer cores for some builds
128+ # get current parallelism setting
129+ parallel = self .cfg ['parallel' ]
130+ if parallel == 1 :
131+ return # no need to limit if already using 1 core
132+
133+ # get CPU target
131134 cpu_target = get_eessi_envvar ('EESSI_SOFTWARE_SUBDIR' )
132- memory_hungry_build_a64fx = cpu_target == CPU_TARGET_A64FX and self .name in ['Qt5' , 'ROOT' ]
133- if memory_hungry_build or memory_hungry_build_a64fx :
134- parallel = self .cfg ['parallel' ]
135- if cpu_target == CPU_TARGET_A64FX and self .name in ['TensorFlow' ]:
136- # limit parallelism to 8, builds with 12 and 16 failed on Deucalion
137- if parallel > 8 :
138- self .cfg ['parallel' ] = 8
139- msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing"
140- print_msg (msg % (self .cfg ['parallel' ], parallel , self .name , cpu_target ), log = self .log )
141- elif parallel > 1 :
142- self .cfg ['parallel' ] = parallel // 2
143- msg = "limiting parallelism to %s (was %s) for %s to avoid out-of-memory failures during building/testing"
144- print_msg (msg % (self .cfg ['parallel' ], parallel , self .name ), log = self .log )
135+
136+ # check if we have limits defined for this software
137+ if self .name in PARALLELISM_LIMITS :
138+ limits = PARALLELISM_LIMITS [self .name ]
139+
140+ # first check for CPU-specific limit
141+ if cpu_target in limits :
142+ operation_func , operation_args = limits [cpu_target ]
143+ new_parallel = operation_func (parallel , operation_args )
144+ # then check for generic limit (applies to all CPU targets)
145+ elif '*' in limits :
146+ operation_func , operation_args = limits ['*' ]
147+ new_parallel = operation_func (parallel , operation_args )
148+ else :
149+ return # no applicable limits found
150+
151+ # apply the limit if it's different from current
152+ if new_parallel != parallel :
153+ self .cfg ['parallel' ] = new_parallel
154+ msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing"
155+ print_msg (msg % (new_parallel , parallel , self .name , cpu_target ), log = self .log )
145156
146157
147158def pre_prepare_hook (self , * args , ** kwargs ):
@@ -1249,3 +1260,38 @@ def post_module_hook(self, *args, **kwargs):
12491260}
12501261
12511262POST_MODULE_HOOKS = {}
1263+
1264+ # Define parallelism limit operations
1265+ def divide_by_factor (parallel , factor ):
1266+ """Divide parallelism by given factor"""
1267+ return max (1 , parallel // factor )
1268+
1269+ def set_maximum (parallel , max_value ):
1270+ """Set parallelism to maximum value"""
1271+ return min (parallel , max_value )
1272+
1273+ # Data structure defining parallelism limits for different software and CPU targets
1274+ # Format: {software_name: {cpu_target: (operation_function, operation_args)}}
1275+ # '*' for a CPU target means the operation applies to all CPU targets
1276+ # Information is processed in the post_ready_hook function. First it checks if the
1277+ # specific CPU target is defined in the data structure below. If not, it checks for
1278+ # the generic '*' entry.
1279+ PARALLELISM_LIMITS = {
1280+ 'libxc' : {
1281+ '*' : (divide_by_factor , 2 ),
1282+ CPU_TARGET_A64FX : (set_maximum , 12 ),
1283+ },
1284+ 'MBX' : {
1285+ '*' : (divide_by_factor , 2 ),
1286+ },
1287+ 'TensorFlow' : {
1288+ '*' : (divide_by_factor , 2 ),
1289+ CPU_TARGET_A64FX : (set_maximum , 8 ),
1290+ },
1291+ 'Qt5' : {
1292+ CPU_TARGET_A64FX : (divide_by_factor , 2 ),
1293+ },
1294+ 'ROOT' : {
1295+ CPU_TARGET_A64FX : (divide_by_factor , 2 ),
1296+ },
1297+ }
0 commit comments