@@ -31,55 +31,40 @@ def check_slurm_env(self) -> None:
3131
3232 Some HPC systems require this variable to be unset for correct CPU binding.
3333 """
34- os .environ .pop ("SLURM_CPU_BIND" , None )
34+ if "SLURM_CPU_BIND" in os .environ :
35+ os .environ .pop ("SLURM_CPU_BIND" )
3536
3637 def system_network_interface (self ) -> str :
3738 """
38- Select the most appropriate network interface for HPC communication .
39+ Get best candidate for HPC network interface from commonly known ones .
3940
40- If args.hpc_interface is provided, that value is used directly. Otherwise,
41- commonly used HPC interfaces are preferred. Loopback and container interfaces
42- are avoided because Dask workers on other nodes cannot connect to a scheduler
43- advertised on 127.0.0.1.
44-
45- Returns:
46- str: Name of selected network interface.
47-
48- Raises:
49- RuntimeError: If no suitable non-loopback interface can be found.
41+ This deliberately follows the WaterEntropy behaviour and only selects from
42+ known HPC-safe interfaces. It avoids selecting arbitrary interfaces such as
43+ eno1, which may exist on the master node but not on worker nodes.
5044 """
51- configured = getattr (self .args , "hpc_interface" , None )
52- if configured :
53- return configured
54-
55- preferred_nics = ["bond0" , "ib0" , "hsn0" , "eth0" ]
45+ hpc_nics = ["bond0" , "ib0" , "hsn0" , "eth0" ]
5646 interfaces = list (psutil .net_if_addrs ().keys ())
5747
58- for iface in preferred_nics :
48+ for iface in hpc_nics :
5949 if iface in interfaces :
6050 return iface
6151
62- for iface in interfaces :
63- if not iface .startswith (("lo" , "docker" , "veth" )):
64- return iface
65-
6652 raise RuntimeError (
67- "Could not find a non-loopback network interface for Dask workers. "
68- f"Available interfaces: { interfaces } . Set 'hpc_interface' in config.yaml."
53+ "Could not find a known HPC network interface. "
54+ f"Available interfaces: { interfaces } . "
55+ "Expected one of: bond0, ib0, hsn0, eth0."
6956 )
7057
7158 def slurm_directives (self ) -> tuple [list [str ], list [str ]]:
7259 """
73- Build SLURM job directives and skip list .
60+ Process extra SLURM directives and directives to be skipped .
7461
7562 Returns:
76- Tuple[List[str], List[str]]:
77- - Extra SLURM directives
78- - Directives to skip
63+ Tuple containing extra directives and skipped directives.
7964 """
8065 args = self .args
66+
8167 extra : list [str ] = []
82- skip : list [str ] = ["--mem" ]
8368
8469 if args .hpc_account :
8570 extra .append (f'--account="{ args .hpc_account } "' )
@@ -88,14 +73,16 @@ def slurm_directives(self) -> tuple[list[str], list[str]]:
8873 if args .hpc_constraint :
8974 extra .append (f'--constraint="{ args .hpc_constraint } "' )
9075
76+ skip = ["--mem" ]
77+
9178 return extra , skip
9279
9380 def slurm_prologues (self ) -> list [str ]:
9481 """
95- Build SLURM job prologue commands for environment setup .
82+ Process environment setup commands for the SLURM worker job script .
9683
9784 Returns:
98- List[str]: Shell commands executed before job start .
85+ List of shell commands executed before the Dask worker starts .
9986 """
10087 args = self .args
10188 prologue : list [str ] = []
@@ -115,10 +102,10 @@ def slurm_prologues(self) -> list[str]:
115102
116103 def configure_cluster (self ) -> Client :
117104 """
118- Configure and launch a SLURM-backed Dask cluster.
105+ Configure a SLURM-backed Dask cluster.
119106
120107 Returns:
121- Client: Dask distributed client connected to cluster .
108+ Dask distributed client connected to the SLURMCluster .
122109 """
123110 args = self .args
124111
@@ -140,7 +127,6 @@ def configure_cluster(self) -> Client:
140127 shebang = "#!/bin/bash --login" ,
141128 local_directory = "$PWD" ,
142129 interface = iface ,
143- scheduler_options = {"interface" : iface },
144130 job_script_prologue = prologue ,
145131 )
146132
@@ -155,9 +141,9 @@ def configure_cluster(self) -> Client:
155141
156142 def submit_master (self ) -> None :
157143 """
158- Submit a SLURM job that runs a master Dask orchestration process.
144+ Submit a SLURM job that runs the master CodeEntropy process.
159145
160- This generates a temporary SLURM script and submits it via ` sbatch` .
146+ This generates a temporary SLURM script and submits it via sbatch.
161147 """
162148 cli = list (sys .argv [1 :])
163149 if "--submit" in cli :
0 commit comments