Skip to content

Commit 7baaaff

Browse files
committed
fix(hpc): use explicit Dask scheduler interface
1 parent bf372a3 commit 7baaaff

1 file changed

Lines changed: 29 additions & 3 deletions

File tree

CodeEntropy/core/dask_clusters.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import psutil
1010
from dask.distributed import Client
11-
from dask_jobqueue.slurm import SLURMCluster
11+
from dask_jobqueue import SLURMCluster
1212

1313

1414
class HPCDaskManager:
@@ -37,18 +37,36 @@ def system_network_interface(self) -> str:
3737
"""
3838
Select the most appropriate network interface for HPC communication.
3939
40+
If args.hpc_interface is provided, that value is used directly. Otherwise,
41+
commonly used HPC interfaces are preferred. Loopback and container interfaces
42+
are avoided because Dask workers on other nodes cannot connect to a scheduler
43+
advertised on 127.0.0.1.
44+
4045
Returns:
4146
str: Name of selected network interface.
47+
48+
Raises:
49+
RuntimeError: If no suitable non-loopback interface can be found.
4250
"""
51+
configured = getattr(self.args, "hpc_interface", None)
52+
if configured:
53+
return configured
54+
4355
preferred_nics = ["bond0", "ib0", "hsn0", "eth0"]
4456
interfaces = list(psutil.net_if_addrs().keys())
4557

4658
for iface in preferred_nics:
4759
if iface in interfaces:
4860
return iface
4961

50-
# fallback to first available interface
51-
return interfaces[0]
62+
for iface in interfaces:
63+
if not iface.startswith(("lo", "docker", "veth")):
64+
return iface
65+
66+
raise RuntimeError(
67+
"Could not find a non-loopback network interface for Dask workers. "
68+
f"Available interfaces: {interfaces}. Set 'hpc_interface' in config.yaml."
69+
)
5270

5371
def slurm_directives(self) -> tuple[list[str], list[str]]:
5472
"""
@@ -82,6 +100,9 @@ def slurm_prologues(self) -> list[str]:
82100
args = self.args
83101
prologue: list[str] = []
84102

103+
for module_name in getattr(args, "hpc_modules", None) or []:
104+
prologue.append(f"module load {module_name}")
105+
85106
prologue.append(f'eval "$({args.conda_path} shell.bash hook)"')
86107

87108
if args.conda_exec == "mamba":
@@ -119,6 +140,7 @@ def configure_cluster(self) -> Client:
119140
shebang="#!/bin/bash --login",
120141
local_directory="$PWD",
121142
interface=iface,
143+
scheduler_options={"interface": iface},
122144
job_script_prologue=prologue,
123145
)
124146

@@ -164,6 +186,10 @@ def submit_master(self) -> None:
164186
f.write(f"#SBATCH --qos={self.args.hpc_qos}\n")
165187

166188
f.write("\n")
189+
190+
for module_name in getattr(self.args, "hpc_modules", None) or []:
191+
f.write(f"module load {module_name}\n")
192+
167193
f.write(f'eval "$({self.args.conda_path} shell.bash hook)"\n')
168194

169195
if self.args.conda_exec == "mamba":

0 commit comments

Comments
 (0)