Skip to content

Commit 84e97a3

Browse files
authored
init fix example (#109)
* init fix example * remove back composible
1 parent a01b1ca commit 84e97a3

3 files changed

Lines changed: 73 additions & 29 deletions

File tree

dlslime/dlslime/csrc/topology.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,11 @@ json discoverTopology(const std::optional<std::string>& preferred_d
247247
devices = std::move(ordered);
248248
}
249249

250-
if (devices.empty()) {
251-
throw std::runtime_error("No RDMA devices available");
252-
}
253-
250+
// Empty `devices` is not an error: it means the host has no RDMA devices
251+
// visible to us (e.g. TCP-only deployment, or `SLIME_VISIBLE_DEVICES`
252+
// filtered everything out). Return an empty topology and let the caller
253+
// decide whether to fall back to TCP. Mirrors the early-return at the
254+
// top of listRdmaDevices() when the sysfs root is absent.
254255
json nics = json::array();
255256
for (const auto& device : devices) {
256257
json port = readSysfsPort(device, ib_port, sysfs_root);

dlslime/dlslime/peer_agent/_agent.py

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,13 @@
2525
"Install them with: pip install httpx redis"
2626
) from e
2727

28-
from dlslime import discover_topology, RDMAContext, RDMAEndpoint, RDMAMemoryPool
28+
from dlslime import (
29+
available_nic,
30+
discover_topology,
31+
RDMAContext,
32+
RDMAEndpoint,
33+
RDMAMemoryPool,
34+
)
2935

3036
try: # TCP support is a build-time option (BUILD_TCP). Tolerate its absence.
3137
from dlslime import TcpEndpoint as _TcpEndpoint
@@ -267,6 +273,10 @@ def __init__(
267273
device: RDMA device name (e.g., "mlx5_0"), if None, auto-select
268274
scope: Scope string for multi-tenant isolation (used as Redis key prefix).
269275
"""
276+
# Set first so __del__ doesn't crash with AttributeError if __init__
277+
# raises partway through (e.g. discover_topology on a TCP-only host
278+
# built against an older binary that throws).
279+
self._shutdown_called = False
270280
self.ctrl_url = ctrl_url
271281
self._redis_address: Optional[str] = None
272282
self.alias: str = alias or ""
@@ -347,7 +357,6 @@ def __init__(
347357
self._redis_client: Optional[redis.Redis] = None
348358

349359
self._stop_event = threading.Event()
350-
self._shutdown_called = False
351360

352361
# Event listener for cleanup only (legacy inbox)
353362
self._event_thread: Optional[threading.Thread] = None
@@ -432,11 +441,32 @@ def _first_usable_resource_key(
432441
) -> RdmaResourceKey:
433442
wanted_link = self._normalize_link_type(link_type) if link_type else None
434443
nics = resource.get("nics") or []
444+
# When inspecting our OWN topology, restrict to NICs that the local
445+
# userspace libibverbs can actually open. The topology may advertise
446+
# NICs that ibv_get_device_list() doesn't see (container missing
447+
# /dev/infiniband, missing rdma-core providers, perms, etc.); skipping
448+
# them lets _default_local_resource_key's existing
449+
# `except RuntimeError: return TcpResourceKey()` fall back transparently.
450+
# For a peer's resource we cannot judge openability, so skip the check.
451+
check_openable = resource is self._local_resource
452+
openable = set(available_nic()) if check_openable else set()
453+
skipped: List[str] = []
435454
for nic in nics:
436-
if device is not None and nic.get("name") != device:
455+
nic_name = str(nic.get("name", ""))
456+
if device is not None and nic_name != device:
437457
continue
438458
if nic.get("health", "AVAILABLE") == "UNAVAILABLE":
439459
continue
460+
if check_openable and nic_name and nic_name not in openable:
461+
skipped.append(nic_name)
462+
logger.warning(
463+
"RDMA device %r advertised in topology but not openable by "
464+
"libibverbs (userspace sees: %s); skipping. Will fall back "
465+
"to TCP if no other NIC is usable.",
466+
nic_name,
467+
sorted(openable),
468+
)
469+
continue
440470
for port in nic.get("ports") or []:
441471
port_num = int(port.get("port", 1))
442472
if ib_port is not None and port_num != int(ib_port):
@@ -451,9 +481,13 @@ def _first_usable_resource_key(
451481
raise RuntimeError(
452482
f"Cannot select RDMA port for {nic.get('name')}: unknown link_type"
453483
)
454-
return RdmaResourceKey(str(nic["name"]), port_num, port_link)
484+
return RdmaResourceKey(nic_name, port_num, port_link)
455485

456486
detail = f"device={device!r}, ib_port={ib_port!r}, link_type={link_type!r}"
487+
if check_openable and skipped:
488+
detail += (
489+
f", skipped_unopenable={skipped}, libibverbs_sees={sorted(openable)}"
490+
)
457491
raise RuntimeError(f"No usable RDMA resource found ({detail})")
458492

459493
def _get_context_and_pool(
@@ -551,18 +585,24 @@ def _register_peer_with_nanoctrl(self) -> Dict[str, Any]:
551585
kwargs["resource"] = self._local_resource
552586

553587
if {"device", "ib_port", "link_type", "name_prefix"} & set(params):
554-
key = self._first_usable_resource_key(
555-
self._local_resource,
556-
device=self._preferred_device,
557-
ib_port=1,
558-
link_type=None,
559-
)
560-
if "device" in params:
561-
kwargs["device"] = key.device
562-
if "ib_port" in params:
563-
kwargs["ib_port"] = key.ib_port
564-
if "link_type" in params:
565-
kwargs["link_type"] = key.link_type
588+
try:
589+
key = self._first_usable_resource_key(
590+
self._local_resource,
591+
device=self._preferred_device,
592+
ib_port=1,
593+
link_type=None,
594+
)
595+
except RuntimeError:
596+
# TCP-only deployment: no usable RDMA NIC. Omit the RDMA-specific
597+
# registration kwargs entirely; NanoCtrl treats them as optional.
598+
key = None
599+
if key is not None:
600+
if "device" in params:
601+
kwargs["device"] = key.device
602+
if "ib_port" in params:
603+
kwargs["ib_port"] = key.ib_port
604+
if "link_type" in params:
605+
kwargs["link_type"] = key.link_type
566606
if "name_prefix" in params:
567607
kwargs["name_prefix"] = "agent"
568608

dlslime/tests/python/test_peer_agent_topology_discovery.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -362,12 +362,15 @@ def test_peer_agent_scope_isolates_discovery_namespace():
362362
assert default_scope_target_with_same_alias.get_resource("dlslime1") is None
363363

364364

365-
def test_discover_topology_requires_at_least_one_nic(tmp_path):
366-
with pytest.raises(RuntimeError, match="No RDMA devices available"):
367-
discover_topology(
368-
preferred_device=None,
369-
ib_port=1,
370-
preferred_link_type=None,
371-
sysfs_root=str(tmp_path),
372-
devices=[],
373-
)
365+
def test_discover_topology_returns_empty_when_no_nics(tmp_path):
366+
"""No NICs is a valid result, not an error: TCP-only deployments rely on
367+
discover_topology returning an empty topology so PeerAgent can construct."""
368+
resource = discover_topology(
369+
preferred_device=None,
370+
ib_port=1,
371+
preferred_link_type=None,
372+
sysfs_root=str(tmp_path),
373+
devices=[],
374+
)
375+
assert resource["nics"] == []
376+
assert resource.get("schema_version") == 1

0 commit comments

Comments
 (0)