2525 "Install them with: pip install httpx redis"
2626 ) from e
2727
28- from dlslime import discover_topology , RDMAContext , RDMAEndpoint , RDMAMemoryPool
28+ from dlslime import (
29+ available_nic ,
30+ discover_topology ,
31+ RDMAContext ,
32+ RDMAEndpoint ,
33+ RDMAMemoryPool ,
34+ )
2935
3036try : # TCP support is a build-time option (BUILD_TCP). Tolerate its absence.
3137 from dlslime import TcpEndpoint as _TcpEndpoint
@@ -267,6 +273,10 @@ def __init__(
267273 device: RDMA device name (e.g., "mlx5_0"), if None, auto-select
268274 scope: Scope string for multi-tenant isolation (used as Redis key prefix).
269275 """
276+ # Set first so __del__ doesn't crash with AttributeError if __init__
277+ # raises partway through (e.g. discover_topology on a TCP-only host
278+ # built against an older binary that throws).
279+ self ._shutdown_called = False
270280 self .ctrl_url = ctrl_url
271281 self ._redis_address : Optional [str ] = None
272282 self .alias : str = alias or ""
@@ -347,7 +357,6 @@ def __init__(
347357 self ._redis_client : Optional [redis .Redis ] = None
348358
349359 self ._stop_event = threading .Event ()
350- self ._shutdown_called = False
351360
352361 # Event listener for cleanup only (legacy inbox)
353362 self ._event_thread : Optional [threading .Thread ] = None
@@ -432,11 +441,32 @@ def _first_usable_resource_key(
432441 ) -> RdmaResourceKey :
433442 wanted_link = self ._normalize_link_type (link_type ) if link_type else None
434443 nics = resource .get ("nics" ) or []
444+ # When inspecting our OWN topology, restrict to NICs that the local
445+ # userspace libibverbs can actually open. The topology may advertise
446+ # NICs that ibv_get_device_list() doesn't see (container missing
447+ # /dev/infiniband, missing rdma-core providers, perms, etc.); skipping
448+ # them lets _default_local_resource_key's existing
449+ # `except RuntimeError: return TcpResourceKey()` fall back transparently.
450+ # For a peer's resource we cannot judge openability, so skip the check.
451+ check_openable = resource is self ._local_resource
452+ openable = set (available_nic ()) if check_openable else set ()
453+ skipped : List [str ] = []
435454 for nic in nics :
436- if device is not None and nic .get ("name" ) != device :
455+ nic_name = str (nic .get ("name" , "" ))
456+ if device is not None and nic_name != device :
437457 continue
438458 if nic .get ("health" , "AVAILABLE" ) == "UNAVAILABLE" :
439459 continue
460+ if check_openable and nic_name and nic_name not in openable :
461+ skipped .append (nic_name )
462+ logger .warning (
463+ "RDMA device %r advertised in topology but not openable by "
464+ "libibverbs (userspace sees: %s); skipping. Will fall back "
465+ "to TCP if no other NIC is usable." ,
466+ nic_name ,
467+ sorted (openable ),
468+ )
469+ continue
440470 for port in nic .get ("ports" ) or []:
441471 port_num = int (port .get ("port" , 1 ))
442472 if ib_port is not None and port_num != int (ib_port ):
@@ -451,9 +481,13 @@ def _first_usable_resource_key(
451481 raise RuntimeError (
452482 f"Cannot select RDMA port for { nic .get ('name' )} : unknown link_type"
453483 )
454- return RdmaResourceKey (str ( nic [ "name" ]) , port_num , port_link )
484+ return RdmaResourceKey (nic_name , port_num , port_link )
455485
456486 detail = f"device={ device !r} , ib_port={ ib_port !r} , link_type={ link_type !r} "
487+ if check_openable and skipped :
488+ detail += (
489+ f", skipped_unopenable={ skipped } , libibverbs_sees={ sorted (openable )} "
490+ )
457491 raise RuntimeError (f"No usable RDMA resource found ({ detail } )" )
458492
459493 def _get_context_and_pool (
@@ -551,18 +585,24 @@ def _register_peer_with_nanoctrl(self) -> Dict[str, Any]:
551585 kwargs ["resource" ] = self ._local_resource
552586
553587 if {"device" , "ib_port" , "link_type" , "name_prefix" } & set (params ):
554- key = self ._first_usable_resource_key (
555- self ._local_resource ,
556- device = self ._preferred_device ,
557- ib_port = 1 ,
558- link_type = None ,
559- )
560- if "device" in params :
561- kwargs ["device" ] = key .device
562- if "ib_port" in params :
563- kwargs ["ib_port" ] = key .ib_port
564- if "link_type" in params :
565- kwargs ["link_type" ] = key .link_type
588+ try :
589+ key = self ._first_usable_resource_key (
590+ self ._local_resource ,
591+ device = self ._preferred_device ,
592+ ib_port = 1 ,
593+ link_type = None ,
594+ )
595+ except RuntimeError :
596+ # TCP-only deployment: no usable RDMA NIC. Omit the RDMA-specific
597+ # registration kwargs entirely; NanoCtrl treats them as optional.
598+ key = None
599+ if key is not None :
600+ if "device" in params :
601+ kwargs ["device" ] = key .device
602+ if "ib_port" in params :
603+ kwargs ["ib_port" ] = key .ib_port
604+ if "link_type" in params :
605+ kwargs ["link_type" ] = key .link_type
566606 if "name_prefix" in params :
567607 kwargs ["name_prefix" ] = "agent"
568608
0 commit comments