2222
2323from __future__ import annotations
2424
25+ import ipaddress
2526import os
2627import socket
2728import subprocess
@@ -67,19 +68,12 @@ def _tq(): # pragma: no cover - trivially exercised by smoke tests
6768def _get_local_node_ip () -> str :
6869 """Return THIS process's host IP, not the cluster head's.
6970
70- Each Ray actor process must use its own node's IP for Mooncake's
71- listener bind (multi-node correctness). If we used the head IP,
72- actors on worker nodes would announce a listener address that
73- only routes back to the head — peers fail with connection refused.
74-
75- Skips link-local APIPA addresses (RFC 3927 IPv4 169.254/16,
76- RFC 4291 IPv6 fe80::/10): on this cluster ``avahi-autoipd``
77- assigns 169.254.x to ``usb0``, and ``gethostbyname`` can resolve
78- to that non-routable address. The cluster wrapper's network-init
79- block strips usb0 in most cases, but the check is a defense in
80- depth (and free).
71+ Each Ray actor process must use its own node's IP so Mooncake's
72+ announce address (``MC_TCP_BIND_ADDRESS`` → ``desc.ip_or_host_name``
73+ in ``transfer_engine_impl.cpp``) is routable cross-node. Link-local
74+ (169.254/16, fe80::/10) is rejected — ``gethostbyname`` can resolve
75+ to APIPA on hosts where ``avahi-autoipd`` is active.
8176 """
82- import ipaddress
8377 try :
8478 ip = socket .gethostbyname (socket .gethostname ())
8579 if ipaddress .ip_address (ip ).is_link_local :
@@ -89,42 +83,6 @@ def _get_local_node_ip() -> str:
8983 return ""
9084
9185
92- def _usb0_down () -> None :
93- """Best-effort attempt to take down usb0 / strip 169.254.x APIPA.
94-
95- **DO NOT rely on this from Python.** Ray actors run unprivileged —
96- the ``ip``/``ifconfig`` calls here silently return "Operation not
97- permitted" without `CAP_NET_ADMIN`. Even when run as root, the fix
98- is too late: Mooncake's RPC listener has already scanned
99- ``getifaddrs()`` and bound to the first active interface (usually
100- ``usb0`` 169.254.3.1, the link-local APIPA address) before the
101- Python adapter is loaded. Background daemons (``avahi-autoipd``,
102- NetworkManager) also re-assign the APIPA address within seconds.
103-
104- The proven fix lives at the **Slurm container start-up** layer
105- (e.g. a ``NETWORK_INIT_CMDS`` block in the cluster wrapper that
106- kills ``avahi-autoipd``, sets ``nmcli device set usb0 managed no``,
107- flushes the address, and runs a 5 s relaunch loop as a failsafe).
108- See ``research/data_plane_mooncake_status.md`` and
109- ``data-plane-bench/DEBUG_TQ_BACKENDS.md`` (Issue 1).
110-
111- This function is kept for reference only; it is a no-op on the
112- workers where it matters.
113- """
114- cmds = [
115- "ifconfig usb0 0.0.0.0 2>/dev/null" ,
116- "ifconfig usb0 down 2>/dev/null" ,
117- "ip link set usb0 down 2>/dev/null" ,
118- "ip addr flush dev usb0 2>/dev/null" ,
119- ]
120- try :
121- subprocess .run (
122- ["sh" , "-c" , "; " .join (cmds )], check = False , capture_output = True
123- )
124- except Exception :
125- pass
126-
127-
12886def _mooncake_transport_config () -> dict :
12987 protocol = os .environ .get ("MC_MOONCAKE_PROTOCOL" , "tcp" )
13088 if protocol != "rdma" :
@@ -266,46 +224,26 @@ def _init_tq(cfg: DataPlaneConfig) -> None:
266224 },
267225 }
268226 elif backend == "mooncake_cpu" :
269- # Enable KV-path 1D→2D promotion (see codec._KV_PROMOTE_1D);
270- # mooncake_cpu goes through TQ's KVStorageManager which has the
271- # 1D schema/data mismatch. Idempotent with the per-process
272- # set_kv_promote_1d in TQDataPlaneClient.__init__; kept here
273- # so this branch is self-contained.
274- from nemo_rl .data_plane .codec import set_kv_promote_1d
275- set_kv_promote_1d (True )
276-
277227 # The mooncake-transfer-engine wheel ships `mooncake_master` at
278228 # <site-packages>/mooncake/, NOT on $PATH. TQ's
279229 # subprocess.Popen(["mooncake_master", ...]) fails with
280230 # FileNotFoundError unless we put the package dir on PATH first.
281- # The wheel is a base dep (TQ-tier), so the import should always
282- # succeed — fail loud otherwise.
283231 import mooncake # type: ignore[import-not-found]
284232
285233 _moon_pkg = os .path .dirname (mooncake .__file__ )
286234 _master = os .path .join (_moon_pkg , "mooncake_master" )
287- if os .path .exists (_master ) and not os .access (_master , os .X_OK ):
288- # Wheels can strip the +x bit on extract; restore it.
289- import stat as _stat
290- try :
291- os .chmod (
292- _master ,
293- os .stat (_master ).st_mode
294- | _stat .S_IXUSR | _stat .S_IXGRP | _stat .S_IXOTH ,
295- )
296- except OSError :
297- pass
235+ try :
236+ os .chmod (_master , 0o755 )
237+ except OSError :
238+ pass
298239 _existing_path = os .environ .get ("PATH" , "" )
299240 if _moon_pkg not in _existing_path .split (os .pathsep ):
300241 os .environ ["PATH" ] = _moon_pkg + os .pathsep + _existing_path
301- _usb0_down ()
242+ # Per-process MC_TCP_BIND_ADDRESS / KV-path promotion already
243+ # set by TQDataPlaneClient.__init__ (runs on every process,
244+ # including this driver). _init_tq only needs local_ip below
245+ # for the metadata/master server URLs (driver-bound).
302246 local_ip = _get_local_node_ip ()
303- if local_ip :
304- # Force-assign (NOT setdefault): Ray actors inherit env vars
305- # from the driver, so on multi-node runs every actor would
306- # otherwise carry the driver's IP and announce listeners at
307- # the wrong host. Each process must publish its OWN IP.
308- os .environ ["MC_TCP_BIND_ADDRESS" ] = local_ip
309247 overlay = {
310248 ** controller_overlay ,
311249 "backend" : {
@@ -418,16 +356,21 @@ def __init__(self, cfg: DataPlaneConfig, *, bootstrap: bool = True) -> None:
418356 cluster — ``cfg`` is then only consulted for client-side
419357 knobs (poll interval).
420358 """
421- # mooncake_cpu setup must run BEFORE _init_tq / _connect_existing,
422- # because Mooncake's getifaddrs() listener bind happens inside
423- # tq.init/connect — once it's bound to usb0 (169.254.3.1), no env
424- # var change rescues it. Three per-process knobs needed in EVERY
425- # process that builds a TQ client (driver, SyncRolloutActor, every
426- # MegatronPolicyWorker rank):
427- # 1. MC_TCP_BIND_ADDRESS — picked up by Mooncake engine.so for
428- # client registration so peers receive a routable address.
429- # 2. MC_STORE_MEMCPY=0 — bypasses Mooncake #1986 LOCAL_MEMCPY
430- # cross-process pointer-deref segfault (see comment below).
359+ # mooncake_cpu setup must run BEFORE _init_tq / _connect_existing
360+ # — once tq.init/connect runs, Mooncake's engine.so reads the
361+ # env vars and they can't be changed. Three per-process knobs
362+ # needed in EVERY process that builds a TQ client (driver,
363+ # SyncRolloutActor, every MegatronPolicyWorker rank):
364+ # 1. MC_TCP_BIND_ADDRESS — Mooncake engine.so writes this into
365+ # desc.ip_or_host_name, the address peers receive from the
366+ # metadata service. Without it, getifaddrs()[0] picks usb0
367+ # (169.254.x APIPA) and peers fail to connect.
368+ # 2. MC_STORE_MEMCPY=0 — Mooncake LOCAL_MEMCPY fast-path
369+ # reinterpret_casts cross-process pointers, segfaulting
370+ # MemcpyWorkerPool. PR #1995 (merged 2026-04-30) fixes the
371+ # root cause but isn't in any published wheel yet
372+ # (mooncake-transfer-engine 0.3.10.post2 was bumped before
373+ # that merge). Drop this once the wheel includes the fix.
431374 # 3. KV-path 1D promotion — works around TQ's
432375 # extract_field_schema schema/data mismatch for 1D fields.
433376 if cfg .get ("backend" ) == "mooncake_cpu" :
@@ -438,12 +381,6 @@ def __init__(self, cfg: DataPlaneConfig, *, bootstrap: bool = True) -> None:
438381 # be a no-op and the actor would announce the driver's
439382 # IP — peers fail with "connection refused".
440383 os .environ ["MC_TCP_BIND_ADDRESS" ] = local_ip
441- # Disable LOCAL_MEMCPY fast-path: with multiple Ray actors on
442- # the same host (driver + 8 policy workers + rollout actor),
443- # mooncake's isLocalTransfer() incorrectly compares IP-only
444- # and reinterpret_casts another process's virtual address,
445- # segfaulting MemcpyWorkerPool. See kvcache-ai/Mooncake#1986
446- # (PR #1995 is the upstream fix; not yet in our wheel).
447384 os .environ .setdefault ("MC_STORE_MEMCPY" , "0" )
448385 from nemo_rl .data_plane .codec import set_kv_promote_1d
449386 set_kv_promote_1d (True )
0 commit comments