From 2482ae4126d78950f45481eb98c69104ff81a4da Mon Sep 17 00:00:00 2001 From: lonexreb Date: Fri, 5 Jun 2026 21:57:48 -0400 Subject: [PATCH] fix: prefer IPv4 non-link-local in _get_host_ip for Docker (#743) In containers, hostname resolution often returns an IPv6 link-local address (fe80::...) which c10d / TCPStore cannot use, producing: [c10d] The IPv4 network addresses of (fe80::..., 51391) cannot be retrieved (gai error: -9 - Address family for hostname not supported). Replace the unguarded get_ipaddr(hostname, 0) call with an explicit getaddrinfo scan that prefers IPv4 non-link-local, then IPv6 non-link- local, falling back to get_ipaddr only when nothing usable surfaces or resolution fails outright. VLLM_HOST_IP override is preserved. Test plan: tests/unit_tests/test_get_host_ip.py - VLLM_HOST_IP override honored - IPv4 wins over IPv6 link-local (regression for #743) - IPv4 link-local (169.254/16) skipped - IPv6 link-local skipped, global IPv6 preferred when IPv4 absent - Falls back to Monarch resolver when only link-local addrs exist - Falls back to Monarch resolver on gaierror --- src/forge/actors/vllm/v1/monarch_executor.py | 26 ++++- tests/unit_tests/test_get_host_ip.py | 112 +++++++++++++++++++ 2 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 tests/unit_tests/test_get_host_ip.py diff --git a/src/forge/actors/vllm/v1/monarch_executor.py b/src/forge/actors/vllm/v1/monarch_executor.py index b42da2876..677cfed10 100644 --- a/src/forge/actors/vllm/v1/monarch_executor.py +++ b/src/forge/actors/vllm/v1/monarch_executor.py @@ -25,8 +25,10 @@ def _get_host_ip() -> str: """Get this host's routable IP address using hostname resolution. - Uses socket.gethostname() + DNS resolution, which works on internal - networks where external IPs (like 8.8.8.8) are unreachable. + Prefers IPv4 non-link-local addresses, then IPv6 non-link-local; falls + back to Monarch's resolver only when getaddrinfo returns nothing usable. + Inside containers hostname resolution often returns an IPv6 link-local + address (fe80::...), which c10d / TCPStore cannot use — see #743. """ import socket @@ -34,7 +36,25 @@ def _get_host_ip() -> str: return host_ip hostname = socket.gethostname() - # Use Monarch's get_ipaddr which resolves hostname via DNS + + def _is_link_local(family: int, ip: str) -> bool: + if family == socket.AF_INET: + return ip.startswith("169.254.") + if family == socket.AF_INET6: + return ip.lower().startswith("fe80:") + return False + + try: + infos = socket.getaddrinfo(hostname, None, type=socket.SOCK_STREAM) + except socket.gaierror: + return get_ipaddr(hostname, 0) + + for preferred_family in (socket.AF_INET, socket.AF_INET6): + for family, _, _, _, addr in infos: + ip = addr[0] + if family == preferred_family and not _is_link_local(family, ip): + return ip + return get_ipaddr(hostname, 0) diff --git a/tests/unit_tests/test_get_host_ip.py b/tests/unit_tests/test_get_host_ip.py new file mode 100644 index 000000000..dacab8d01 --- /dev/null +++ b/tests/unit_tests/test_get_host_ip.py @@ -0,0 +1,112 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for vllm.v1.monarch_executor._get_host_ip (regression for #743). + +Issue #743: inside containers, hostname resolution can return an IPv6 +link-local address (fe80::...) which c10d / TCPStore cannot bind, leading +to "IPv4 network addresses ... cannot be retrieved" failures. _get_host_ip +must prefer IPv4 non-link-local addresses before falling back to IPv6. +""" + +import socket +import sys +from unittest import mock + +import pytest + + +@pytest.fixture +def get_host_ip(): + """Import the function under test with monarch / vllm stubbed out so the + test does not require those packages to be installed.""" + stubs = { + "monarch": mock.MagicMock(), + "monarch.actor": mock.MagicMock(), + "monarch.tools": mock.MagicMock(), + "monarch.tools.network": mock.MagicMock(), + "vllm": mock.MagicMock(), + "vllm.v1": mock.MagicMock(), + "vllm.v1.executor": mock.MagicMock(), + "vllm.v1.executor.abstract": mock.MagicMock(), + "vllm.v1.worker": mock.MagicMock(), + "vllm.v1.worker.worker_base": mock.MagicMock(), + "cloudpickle": mock.MagicMock(), + } + with mock.patch.dict(sys.modules, stubs): + # Force re-import so stubs are picked up + sys.modules.pop("forge.actors.vllm.v1.monarch_executor", None) + from forge.actors.vllm.v1.monarch_executor import _get_host_ip + yield _get_host_ip + + +def _addrinfo(family: int, ip: str) -> tuple: + """Build a getaddrinfo tuple: (family, type, proto, canon, sockaddr).""" + sockaddr = (ip, 0) if family == socket.AF_INET else (ip, 0, 0, 0) + return (family, socket.SOCK_STREAM, 0, "", sockaddr) + + +class TestGetHostIp: + def test_vllm_host_ip_env_override(self, get_host_ip, monkeypatch): + monkeypatch.setenv("VLLM_HOST_IP", "10.20.30.40") + assert get_host_ip() == "10.20.30.40" + + def test_prefers_ipv4_over_ipv6_link_local(self, get_host_ip, monkeypatch): + """Regression for #743: IPv4 must win over IPv6 link-local.""" + monkeypatch.delenv("VLLM_HOST_IP", raising=False) + infos = [ + _addrinfo(socket.AF_INET6, "fe80::222:48ff:fe49:ba90"), + _addrinfo(socket.AF_INET, "192.168.1.50"), + ] + with mock.patch("socket.getaddrinfo", return_value=infos): + assert get_host_ip() == "192.168.1.50" + + def test_skips_ipv4_link_local(self, get_host_ip, monkeypatch): + """169.254.x.x is APIPA / link-local and not routable.""" + monkeypatch.delenv("VLLM_HOST_IP", raising=False) + infos = [ + _addrinfo(socket.AF_INET, "169.254.88.125"), + _addrinfo(socket.AF_INET, "192.168.1.50"), + ] + with mock.patch("socket.getaddrinfo", return_value=infos): + assert get_host_ip() == "192.168.1.50" + + def test_falls_back_to_ipv6_when_only_ipv6_available( + self, get_host_ip, monkeypatch + ): + monkeypatch.delenv("VLLM_HOST_IP", raising=False) + infos = [ + _addrinfo(socket.AF_INET6, "fe80::1"), # link-local, must skip + _addrinfo(socket.AF_INET6, "2001:db8::1"), # global, OK + ] + with mock.patch("socket.getaddrinfo", return_value=infos): + assert get_host_ip() == "2001:db8::1" + + def test_falls_back_to_monarch_when_only_link_local( + self, get_host_ip, monkeypatch + ): + """If every candidate is link-local, fall back to Monarch's + resolver to preserve prior behavior in degenerate cases.""" + monkeypatch.delenv("VLLM_HOST_IP", raising=False) + infos = [ + _addrinfo(socket.AF_INET, "169.254.88.125"), + _addrinfo(socket.AF_INET6, "fe80::1"), + ] + with mock.patch("socket.getaddrinfo", return_value=infos), mock.patch( + "forge.actors.vllm.v1.monarch_executor.get_ipaddr", + return_value="fallback-ip", + ): + assert get_host_ip() == "fallback-ip" + + def test_falls_back_to_monarch_on_gaierror(self, get_host_ip, monkeypatch): + monkeypatch.delenv("VLLM_HOST_IP", raising=False) + with mock.patch( + "socket.getaddrinfo", side_effect=socket.gaierror("no such host") + ), mock.patch( + "forge.actors.vllm.v1.monarch_executor.get_ipaddr", + return_value="monarch-fallback", + ): + assert get_host_ip() == "monarch-fallback"