Skip to content

Commit 47fc927

Browse files
committed
Dispatch getsockname/sendto/recvfrom on netlink sockets
Go's route package (vishvananda/netlink, used by the Kubernetes node-IP detection path) talks to NETLINK_ROUTE via sendto(2)/recvfrom(2) and queries the bound port id with getsockname(2). Only sendmsg/recvmsg/read were dispatched to the netlink emulation, so these calls fell through to the host socket fd and failed with ENOTSOCK ("netlinkrib: socket operation on non-socket"), forcing callers onto a 127.0.0.1 fallback and breaking real interface and SAN detection. Add three netlink entry points and dispatch FD_NETLINK to them from sys_getsockname/sys_sendto/sys_recvfrom: - netlink_send: process a flat (non-msghdr) RTM_GET* request buffer. - netlink_recv: drain whole buffered messages and write back sockaddr_nl. - netlink_getsockname: report the bound or auto-assigned port id. Also honor the RTM_GETLINK request filter: parse the ifinfomsg ifi_index and an optional IFLA_IFNAME attribute and emit only the matching link, so LinkByName/LinkByIndex see exactly one reply instead of erroring with "more than one link found". The sendmsg and send paths share a single nl_process_request dispatcher so both honor the filter. Validated with make check and make test-matrix on Apple Silicon; the k0s controller now detects the real host IP instead of the loopback fallback. (cherry picked from commit 31d29d184be83d4700f21d156b76739893fe1412)
1 parent 75fb59b commit 47fc927

3 files changed

Lines changed: 253 additions & 35 deletions

File tree

src/syscall/net.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,9 @@ int64_t sys_getsockname(guest_t *g,
509509
uint64_t addr_gva,
510510
uint64_t addrlen_gva)
511511
{
512+
if (fd_get_type(fd) == FD_NETLINK)
513+
return netlink_getsockname(fd, g, addr_gva, addrlen_gva);
514+
512515
host_fd_ref_t host_ref;
513516
if (host_fd_ref_open(fd, &host_ref) < 0)
514517
return -LINUX_EBADF;
@@ -639,6 +642,9 @@ int64_t sys_sendto(guest_t *g,
639642
uint64_t dest_gva,
640643
uint32_t addrlen)
641644
{
645+
if (fd_get_type(fd) == FD_NETLINK)
646+
return netlink_send(fd, g, buf_gva, len);
647+
642648
host_fd_ref_t host_ref;
643649
if (host_fd_ref_open(fd, &host_ref) < 0)
644650
return -LINUX_EBADF;
@@ -706,6 +712,9 @@ int64_t sys_recvfrom(guest_t *g,
706712
uint64_t src_gva,
707713
uint64_t addrlen_gva)
708714
{
715+
if (fd_get_type(fd) == FD_NETLINK)
716+
return netlink_recv(fd, g, buf_gva, len, src_gva, addrlen_gva);
717+
709718
host_fd_ref_t host_ref;
710719
if (host_fd_ref_open(fd, &host_ref) < 0)
711720
return -LINUX_EBADF;

src/syscall/net.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,25 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags);
190190
/* Netlink recvmsg: return buffered response data. */
191191
int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags);
192192

193-
/* Netlink read: return buffered response data without msghdr metadata. */
194193
int64_t netlink_read(int guest_fd,
195194
guest_t *g,
196195
uint64_t buf_gva,
197196
uint64_t count);
198197

198+
int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len);
199+
200+
int64_t netlink_recv(int guest_fd,
201+
guest_t *g,
202+
uint64_t buf_gva,
203+
uint64_t len,
204+
uint64_t src_gva,
205+
uint64_t addrlen_gva);
206+
207+
int64_t netlink_getsockname(int guest_fd,
208+
guest_t *g,
209+
uint64_t addr_gva,
210+
uint64_t addrlen_gva);
211+
199212
/* Clean up abstract socket filesystem entry for a fd being closed. */
200213
void absock_unregister_fd(int guest_fd);
201214

src/syscall/netlink.c

Lines changed: 230 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,12 @@ static size_t nl_put_attr(uint8_t *buf,
188188
return aligned;
189189
}
190190

191-
/* Build RTM_GETLINK response from host getifaddrs(). */
192-
static int nl_build_getlink(netlink_state_t *ns)
191+
/* Build RTM_GETLINK response from host getifaddrs(). A non-empty name_filter
192+
* or non-zero index_filter restricts the reply to one matching link.
193+
*/
194+
static int nl_build_getlink(netlink_state_t *ns,
195+
const char *name_filter,
196+
uint32_t index_filter)
193197
{
194198
struct ifaddrs *ifalist, *ifa;
195199
if (getifaddrs(&ifalist) < 0)
@@ -210,6 +214,11 @@ static int nl_build_getlink(netlink_state_t *ns)
210214
if (idx == 0)
211215
continue;
212216

217+
if (name_filter[0] && strcmp(ifa->ifa_name, name_filter) != 0)
218+
continue;
219+
if (index_filter != 0 && idx != index_filter)
220+
continue;
221+
213222
/* Check if already seen */
214223
bool found = false;
215224
for (int i = 0; i < nseen; i++) {
@@ -459,6 +468,96 @@ int64_t netlink_bind(int guest_fd,
459468
return 0;
460469
}
461470

471+
/* Extract the LinkByName/LinkByIndex filter (ifi_index plus an optional
472+
* IFLA_IFNAME) from a RTM_GETLINK request. Empty name / zero index = no filter.
473+
*/
474+
static void nl_parse_link_filter(const uint8_t *req,
475+
size_t reqlen,
476+
char *name_out,
477+
size_t name_cap,
478+
uint32_t *index_out)
479+
{
480+
name_out[0] = '\0';
481+
*index_out = 0;
482+
483+
if (reqlen < (size_t) NLMSG_HDRLEN + sizeof(ifinfomsg_t))
484+
return;
485+
486+
ifinfomsg_t ifi;
487+
memcpy(&ifi, req + NLMSG_HDRLEN, sizeof(ifi));
488+
if (ifi.ifi_index > 0)
489+
*index_out = (uint32_t) ifi.ifi_index;
490+
491+
uint32_t nlmsg_len;
492+
memcpy(&nlmsg_len, req, sizeof(nlmsg_len));
493+
size_t total = (nlmsg_len < reqlen) ? nlmsg_len : reqlen;
494+
495+
size_t off = NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(ifinfomsg_t));
496+
while (off + RTA_HDRLEN <= total) {
497+
rtattr_t rta;
498+
memcpy(&rta, req + off, sizeof(rta));
499+
if (rta.rta_len < RTA_HDRLEN || off + rta.rta_len > total)
500+
break;
501+
if (rta.rta_type == IFLA_IFNAME) {
502+
size_t dlen = rta.rta_len - RTA_HDRLEN;
503+
size_t i = 0;
504+
for (; i < dlen && i + 1 < name_cap && req[off + RTA_HDRLEN + i];
505+
i++)
506+
name_out[i] = (char) req[off + RTA_HDRLEN + i];
507+
name_out[i] = '\0';
508+
}
509+
off += RTA_ALIGN(rta.rta_len);
510+
}
511+
}
512+
513+
/* Build the reply for one rtnetlink request (already copied into req). Mutates
514+
* ns->buf/seq. Returns 0 on success (including a built NLMSG_ERROR reply for
515+
* unsupported types), or a negative LINUX_E* on a build failure. Caller holds
516+
* nl_lock. req is guaranteed to be at least NLMSG_HDRLEN bytes.
517+
*/
518+
static int nl_process_request(netlink_state_t *ns,
519+
const uint8_t *req,
520+
size_t reqlen)
521+
{
522+
nlmsghdr_t req_hdr;
523+
memcpy(&req_hdr, req, sizeof(req_hdr));
524+
ns->seq = req_hdr.nlmsg_seq;
525+
526+
int ret;
527+
switch (req_hdr.nlmsg_type) {
528+
case RTM_GETLINK: {
529+
char name[64];
530+
uint32_t index;
531+
nl_parse_link_filter(req, reqlen, name, sizeof(name), &index);
532+
ret = nl_build_getlink(ns, name, index);
533+
break;
534+
}
535+
case RTM_GETADDR:
536+
ret = nl_build_getaddr(ns);
537+
break;
538+
default:
539+
/* Unsupported request: return NLMSG_ERROR with EOPNOTSUPP */
540+
if ((size_t) NLMSG_HDRLEN + 4 <= NETLINK_BUF_SIZE) {
541+
size_t off = 0;
542+
nlmsghdr_t err_hdr = {
543+
.nlmsg_len = NLMSG_HDRLEN + 4,
544+
.nlmsg_type = NLMSG_ERROR,
545+
.nlmsg_seq = ns->seq,
546+
.nlmsg_pid = ns->pid,
547+
};
548+
memcpy(ns->buf + off, &err_hdr, sizeof(err_hdr));
549+
off += NLMSG_HDRLEN;
550+
int32_t errcode = -95; /* -EOPNOTSUPP */
551+
memcpy(ns->buf + off, &errcode, 4);
552+
ns->buf_len = off + 4;
553+
ns->buf_pos = 0;
554+
}
555+
return 0;
556+
}
557+
558+
return (ret < 0) ? -LINUX_EIO : 0;
559+
}
560+
462561
int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
463562
{
464563
(void) flags;
@@ -491,56 +590,153 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
491590
goto out;
492591
}
493592

494-
if (iov.iov_len < NLMSG_HDRLEN) {
593+
if (iov.iov_len < (uint64_t) NLMSG_HDRLEN) {
495594
result = -LINUX_EINVAL;
496595
goto out;
497596
}
498597

499-
nlmsghdr_t req_hdr;
500-
if (guest_read_small(g, iov.iov_base, &req_hdr, sizeof(req_hdr)) < 0) {
598+
/* Copy the whole request: the dispatcher inspects filter attributes past
599+
* the fixed nlmsghdr.
600+
*/
601+
uint8_t req[512];
602+
size_t rlen = (iov.iov_len < sizeof(req)) ? iov.iov_len : sizeof(req);
603+
if (guest_read(g, iov.iov_base, req, rlen) < 0) {
501604
result = -LINUX_EFAULT;
502605
goto out;
503606
}
504607

505-
ns->seq = req_hdr.nlmsg_seq;
608+
int ret = nl_process_request(ns, req, rlen);
609+
result = (ret < 0) ? ret : (int64_t) iov.iov_len;
506610

507-
/* Dispatch based on request type */
508-
int ret;
509-
switch (req_hdr.nlmsg_type) {
510-
case RTM_GETLINK:
511-
ret = nl_build_getlink(ns);
512-
break;
513-
case RTM_GETADDR:
514-
ret = nl_build_getaddr(ns);
515-
break;
516-
default:
517-
/* Unsupported request: return NLMSG_ERROR with EOPNOTSUPP */
518-
if (ns->buf_len + NLMSG_HDRLEN + 4 <= NETLINK_BUF_SIZE) {
519-
size_t off = 0;
520-
nlmsghdr_t err_hdr = {
521-
.nlmsg_len = NLMSG_HDRLEN + 4,
522-
.nlmsg_type = NLMSG_ERROR,
523-
.nlmsg_seq = ns->seq,
524-
.nlmsg_pid = ns->pid,
525-
};
526-
memcpy(ns->buf + off, &err_hdr, sizeof(err_hdr));
527-
off += NLMSG_HDRLEN;
528-
int32_t errcode = -95; /* -EOPNOTSUPP */
529-
memcpy(ns->buf + off, &errcode, 4);
530-
ns->buf_len = off + 4;
531-
ns->buf_pos = 0;
532-
}
533-
result = (int64_t) iov.iov_len;
611+
out:
612+
pthread_mutex_unlock(&nl_lock);
613+
return result;
614+
}
615+
616+
/* sendto(2) on a netlink socket: a flat request buffer (no msghdr). */
617+
int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len)
618+
{
619+
pthread_mutex_lock(&nl_lock);
620+
netlink_state_t *ns = nl_find(guest_fd);
621+
if (!ns) {
622+
pthread_mutex_unlock(&nl_lock);
623+
return -LINUX_EBADF;
624+
}
625+
626+
int64_t result;
627+
if (len < (uint64_t) NLMSG_HDRLEN) {
628+
result = -LINUX_EINVAL;
629+
goto out;
630+
}
631+
632+
uint8_t req[512];
633+
size_t rlen = (len < sizeof(req)) ? len : sizeof(req);
634+
if (guest_read(g, buf_gva, req, rlen) < 0) {
635+
result = -LINUX_EFAULT;
534636
goto out;
535637
}
536638

537-
result = (ret < 0) ? -LINUX_EIO : (int64_t) iov.iov_len;
639+
int ret = nl_process_request(ns, req, rlen);
640+
result = (ret < 0) ? ret : (int64_t) len;
538641

539642
out:
540643
pthread_mutex_unlock(&nl_lock);
541644
return result;
542645
}
543646

647+
/* recvfrom(2) on a netlink socket: drain whole messages; write back a kernel
648+
* sockaddr_nl (nl_pid 0) when src is requested.
649+
*/
650+
int64_t netlink_recv(int guest_fd,
651+
guest_t *g,
652+
uint64_t buf_gva,
653+
uint64_t len,
654+
uint64_t src_gva,
655+
uint64_t addrlen_gva)
656+
{
657+
pthread_mutex_lock(&nl_lock);
658+
netlink_state_t *ns = nl_find(guest_fd);
659+
if (!ns) {
660+
pthread_mutex_unlock(&nl_lock);
661+
return -LINUX_EBADF;
662+
}
663+
664+
if (ns->buf_pos >= ns->buf_len) {
665+
pthread_mutex_unlock(&nl_lock);
666+
return 0;
667+
}
668+
669+
size_t avail = ns->buf_len - ns->buf_pos;
670+
size_t to_copy = (avail < len) ? avail : len;
671+
672+
/* Return complete netlink messages only (same walk as netlink_recvmsg). */
673+
size_t msg_end = 0, pos = ns->buf_pos;
674+
while (pos < ns->buf_len && (pos - ns->buf_pos + NLMSG_HDRLEN) <= to_copy) {
675+
nlmsghdr_t *hdr = (nlmsghdr_t *) (ns->buf + pos);
676+
if (hdr->nlmsg_len < NLMSG_HDRLEN)
677+
break;
678+
size_t msg_bytes = pos - ns->buf_pos + NLMSG_ALIGN(hdr->nlmsg_len);
679+
if (msg_bytes > to_copy)
680+
break;
681+
pos += NLMSG_ALIGN(hdr->nlmsg_len);
682+
msg_end = pos - ns->buf_pos;
683+
}
684+
if (msg_end == 0)
685+
msg_end = to_copy;
686+
687+
if (guest_write(g, buf_gva, ns->buf + ns->buf_pos, msg_end) < 0) {
688+
pthread_mutex_unlock(&nl_lock);
689+
return -LINUX_EFAULT;
690+
}
691+
ns->buf_pos += msg_end;
692+
693+
if (src_gva && addrlen_gva) {
694+
sockaddr_nl_t snl = {
695+
.nl_family = LINUX_AF_NETLINK,
696+
.nl_pid = 0, /* From kernel */
697+
};
698+
guest_write_small(g, src_gva, &snl, sizeof(snl));
699+
uint32_t namelen = sizeof(sockaddr_nl_t);
700+
guest_write_small(g, addrlen_gva, &namelen, sizeof(namelen));
701+
}
702+
703+
pthread_mutex_unlock(&nl_lock);
704+
return (int64_t) msg_end;
705+
}
706+
707+
/* getsockname(2) on a netlink socket: returns the bound/auto-assigned pid. */
708+
int64_t netlink_getsockname(int guest_fd,
709+
guest_t *g,
710+
uint64_t addr_gva,
711+
uint64_t addrlen_gva)
712+
{
713+
pthread_mutex_lock(&nl_lock);
714+
netlink_state_t *ns = nl_find(guest_fd);
715+
if (!ns) {
716+
pthread_mutex_unlock(&nl_lock);
717+
return -LINUX_EBADF;
718+
}
719+
uint32_t pid = ns->pid;
720+
pthread_mutex_unlock(&nl_lock);
721+
722+
uint32_t cap = 0;
723+
if (guest_read_small(g, addrlen_gva, &cap, sizeof(cap)) < 0)
724+
return -LINUX_EFAULT;
725+
726+
sockaddr_nl_t snl = {
727+
.nl_family = LINUX_AF_NETLINK,
728+
.nl_pid = pid,
729+
};
730+
size_t n = (cap < sizeof(snl)) ? cap : sizeof(snl);
731+
if (n > 0 && guest_write(g, addr_gva, &snl, n) < 0)
732+
return -LINUX_EFAULT;
733+
734+
uint32_t actual = sizeof(snl);
735+
if (guest_write_small(g, addrlen_gva, &actual, sizeof(actual)) < 0)
736+
return -LINUX_EFAULT;
737+
return 0;
738+
}
739+
544740
int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
545741
{
546742
(void) flags;

0 commit comments

Comments
 (0)