Skip to content

Commit 23b8300

Browse files
authored
Merge pull request #72 from sysprog21/hotpath-cleanup
Switch ppoll/pselect6 to host_fd_ref_t and tighten
2 parents bde0b37 + d369caf commit 23b8300

12 files changed

Lines changed: 646 additions & 142 deletions

File tree

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
172172
@echo " CROSS $< (with -lpthread)"
173173
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
174174

175+
# test-scm-creds blocks accept in a pthread while the listener option changes.
176+
$(BUILD_DIR)/test-scm-creds: tests/test-scm-creds.c | $(BUILD_DIR)
177+
@echo " CROSS $< (with -lpthread)"
178+
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
179+
175180
# test-shim-cred-race spawns a pthread reader while the main thread
176181
# toggles setresuid; the reader spins on the identity fast path.
177182
$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)

src/syscall/abi.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,12 @@ enum {
682682
SOCK_OPT_TCP_KEEPINTVL,
683683
SOCK_OPT_IPV6_V6ONLY,
684684
SOCK_OPT_PASSCRED,
685+
SOCK_OPT_IP_TOS,
686+
SOCK_OPT_IP_TTL,
687+
SOCK_OPT_IP_HDRINCL,
688+
SOCK_OPT_IP_PKTINFO,
689+
SOCK_OPT_IP_RECVTTL,
690+
SOCK_OPT_IP_RECVTOS,
685691
/* IP_MTU_DISCOVER value stored verbatim so getsockopt round-trips the
686692
* Linux PMTUD mode the guest set. The host accepts the value but does
687693
* not honour every Linux mode; see sys_setsockopt for the IP_DONTFRAG
@@ -697,10 +703,11 @@ typedef struct {
697703
} sock_opt_cache_t;
698704

699705
typedef struct {
700-
int type; /* FD_CLOSED, FD_STDIO, FD_REGULAR, FD_DIR */
701-
int host_fd; /* Underlying macOS file descriptor */
702-
int linux_flags; /* Linux open flags (for CLOEXEC tracking) */
703-
void *dir; /* DIR* for FD_DIR entries (NULL otherwise) */
706+
int type; /* FD_CLOSED, FD_STDIO, FD_REGULAR, FD_DIR */
707+
int host_fd; /* Underlying macOS file descriptor */
708+
uint64_t generation; /* Bumped each time this guest fd slot is reused. */
709+
int linux_flags; /* Linux open flags (for CLOEXEC tracking) */
710+
void *dir; /* DIR* for FD_DIR entries (NULL otherwise) */
704711
char proc_path[FD_VIRTUAL_PATH_MAX]; /* Virtual /proc dir root for *at */
705712
int seals; /* F_SEAL_* bits (non-zero only for memfd_create fds) */
706713
sock_opt_cache_t sock; /* Socket option cache (zeroed for non-sockets) */

src/syscall/fdtable.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pthread_mutex_t fd_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 3 */
3131

3232
/* FD table. */
3333
fd_entry_t fd_table[FD_TABLE_SIZE];
34+
static uint64_t fd_next_generation = 1;
3435

3536
/* RLIMIT_NOFILE tracking. */
3637
/* Guest-side soft limit for RLIMIT_NOFILE. fd_alloc checks this.
@@ -77,6 +78,7 @@ static inline void fd_init_entry(int fd,
7778
fd_bitmap_set_used(fd);
7879
fd_table[fd].type = type;
7980
fd_table[fd].host_fd = host_fd;
81+
fd_table[fd].generation = fd_next_generation++;
8082
fd_table[fd].linux_flags = 0;
8183
fd_table[fd].dir = NULL;
8284
fd_table[fd].proc_path[0] = '\0';
@@ -154,9 +156,16 @@ void fdtable_init(void)
154156
memset(fd_free_bitmap, 0xFF, sizeof(fd_free_bitmap));
155157

156158
/* Pre-open stdin/stdout/stderr */
157-
fd_table[0] = (fd_entry_t) {.type = FD_STDIO, .host_fd = STDIN_FILENO};
158-
fd_table[1] = (fd_entry_t) {.type = FD_STDIO, .host_fd = STDOUT_FILENO};
159-
fd_table[2] = (fd_entry_t) {.type = FD_STDIO, .host_fd = STDERR_FILENO};
159+
fd_next_generation = 1;
160+
fd_table[0] = (fd_entry_t) {.type = FD_STDIO,
161+
.host_fd = STDIN_FILENO,
162+
.generation = fd_next_generation++};
163+
fd_table[1] = (fd_entry_t) {.type = FD_STDIO,
164+
.host_fd = STDOUT_FILENO,
165+
.generation = fd_next_generation++};
166+
fd_table[2] = (fd_entry_t) {.type = FD_STDIO,
167+
.host_fd = STDERR_FILENO,
168+
.generation = fd_next_generation++};
160169
fd_bitmap_set_used(0);
161170
fd_bitmap_set_used(1);
162171
fd_bitmap_set_used(2);

src/syscall/net-abi.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,11 @@ int socket_small_int_normalize(int level, int optname, int value)
1919
(optname == LINUX_SO_KEEPALIVE || optname == LINUX_SO_REUSEADDR ||
2020
optname == LINUX_SO_ACCEPTCONN || optname == LINUX_SO_REUSEPORT ||
2121
optname == LINUX_SO_BROADCAST || optname == LINUX_SO_DONTROUTE ||
22-
optname == LINUX_SO_OOBINLINE)) ||
22+
optname == LINUX_SO_OOBINLINE || optname == LINUX_SO_PASSCRED)) ||
2323
(level == LINUX_IPPROTO_TCP && optname == LINUX_TCP_NODELAY) ||
24+
(level == LINUX_IPPROTO_IP &&
25+
(optname == LINUX_IP_HDRINCL || optname == LINUX_IP_PKTINFO ||
26+
optname == LINUX_IP_RECVTTL || optname == LINUX_IP_RECVTOS)) ||
2427
(level == LINUX_IPPROTO_IPV6 && optname == LINUX_IPV6_V6ONLY))
2528
return value != 0;
2629

@@ -46,6 +49,7 @@ int socket_opt_uses_small_int(int level, int optname)
4649
case LINUX_SO_SNDBUF:
4750
case LINUX_SO_TYPE:
4851
case LINUX_SO_ERROR:
52+
case LINUX_SO_PASSCRED:
4953
return 1;
5054
default:
5155
return 0;
@@ -64,6 +68,20 @@ int socket_opt_uses_small_int(int level, int optname)
6468
}
6569
}
6670

71+
if (level == LINUX_IPPROTO_IP) {
72+
switch (optname) {
73+
case LINUX_IP_TOS:
74+
case LINUX_IP_TTL:
75+
case LINUX_IP_HDRINCL:
76+
case LINUX_IP_PKTINFO:
77+
case LINUX_IP_RECVTTL:
78+
case LINUX_IP_RECVTOS:
79+
return 1;
80+
default:
81+
return 0;
82+
}
83+
}
84+
6785
return level == LINUX_IPPROTO_IPV6 && optname == LINUX_IPV6_V6ONLY;
6886
}
6987

@@ -136,6 +154,12 @@ int translate_small_int_sockopt(int level,
136154
}
137155
}
138156

157+
if (level == LINUX_IPPROTO_IP) {
158+
*mac_level = IPPROTO_IP;
159+
*mac_optname = translate_ip_sockopt_to_mac(optname);
160+
return *mac_optname >= 0;
161+
}
162+
139163
if (level == LINUX_IPPROTO_IPV6 && optname == LINUX_IPV6_V6ONLY) {
140164
*mac_level = IPPROTO_IPV6;
141165
*mac_optname = IPV6_V6ONLY;

src/syscall/net-msg.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@
2828
/* Linux SCM_MAX_FD: maximum number of file descriptors in SCM_RIGHTS */
2929
#define LINUX_SCM_MAX_FD 253
3030

31+
/* Linux only delivers SCM_CREDENTIALS on AF_UNIX sockets even when
32+
* SO_PASSCRED is set, so PASSCRED toggled on AF_INET / AF_INET6 must
33+
* stay a no-op.
34+
*/
35+
static bool host_socket_is_unix(int host_fd)
36+
{
37+
struct sockaddr_storage ss;
38+
socklen_t slen = sizeof(ss);
39+
return getsockname(host_fd, (struct sockaddr *) &ss, &slen) == 0 &&
40+
ss.ss_family == AF_UNIX;
41+
}
42+
3143
static int translate_scm_rights_fds(int *fds, size_t nfds)
3244
{
3345
if (nfds > LINUX_SCM_MAX_FD)
@@ -597,7 +609,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
597609
int passcred_val = 0;
598610
if (net_socket_cached_int_get(fd, LINUX_SOL_SOCKET, LINUX_SO_PASSCRED,
599611
&passcred_val) &&
600-
passcred_val) {
612+
passcred_val && host_socket_is_unix(host_ref.fd)) {
601613
linux_ucred_t cred = {
602614
.pid = (int32_t) proc_get_pid(),
603615
.uid = proc_get_uid(),
@@ -655,7 +667,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
655667
int injected = 0, passcred_val = 0;
656668
if (net_socket_cached_int_get(fd, LINUX_SOL_SOCKET, LINUX_SO_PASSCRED,
657669
&passcred_val) &&
658-
passcred_val) {
670+
passcred_val && host_socket_is_unix(host_ref.fd)) {
659671
linux_ucred_t cred = {
660672
.pid = (int32_t) proc_get_pid(),
661673
.uid = proc_get_uid(),

src/syscall/net-sockopt.c

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,26 @@ static int net_sock_opt_index_for(int level, int optname)
124124
}
125125
if (level == LINUX_IPPROTO_IPV6 && optname == LINUX_IPV6_V6ONLY)
126126
return SOCK_OPT_IPV6_V6ONLY;
127-
if (level == LINUX_IPPROTO_IP && optname == LINUX_IP_MTU_DISCOVER)
128-
return SOCK_OPT_IP_MTU_DISCOVER;
127+
if (level == LINUX_IPPROTO_IP) {
128+
switch (optname) {
129+
case LINUX_IP_TOS:
130+
return SOCK_OPT_IP_TOS;
131+
case LINUX_IP_TTL:
132+
return SOCK_OPT_IP_TTL;
133+
case LINUX_IP_HDRINCL:
134+
return SOCK_OPT_IP_HDRINCL;
135+
case LINUX_IP_PKTINFO:
136+
return SOCK_OPT_IP_PKTINFO;
137+
case LINUX_IP_RECVTTL:
138+
return SOCK_OPT_IP_RECVTTL;
139+
case LINUX_IP_RECVTOS:
140+
return SOCK_OPT_IP_RECVTOS;
141+
case LINUX_IP_MTU_DISCOVER:
142+
return SOCK_OPT_IP_MTU_DISCOVER;
143+
default:
144+
return -1;
145+
}
146+
}
129147
return -1;
130148
}
131149

@@ -140,6 +158,34 @@ int net_socket_cached_int_get(int guest_fd, int level, int optname, int *value)
140158
return net_sock_cache_get(guest_fd, idx, value);
141159
}
142160

161+
int net_socket_cached_int_get_if_generation(int guest_fd,
162+
uint64_t generation,
163+
int level,
164+
int optname,
165+
int *value)
166+
{
167+
if (level == LINUX_SOL_SOCKET && optname == LINUX_SO_ERROR)
168+
return 0;
169+
170+
int idx = net_sock_opt_index_for(level, optname);
171+
if (idx < 0 || !RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE) || !value)
172+
return 0;
173+
174+
if (thread_is_single_active()) {
175+
fd_entry_t *entry = &fd_table[guest_fd];
176+
if (entry->type == FD_SOCKET && entry->generation == generation)
177+
return sock_opt_get(entry, idx, value);
178+
return 0;
179+
}
180+
181+
pthread_mutex_lock(&fd_lock);
182+
fd_entry_t *entry = &fd_table[guest_fd];
183+
bool ok = entry->type == FD_SOCKET && entry->generation == generation &&
184+
sock_opt_get(entry, idx, value);
185+
pthread_mutex_unlock(&fd_lock);
186+
return ok;
187+
}
188+
143189
void net_socket_cached_int_set(int guest_fd, int level, int optname, int value)
144190
{
145191
if (level == LINUX_SOL_SOCKET && optname == LINUX_SO_ERROR)
@@ -155,7 +201,7 @@ void net_socket_cache_init_defaults(int guest_fd, int domain, int real_type)
155201
static const int zero_opts[] = {
156202
SOCK_OPT_KEEPALIVE, SOCK_OPT_REUSEADDR, SOCK_OPT_ACCEPTCONN,
157203
SOCK_OPT_REUSEPORT, SOCK_OPT_BROADCAST, SOCK_OPT_DONTROUTE,
158-
SOCK_OPT_OOBINLINE,
204+
SOCK_OPT_OOBINLINE, SOCK_OPT_PASSCRED,
159205
};
160206

161207
net_socket_cache_set_many_zero(guest_fd, zero_opts, ARRAY_SIZE(zero_opts));
@@ -168,12 +214,19 @@ void net_socket_cache_init_defaults(int guest_fd, int domain, int real_type)
168214
net_socket_cache_set_index(guest_fd, SOCK_OPT_IPV6_V6ONLY, 0);
169215
}
170216

171-
void net_socket_cache_init_accept(int guest_fd)
217+
void net_socket_cache_init_accept(int guest_fd, int inherit_passcred)
172218
{
173219
static const int zero_opts[] = {
174220
SOCK_OPT_ACCEPTCONN, SOCK_OPT_REUSEPORT, SOCK_OPT_BROADCAST,
175221
SOCK_OPT_DONTROUTE, SOCK_OPT_OOBINLINE,
176222
};
177223

178224
net_socket_cache_set_many_zero(guest_fd, zero_opts, ARRAY_SIZE(zero_opts));
225+
226+
/* AF_UNIX accept inherits SO_PASSCRED from the listener. For local
227+
* connects the accept path receives the value captured when the
228+
* connection was queued; otherwise it falls back to the listener value.
229+
*/
230+
net_socket_cache_set_index(guest_fd, SOCK_OPT_PASSCRED,
231+
inherit_passcred ? 1 : 0);
179232
}

src/syscall/net-sockopt.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@
66

77
#pragma once
88

9+
#include <stdint.h>
10+
911
int net_socket_fd_is_valid(int guest_fd);
1012
int net_socket_cached_int_get(int guest_fd, int level, int optname, int *value);
13+
int net_socket_cached_int_get_if_generation(int guest_fd,
14+
uint64_t generation,
15+
int level,
16+
int optname,
17+
int *value);
1118
void net_socket_cached_int_set(int guest_fd, int level, int optname, int value);
1219
void net_socket_cache_set_index(int guest_fd, int idx, int value);
1320
void net_socket_cache_init_defaults(int guest_fd, int domain, int real_type);
14-
void net_socket_cache_init_accept(int guest_fd);
21+
void net_socket_cache_init_accept(int guest_fd, int inherit_passcred);

0 commit comments

Comments
 (0)