Skip to content

Commit 697ed2d

Browse files
authored
hook write (#440)
2 parents 095427b + 2df729a commit 697ed2d

File tree

3 files changed

+87
-8
lines changed

3 files changed

+87
-8
lines changed

core/src/monitor.rs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::common::constants::{CoroutineState, MONITOR_BEAN};
33
use crate::common::{get_timeout_time, now, CondvarBlocker};
44
use crate::coroutine::listener::Listener;
55
use crate::coroutine::local::CoroutineLocal;
6-
use crate::scheduler::SchedulableSuspender;
6+
use crate::scheduler::{SchedulableCoroutine, SchedulableSuspender};
77
use crate::{catch, error, impl_current_for, impl_display_by_debug, info};
88
#[cfg(unix)]
99
use nix::sys::pthread::{pthread_kill, pthread_self, Pthread};
@@ -78,6 +78,24 @@ impl Monitor {
7878
set.remove(Signal::SIGURG);
7979
set.thread_set_mask()
8080
.expect("Failed to remove SIGURG signal mask!");
81+
//不抢占处于Syscall状态的协程。
82+
//MonitorListener的设计理念是不对Syscall状态的协程发送信号。
83+
//但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件,
84+
//SIGURG可能在协程刚进入Syscall状态时到达。
85+
//如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册),
86+
//导致死锁。
87+
// Skip preemption for coroutines in Syscall state.
88+
// MonitorListener's design is to NOT send signals to Syscall-state
89+
// coroutines. However, a race between NOTIFY_NODE removal and the
90+
// monitor's queue iteration can cause SIGURG to arrive just after
91+
// the coroutine entered Syscall state. If preempted here, the
92+
// coroutine lands in the syscall map with no io_uring/epoll/timer
93+
// registration to wake it, causing a deadlock.
94+
if let Some(co) = SchedulableCoroutine::current() {
95+
if matches!(co.state(), CoroutineState::Syscall((), _, _)) {
96+
return;
97+
}
98+
}
8199
if let Some(suspender) = SchedulableSuspender::current() {
82100
suspender.suspend();
83101
}
@@ -89,7 +107,7 @@ impl Monitor {
89107
// install panic hook
90108
std::panic::set_hook(Box::new(|panic_hook_info| {
91109
let syscall = crate::common::constants::SyscallName::panicking;
92-
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
110+
if let Some(co) = SchedulableCoroutine::current() {
93111
let new_state = crate::common::constants::SyscallState::Executing;
94112
if co.syscall((), syscall, new_state).is_err() {
95113
error!(
@@ -109,7 +127,7 @@ impl Monitor {
109127
"stack backtrace:\n{}",
110128
std::backtrace::Backtrace::force_capture()
111129
);
112-
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
130+
if let Some(co) = SchedulableCoroutine::current() {
113131
if co.running().is_err() {
114132
error!("{} change to running state failed !", co.name());
115133
}
@@ -523,6 +541,24 @@ extern "C" fn do_preempt() {
523541
// coroutine never yielded (no hooked syscalls) — it is truly CPU-bound.
524542
// Force immediate suspension.
525543
flag.set(false);
544+
//不抢占处于Syscall状态的协程。
545+
//MonitorListener的设计理念是不对Syscall状态的协程发送信号。
546+
//但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件,
547+
//SIGURG可能在协程刚进入Syscall状态时到达。
548+
//如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册),
549+
//导致死锁。
550+
// Skip preemption for coroutines in Syscall state.
551+
// MonitorListener's design is to NOT send signals to Syscall-state
552+
// coroutines. However, a race between NOTIFY_NODE removal and the
553+
// monitor's queue iteration can cause SIGURG to arrive just after
554+
// the coroutine entered Syscall state. If preempted here, the
555+
// coroutine lands in the syscall map with no io_uring/epoll/timer
556+
// registration to wake it, causing a deadlock.
557+
if let Some(co) = SchedulableCoroutine::current() {
558+
if matches!(co.state(), CoroutineState::Syscall((), _, _)) {
559+
return;
560+
}
561+
}
526562
if let Some(suspender) = SchedulableSuspender::current() {
527563
suspender.suspend();
528564
}

core/src/syscall/unix/write.rs

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,52 @@ impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWrite
1515
write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t
1616
);
1717

18-
impl_facade!(WriteSyscallFacade, WriteSyscall,
19-
write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t
20-
);
18+
//write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发,
19+
//必须跳过所有中间层(facade/io_uring/NIO)直接调用原始系统调用,否则:
20+
//1. facade内部的info!()会再次触发write导致stdout RefCell重复借用(无限递归)
21+
//2. io_uring层会提交写操作并阻塞在condvar等待完成,导致死锁
22+
// The write facade needs special handling: writes to stdout/stderr are
23+
// triggered by the logging framework (tracing). They must bypass ALL layers
24+
// (facade, io_uring, NIO) and call the raw syscall directly. Otherwise:
25+
// 1. The facade's info!() re-triggers write → stdout RefCell double-borrow
26+
// 2. The io_uring layer submits the write and blocks on condvar → deadlock
27+
#[repr(C)]
28+
#[derive(Debug, Default)]
29+
struct WriteSyscallFacade<I: WriteSyscall> {
30+
inner: I,
31+
}
32+
33+
impl<I: WriteSyscall> WriteSyscall for WriteSyscallFacade<I> {
34+
extern "C" fn write(
35+
&self,
36+
fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>,
37+
fd: c_int,
38+
buf: *const c_void,
39+
len: size_t,
40+
) -> ssize_t {
41+
if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO {
42+
return RawWriteSyscall::default().write(fn_ptr, fd, buf, len);
43+
}
44+
let syscall = crate::common::constants::SyscallName::write;
45+
crate::info!("enter syscall {}", syscall);
46+
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
47+
let new_state = crate::common::constants::SyscallState::Executing;
48+
if co.syscall((), syscall, new_state).is_err() {
49+
crate::error!("{} change to syscall {} {} failed !",
50+
co.name(), syscall, new_state
51+
);
52+
}
53+
}
54+
let r = self.inner.write(fn_ptr, fd, buf, len);
55+
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
56+
if co.running().is_err() {
57+
crate::error!("{} change to running state failed !", co.name());
58+
}
59+
}
60+
crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error());
61+
r
62+
}
63+
}
2164

2265
impl_io_uring_write!(IoUringWriteSyscall, WriteSyscall,
2366
write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t

hook/src/syscall/unix.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ impl_hook!(PREADV, preadv(fd: c_int, iov: *const iovec, iovcnt: c_int, offset: o
6262
impl_hook!(RECVMSG, recvmsg(fd: c_int, msg: *mut msghdr, flags: c_int) -> ssize_t);
6363
impl_hook!(SEND, send(fd: c_int, buf: *const c_void, len: size_t, flags: c_int) -> ssize_t);
6464
impl_hook!(SENDTO, sendto(fd: c_int, buf: *const c_void, len: size_t, flags: c_int, addr: *const sockaddr, addrlen: socklen_t) -> ssize_t);
65+
impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t);
6566
impl_hook!(PWRITE, pwrite(fd: c_int, buf: *const c_void, count: size_t, offset: off_t) -> ssize_t);
6667
impl_hook!(WRITEV, writev(fd: c_int, iov: *const iovec, iovcnt: c_int) -> ssize_t);
6768
impl_hook!(PWRITEV, pwritev(fd: c_int, iov: *const iovec, iovcnt: c_int, offset: off_t) -> ssize_t);
@@ -82,7 +83,6 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf
8283
// NOTE: unhook poll due to mio's poller
8384
// impl_hook!(POLL, poll(fds: *mut pollfd, nfds: nfds_t, timeout: c_int) -> c_int);
8485

85-
// NOTE: unhook write/pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug
86-
// impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t);
86+
// NOTE: unhook pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug
8787
// impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int);
8888
// impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int);

0 commit comments

Comments
 (0)