Skip to content

Commit 4675117

Browse files
fix: hook write with re-entrancy guard to prevent preemption during logging, revert change_state conditional reorder
Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/c679f7fd-4c3d-4a1c-a07d-637ba7b9fc00 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com>
1 parent 726199b commit 4675117

3 files changed

Lines changed: 72 additions & 18 deletions

File tree

core/src/coroutine/state.rs

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,11 @@ where
1818
new_state: CoroutineState<Yield, Return>,
1919
) -> CoroutineState<Yield, Return> {
2020
let old_state = self.state.replace(new_state);
21-
if matches!(new_state, CoroutineState::Running) {
22-
//先打印日志再通知监听器,避免在QEMU等慢速平台上的活锁问题
23-
// Log before notifying: MonitorListener starts the 10ms preemption
24-
// timer on Running state. On QEMU, info!() takes >10ms, so logging
25-
// first ensures the timer starts after the slow I/O completes.
26-
info!("{} {:?}->{:?}", self.name(), old_state, new_state);
27-
self.on_state_changed(self, old_state, new_state);
21+
self.on_state_changed(self, old_state, new_state);
22+
if let CoroutineState::Error(_) = new_state {
23+
error!("{} {:?}->{:?}", self.name(), old_state, new_state);
2824
} else {
29-
//先通知监听器再打印日志,确保抢占定时器在日志I/O之前被移除
30-
// Notify before logging: MonitorListener removes the NOTIFY_NODE for
31-
// non-Running states. Removing it first prevents SIGURG from arriving
32-
// during the potentially slow log I/O.
33-
self.on_state_changed(self, old_state, new_state);
34-
if let CoroutineState::Error(_) = new_state {
35-
error!("{} {:?}->{:?}", self.name(), old_state, new_state);
36-
} else {
37-
info!("{} {:?}->{:?}", self.name(), old_state, new_state);
38-
}
25+
info!("{} {:?}->{:?}", self.name(), old_state, new_state);
3926
}
4027
old_state
4128
}

core/src/syscall/unix/mod.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,25 @@ use dashmap::DashMap;
22
use once_cell::sync::Lazy;
33
use std::ffi::c_int;
44

5+
//防止重入:info!()/error!()内部会调用write(),如果write被hook了,
6+
//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用内部系统调用跳过facade逻辑。
7+
// Re-entrancy guard: info!()/error!() internally call write(). If write is hooked,
8+
// this causes infinite recursion or nested state transitions that corrupt coroutine state.
9+
// When re-entrancy is detected, bypass the facade and call the inner syscall directly.
10+
thread_local! {
11+
static IN_FACADE: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
12+
}
13+
14+
#[inline]
15+
pub fn in_facade() -> bool {
16+
IN_FACADE.get()
17+
}
18+
19+
#[inline]
20+
pub fn set_in_facade(val: bool) {
21+
IN_FACADE.set(val);
22+
}
23+
524
macro_rules! impl_syscall {
625
(
726
$facade_struct_name:ident, $iocp_struct_name: ident, $nio_struct_name: ident, $raw_struct_name: ident,
@@ -97,7 +116,18 @@ macro_rules! impl_facade {
97116
fn_ptr: Option<&extern "C" fn($($arg_type),*) -> $result>,
98117
$($arg: $arg_type),*
99118
) -> $result {
119+
if $crate::syscall::in_facade() {
120+
return self.inner.$syscall(fn_ptr, $($arg, )*);
121+
}
100122
let syscall = $crate::common::constants::SyscallName::$syscall;
123+
//在日志和状态变更期间设置防重入标志,因为info!()/error!()内部会
124+
//调用write(),co.syscall()/co.running()内部会调用change_state()
125+
//再调用info!(),这些都可能触发hooked write导致无限递归
126+
// Set re-entrancy guard during logging and state changes because:
127+
// - info!()/error!() internally call write()
128+
// - co.syscall()/co.running() call change_state() which calls info!()
129+
// Both can trigger hooked write causing infinite recursion.
130+
$crate::syscall::set_in_facade(true);
101131
$crate::info!("enter syscall {}", syscall);
102132
if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() {
103133
let new_state = $crate::common::constants::SyscallState::Executing;
@@ -107,13 +137,16 @@ macro_rules! impl_facade {
107137
);
108138
}
109139
}
140+
$crate::syscall::set_in_facade(false);
110141
let r = self.inner.$syscall(fn_ptr, $($arg, )*);
142+
$crate::syscall::set_in_facade(true);
111143
if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() {
112144
if co.running().is_err() {
113145
$crate::error!("{} change to running state failed !", co.name());
114146
}
115147
}
116148
$crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error());
149+
$crate::syscall::set_in_facade(false);
117150
r
118151
}
119152
}

hook/src/syscall/unix.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,40 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf
8383
// impl_hook!(POLL, poll(fds: *mut pollfd, nfds: nfds_t, timeout: c_int) -> c_int);
8484

8585
// NOTE: unhook write/pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug
86-
// impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t);
8786
// impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int);
8887
// impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int);
88+
89+
//write需要特殊的hook实现:stdout/stderr的write由日志框架触发,
90+
//必须绕过facade直接调用原始write,否则facade内部的info!()会再次
91+
//触发write导致stdout RefCell重复借用。其他fd正常走facade。
92+
// write needs a custom hook: writes to stdout/stderr are triggered by
93+
// the logging framework. They must bypass the facade and call raw write
94+
// directly; otherwise the facade's info!() would re-trigger write,
95+
// causing stdout's RefCell to be double-borrowed. Other fds go through
96+
// the facade normally.
97+
#[no_mangle]
98+
pub extern "C" fn write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t {
99+
static WRITE: once_cell::sync::Lazy<extern "C" fn(c_int, *const c_void, size_t) -> ssize_t> =
100+
once_cell::sync::Lazy::new(|| unsafe {
101+
let symbol = std::ffi::CString::new("write")
102+
.unwrap_or_else(|_| panic!("can not transfer \"write\" to CString"));
103+
let ptr = libc::dlsym(libc::RTLD_NEXT, symbol.as_ptr());
104+
assert!(!ptr.is_null(), "syscall \"write\" not found !");
105+
std::mem::transmute(ptr)
106+
});
107+
let fn_ptr = once_cell::sync::Lazy::force(&WRITE);
108+
// stdout(1)/stderr(2)的write由日志框架触发,必须绕过facade
109+
// Bypass facade for stdout/stderr — these are logging fds
110+
if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO
111+
|| open_coroutine_core::syscall::in_facade()
112+
{
113+
return (fn_ptr)(fd, buf, count);
114+
}
115+
if crate::hook()
116+
|| open_coroutine_core::scheduler::SchedulableCoroutine::current().is_some()
117+
|| cfg!(feature = "ci")
118+
{
119+
return open_coroutine_core::syscall::write(Some(fn_ptr), fd, buf, count);
120+
}
121+
(fn_ptr)(fd, buf, count)
122+
}

0 commit comments

Comments
 (0)