Skip to content

Commit 2df729a

Browse files
committed
enhance stability
1 parent 11e8d80 commit 2df729a

1 file changed

Lines changed: 39 additions & 3 deletions

File tree

core/src/monitor.rs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::common::constants::{CoroutineState, MONITOR_BEAN};
33
use crate::common::{get_timeout_time, now, CondvarBlocker};
44
use crate::coroutine::listener::Listener;
55
use crate::coroutine::local::CoroutineLocal;
6-
use crate::scheduler::SchedulableSuspender;
6+
use crate::scheduler::{SchedulableCoroutine, SchedulableSuspender};
77
use crate::{catch, error, impl_current_for, impl_display_by_debug, info};
88
#[cfg(unix)]
99
use nix::sys::pthread::{pthread_kill, pthread_self, Pthread};
@@ -78,6 +78,24 @@ impl Monitor {
7878
set.remove(Signal::SIGURG);
7979
set.thread_set_mask()
8080
.expect("Failed to remove SIGURG signal mask!");
81+
//不抢占处于Syscall状态的协程。
82+
//MonitorListener的设计理念是不对Syscall状态的协程发送信号。
83+
//但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件,
84+
//SIGURG可能在协程刚进入Syscall状态时到达。
85+
//如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册),
86+
//导致死锁。
87+
// Skip preemption for coroutines in Syscall state.
88+
// MonitorListener's design is to NOT send signals to Syscall-state
89+
// coroutines. However, a race between NOTIFY_NODE removal and the
90+
// monitor's queue iteration can cause SIGURG to arrive just after
91+
// the coroutine entered Syscall state. If preempted here, the
92+
// coroutine lands in the syscall map with no io_uring/epoll/timer
93+
// registration to wake it, causing a deadlock.
94+
if let Some(co) = SchedulableCoroutine::current() {
95+
if matches!(co.state(), CoroutineState::Syscall((), _, _)) {
96+
return;
97+
}
98+
}
8199
if let Some(suspender) = SchedulableSuspender::current() {
82100
suspender.suspend();
83101
}
@@ -89,7 +107,7 @@ impl Monitor {
89107
// install panic hook
90108
std::panic::set_hook(Box::new(|panic_hook_info| {
91109
let syscall = crate::common::constants::SyscallName::panicking;
92-
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
110+
if let Some(co) = SchedulableCoroutine::current() {
93111
let new_state = crate::common::constants::SyscallState::Executing;
94112
if co.syscall((), syscall, new_state).is_err() {
95113
error!(
@@ -109,7 +127,7 @@ impl Monitor {
109127
"stack backtrace:\n{}",
110128
std::backtrace::Backtrace::force_capture()
111129
);
112-
if let Some(co) = crate::scheduler::SchedulableCoroutine::current() {
130+
if let Some(co) = SchedulableCoroutine::current() {
113131
if co.running().is_err() {
114132
error!("{} change to running state failed !", co.name());
115133
}
@@ -523,6 +541,24 @@ extern "C" fn do_preempt() {
523541
// coroutine never yielded (no hooked syscalls) — it is truly CPU-bound.
524542
// Force immediate suspension.
525543
flag.set(false);
544+
//不抢占处于Syscall状态的协程。
545+
//MonitorListener的设计理念是不对Syscall状态的协程发送信号。
546+
//但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件,
547+
//SIGURG可能在协程刚进入Syscall状态时到达。
548+
//如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册),
549+
//导致死锁。
550+
// Skip preemption for coroutines in Syscall state.
551+
// MonitorListener's design is to NOT send signals to Syscall-state
552+
// coroutines. However, a race between NOTIFY_NODE removal and the
553+
// monitor's queue iteration can cause SIGURG to arrive just after
554+
// the coroutine entered Syscall state. If preempted here, the
555+
// coroutine lands in the syscall map with no io_uring/epoll/timer
556+
// registration to wake it, causing a deadlock.
557+
if let Some(co) = SchedulableCoroutine::current() {
558+
if matches!(co.state(), CoroutineState::Syscall((), _, _)) {
559+
return;
560+
}
561+
}
526562
if let Some(suspender) = SchedulableSuspender::current() {
527563
suspender.suspend();
528564
}

0 commit comments

Comments
 (0)