Skip to content

Commit 3481956

Browse files
authored
fix(mcp): clean up stdio test process trees (#368)
## Summary - Move stdio MCP connection-test timeout handling inside the protocol step so child cleanup always runs. - Add a shared runtime helper for process-tree cleanup across Unix and Windows. - Add a Unix regression test that verifies stdio timeouts do not leave the spawned process group alive. ## Test Plan - `just push -u origin fix/mcp-stdio-timeout-cleanup` - includes `cargo fix --allow-dirty --allow-staged` - includes `cargo clippy --fix --workspace --allow-dirty --allow-staged -- -D warnings` - includes `cargo fmt --all` - includes `cargo nextest run --workspace` (5708 passed, 18 skipped) Co-authored-by: zynx <>
1 parent 2974f47 commit 3481956

5 files changed

Lines changed: 163 additions & 39 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/aionui-mcp/src/connection_test/mod.rs

Lines changed: 88 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ use std::collections::HashMap;
44
use std::time::Duration;
55

66
use aionui_api_types::McpConnectionTestResult;
7-
use aionui_runtime::Builder as CmdBuilder;
7+
use aionui_runtime::{Builder as CmdBuilder, kill_process_tree};
88
use serde::Serialize;
99
use tokio::sync::mpsc;
10-
use tracing::debug;
10+
use tracing::{debug, warn};
1111

1212
use crate::types::McpServerTransport;
1313
use protocol::{
@@ -72,10 +72,7 @@ impl McpConnectionTestService {
7272
args: &[String],
7373
env: &HashMap<String, String>,
7474
) -> McpConnectionTestResult {
75-
match tokio::time::timeout(self.timeout, self.test_stdio_inner(command, args, env)).await {
76-
Ok(r) => r,
77-
Err(_) => timeout_result(self.timeout),
78-
}
75+
self.test_stdio_inner(command, args, env).await
7976
}
8077

8178
async fn test_stdio_inner(
@@ -98,8 +95,13 @@ impl McpConnectionTestService {
9895

9996
let stdin = child.stdin.take().expect("stdin was piped");
10097
let stdout = child.stdout.take().expect("stdout was piped");
101-
let result = run_stdio_protocol(stdin, stdout).await;
102-
kill_child_tree(&mut child).await;
98+
let result = match tokio::time::timeout(self.timeout, run_stdio_protocol(stdin, stdout)).await {
99+
Ok(r) => r,
100+
Err(_) => timeout_result(self.timeout),
101+
};
102+
if let Err(error) = kill_process_tree(&mut child).await {
103+
warn!(%error, "failed to clean up MCP stdio connection test process tree");
104+
}
103105
result
104106
}
105107

@@ -316,36 +318,6 @@ struct HttpMcpResponse {
316318
session_id: Option<String>,
317319
}
318320

319-
async fn kill_child_tree(child: &mut tokio::process::Child) {
320-
if let Some(pid) = child.id() {
321-
#[cfg(unix)]
322-
{
323-
let process_group = format!("-{pid}");
324-
let _ = tokio::process::Command::new("kill")
325-
.args(["-KILL", &process_group])
326-
.status()
327-
.await;
328-
}
329-
330-
#[cfg(windows)]
331-
{
332-
let _ = tokio::process::Command::new("taskkill")
333-
.args(["/PID", &pid.to_string(), "/T", "/F"])
334-
.status()
335-
.await;
336-
}
337-
338-
#[cfg(not(any(unix, windows)))]
339-
{
340-
let _ = child.kill().await;
341-
}
342-
} else {
343-
let _ = child.kill().await;
344-
}
345-
346-
let _ = child.wait().await;
347-
}
348-
349321
#[cfg(test)]
350322
mod tests {
351323
use super::*;
@@ -361,4 +333,82 @@ mod tests {
361333
let svc = McpConnectionTestService::new(reqwest::Client::new()).with_timeout(Duration::from_secs(5));
362334
assert_eq!(svc.timeout, Duration::from_secs(5));
363335
}
336+
337+
#[cfg(unix)]
338+
#[tokio::test]
339+
async fn stdio_timeout_cleans_up_process_group() {
340+
let marker_path = std::env::temp_dir().join(format!(
341+
"aionui-mcp-timeout-pid-{}-{}",
342+
std::process::id(),
343+
std::time::SystemTime::now()
344+
.duration_since(std::time::UNIX_EPOCH)
345+
.unwrap()
346+
.as_nanos()
347+
));
348+
let transport = McpServerTransport::Stdio {
349+
command: "sh".into(),
350+
args: vec![
351+
"-c".into(),
352+
"printf '%s\n' \"$$\" > \"$1\"; sleep 30".into(),
353+
"mcp-timeout-child".into(),
354+
marker_path.to_string_lossy().into_owned(),
355+
],
356+
env: HashMap::new(),
357+
};
358+
let svc = McpConnectionTestService::new(reqwest::Client::new()).with_timeout(Duration::from_millis(100));
359+
360+
let result = svc.test_connection("timeout-cleanup", &transport).await;
361+
assert!(!result.success);
362+
assert!(
363+
result.error.as_deref().unwrap_or_default().contains("timed out"),
364+
"expected timeout result, got {result:?}"
365+
);
366+
367+
let pid: i32 = std::fs::read_to_string(&marker_path)
368+
.expect("stdio child should write its pid")
369+
.trim()
370+
.parse()
371+
.expect("pid marker should be numeric");
372+
373+
let group_alive = wait_for_process_group_exit(pid, Duration::from_secs(1)).await;
374+
if group_alive {
375+
let _ = kill_process_group(pid, libc_sigkill());
376+
}
377+
let _ = std::fs::remove_file(marker_path);
378+
379+
assert!(
380+
!group_alive,
381+
"stdio timeout should terminate the spawned process group for pid={pid}"
382+
);
383+
}
384+
385+
#[cfg(unix)]
386+
async fn wait_for_process_group_exit(pid: i32, timeout: Duration) -> bool {
387+
let deadline = tokio::time::Instant::now() + timeout;
388+
while tokio::time::Instant::now() < deadline {
389+
if !is_process_group_alive(pid) {
390+
return false;
391+
}
392+
tokio::time::sleep(Duration::from_millis(50)).await;
393+
}
394+
is_process_group_alive(pid)
395+
}
396+
397+
#[cfg(unix)]
398+
fn is_process_group_alive(pid: i32) -> bool {
399+
kill_process_group(pid, 0)
400+
}
401+
402+
#[cfg(unix)]
403+
fn kill_process_group(pid: i32, signal: i32) -> bool {
404+
unsafe extern "C" {
405+
fn kill(pid: i32, sig: i32) -> i32;
406+
}
407+
unsafe { kill(-pid, signal) == 0 }
408+
}
409+
410+
#[cfg(unix)]
411+
fn libc_sigkill() -> i32 {
412+
9
413+
}
364414
}

crates/aionui-runtime/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,5 @@ sha2.workspace = true
3939
hex.workspace = true
4040

4141
[target.'cfg(unix)'.dependencies]
42+
libc.workspace = true
4243
wait-timeout = "0.2"

crates/aionui-runtime/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ pub use cache::init;
1515
pub use resolver::{ResolveError, bun_bin_dir, resolve_bun, resolve_command_in, resolve_command_path};
1616
pub use shell_env::enhance_process_path;
1717
mod spawn;
18-
pub use spawn::Builder;
18+
pub use spawn::{Builder, kill_process_tree};
1919

2020
#[cfg(test)]
2121
#[path = "../build_support.rs"]

crates/aionui-runtime/src/spawn.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,25 @@ pub struct Builder {
4444
mode: Mode,
4545
}
4646

47+
/// Force-kill a spawned child and wait for the direct child handle to exit.
48+
///
49+
/// On Unix, children spawned through [`Builder::new`] are process-group
50+
/// leaders, so this targets that group to clean up descendants as well. On
51+
/// Windows, this uses `taskkill /T` to terminate the process tree.
52+
pub async fn kill_process_tree(child: &mut Child) -> io::Result<()> {
53+
let Some(pid) = child.id() else {
54+
return child.kill().await;
55+
};
56+
57+
#[cfg(unix)]
58+
force_kill_process_tree(pid)?;
59+
#[cfg(windows)]
60+
kill_windows_process_tree(pid).await?;
61+
#[cfg(not(any(unix, windows)))]
62+
child.kill().await?;
63+
child.wait().await.map(|_| ())
64+
}
65+
4766
impl std::fmt::Debug for Builder {
4867
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
4968
f.debug_struct("Builder")
@@ -191,6 +210,59 @@ fn configure_platform_spawn(cmd: &mut Command) {
191210
#[cfg(not(unix))]
192211
fn configure_platform_spawn(_cmd: &mut Command) {}
193212

213+
#[cfg(unix)]
214+
fn force_kill_process_tree(pid: u32) -> io::Result<()> {
215+
let pgid = unsafe { libc::getpgid(pid as i32) };
216+
if pgid == -1 {
217+
let err = io::Error::last_os_error();
218+
if err.raw_os_error() == Some(libc::ESRCH) {
219+
return Ok(());
220+
}
221+
return kill_unix_target(pid as i32);
222+
}
223+
224+
if pgid > 1 && pgid as u32 == pid {
225+
kill_unix_target(-pgid)
226+
} else {
227+
kill_unix_target(pid as i32)
228+
}
229+
}
230+
231+
#[cfg(unix)]
232+
fn kill_unix_target(target: i32) -> io::Result<()> {
233+
let result = unsafe { libc::kill(target, libc::SIGKILL) };
234+
if result == 0 {
235+
return Ok(());
236+
}
237+
238+
let err = io::Error::last_os_error();
239+
if err.raw_os_error() == Some(libc::ESRCH) {
240+
Ok(())
241+
} else {
242+
Err(err)
243+
}
244+
}
245+
246+
#[cfg(windows)]
247+
async fn kill_windows_process_tree(pid: u32) -> io::Result<()> {
248+
let pid_arg = pid.to_string();
249+
let mut cmd = Builder::clean_cli("taskkill");
250+
cmd.args(["/F", "/T", "/PID", pid_arg.as_str()]);
251+
let output = cmd.output().await?;
252+
if output.status.success() || output.status.code() == Some(128) {
253+
return Ok(());
254+
}
255+
256+
Err(io::Error::new(
257+
io::ErrorKind::Other,
258+
format!(
259+
"taskkill failed for pid {pid} (exit {:?}): {}",
260+
output.status.code(),
261+
String::from_utf8_lossy(&output.stderr)
262+
),
263+
))
264+
}
265+
194266
/// Resolve `program` through `resolve_command_path` so callers don't have
195267
/// to. If the input already contains a path separator (relative or
196268
/// absolute) we leave it alone — only bare command names go through

0 commit comments

Comments
 (0)