Skip to content

Commit b4f9613

Browse files
rvqlkylewanginchina
authored andcommitted
agent: harden watchdog parent identity checks
1 parent 247d1b3 commit b4f9613

3 files changed

Lines changed: 82 additions & 26 deletions

File tree

agent/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ wasmtime = "12.0.1"
126126
wasmtime-wasi = "12.0.1"
127127
zstd = "0.13.2"
128128

129-
[target.'cfg(any(target_os = "linux", target_os = "android"))'.dependencies]
129+
[target.'cfg(unix)'.dependencies]
130130
cgroups-rs = "0.2.9"
131131
nix = "0.23"
132132
# As of procfs 0.16.0, Process::fd().iter() still not giving correct results on kernel 2.6.32

agent/src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ struct Opts {
7979
#[clap(long, hide = true)]
8080
watchdog_parent_pid: Option<u32>,
8181

82+
#[cfg(unix)]
83+
#[clap(long, hide = true)]
84+
watchdog_parent_start_time: Option<u64>,
85+
8286
#[cfg(unix)]
8387
#[clap(long, hide = true)]
8488
watchdog_liveness_url: Option<String>,
@@ -128,6 +132,8 @@ fn main() -> Result<()> {
128132
if let Some(parent_pid) = opts.watchdog_parent_pid {
129133
return watchdog::run(
130134
parent_pid,
135+
opts.watchdog_parent_start_time
136+
.ok_or_else(|| anyhow::anyhow!("watchdog parent start time is required"))?,
131137
opts.watchdog_liveness_url
132138
.as_deref()
133139
.ok_or_else(|| anyhow::anyhow!("watchdog liveness url is required"))?,

agent/src/watchdog.rs

Lines changed: 75 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use nix::{
3030
sys::signal::{kill, Signal},
3131
unistd::Pid,
3232
};
33+
use procfs::process::Process;
3334

3435
use crate::utils::environment::get_executable_path;
3536

@@ -38,6 +39,12 @@ pub const WATCHDOG_PERIOD: Duration = Duration::from_secs(10);
3839
pub const WATCHDOG_HTTP_TIMEOUT: Duration = Duration::from_secs(3);
3940
pub const WATCHDOG_TERMINATION_GRACE: Duration = Duration::from_secs(10);
4041

42+
#[derive(Clone, Copy, Debug)]
43+
struct ParentIdentity {
44+
pid: Pid,
45+
start_time_ticks: u64,
46+
}
47+
4148
pub fn liveness_url(port: u16) -> String {
4249
format!("http://127.0.0.1:{port}/livez")
4350
}
@@ -79,20 +86,23 @@ impl ProbeTarget {
7986
}
8087
}
8188

82-
pub fn run(parent_pid: u32, liveness_url: &str) -> Result<()> {
83-
let parent_pid = Pid::from_raw(parent_pid as i32);
89+
pub fn run(parent_pid: u32, parent_start_time_ticks: u64, liveness_url: &str) -> Result<()> {
90+
let parent = ParentIdentity {
91+
pid: Pid::from_raw(parent_pid as i32),
92+
start_time_ticks: parent_start_time_ticks,
93+
};
8494
let probe_target = ProbeTarget::from_url(liveness_url)?;
8595
let mut consecutive_failures = 0;
8696

8797
eprintln!(
88-
"[watchdog] monitoring parent pid {} via {}",
89-
parent_pid, liveness_url
98+
"[watchdog] monitoring parent pid {} start_time={} via {}",
99+
parent.pid, parent.start_time_ticks, liveness_url
90100
);
91101
loop {
92-
if !process_exists(parent_pid)? {
102+
if !process_matches(parent)? {
93103
eprintln!(
94-
"[watchdog] parent pid {} exited, watchdog stopping",
95-
parent_pid
104+
"[watchdog] parent pid {} no longer matches original identity, watchdog stopping",
105+
parent.pid
96106
);
97107
return Ok(());
98108
}
@@ -105,50 +115,82 @@ pub fn run(parent_pid: u32, liveness_url: &str) -> Result<()> {
105115
consecutive_failures += 1;
106116
eprintln!(
107117
"[watchdog] liveness returned unhealthy status for parent pid {}, consecutive_failures={}",
108-
parent_pid, consecutive_failures
118+
parent.pid, consecutive_failures
109119
);
110120
}
111121
Err(e) => {
112122
consecutive_failures += 1;
113123
eprintln!(
114124
"[watchdog] liveness probe for parent pid {} failed: {}, consecutive_failures={}",
115-
parent_pid, e, consecutive_failures
125+
parent.pid, e, consecutive_failures
116126
);
117127
}
118128
}
119129

120130
if consecutive_failures >= WATCHDOG_FAILURE_THRESHOLD {
121131
eprintln!(
122132
"[watchdog] parent pid {} exceeded liveness failure threshold {}, restarting",
123-
parent_pid, WATCHDOG_FAILURE_THRESHOLD
133+
parent.pid, WATCHDOG_FAILURE_THRESHOLD
124134
);
125-
return terminate_parent(parent_pid);
135+
return terminate_parent(parent);
126136
}
127137

128138
std::thread::sleep(WATCHDOG_PERIOD);
129139
}
130140
}
131141

132142
pub fn spawn(parent_pid: u32, liveness_url: &str) -> Result<Child> {
143+
let parent = ParentIdentity {
144+
pid: Pid::from_raw(parent_pid as i32),
145+
start_time_ticks: read_process_start_time(Pid::from_raw(parent_pid as i32))?.ok_or_else(
146+
|| {
147+
anyhow!(
148+
"parent pid {} disappeared before watchdog spawn",
149+
parent_pid
150+
)
151+
},
152+
)?,
153+
};
133154
let binary = get_executable_path().context("get executable path for watchdog failed")?;
134155
Command::new(binary)
135156
.arg("--watchdog-parent-pid")
136-
.arg(parent_pid.to_string())
157+
.arg(parent.pid.as_raw().to_string())
158+
.arg("--watchdog-parent-start-time")
159+
.arg(parent.start_time_ticks.to_string())
137160
.arg("--watchdog-liveness-url")
138161
.arg(liveness_url)
139162
.spawn()
140163
.context("spawn watchdog failed")
141164
}
142165

143-
fn process_exists(pid: Pid) -> Result<bool> {
144-
match kill(pid, None) {
166+
fn process_matches(parent: ParentIdentity) -> Result<bool> {
167+
match kill(parent.pid, None) {
145168
Ok(_) => Ok(true),
146169
Err(Errno::EPERM) => Ok(true),
147170
Err(Errno::ESRCH) => Ok(false),
148-
Err(e) => Err(anyhow!("check parent pid {} failed: {}", pid, e)),
171+
Err(e) => Err(anyhow!("check parent pid {} failed: {}", parent.pid, e)),
172+
}?;
173+
174+
match read_process_start_time(parent.pid)? {
175+
Some(start_time_ticks) => Ok(start_time_ticks == parent.start_time_ticks),
176+
None => Ok(false),
149177
}
150178
}
151179

180+
fn read_process_start_time(pid: Pid) -> Result<Option<u64>> {
181+
let process = match Process::new(pid.as_raw()) {
182+
Ok(process) => process,
183+
Err(procfs::ProcError::NotFound(_)) => return Ok(None),
184+
Err(e) => {
185+
return Err(anyhow!("read /proc/{}/stat failed: {}", pid, e));
186+
}
187+
};
188+
let stat = process
189+
.stat()
190+
.with_context(|| format!("read /proc/{}/stat failed", pid))?;
191+
Ok(Some(stat.starttime))
192+
}
193+
152194
fn parse_addr(uri: &Uri) -> Result<SocketAddr> {
153195
let host = uri
154196
.host()
@@ -163,44 +205,52 @@ fn parse_addr(uri: &Uri) -> Result<SocketAddr> {
163205
.ok_or_else(|| anyhow!("resolve watchdog host {host}:{port} failed"))
164206
}
165207

166-
fn terminate_parent(parent_pid: Pid) -> Result<()> {
167-
match kill(parent_pid, Signal::SIGTERM) {
168-
Ok(_) => eprintln!("[watchdog] sent SIGTERM to parent pid {}", parent_pid),
208+
fn terminate_parent(parent: ParentIdentity) -> Result<()> {
209+
if !process_matches(parent)? {
210+
eprintln!(
211+
"[watchdog] parent pid {} no longer matches original identity, skip termination",
212+
parent.pid
213+
);
214+
return Ok(());
215+
}
216+
217+
match kill(parent.pid, Signal::SIGTERM) {
218+
Ok(_) => eprintln!("[watchdog] sent SIGTERM to parent pid {}", parent.pid),
169219
Err(Errno::ESRCH) => return Ok(()),
170220
Err(e) => {
171221
return Err(anyhow!(
172222
"send SIGTERM to parent pid {} failed: {}",
173-
parent_pid,
223+
parent.pid,
174224
e
175225
))
176226
}
177227
}
178228

179229
let deadline = Instant::now() + WATCHDOG_TERMINATION_GRACE;
180230
while Instant::now() < deadline {
181-
if !process_exists(parent_pid)? {
182-
eprintln!("[watchdog] parent pid {} exited after SIGTERM", parent_pid);
231+
if !process_matches(parent)? {
232+
eprintln!("[watchdog] parent pid {} exited after SIGTERM", parent.pid);
183233
return Ok(());
184234
}
185235
std::thread::sleep(Duration::from_millis(200));
186236
}
187237

188-
if !process_exists(parent_pid)? {
238+
if !process_matches(parent)? {
189239
return Ok(());
190240
}
191241

192-
match kill(parent_pid, Signal::SIGKILL) {
242+
match kill(parent.pid, Signal::SIGKILL) {
193243
Ok(_) => {
194244
eprintln!(
195245
"[watchdog] parent pid {} did not exit in {:?}, sent SIGKILL",
196-
parent_pid, WATCHDOG_TERMINATION_GRACE
246+
parent.pid, WATCHDOG_TERMINATION_GRACE
197247
);
198248
Ok(())
199249
}
200250
Err(Errno::ESRCH) => Ok(()),
201251
Err(e) => Err(anyhow!(
202252
"send SIGKILL to parent pid {} failed: {}",
203-
parent_pid,
253+
parent.pid,
204254
e
205255
)),
206256
}

0 commit comments

Comments
 (0)