@@ -30,6 +30,7 @@ use nix::{
3030 sys:: signal:: { kill, Signal } ,
3131 unistd:: Pid ,
3232} ;
33+ use procfs:: process:: Process ;
3334
3435use crate :: utils:: environment:: get_executable_path;
3536
@@ -38,6 +39,12 @@ pub const WATCHDOG_PERIOD: Duration = Duration::from_secs(10);
3839pub const WATCHDOG_HTTP_TIMEOUT : Duration = Duration :: from_secs ( 3 ) ;
3940pub const WATCHDOG_TERMINATION_GRACE : Duration = Duration :: from_secs ( 10 ) ;
4041
42+ #[ derive( Clone , Copy , Debug ) ]
43+ struct ParentIdentity {
44+ pid : Pid ,
45+ start_time_ticks : u64 ,
46+ }
47+
4148pub fn liveness_url ( port : u16 ) -> String {
4249 format ! ( "http://127.0.0.1:{port}/livez" )
4350}
@@ -79,20 +86,23 @@ impl ProbeTarget {
7986 }
8087}
8188
82- pub fn run ( parent_pid : u32 , liveness_url : & str ) -> Result < ( ) > {
83- let parent_pid = Pid :: from_raw ( parent_pid as i32 ) ;
89+ pub fn run ( parent_pid : u32 , parent_start_time_ticks : u64 , liveness_url : & str ) -> Result < ( ) > {
90+ let parent = ParentIdentity {
91+ pid : Pid :: from_raw ( parent_pid as i32 ) ,
92+ start_time_ticks : parent_start_time_ticks,
93+ } ;
8494 let probe_target = ProbeTarget :: from_url ( liveness_url) ?;
8595 let mut consecutive_failures = 0 ;
8696
8797 eprintln ! (
88- "[watchdog] monitoring parent pid {} via {}" ,
89- parent_pid , liveness_url
98+ "[watchdog] monitoring parent pid {} start_time={} via {}" ,
99+ parent . pid , parent . start_time_ticks , liveness_url
90100 ) ;
91101 loop {
92- if !process_exists ( parent_pid ) ? {
102+ if !process_matches ( parent ) ? {
93103 eprintln ! (
94- "[watchdog] parent pid {} exited , watchdog stopping" ,
95- parent_pid
104+ "[watchdog] parent pid {} no longer matches original identity , watchdog stopping" ,
105+ parent . pid
96106 ) ;
97107 return Ok ( ( ) ) ;
98108 }
@@ -105,50 +115,82 @@ pub fn run(parent_pid: u32, liveness_url: &str) -> Result<()> {
105115 consecutive_failures += 1 ;
106116 eprintln ! (
107117 "[watchdog] liveness returned unhealthy status for parent pid {}, consecutive_failures={}" ,
108- parent_pid , consecutive_failures
118+ parent . pid , consecutive_failures
109119 ) ;
110120 }
111121 Err ( e) => {
112122 consecutive_failures += 1 ;
113123 eprintln ! (
114124 "[watchdog] liveness probe for parent pid {} failed: {}, consecutive_failures={}" ,
115- parent_pid , e, consecutive_failures
125+ parent . pid , e, consecutive_failures
116126 ) ;
117127 }
118128 }
119129
120130 if consecutive_failures >= WATCHDOG_FAILURE_THRESHOLD {
121131 eprintln ! (
122132 "[watchdog] parent pid {} exceeded liveness failure threshold {}, restarting" ,
123- parent_pid , WATCHDOG_FAILURE_THRESHOLD
133+ parent . pid , WATCHDOG_FAILURE_THRESHOLD
124134 ) ;
125- return terminate_parent ( parent_pid ) ;
135+ return terminate_parent ( parent ) ;
126136 }
127137
128138 std:: thread:: sleep ( WATCHDOG_PERIOD ) ;
129139 }
130140}
131141
132142pub fn spawn ( parent_pid : u32 , liveness_url : & str ) -> Result < Child > {
143+ let parent = ParentIdentity {
144+ pid : Pid :: from_raw ( parent_pid as i32 ) ,
145+ start_time_ticks : read_process_start_time ( Pid :: from_raw ( parent_pid as i32 ) ) ?. ok_or_else (
146+ || {
147+ anyhow ! (
148+ "parent pid {} disappeared before watchdog spawn" ,
149+ parent_pid
150+ )
151+ } ,
152+ ) ?,
153+ } ;
133154 let binary = get_executable_path ( ) . context ( "get executable path for watchdog failed" ) ?;
134155 Command :: new ( binary)
135156 . arg ( "--watchdog-parent-pid" )
136- . arg ( parent_pid. to_string ( ) )
157+ . arg ( parent. pid . as_raw ( ) . to_string ( ) )
158+ . arg ( "--watchdog-parent-start-time" )
159+ . arg ( parent. start_time_ticks . to_string ( ) )
137160 . arg ( "--watchdog-liveness-url" )
138161 . arg ( liveness_url)
139162 . spawn ( )
140163 . context ( "spawn watchdog failed" )
141164}
142165
143- fn process_exists ( pid : Pid ) -> Result < bool > {
144- match kill ( pid, None ) {
166+ fn process_matches ( parent : ParentIdentity ) -> Result < bool > {
167+ match kill ( parent . pid , None ) {
145168 Ok ( _) => Ok ( true ) ,
146169 Err ( Errno :: EPERM ) => Ok ( true ) ,
147170 Err ( Errno :: ESRCH ) => Ok ( false ) ,
148- Err ( e) => Err ( anyhow ! ( "check parent pid {} failed: {}" , pid, e) ) ,
171+ Err ( e) => Err ( anyhow ! ( "check parent pid {} failed: {}" , parent. pid, e) ) ,
172+ } ?;
173+
174+ match read_process_start_time ( parent. pid ) ? {
175+ Some ( start_time_ticks) => Ok ( start_time_ticks == parent. start_time_ticks ) ,
176+ None => Ok ( false ) ,
149177 }
150178}
151179
180+ fn read_process_start_time ( pid : Pid ) -> Result < Option < u64 > > {
181+ let process = match Process :: new ( pid. as_raw ( ) ) {
182+ Ok ( process) => process,
183+ Err ( procfs:: ProcError :: NotFound ( _) ) => return Ok ( None ) ,
184+ Err ( e) => {
185+ return Err ( anyhow ! ( "read /proc/{}/stat failed: {}" , pid, e) ) ;
186+ }
187+ } ;
188+ let stat = process
189+ . stat ( )
190+ . with_context ( || format ! ( "read /proc/{}/stat failed" , pid) ) ?;
191+ Ok ( Some ( stat. starttime ) )
192+ }
193+
152194fn parse_addr ( uri : & Uri ) -> Result < SocketAddr > {
153195 let host = uri
154196 . host ( )
@@ -163,44 +205,52 @@ fn parse_addr(uri: &Uri) -> Result<SocketAddr> {
163205 . ok_or_else ( || anyhow ! ( "resolve watchdog host {host}:{port} failed" ) )
164206}
165207
166- fn terminate_parent ( parent_pid : Pid ) -> Result < ( ) > {
167- match kill ( parent_pid, Signal :: SIGTERM ) {
168- Ok ( _) => eprintln ! ( "[watchdog] sent SIGTERM to parent pid {}" , parent_pid) ,
208+ fn terminate_parent ( parent : ParentIdentity ) -> Result < ( ) > {
209+ if !process_matches ( parent) ? {
210+ eprintln ! (
211+ "[watchdog] parent pid {} no longer matches original identity, skip termination" ,
212+ parent. pid
213+ ) ;
214+ return Ok ( ( ) ) ;
215+ }
216+
217+ match kill ( parent. pid , Signal :: SIGTERM ) {
218+ Ok ( _) => eprintln ! ( "[watchdog] sent SIGTERM to parent pid {}" , parent. pid) ,
169219 Err ( Errno :: ESRCH ) => return Ok ( ( ) ) ,
170220 Err ( e) => {
171221 return Err ( anyhow ! (
172222 "send SIGTERM to parent pid {} failed: {}" ,
173- parent_pid ,
223+ parent . pid ,
174224 e
175225 ) )
176226 }
177227 }
178228
179229 let deadline = Instant :: now ( ) + WATCHDOG_TERMINATION_GRACE ;
180230 while Instant :: now ( ) < deadline {
181- if !process_exists ( parent_pid ) ? {
182- eprintln ! ( "[watchdog] parent pid {} exited after SIGTERM" , parent_pid ) ;
231+ if !process_matches ( parent ) ? {
232+ eprintln ! ( "[watchdog] parent pid {} exited after SIGTERM" , parent . pid ) ;
183233 return Ok ( ( ) ) ;
184234 }
185235 std:: thread:: sleep ( Duration :: from_millis ( 200 ) ) ;
186236 }
187237
188- if !process_exists ( parent_pid ) ? {
238+ if !process_matches ( parent ) ? {
189239 return Ok ( ( ) ) ;
190240 }
191241
192- match kill ( parent_pid , Signal :: SIGKILL ) {
242+ match kill ( parent . pid , Signal :: SIGKILL ) {
193243 Ok ( _) => {
194244 eprintln ! (
195245 "[watchdog] parent pid {} did not exit in {:?}, sent SIGKILL" ,
196- parent_pid , WATCHDOG_TERMINATION_GRACE
246+ parent . pid , WATCHDOG_TERMINATION_GRACE
197247 ) ;
198248 Ok ( ( ) )
199249 }
200250 Err ( Errno :: ESRCH ) => Ok ( ( ) ) ,
201251 Err ( e) => Err ( anyhow ! (
202252 "send SIGKILL to parent pid {} failed: {}" ,
203- parent_pid ,
253+ parent . pid ,
204254 e
205255 ) ) ,
206256 }
0 commit comments