Skip to content

Commit 8642007

Browse files
committed
Debug shutdown hangs and guard with timer watchdog
Signed-off-by: Joachim Wiberg <troglobit@gmail.com>
1 parent 96c74ed commit 8642007

3 files changed

Lines changed: 63 additions & 1 deletion

File tree

src/sig.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ void do_shutdown(shutop_t op)
303303
int in_cont = in_container();
304304
int signo = SIGTERM;
305305

306+
if (SHUTDOWN_DEBUG) cprintf("do_shutdown: entered, op=%d\n", op);
307+
306308
if (!in_cont) {
307309
/*
308310
* On a PREEMPT-RT system, Finit must run as the highest prioritized
@@ -322,13 +324,15 @@ void do_shutdown(shutop_t op)
322324
* Tell remaining non-monitored processes to exit, give them
323325
* time to exit gracefully, 2 sec was customary, we go for 1.
324326
*/
327+
if (SHUTDOWN_DEBUG) cprintf("do_shutdown: killing remaining processes ...\n");
325328
iterate_proc(kill_cb, &signo);
326329
if (do_wait(1)) {
327330
signo = SIGKILL;
328331
iterate_proc(kill_cb, &signo);
329332
}
330333

331334
/* Exit plugins gracefully */
335+
if (SHUTDOWN_DEBUG) cprintf("do_shutdown: plugin_exit() + cond_exit() ...\n");
332336
plugin_exit();
333337

334338
/* We can no longer act on conditions, activate script runner */
@@ -351,12 +355,14 @@ void do_shutdown(shutop_t op)
351355
for (int fd = 3; fd < 128; fd++)
352356
close(fd);
353357

358+
if (SHUTDOWN_DEBUG) cprintf("do_shutdown: vfork, child will reboot ...\n");
354359
if (vfork()) {
355360
/*
356361
* Put PID 1 aside and let child perform reboot/halt
357362
* kernel may exit child and we don't want to exit PID 1
358363
* ... causing "aiii killing init" during reboot ...
359364
*/
365+
if (SHUTDOWN_DEBUG) cprintf("do_shutdown: PID 1 returning from vfork\n");
360366
return;
361367
}
362368

src/sm.c

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,37 @@ static struct {
6363
int skip_bootstrap; /* Set to skip bootstrap waiting */
6464
} sm;
6565

66+
/* Shutdown trace: set SHUTDOWN_DEBUG to 1 in sm.h to enable */
67+
#define shutdbg(fmt, args...) do { \
68+
if (SHUTDOWN_DEBUG && (runlevel == 0 || runlevel == 6)) \
69+
cprintf("shutdown[%c]: " fmt "\n", \
70+
sm_rl2ch(runlevel), ##args); \
71+
} while (0)
72+
6673
static uev_t console_watcher;
6774
static int console_fd = -1;
6875

76+
/*
77+
* Shutdown watchdog: if graceful service stop takes too long,
78+
* force do_shutdown() directly from the event loop callback.
79+
* Default timeout is 30 seconds, adjustable at build time.
80+
*/
81+
#ifndef SHUTDOWN_WDT_TIMEOUT
82+
#define SHUTDOWN_WDT_TIMEOUT 30
83+
#endif
84+
static uev_t shutdown_wdt;
85+
86+
static void shutdown_wdt_cb(uev_t *w, void *arg, int events)
87+
{
88+
(void)w;
89+
(void)arg;
90+
(void)events;
91+
92+
logit(LOG_CONSOLE | LOG_CRIT, "Timed out (%d sec) waiting for services to stop, forcing shutdown.",
93+
SHUTDOWN_WDT_TIMEOUT);
94+
do_shutdown(halt);
95+
}
96+
6997
/*
7098
* Console input callback - handles Ctrl-C during bootstrap wait
7199
*/
@@ -350,6 +378,7 @@ void sm_step(void)
350378

351379
/* reload ? */
352380
if (sm.reload) {
381+
shutdbg("RUNNING got reload request!");
353382
sm.reload = 0;
354383
sm.state = SM_RELOAD_CHANGE_STATE;
355384
}
@@ -362,9 +391,15 @@ void sm_step(void)
362391

363392
/* Restore terse mode and run hooks before shutdown */
364393
if (runlevel == 0 || runlevel == 6) {
394+
shutdbg("entering CHANGE state, starting %d sec watchdog",
395+
SHUTDOWN_WDT_TIMEOUT);
396+
uev_timer_init(ctx, &shutdown_wdt, shutdown_wdt_cb,
397+
NULL, SHUTDOWN_WDT_TIMEOUT * 1000, 0);
365398
api_exit();
366399
log_exit();
400+
shutdbg("running HOOK_SHUTDOWN ...");
367401
plugin_run_hooks(HOOK_SHUTDOWN);
402+
shutdbg("HOOK_SHUTDOWN done");
368403
}
369404

370405
dbg("Setting new runlevel --> %c <-- previous %c", sm_rl2ch(runlevel), sm_rl2ch(prevlevel));
@@ -384,12 +419,14 @@ void sm_step(void)
384419
}
385420

386421
/* Reset once flag of runtasks */
422+
shutdbg("runtask_clean + step_all ...");
387423
service_runtask_clean();
388424

389425
dbg("Stopping services not allowed in new runlevel ...");
390426
sm.in_reload = 1;
391427
service_step_all(SVC_TYPE_ANY);
392428

429+
shutdbg("CHANGE done, entering WAIT");
393430
sm.state = SM_RUNLEVEL_WAIT_STATE;
394431
break;
395432

@@ -400,10 +437,13 @@ void sm_step(void)
400437
*/
401438
svc = svc_stop_completed();
402439
if (svc) {
440+
shutdbg("WAIT, collecting %s[%d] ...", svc_ident(svc, NULL, 0), svc->pid);
403441
dbg("Waiting to collect %s, cmd %s(%d) ...", svc_ident(svc, NULL, 0), svc->cmd, svc->pid);
404442
break;
405443
}
406444

445+
shutdbg("WAIT done, all services stopped");
446+
407447
/* Prev runlevel services stopped, call hooks before starting new runlevel ... */
408448
dbg("All services have been stopped, calling runlevel change hooks ...");
409449
plugin_run_hooks(HOOK_RUNLEVEL_CHANGE); /* Reconfigure HW/VLANs/etc here */
@@ -416,6 +456,7 @@ void sm_step(void)
416456
sm.in_reload = 0;
417457
service_step_all(SVC_TYPE_ANY);
418458

459+
shutdbg("entering CLEAN");
419460
sm.state = SM_RUNLEVEL_CLEAN_STATE;
420461
break;
421462

@@ -427,11 +468,15 @@ void sm_step(void)
427468
*/
428469
svc = svc_clean_completed();
429470
if (svc) {
471+
shutdbg("CLEAN, waiting for %s[%d] (%s) ...",
472+
svc_ident(svc, NULL, 0), svc->pid, svc_status(svc));
430473
dbg("Waiting to collect post/cleanup script for %s, cmd %s(%d) ...",
431474
svc_ident(svc, NULL, 0), svc->cmd, svc->pid);
432475
break;
433476
}
434477

478+
shutdbg("CLEAN done, all post/cleanup scripts collected");
479+
435480
/* Cleanup stale services */
436481
svc_clean_dynamic(service_unregister);
437482

@@ -447,9 +492,15 @@ void sm_step(void)
447492
* Tannhäuser Gate. All those .. moments .. will be lost in time, like
448493
* tears ... in ... rain."
449494
*/
450-
if (runlevel == 0 || runlevel == 6)
495+
if (runlevel == 0 || runlevel == 6) {
496+
shutdbg("calling do_shutdown()");
497+
uev_timer_stop(&shutdown_wdt);
451498
do_shutdown(halt);
499+
shutdbg("do_shutdown() returned to SM!");
500+
}
452501

502+
shutdbg("entering RUNNING (reload=%d, newlevel=%d)",
503+
sm.reload, sm.newlevel);
453504
sm.state = SM_RUNNING_STATE;
454505
break;
455506

src/sm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@
2424
#ifndef FINIT_SM_H_
2525
#define FINIT_SM_H_
2626

27+
/* Set to 1 to enable shutdown trace logging (independent of 'initctl debug') */
28+
#ifndef SHUTDOWN_DEBUG
29+
#define SHUTDOWN_DEBUG 1
30+
#endif
31+
2732
void sm_init (void);
2833
void sm_step (void);
2934

0 commit comments

Comments
 (0)