Skip to content

Commit f635e9e

Browse files
committed
Modified mailmanctl to respect runner backoff behavior
1 parent 203e582 commit f635e9e

1 file changed

Lines changed: 106 additions & 212 deletions

File tree

bin/mailmanctl

Lines changed: 106 additions & 212 deletions
Original file line numberDiff line numberDiff line change
@@ -552,228 +552,122 @@ def main():
552552
else:
553553
sys.exit(1)
554554
elif args.command == 'start':
555+
# Check if we're already running
556+
if os.path.exists(mm_cfg.PIDFILE):
557+
try:
558+
with open(mm_cfg.PIDFILE) as fp:
559+
pid = int(fp.read().strip())
560+
if check_pid(pid):
561+
print(C_('Mailman qrunner is already running (pid: %(pid)d)'), file=sys.stderr)
562+
sys.exit(1)
563+
except (ValueError, IOError):
564+
pass
565+
566+
# Try to acquire the lock
555567
try:
556-
# First, acquire the master mailmanctl lock
557568
lock = acquire_lock(args.stale_lock_cleanup)
558-
if not lock:
559-
return
560-
561-
# Daemon process startup according to Stevens, Advanced Programming in
562-
# the UNIX Environment, Chapter 13.
563-
pid = os.fork()
564-
if pid:
565-
# parent
566-
if not args.quiet:
567-
print(C_("Starting Mailman's master qrunner."))
568-
# Give up the lock "ownership". This just means the foreground
569-
# process won't close/unlock the lock when it finalizes this lock
570-
# instance. We'll let the master watcher subproc own the lock.
571-
lock._transfer_to(pid)
569+
except LockFile.TimeOutError:
570+
sys.exit(1)
571+
572+
# Start all runners
573+
kids = start_all_runners()
574+
if not kids:
575+
print(C_('No runners started'), file=sys.stderr)
576+
lock.unlock(unconditionally=1)
577+
sys.exit(1)
578+
579+
# Write our PID to the PID file
580+
try:
581+
with open(mm_cfg.PIDFILE, 'w') as fp:
582+
fp.write(str(os.getpid()))
583+
except IOError as e:
584+
print(C_('Failed to write PID file: %(error)s'), file=sys.stderr)
585+
lock.unlock(unconditionally=1)
586+
sys.exit(1)
587+
588+
# Now we're ready to simply do our wait/restart loop
589+
try:
590+
while True:
591+
try:
592+
pid, status = os.wait()
593+
except OSError as e:
594+
# No children? We're done
595+
if e.errno == errno.ECHILD:
596+
break
597+
# If the system call got interrupted, just restart it.
598+
elif e.errno != errno.EINTR:
599+
raise
600+
continue
601+
602+
killsig = exitstatus = None
603+
if os.WIFSIGNALED(status):
604+
killsig = os.WTERMSIG(status)
605+
if os.WIFEXITED(status):
606+
exitstatus = os.WEXITSTATUS(status)
607+
608+
restarting = ''
609+
if not args.no_restart:
610+
# Only restart if the runner exited with SIGINT (normal exit)
611+
# and not SIGTERM (error or forced stop)
612+
if exitstatus == signal.SIGINT:
613+
restarting = '[restarting]'
614+
615+
qrname, slice, count, restarts = kids[pid]
616+
del kids[pid]
572617

573-
# Wait briefly to ensure child process starts
574-
time.sleep(1)
618+
# Only log abnormal exits
619+
if killsig == signal.SIGTERM or \
620+
(exitstatus is not None and exitstatus != signal.SIGINT):
621+
syslog('qrunner', """\
622+
Master qrunner detected abnormal subprocess exit
623+
(pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""",
624+
pid, killsig, exitstatus, qrname,
625+
slice+1, count, restarting)
626+
627+
if restarting and check_global_circuit_breaker():
628+
syslog('error', 'Global circuit breaker triggered - stopping all runners')
629+
# Stop all processes and clean up
630+
stop_all_processes(kids, lock)
631+
# Exit the main loop
632+
break
575633

576-
# Verify the child process is running and PID file is correct
634+
if exitstatus != signal.SIGINT:
635+
restarts += 1
636+
if restarts > MAX_RESTARTS:
637+
syslog('qrunner', """\
638+
Qrunner %s reached maximum restart limit of %d, not restarting.""",
639+
qrname, MAX_RESTARTS)
640+
restarting = ''
641+
642+
# Now perhaps restart the process
643+
if restarting:
644+
newpid = start_runner(qrname, slice, count)
645+
kids[newpid] = (qrname, slice, count, restarts)
646+
647+
finally:
648+
# all of our children are exited cleanly
649+
for pid in list(kids.keys()):
577650
try:
578-
os.kill(pid, 0) # Check if process exists
579-
580-
# Verify PID file exists and contains correct PID
581-
try:
582-
with open(mm_cfg.PIDFILE, 'r') as fp:
583-
content = fp.read().strip().split()
584-
if len(content) >= 2:
585-
child_pid = int(content[0])
586-
child_hostname = content[1]
587-
if child_pid != pid:
588-
print(C_('Error: PID file contains incorrect PID'), file=sys.stderr)
589-
return
590-
if child_hostname != socket.gethostname():
591-
print(C_('Error: PID file hostname mismatch'), file=sys.stderr)
592-
return
593-
else:
594-
print(C_('Error: Invalid PID file format'), file=sys.stderr)
595-
return
596-
597-
# Verify process is a Mailman process
598-
try:
599-
with open(f'/proc/{pid}/cmdline', 'r') as cmd_fp:
600-
cmdline = cmd_fp.read()
601-
if 'mailman' not in cmdline.lower():
602-
print(C_('Error: Process is not a Mailman process'), file=sys.stderr)
603-
return
604-
except (IOError, OSError) as e:
605-
print(C_('Warning: Could not verify process type: %s') % str(e), file=sys.stderr)
606-
607-
if not args.quiet:
608-
print(C_('Master qrunner started successfully (pid: %d)') % pid)
609-
syslog('qrunner', 'Master qrunner started successfully (pid: %d)', pid)
610-
611-
except (IOError, ValueError) as e:
612-
print(C_('Error reading PID file: %s') % str(e), file=sys.stderr)
613-
return
651+
os.kill(pid, signal.SIGTERM)
614652
except OSError as e:
615653
if e.errno == errno.ESRCH:
616-
print(C_('Error: Master process failed to start'), file=sys.stderr)
617-
return
618-
raise
619-
return
620-
621-
# child
622-
try:
623-
lock._take_possession()
624-
# First, save our pid in a file for "mailmanctl stop" rendezvous
625-
omask = os.umask(6)
626-
try:
627-
with open(mm_cfg.PIDFILE, 'w') as fp:
628-
print('%d %s' % (os.getpid(), socket.gethostname()), file=fp)
629-
finally:
630-
os.umask(omask)
631-
632-
# Create a new session and become the session leader
633-
os.setsid()
634-
635-
# Be sure to close any open std{in,out,err}
636-
devnull = os.open('/dev/null', 0)
637-
os.dup2(devnull, 0)
638-
os.dup2(devnull, 1)
639-
os.dup2(devnull, 2)
640-
641-
# Instead of cd'ing to root, cd to the Mailman installation home
642-
os.chdir(mm_cfg.PREFIX)
643-
# Set our file mode creation umask
644-
os.umask(0o07)
645-
646-
# Now start all the qrunners
647-
kids = start_all_runners()
648-
if not kids:
649-
syslog('error', 'No runners started successfully')
650-
sys.exit(1)
654+
syslog('qrunner', 'ESRCH on pid: %d', pid)
655+
del kids[pid]
651656

652-
# Set up a SIGALRM handler to refresh the lock once per day
653-
def sigalrm_handler(signum, frame, lock=lock):
654-
lock.refresh()
655-
signal.alarm(mm_cfg.days(1))
656-
signal.signal(signal.SIGALRM, sigalrm_handler)
657-
signal.alarm(mm_cfg.days(1))
658-
659-
# Set up a SIGHUP handler
660-
def sighup_handler(signum, frame, kids=kids):
661-
syslog.close()
662-
for pid in list(kids.keys()):
663-
os.kill(pid, signal.SIGHUP)
664-
syslog('qrunner',
665-
'Master watcher caught SIGHUP. Re-opening log files.')
666-
signal.signal(signal.SIGHUP, sighup_handler)
667-
668-
# Set up a SIGTERM handler
669-
def sigterm_handler(signum, frame, kids=kids):
670-
for pid in list(kids.keys()):
671-
try:
672-
os.kill(pid, signal.SIGTERM)
673-
except OSError as e:
674-
if e.errno != errno.ESRCH: raise
675-
syslog('qrunner', 'Master watcher caught SIGTERM. Exiting.')
676-
signal.signal(signal.SIGTERM, sigterm_handler)
677-
678-
# Set up a SIGINT handler
679-
def sigint_handler(signum, frame, kids=kids):
680-
for pid in list(kids.keys()):
681-
os.kill(pid, signal.SIGINT)
682-
syslog('qrunner', 'Master watcher caught SIGINT. Restarting.')
683-
signal.signal(signal.SIGINT, sigint_handler)
684-
685-
# Now we're ready to simply do our wait/restart loop
657+
# Wait for all the children to go away
658+
while True:
686659
try:
687-
while True:
688-
try:
689-
pid, status = os.wait()
690-
except OSError as e:
691-
# No children? We're done
692-
if e.errno == errno.ECHILD:
693-
break
694-
# If the system call got interrupted, just restart it.
695-
elif e.errno != errno.EINTR:
696-
raise
697-
continue
698-
699-
killsig = exitstatus = None
700-
if os.WIFSIGNALED(status):
701-
killsig = os.WTERMSIG(status)
702-
if os.WIFEXITED(status):
703-
exitstatus = os.WEXITSTATUS(status)
704-
705-
restarting = ''
706-
if not args.no_restart:
707-
if (exitstatus is None and killsig != signal.SIGTERM) or \
708-
(killsig is None and exitstatus != signal.SIGTERM):
709-
restarting = '[restarting]'
710-
711-
qrname, slice, count, restarts = kids[pid]
712-
del kids[pid]
713-
714-
# Only log abnormal exits
715-
if killsig == signal.SIGTERM or \
716-
(exitstatus is not None and exitstatus != 120 and exitstatus != signal.SIGINT):
717-
syslog('qrunner', """\
718-
Master qrunner detected abnormal subprocess exit
719-
(pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""",
720-
pid, killsig, exitstatus, qrname,
721-
slice+1, count, restarting)
722-
723-
# Check global circuit breaker before restarting
724-
if restarting and check_global_circuit_breaker():
725-
syslog('error', 'Global circuit breaker triggered - stopping all runners')
726-
# Stop all processes and clean up
727-
stop_all_processes(kids, lock)
728-
# Exit the main loop
729-
break
730-
731-
if exitstatus != signal.SIGINT:
732-
restarts += 1
733-
if restarts > MAX_RESTARTS:
734-
syslog('qrunner', """\
735-
Qrunner %s reached maximum restart limit of %d, not restarting.""",
736-
qrname, MAX_RESTARTS)
737-
restarting = ''
738-
739-
# Now perhaps restart the process
740-
if restarting:
741-
newpid = start_runner(qrname, slice, count)
742-
kids[newpid] = (qrname, slice, count, restarts)
743-
744-
finally:
745-
# Should we leave the main loop for any reason, we want to be sure
746-
# all of our children are exited cleanly
747-
for pid in list(kids.keys()):
748-
try:
749-
os.kill(pid, signal.SIGTERM)
750-
except OSError as e:
751-
if e.errno == errno.ESRCH:
752-
syslog('qrunner', 'ESRCH on pid: %d', pid)
753-
del kids[pid]
754-
755-
# Wait for all the children to go away
756-
while True:
757-
try:
758-
pid, status = os.wait()
759-
except OSError as e:
760-
if e.errno == errno.ECHILD:
761-
break
762-
elif e.errno != errno.EINTR:
763-
raise
764-
continue
765-
766-
# Finally, give up the lock
767-
lock.unlock(unconditionally=1)
768-
os._exit(0)
769-
except Exception as e:
770-
syslog('error', 'Child process error during startup: %s', str(e))
771-
os._exit(1)
660+
pid, status = os.wait()
661+
except OSError as e:
662+
if e.errno == errno.ECHILD:
663+
break
664+
elif e.errno != errno.EINTR:
665+
raise
666+
continue
772667

773-
except Exception as e:
774-
import traceback
775-
syslog('error', 'Error during startup: %s\nTraceback:\n%s', str(e), traceback.format_exc())
776-
sys.exit(1)
668+
# Finally, give up the lock
669+
lock.unlock(unconditionally=1)
670+
os._exit(0)
777671
elif args.command == 'stop':
778672
kill_watcher(signal.SIGTERM)
779673
try:

0 commit comments

Comments
 (0)