@@ -552,228 +552,122 @@ def main():
552552 else:
553553 sys.exit(1)
554554 elif args.command == 'start':
555+ # Check if we're already running
556+ if os.path.exists(mm_cfg.PIDFILE):
557+ try:
558+ with open(mm_cfg.PIDFILE) as fp:
559+ pid = int(fp.read().strip())
560+ if check_pid(pid):
561+ print(C_('Mailman qrunner is already running (pid: %(pid)d)'), file=sys.stderr)
562+ sys.exit(1)
563+ except (ValueError, IOError):
564+ pass
565+
566+ # Try to acquire the lock
555567 try:
556- # First, acquire the master mailmanctl lock
557568 lock = acquire_lock(args.stale_lock_cleanup)
558- if not lock:
559- return
560-
561- # Daemon process startup according to Stevens, Advanced Programming in
562- # the UNIX Environment, Chapter 13.
563- pid = os.fork()
564- if pid:
565- # parent
566- if not args.quiet:
567- print(C_("Starting Mailman's master qrunner."))
568- # Give up the lock "ownership". This just means the foreground
569- # process won't close/unlock the lock when it finalizes this lock
570- # instance. We'll let the master watcher subproc own the lock.
571- lock._transfer_to(pid)
569+ except LockFile.TimeOutError:
570+ sys.exit(1)
571+
572+ # Start all runners
573+ kids = start_all_runners()
574+ if not kids:
575+ print(C_('No runners started'), file=sys.stderr)
576+ lock.unlock(unconditionally=1)
577+ sys.exit(1)
578+
579+ # Write our PID to the PID file
580+ try:
581+ with open(mm_cfg.PIDFILE, 'w') as fp:
582+ fp.write(str(os.getpid()))
583+ except IOError as e:
584+ print(C_('Failed to write PID file: %(error)s'), file=sys.stderr)
585+ lock.unlock(unconditionally=1)
586+ sys.exit(1)
587+
588+ # Now we're ready to simply do our wait/restart loop
589+ try:
590+ while True:
591+ try:
592+ pid, status = os.wait()
593+ except OSError as e:
594+ # No children? We're done
595+ if e.errno == errno.ECHILD:
596+ break
597+ # If the system call got interrupted, just restart it.
598+ elif e.errno != errno.EINTR:
599+ raise
600+ continue
601+
602+ killsig = exitstatus = None
603+ if os.WIFSIGNALED(status):
604+ killsig = os.WTERMSIG(status)
605+ if os.WIFEXITED(status):
606+ exitstatus = os.WEXITSTATUS(status)
607+
608+ restarting = ''
609+ if not args.no_restart:
610+ # Only restart if the runner exited with SIGINT (normal exit)
611+ # and not SIGTERM (error or forced stop)
612+ if exitstatus == signal.SIGINT:
613+ restarting = '[restarting]'
614+
615+ qrname, slice, count, restarts = kids[pid]
616+ del kids[pid]
572617
573- # Wait briefly to ensure child process starts
574- time.sleep(1)
618+ # Only log abnormal exits
619+ if killsig == signal.SIGTERM or \
620+ (exitstatus is not None and exitstatus != signal.SIGINT):
621+ syslog('qrunner', """\
622+ Master qrunner detected abnormal subprocess exit
623+ (pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""",
624+ pid, killsig, exitstatus, qrname,
625+ slice+1, count, restarting)
626+
627+ if restarting and check_global_circuit_breaker():
628+ syslog('error', 'Global circuit breaker triggered - stopping all runners')
629+ # Stop all processes and clean up
630+ stop_all_processes(kids, lock)
631+ # Exit the main loop
632+ break
575633
576- # Verify the child process is running and PID file is correct
634+ if exitstatus != signal.SIGINT:
635+ restarts += 1
636+ if restarts > MAX_RESTARTS:
637+ syslog('qrunner', """\
638+ Qrunner %s reached maximum restart limit of %d, not restarting.""",
639+ qrname, MAX_RESTARTS)
640+ restarting = ''
641+
642+ # Now perhaps restart the process
643+ if restarting:
644+ newpid = start_runner(qrname, slice, count)
645+ kids[newpid] = (qrname, slice, count, restarts)
646+
647+ finally:
648+ # all of our children are exited cleanly
649+ for pid in list(kids.keys()):
577650 try:
578- os.kill(pid, 0) # Check if process exists
579-
580- # Verify PID file exists and contains correct PID
581- try:
582- with open(mm_cfg.PIDFILE, 'r') as fp:
583- content = fp.read().strip().split()
584- if len(content) >= 2:
585- child_pid = int(content[0])
586- child_hostname = content[1]
587- if child_pid != pid:
588- print(C_('Error: PID file contains incorrect PID'), file=sys.stderr)
589- return
590- if child_hostname != socket.gethostname():
591- print(C_('Error: PID file hostname mismatch'), file=sys.stderr)
592- return
593- else:
594- print(C_('Error: Invalid PID file format'), file=sys.stderr)
595- return
596-
597- # Verify process is a Mailman process
598- try:
599- with open(f'/proc/{pid}/cmdline', 'r') as cmd_fp:
600- cmdline = cmd_fp.read()
601- if 'mailman' not in cmdline.lower():
602- print(C_('Error: Process is not a Mailman process'), file=sys.stderr)
603- return
604- except (IOError, OSError) as e:
605- print(C_('Warning: Could not verify process type: %s') % str(e), file=sys.stderr)
606-
607- if not args.quiet:
608- print(C_('Master qrunner started successfully (pid: %d)') % pid)
609- syslog('qrunner', 'Master qrunner started successfully (pid: %d)', pid)
610-
611- except (IOError, ValueError) as e:
612- print(C_('Error reading PID file: %s') % str(e), file=sys.stderr)
613- return
651+ os.kill(pid, signal.SIGTERM)
614652 except OSError as e:
615653 if e.errno == errno.ESRCH:
616- print(C_('Error: Master process failed to start'), file=sys.stderr)
617- return
618- raise
619- return
620-
621- # child
622- try:
623- lock._take_possession()
624- # First, save our pid in a file for "mailmanctl stop" rendezvous
625- omask = os.umask(6)
626- try:
627- with open(mm_cfg.PIDFILE, 'w') as fp:
628- print('%d %s' % (os.getpid(), socket.gethostname()), file=fp)
629- finally:
630- os.umask(omask)
631-
632- # Create a new session and become the session leader
633- os.setsid()
634-
635- # Be sure to close any open std{in,out,err}
636- devnull = os.open('/dev/null', 0)
637- os.dup2(devnull, 0)
638- os.dup2(devnull, 1)
639- os.dup2(devnull, 2)
640-
641- # Instead of cd'ing to root, cd to the Mailman installation home
642- os.chdir(mm_cfg.PREFIX)
643- # Set our file mode creation umask
644- os.umask(0o07)
645-
646- # Now start all the qrunners
647- kids = start_all_runners()
648- if not kids:
649- syslog('error', 'No runners started successfully')
650- sys.exit(1)
654+ syslog('qrunner', 'ESRCH on pid: %d', pid)
655+ del kids[pid]
651656
652- # Set up a SIGALRM handler to refresh the lock once per day
653- def sigalrm_handler(signum, frame, lock=lock):
654- lock.refresh()
655- signal.alarm(mm_cfg.days(1))
656- signal.signal(signal.SIGALRM, sigalrm_handler)
657- signal.alarm(mm_cfg.days(1))
658-
659- # Set up a SIGHUP handler
660- def sighup_handler(signum, frame, kids=kids):
661- syslog.close()
662- for pid in list(kids.keys()):
663- os.kill(pid, signal.SIGHUP)
664- syslog('qrunner',
665- 'Master watcher caught SIGHUP. Re-opening log files.')
666- signal.signal(signal.SIGHUP, sighup_handler)
667-
668- # Set up a SIGTERM handler
669- def sigterm_handler(signum, frame, kids=kids):
670- for pid in list(kids.keys()):
671- try:
672- os.kill(pid, signal.SIGTERM)
673- except OSError as e:
674- if e.errno != errno.ESRCH: raise
675- syslog('qrunner', 'Master watcher caught SIGTERM. Exiting.')
676- signal.signal(signal.SIGTERM, sigterm_handler)
677-
678- # Set up a SIGINT handler
679- def sigint_handler(signum, frame, kids=kids):
680- for pid in list(kids.keys()):
681- os.kill(pid, signal.SIGINT)
682- syslog('qrunner', 'Master watcher caught SIGINT. Restarting.')
683- signal.signal(signal.SIGINT, sigint_handler)
684-
685- # Now we're ready to simply do our wait/restart loop
657+ # Wait for all the children to go away
658+ while True:
686659 try:
687- while True:
688- try:
689- pid, status = os.wait()
690- except OSError as e:
691- # No children? We're done
692- if e.errno == errno.ECHILD:
693- break
694- # If the system call got interrupted, just restart it.
695- elif e.errno != errno.EINTR:
696- raise
697- continue
698-
699- killsig = exitstatus = None
700- if os.WIFSIGNALED(status):
701- killsig = os.WTERMSIG(status)
702- if os.WIFEXITED(status):
703- exitstatus = os.WEXITSTATUS(status)
704-
705- restarting = ''
706- if not args.no_restart:
707- if (exitstatus is None and killsig != signal.SIGTERM) or \
708- (killsig is None and exitstatus != signal.SIGTERM):
709- restarting = '[restarting]'
710-
711- qrname, slice, count, restarts = kids[pid]
712- del kids[pid]
713-
714- # Only log abnormal exits
715- if killsig == signal.SIGTERM or \
716- (exitstatus is not None and exitstatus != 120 and exitstatus != signal.SIGINT):
717- syslog('qrunner', """\
718- Master qrunner detected abnormal subprocess exit
719- (pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""",
720- pid, killsig, exitstatus, qrname,
721- slice+1, count, restarting)
722-
723- # Check global circuit breaker before restarting
724- if restarting and check_global_circuit_breaker():
725- syslog('error', 'Global circuit breaker triggered - stopping all runners')
726- # Stop all processes and clean up
727- stop_all_processes(kids, lock)
728- # Exit the main loop
729- break
730-
731- if exitstatus != signal.SIGINT:
732- restarts += 1
733- if restarts > MAX_RESTARTS:
734- syslog('qrunner', """\
735- Qrunner %s reached maximum restart limit of %d, not restarting.""",
736- qrname, MAX_RESTARTS)
737- restarting = ''
738-
739- # Now perhaps restart the process
740- if restarting:
741- newpid = start_runner(qrname, slice, count)
742- kids[newpid] = (qrname, slice, count, restarts)
743-
744- finally:
745- # Should we leave the main loop for any reason, we want to be sure
746- # all of our children are exited cleanly
747- for pid in list(kids.keys()):
748- try:
749- os.kill(pid, signal.SIGTERM)
750- except OSError as e:
751- if e.errno == errno.ESRCH:
752- syslog('qrunner', 'ESRCH on pid: %d', pid)
753- del kids[pid]
754-
755- # Wait for all the children to go away
756- while True:
757- try:
758- pid, status = os.wait()
759- except OSError as e:
760- if e.errno == errno.ECHILD:
761- break
762- elif e.errno != errno.EINTR:
763- raise
764- continue
765-
766- # Finally, give up the lock
767- lock.unlock(unconditionally=1)
768- os._exit(0)
769- except Exception as e:
770- syslog('error', 'Child process error during startup: %s', str(e))
771- os._exit(1)
660+ pid, status = os.wait()
661+ except OSError as e:
662+ if e.errno == errno.ECHILD:
663+ break
664+ elif e.errno != errno.EINTR:
665+ raise
666+ continue
772667
773- except Exception as e:
774- import traceback
775- syslog('error', 'Error during startup: %s\nTraceback:\n%s', str(e), traceback.format_exc())
776- sys.exit(1)
668+ # Finally, give up the lock
669+ lock.unlock(unconditionally=1)
670+ os._exit(0)
777671 elif args.command == 'stop':
778672 kill_watcher(signal.SIGTERM)
779673 try:
0 commit comments