@@ -107,6 +107,37 @@ func runWatcher(ctx context.Context, cfg Config) error {
107107 var crashes []time.Time
108108 backoffIdx := 0
109109
110+ recordCrash := func (now time.Time ) (int , bool ) {
111+ crashes = append (crashes , now )
112+ cutoff := now .Add (- cfg .MaxCrashWindow ) // crash older than 'now - cfg.MaxCrashWindow' is considered expired
113+ i := 0
114+
115+ for i < len (crashes ) && crashes [i ].Before (cutoff ) {
116+ i ++
117+ }
118+ crashes = crashes [i :] // sliding-window
119+
120+ return len (crashes ), len (crashes ) >= cfg .MaxCrashes
121+ }
122+
123+ // sleepBackoff waits the next backoff step; false means shutdown was
124+ // requested while waiting.
125+ sleepBackoff := func (crashCount int ) bool {
126+ wait := cfg .BackoffSchedule [min (backoffIdx , len (cfg .BackoffSchedule )- 1 )]
127+ backoffIdx ++
128+
129+ fmt .Fprintf (os .Stderr , "supervisor: backing off %s before restart (crash %d of max %d in %s window)\n " , wait , crashCount , cfg .MaxCrashes , cfg .MaxCrashWindow )
130+
131+ select {
132+ case <- time .After (wait ): // wait next backoff
133+ return true
134+ case <- ctx .Done (): // shutdown requested
135+ return false
136+ case <- sigCh : // shutdown requested
137+ return false
138+ }
139+ }
140+
110141 for {
111142 cmd := exec .Command (cfg .RunedBinary , cfg .RunedArgs ... )
112143 cmd .Stdin = nil
@@ -115,8 +146,20 @@ func runWatcher(ctx context.Context, cfg Config) error {
115146
116147 fmt .Fprintf (os .Stderr , "supervisor: starting %s %v\n " , cfg .RunedBinary , cfg .RunedArgs )
117148 started := time .Now ()
149+
150+ // Share crash budget rather than end up supervision for retriable error
118151 if err := cmd .Start (); err != nil {
119- return fmt .Errorf ("supervisor: start %s: %w" , cfg .RunedBinary , err )
152+ fmt .Fprintf (os .Stderr , "supervisor: start %s: %v\n " , cfg .RunedBinary , err )
153+
154+ count , giveUp := recordCrash (time .Now ())
155+ if giveUp {
156+ return fmt .Errorf ("supervisor: start %s: %w (%d failures within %s - giving up)" , cfg .RunedBinary , err , count , cfg .MaxCrashWindow )
157+ }
158+ if ! sleepBackoff (count ) {
159+ return nil
160+ }
161+
162+ continue
120163 }
121164
122165 done := make (chan error , 1 )
@@ -146,27 +189,11 @@ func runWatcher(ctx context.Context, cfg Config) error {
146189 backoffIdx = 0
147190 }
148191
149- crashes = append (crashes , now )
150- cutoff := now .Add (- cfg .MaxCrashWindow ) // crashes older than now - cfg.MaxCrashWindow are considered expired
151- i := 0
152- for i < len (crashes ) && crashes [i ].Before (cutoff ) {
153- i ++
192+ count , giveUp := recordCrash (now )
193+ if giveUp {
194+ return fmt .Errorf ("supervisor: %d crashes within %s - giving up" , count , cfg .MaxCrashWindow )
154195 }
155- crashes = crashes [i :] // sliding-window
156-
157- if len (crashes ) >= cfg .MaxCrashes {
158- return fmt .Errorf ("supervisor: %d crashes within %s - giving up" , len (crashes ), cfg .MaxCrashWindow )
159- }
160-
161- wait := cfg .BackoffSchedule [min (backoffIdx , len (cfg .BackoffSchedule )- 1 )]
162- backoffIdx ++
163- fmt .Fprintf (os .Stderr , "supervisor: backing off %s before restart (crash %d of max %d in %s window)\n " , wait , len (crashes ), cfg .MaxCrashes , cfg .MaxCrashWindow )
164- select {
165- case <- time .After (wait ):
166- continue
167- case <- ctx .Done ():
168- return nil
169- case <- sigCh :
196+ if ! sleepBackoff (count ) {
170197 return nil
171198 }
172199 }
@@ -184,8 +211,16 @@ func shutdownChild(cmd *exec.Cmd, grace time.Duration, done <-chan error) error
184211 return nil
185212 case <- time .After (grace ):
186213 fmt .Fprintf (os .Stderr , "supervisor: child didn't exit within %s, sending SIGKILL\n " , grace )
214+
187215 _ = cmd .Process .Kill ()
188- <- done
216+
217+ // Also check if child still hasn't died for certain period
218+ select {
219+ case <- done :
220+ case <- time .After (grace ):
221+ fmt .Fprintf (os .Stderr , "supervisor: child unresponsive to SIGKILL after %s, abandoning\n " , grace )
222+ }
223+
189224 return nil
190225 }
191226}
0 commit comments