@@ -13,32 +13,51 @@ import (
1313 "os/exec"
1414 "path/filepath"
1515 "sync"
16+ "time"
1617
1718 "github.com/minicodemonkey/chief/embed"
1819 "github.com/minicodemonkey/chief/internal/prd"
1920)
2021
22+ // RetryConfig configures automatic retry behavior on Claude crashes.
23+ type RetryConfig struct {
24+ MaxRetries int // Maximum number of retry attempts (default: 3)
25+ RetryDelays []time.Duration // Delays between retries (default: 0s, 5s, 15s)
26+ Enabled bool // Whether retry is enabled (default: true)
27+ }
28+
29+ // DefaultRetryConfig returns the default retry configuration.
30+ func DefaultRetryConfig () RetryConfig {
31+ return RetryConfig {
32+ MaxRetries : 3 ,
33+ RetryDelays : []time.Duration {0 , 5 * time .Second , 15 * time .Second },
34+ Enabled : true ,
35+ }
36+ }
37+
2138// Loop manages the core agent loop that invokes Claude repeatedly until all stories are complete.
2239type Loop struct {
23- prdPath string
24- prompt string
25- maxIter int
26- iteration int
27- events chan Event
28- claudeCmd * exec.Cmd
29- logFile * os.File
30- mu sync.Mutex
31- stopped bool
32- paused bool
40+ prdPath string
41+ prompt string
42+ maxIter int
43+ iteration int
44+ events chan Event
45+ claudeCmd * exec.Cmd
46+ logFile * os.File
47+ mu sync.Mutex
48+ stopped bool
49+ paused bool
50+ retryConfig RetryConfig
3351}
3452
3553// NewLoop creates a new Loop instance.
3654func NewLoop (prdPath , prompt string , maxIter int ) * Loop {
3755 return & Loop {
38- prdPath : prdPath ,
39- prompt : prompt ,
40- maxIter : maxIter ,
41- events : make (chan Event , 100 ),
56+ prdPath : prdPath ,
57+ prompt : prompt ,
58+ maxIter : maxIter ,
59+ events : make (chan Event , 100 ),
60+ retryConfig : DefaultRetryConfig (),
4261 }
4362}
4463
@@ -102,8 +121,8 @@ func (l *Loop) Run(ctx context.Context) error {
102121 Iteration : currentIter ,
103122 }
104123
105- // Run a single iteration
106- if err := l .runIteration (ctx ); err != nil {
124+ // Run a single iteration with retry logic
125+ if err := l .runIterationWithRetry (ctx ); err != nil {
107126 l .events <- Event {
108127 Type : EventError ,
109128 Err : err ,
@@ -146,6 +165,82 @@ func (l *Loop) Run(ctx context.Context) error {
146165 }
147166}
148167
168+ // runIterationWithRetry wraps runIteration with retry logic for crash recovery.
169+ func (l * Loop ) runIterationWithRetry (ctx context.Context ) error {
170+ l .mu .Lock ()
171+ config := l .retryConfig
172+ l .mu .Unlock ()
173+
174+ var lastErr error
175+ for attempt := 0 ; attempt <= config .MaxRetries ; attempt ++ {
176+ // Check if retry is enabled (except for first attempt)
177+ if attempt > 0 {
178+ if ! config .Enabled {
179+ return lastErr
180+ }
181+
182+ // Get delay for this retry
183+ delayIdx := attempt - 1
184+ if delayIdx >= len (config .RetryDelays ) {
185+ delayIdx = len (config .RetryDelays ) - 1
186+ }
187+ delay := config .RetryDelays [delayIdx ]
188+
189+ // Emit retry event
190+ l .mu .Lock ()
191+ iter := l .iteration
192+ l .mu .Unlock ()
193+ l .events <- Event {
194+ Type : EventRetrying ,
195+ Iteration : iter ,
196+ RetryCount : attempt ,
197+ RetryMax : config .MaxRetries ,
198+ Text : fmt .Sprintf ("Claude crashed, retrying (%d/%d)..." , attempt , config .MaxRetries ),
199+ }
200+
201+ // Wait before retry
202+ if delay > 0 {
203+ select {
204+ case <- time .After (delay ):
205+ case <- ctx .Done ():
206+ return ctx .Err ()
207+ }
208+ }
209+ }
210+
211+ // Check if stopped during delay
212+ l .mu .Lock ()
213+ if l .stopped {
214+ l .mu .Unlock ()
215+ return nil
216+ }
217+ l .mu .Unlock ()
218+
219+ // Run the iteration
220+ err := l .runIteration (ctx )
221+ if err == nil {
222+ return nil // Success
223+ }
224+
225+ // Check if this is a context cancellation (don't retry)
226+ if ctx .Err () != nil {
227+ return ctx .Err ()
228+ }
229+
230+ // Check if stopped intentionally
231+ l .mu .Lock ()
232+ stopped := l .stopped
233+ l .mu .Unlock ()
234+ if stopped {
235+ return nil
236+ }
237+
238+ lastErr = err
239+ }
240+
241+ return fmt .Errorf ("max retries (%d) exceeded: %w" , config .MaxRetries , lastErr )
242+ }
243+
149244// runIteration spawns Claude and processes its output.
150245func (l * Loop ) runIteration (ctx context.Context ) error {
151246 // Build Claude command with required flags
@@ -302,3 +397,31 @@ func (l *Loop) IsRunning() bool {
302397 defer l .mu .Unlock ()
303398 return l .claudeCmd != nil && l .claudeCmd .Process != nil
304399}
400+
401+ // SetMaxIterations updates the maximum iterations limit.
402+ func (l * Loop ) SetMaxIterations (maxIter int ) {
403+ l .mu .Lock ()
404+ defer l .mu .Unlock ()
405+ l .maxIter = maxIter
406+ }
407+
408+ // MaxIterations returns the current max iterations limit.
409+ func (l * Loop ) MaxIterations () int {
410+ l .mu .Lock ()
411+ defer l .mu .Unlock ()
412+ return l .maxIter
413+ }
414+
415+ // SetRetryConfig updates the retry configuration.
416+ func (l * Loop ) SetRetryConfig (config RetryConfig ) {
417+ l .mu .Lock ()
418+ defer l .mu .Unlock ()
419+ l .retryConfig = config
420+ }
421+
422+ // DisableRetry disables automatic retry on crash.
423+ func (l * Loop ) DisableRetry () {
424+ l .mu .Lock ()
425+ defer l .mu .Unlock ()
426+ l .retryConfig .Enabled = false
427+ }
0 commit comments