@@ -104,6 +104,42 @@ function parseMs(v, fallback) {
104104 return fallback ;
105105}
106106
107+ function parseBoolEnv ( v , fallback ) {
108+ if ( v == null ) return fallback ;
109+ const s = String ( v ) . toLowerCase ( ) . trim ( ) ;
110+ if ( s === '' ) return fallback ;
111+ if ( s === 'false' || s === '0' || s === 'off' || s === 'no' ) return false ;
112+ if ( s === 'true' || s === '1' || s === 'on' || s === 'yes' ) return true ;
113+ return fallback ;
114+ }
115+
116+ class CycleTimeoutError extends Error {
117+ constructor ( timeoutMs , phase , cycleNum ) {
118+ super ( 'Cycle hard-timeout exceeded after ' + timeoutMs + 'ms (cycle=' + cycleNum + ', phase=' + phase + ')' ) ;
119+ this . name = 'CycleTimeoutError' ;
120+ this . code = 'CYCLE_TIMEOUT' ;
121+ this . timeoutMs = timeoutMs ;
122+ this . phase = phase ;
123+ this . cycleNum = cycleNum ;
124+ }
125+ }
126+
127+ // Atomic write of the cycle_progress.json file. Wrapper polls this file every
128+ // 60s; if updated_at goes stale beyond EVOLVE_INNER_STUCK_TIMEOUT_SEC the
129+ // wrapper treats the inner core as zombie and SIGKILLs it. See Issue #19 (the
130+ // 22-day stuck-cycle incident) and the cross-repo timeout plan for context.
131+ function writeCycleProgressAtomic ( progressPath , fields ) {
132+ try {
133+ const data = Object . assign ( { } , fields , { updated_at : Date . now ( ) } ) ;
134+ const tmp = progressPath + '.tmp.' + process . pid ;
135+ fs . writeFileSync ( tmp , JSON . stringify ( data , null , 2 ) + '\n' , 'utf8' ) ;
136+ fs . renameSync ( tmp , progressPath ) ;
137+ return true ;
138+ } catch ( e ) {
139+ return false ;
140+ }
141+ }
142+
107143function getLastSignals ( statePath ) {
108144 try {
109145 const st = readJsonSafe ( statePath ) ;
@@ -255,6 +291,7 @@ async function main() {
255291
256292 const { getEvolutionDir, getEvolverLogPath } = require ( './src/gep/paths' ) ;
257293 const solidifyStatePath = path . join ( getEvolutionDir ( ) , 'evolution_solidify_state.json' ) ;
294+ const cycleProgressPath = path . join ( getEvolutionDir ( ) , 'cycle_progress.json' ) ;
258295
259296 const minSleepMs = parseMs ( process . env . EVOLVER_MIN_SLEEP_MS , 2000 ) ;
260297 const maxSleepMs = parseMs ( process . env . EVOLVER_MAX_SLEEP_MS , 300000 ) ;
@@ -270,6 +307,15 @@ async function main() {
270307 const maxRssMb = parseMs ( process . env . EVOLVER_MAX_RSS_MB , 500 ) || 500 ;
271308 const suicideEnabled = String ( process . env . EVOLVER_SUICIDE || '' ) . toLowerCase ( ) !== 'false' ;
272309
310+ // Issue #19: hard timeout around evolve.run() to break out of zombie
311+ // cycles (e.g. unclosed socket / stuck LLM call). On timeout we throw
312+ // CycleTimeoutError, log diagnostic stderr, and force suicide-respawn
313+ // so the wrapper sees a fresh PID + cycle. Also write cycle_progress
314+ // every progressUpdateMs so the wrapper has a true heartbeat to poll.
315+ const cycleTimeoutEnabled = parseBoolEnv ( process . env . EVOLVER_CYCLE_TIMEOUT_ENABLED , true ) ;
316+ const cycleTimeoutMs = parseMs ( process . env . EVOLVER_CYCLE_TIMEOUT_MS , 2700000 ) ; // 45 min default
317+ const progressUpdateMs = parseMs ( process . env . EVOLVER_PROGRESS_UPDATE_MS , 60000 ) ; // 1 min default
318+
273319 // Start hub heartbeat (keeps node alive independently of evolution cycles)
274320 try {
275321 if ( process . env . EVOMAP_PROXY === '1' || process . env . A2A_TRANSPORT === 'mailbox' ) {
@@ -388,8 +434,46 @@ async function main() {
388434
389435 const t0 = Date . now ( ) ;
390436 let ok = false ;
437+ // Issue #19: write progress at cycle start, refresh it every
438+ // progressUpdateMs (default 60s) while evolve.run() is active, and
439+ // wrap evolve.run() with Promise.race(timeout) so a hung internal
440+ // call cannot freeze the daemon for days.
441+ writeCycleProgressAtomic ( cycleProgressPath , {
442+ pid : process . pid ,
443+ outer_cycle : cycleCount ,
444+ inner_cycle : cycleCount ,
445+ started_at : t0 ,
446+ phase : 'evolve.run' ,
447+ } ) ;
448+ let progressTicker = null ;
449+ if ( progressUpdateMs > 0 ) {
450+ progressTicker = setInterval ( function ( ) {
451+ writeCycleProgressAtomic ( cycleProgressPath , {
452+ pid : process . pid ,
453+ outer_cycle : cycleCount ,
454+ inner_cycle : cycleCount ,
455+ started_at : t0 ,
456+ phase : 'evolve.run' ,
457+ } ) ;
458+ } , progressUpdateMs ) ;
459+ if ( typeof progressTicker . unref === 'function' ) progressTicker . unref ( ) ;
460+ }
461+ let cycleTimeoutHandle = null ;
462+ let cycleTimedOut = false ;
391463 try {
392- await evolve . run ( ) ;
464+ const evolvePromise = evolve . run ( ) ;
465+ if ( cycleTimeoutEnabled && cycleTimeoutMs > 0 ) {
466+ const timeoutPromise = new Promise ( function ( _ , reject ) {
467+ cycleTimeoutHandle = setTimeout ( function ( ) {
468+ cycleTimedOut = true ;
469+ reject ( new CycleTimeoutError ( cycleTimeoutMs , 'evolve.run' , cycleCount ) ) ;
470+ } , cycleTimeoutMs ) ;
471+ if ( cycleTimeoutHandle && typeof cycleTimeoutHandle . unref === 'function' ) cycleTimeoutHandle . unref ( ) ;
472+ } ) ;
473+ await Promise . race ( [ evolvePromise , timeoutPromise ] ) ;
474+ } else {
475+ await evolvePromise ;
476+ }
393477 ok = true ;
394478
395479 if ( String ( process . env . EVOLVE_BRIDGE || '' ) . toLowerCase ( ) === 'false' ) {
@@ -403,7 +487,37 @@ async function main() {
403487 }
404488 } catch ( error ) {
405489 const msg = error && error . message ? String ( error . message ) : String ( error ) ;
490+ if ( error && error . code === 'CYCLE_TIMEOUT' ) {
491+ console . error ( '[Daemon] ' + msg ) ;
492+ if ( progressTicker ) { clearInterval ( progressTicker ) ; progressTicker = null ; }
493+ if ( cycleTimeoutHandle ) { clearTimeout ( cycleTimeoutHandle ) ; cycleTimeoutHandle = null ; }
494+ writeCycleProgressAtomic ( cycleProgressPath , {
495+ pid : process . pid ,
496+ outer_cycle : cycleCount ,
497+ inner_cycle : cycleCount ,
498+ started_at : t0 ,
499+ phase : 'cycle_timeout_respawn' ,
500+ } ) ;
501+ try {
502+ const logFd = fs . openSync ( getEvolverLogPath ( ) , 'a' ) ;
503+ const spawnOpts = {
504+ detached : true ,
505+ stdio : [ 'ignore' , logFd , logFd ] ,
506+ env : process . env ,
507+ windowsHide : true ,
508+ } ;
509+ const child = spawn ( process . execPath , [ __filename , ...args ] , spawnOpts ) ;
510+ child . unref ( ) ;
511+ } catch ( spawnErr ) {
512+ console . error ( '[Daemon] Force-restart spawn after cycle timeout failed: ' + ( spawnErr && spawnErr . message || spawnErr ) ) ;
513+ }
514+ releaseLock ( ) ;
515+ process . exit ( 1 ) ;
516+ }
406517 console . error ( `Evolution cycle failed: ${ msg } ` ) ;
518+ } finally {
519+ if ( progressTicker ) { clearInterval ( progressTicker ) ; progressTicker = null ; }
520+ if ( cycleTimeoutHandle ) { clearTimeout ( cycleTimeoutHandle ) ; cycleTimeoutHandle = null ; }
407521 }
408522 const dt = Date . now ( ) - t0 ;
409523
@@ -496,6 +610,13 @@ async function main() {
496610 const signals = getLastSignals ( solidifyStatePath ) . join ( ',' ) ;
497611 console . log ( `[Verbose] cycle=${ cycleCount } ok=${ ok } dt=${ dt } ms sleep=${ totalSleepMs } ms (base=${ currentSleepMs } jitter=${ jitter } sat=${ saturationMultiplier } x) rss=${ memMb } MB signals=[${ signals } ]` ) ;
498612 }
613+ writeCycleProgressAtomic ( cycleProgressPath , {
614+ pid : process . pid ,
615+ outer_cycle : cycleCount ,
616+ inner_cycle : cycleCount ,
617+ started_at : t0 ,
618+ phase : 'sleep' ,
619+ } ) ;
499620 await sleepMs ( totalSleepMs ) ;
500621
501622 } catch ( loopErr ) {
@@ -1581,4 +1702,7 @@ module.exports = {
15811702 readJsonSafe,
15821703 rejectPendingRun,
15831704 isPendingSolidify,
1705+ parseBoolEnv,
1706+ CycleTimeoutError,
1707+ writeCycleProgressAtomic,
15841708} ;
0 commit comments