feat(recovery): improve crash recovery handling with auto mode support

moazbuilds · moazbuilds · commit 55efca79e0bc · 2025-12-31T09:04:54.000+02:00
- Replace 'workflow:skip' with 'workflow:stop' for proper workflow termination
- Add sendRecoveryPrompt callback to centralize recovery logic
- Sync paused state between machine context and mode for consistency
- Enhance queue restoration to always attempt loading from agent config
- Implement separate recovery flows for auto and manual modes
diff --git a/src/agents/monitoring/cleanup.ts b/src/agents/monitoring/cleanup.ts
@@ -142,8 +142,8 @@ export class MonitoringCleanup {
     // Suppress all error/warn logs during graceful shutdown
     setShuttingDown(true);
 
-    // Emit workflow:skip to abort the currently running step
-    (process as NodeJS.EventEmitter).emit('workflow:skip');
+    // Emit workflow:stop to stop the workflow (not skip to next step)
+    (process as NodeJS.EventEmitter).emit('workflow:stop');
 
     // Save session state for active agents before cleanup (for resume on restart)
     if (this.workflowHandlers.onBeforeCleanup) {
diff --git a/src/workflows/recovery/index.ts b/src/workflows/recovery/index.ts
@@ -30,6 +30,23 @@ export type {
   CrashRestoreResult,
 } from './types.js';
 
+/**
+ * Options for sending recovery prompt
+ */
+export interface SendRecoveryPromptOptions {
+  /** Prompt to send */
+  resumePrompt: string;
+  /** Monitoring ID for the session */
+  resumeMonitoringId?: number;
+  /** Source of the prompt */
+  source: 'controller';
+}
+
+/**
+ * Callback to send recovery prompt to agent
+ */
+export type SendRecoveryPromptFn = (options: SendRecoveryPromptOptions) => Promise<void>;
+
 /**
  * Options for handleCrashRecovery
  */
@@ -54,6 +71,8 @@ export interface HandleCrashRecoveryOptions {
   indexManager: StepIndexManager;
   /** Current session (optional) */
   session?: StepSession | null;
+  /** Callback to send recovery prompt (required for auto mode recovery) */
+  sendRecoveryPrompt?: SendRecoveryPromptFn;
 }
 
 /**
@@ -119,13 +138,46 @@ export async function handleCrashRecovery(
 
   const restoration = await restoreFromCrash(restoreCtx);
 
-  // 3. Transition state machine to awaiting
-  machine.send({
-    type: 'STEP_COMPLETE',
-    output: { output: '', monitoringId: stepData?.monitoringId },
-  });
+  // 3. Handle recovery based on mode
+  const isAutoMode = machine.context.autoMode;
+  const recoveryPrompt = 'Continue where you left off. Review what was accomplished and proceed with the next logical step.';
+
+  if (isAutoMode) {
+    // Auto mode: Send recovery prompt directly before transitioning
+    // This centralizes all recovery logic here instead of scattering to wait.ts/delegated.ts
+    if (!options.sendRecoveryPrompt) {
+      throw new Error('[recovery] Auto mode crash recovery requires sendRecoveryPrompt callback');
+    }
 
-  debug('[recovery] Crash recovery complete, transitioning to awaiting state');
+    debug('[recovery] Auto mode: sending recovery prompt to agent');
+    emitter.updateAgentStatus(uniqueAgentId, 'running');
+
+    // Transition to running state before sending prompt
+    machine.send({ type: 'RESUME' });
+
+    // Send recovery prompt and wait for agent response
+    await options.sendRecoveryPrompt({
+      resumePrompt: recoveryPrompt,
+      resumeMonitoringId: stepData?.monitoringId,
+      source: 'controller',
+    });
+
+    debug('[recovery] Recovery prompt sent, agent responded');
+
+    // After agent responds, the state machine will have transitioned to awaiting/delegated
+    // The normal flow will continue from there (chained prompts, etc.)
+    debug('[recovery] Crash recovery complete (auto mode)');
+  } else {
+    // Manual mode: Pause and wait for user input
+    machine.context.paused = true;
+
+    machine.send({
+      type: 'STEP_COMPLETE',
+      output: { output: '', monitoringId: stepData?.monitoringId },
+    });
+
+    debug('[recovery] Crash recovery complete, transitioning to awaiting state (manual mode, paused)');
+  }
 
   return { handled: true, detection, restoration };
 }
diff --git a/src/workflows/recovery/restore.ts b/src/workflows/recovery/restore.ts
@@ -58,42 +58,44 @@ export async function restoreFromCrash(ctx: CrashRestoreContext): Promise<CrashR
     monitoringId: stepData.monitoringId,
   };
 
-  // 4. Restore queue from completedChains
+  // 4. Restore queue from agent config (always try, not just when completedChains exists)
   let queueRestored = false;
   let promptCount = 0;
   let resumeIndex = 0;
 
-  if (stepData.completedChains && stepData.completedChains.length > 0) {
-    const agentConfig = await loadAgentConfig(step.agentId, cwd);
+  // Always try to load chained prompts from agent config
+  const agentConfig = await loadAgentConfig(step.agentId, cwd);
 
-    if (agentConfig?.chainedPromptsPath) {
-      const selectedConditions = await getSelectedConditions(cmRoot);
-      const chainedPrompts = await loadChainedPrompts(
-        agentConfig.chainedPromptsPath,
-        cwd,
-        selectedConditions
-      );
-
-      if (chainedPrompts.length > 0) {
-        // Use existing helper to calculate resume index
-        resumeIndex = getNextChainIndex(stepData);
-        promptCount = chainedPrompts.length;
+  if (agentConfig?.chainedPromptsPath) {
+    const selectedConditions = await getSelectedConditions(cmRoot);
+    const chainedPrompts = await loadChainedPrompts(
+      agentConfig.chainedPromptsPath,
+      cwd,
+      selectedConditions
+    );
 
-        debug(
-          '[recovery/restore] Restoring queue: %d prompts, resuming at index %d',
-          promptCount,
-          resumeIndex
-        );
+    if (chainedPrompts.length > 0) {
+      // Use completedChains to determine resume index (or 0 if none completed yet)
+      resumeIndex = stepData.completedChains && stepData.completedChains.length > 0
+        ? getNextChainIndex(stepData)
+        : 0;
+      promptCount = chainedPrompts.length;
 
-        // Use session if available, otherwise fall back to indexManager directly
-        if (session) {
-          session.initializeFromPersisted(chainedPrompts, resumeIndex);
-        } else {
-          indexManager.initQueue(chainedPrompts, resumeIndex);
-        }
+      debug(
+        '[recovery/restore] Restoring queue: %d prompts, resuming at index %d (completedChains=%d)',
+        promptCount,
+        resumeIndex,
+        stepData.completedChains?.length ?? 0
+      );
 
-        queueRestored = true;
+      // Use session if available, otherwise fall back to indexManager directly
+      if (session) {
+        session.initializeFromPersisted(chainedPrompts, resumeIndex);
+      } else {
+        indexManager.initQueue(chainedPrompts, resumeIndex);
       }
+
+      queueRestored = true;
     }
   }
 
diff --git a/src/workflows/runner/delegated.ts b/src/workflows/runner/delegated.ts
@@ -27,6 +27,12 @@ export interface DelegatedCallbacks {
 export async function handleDelegated(ctx: RunnerContext, callbacks: DelegatedCallbacks): Promise<void> {
   const machineCtx = ctx.machine.context;
 
+  // Sync paused state from machineCtx to mode (for crash recovery)
+  if (machineCtx.paused && !ctx.mode.paused) {
+    debug('[Runner:delegated] Syncing paused state from machineCtx to mode (recovery)');
+    ctx.mode.pause();
+  }
+
   debug('[Runner:delegated] Handling delegated state, promptQueue=%d items, queueIndex=%d, autoMode=%s',
     ctx.indexManager.promptQueue.length, ctx.indexManager.promptQueueIndex, ctx.mode.autoMode);
 
diff --git a/src/workflows/runner/wait.ts b/src/workflows/runner/wait.ts
@@ -31,19 +31,26 @@ export interface WaitCallbacks {
 export async function handleWaiting(ctx: RunnerContext, callbacks: WaitCallbacks): Promise<void> {
   const machineCtx = ctx.machine.context;
 
+  // Sync paused state from machineCtx to mode (for crash recovery)
+  // Recovery sets machineCtx.paused directly, mode needs to be synced
+  if (machineCtx.paused && !ctx.mode.paused) {
+    debug('[Runner] Syncing paused state from machineCtx to mode (recovery)');
+    ctx.mode.pause();
+  }
+
   debug('[Runner] Handling waiting state, autoMode=%s, paused=%s, promptQueue=%d items, queueIndex=%d',
     ctx.mode.autoMode, ctx.mode.paused, ctx.indexManager.promptQueue.length, ctx.indexManager.promptQueueIndex);
 
+  // Get current step info
+  const step = ctx.moduleSteps[machineCtx.currentStepIndex];
+  const stepUniqueAgentId = getUniqueAgentId(step, machineCtx.currentStepIndex);
+
   // Get queue state from session (uses indexManager as single source of truth)
   const session = ctx.getCurrentSession();
   const hasChainedPrompts = session
     ? !session.isQueueExhausted
     : !ctx.indexManager.isQueueExhausted();
 
-  // Get current step and resolve interactive behavior
-  const step = ctx.moduleSteps[machineCtx.currentStepIndex];
-  const stepUniqueAgentId = getUniqueAgentId(step, machineCtx.currentStepIndex);
-
   // Resolve interactive behavior using single source of truth
   const behavior = resolveInteractiveBehavior({
     step,
diff --git a/src/workflows/step/run.ts b/src/workflows/step/run.ts
@@ -74,6 +74,14 @@ export async function runStepFresh(ctx: RunnerContext): Promise<RunStepResult |
     machine: ctx.machine,
     indexManager: ctx.indexManager,
     session: ctx.getCurrentSession(),
+    // Callback to send recovery prompt - centralizes all recovery logic in recovery module
+    sendRecoveryPrompt: async (options) => {
+      await runStepResume(ctx, {
+        resumePrompt: options.resumePrompt,
+        resumeMonitoringId: options.resumeMonitoringId,
+        source: options.source,
+      });
+    },
   });
 
   if (recoveryResult.handled) {