Skip to content

Commit fc10a04

Browse files
authored
fix(cloud-agent-next): clean up workspace on failed cold-start resume (#944)
## Summary When a post-clone step (e.g. `kilo import`, snapshot fetch, diff apply) fails during cold-start resume, the workspace directory — including `.git` — was left behind. On the next retry, `handleColdStartResume` saw `isColdStart = false` (because `.git` existed) and skipped the full restore flow, leaving the session in a broken half-initialized state. This wraps the post-clone steps in `handleColdStartResume` in a try/catch that calls `cleanupWorkspace()` on failure, removing the workspace and session home directories so the next retry sees a true cold start and re-runs the full restore from scratch. This applies to all failure modes, including `SessionSnapshotRestoreError` (404). Two tests are added covering the workspace-cleanup behavior on import failure and on 404 snapshot response. ## Verification - [x] `pnpm vitest run` — all 641 tests pass - [x] `pnpm typecheck` — passes - [ ] _Additional verification details_ ## Visual Changes N/A ## Reviewer Notes - The `cleanupWorkspace` helper (from `workspace.ts`) removes both the workspace directory and the session home directory using `session.exec()`. This is best-effort — the original error is always re-thrown. - The initial commit excluded 404 from cleanup; the follow-up commit (`02d0959`) corrected this to also clean up on 404 since a stale `.git` causes the same warm-start misrouting regardless of the error type.
2 parents 1bb1e63 + 565b51c commit fc10a04

2 files changed

Lines changed: 271 additions & 47 deletions

File tree

cloud-agent-next/src/session-service.test.ts

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import {
3636
cloneGitHubRepo as mockCloneGitHubRepo,
3737
manageBranch as mockManageBranch,
3838
restoreWorkspace as mockRestoreWorkspace,
39+
cleanupWorkspace as mockCleanupWorkspace,
3940
} from './workspace.js';
4041
import { InvalidSessionMetadataError, SessionService } from './session-service.js';
4142
import type { SandboxInstance, SessionId, SessionContext, ExecutionSession } from './types.js';
@@ -930,6 +931,212 @@ describe('SessionService', () => {
930931

931932
expect(fetchMock).toHaveBeenCalled();
932933
});
934+
935+
it('removes workspace when kilo import fails during cold start so retry can reclone', async () => {
936+
const mockDOGetMetadata = vi.fn();
937+
const payload = JSON.stringify({ info: {}, messages: [{ info: {}, parts: [] }] });
938+
const fetchMock = vi.fn().mockResolvedValue(new Response(payload));
939+
const envWithIngest: PersistenceEnv = {
940+
...mockEnv,
941+
SESSION_INGEST: {
942+
fetch: fetchMock,
943+
} as unknown as PersistenceEnv['SESSION_INGEST'],
944+
CLOUD_AGENT_SESSION: {
945+
idFromName: vi.fn(() => 'mock-do-id' as unknown as DurableObjectId),
946+
get: vi.fn(() => ({
947+
getMetadata: mockDOGetMetadata,
948+
updateMetadata: vi.fn().mockResolvedValue(undefined),
949+
deleteSession: vi.fn().mockResolvedValue(undefined),
950+
})),
951+
} as unknown as PersistenceEnv['CLOUD_AGENT_SESSION'],
952+
};
953+
const fakeSession = {
954+
exec: vi.fn().mockImplementation((cmd: string) => {
955+
if (cmd.includes('test -d') && cmd.includes('.git')) {
956+
return Promise.resolve({ success: true, exitCode: 1, stdout: '', stderr: '' });
957+
}
958+
if (cmd.includes('kilo import')) {
959+
return Promise.resolve({
960+
success: false,
961+
exitCode: 1,
962+
stdout: '',
963+
stderr: 'import failed',
964+
});
965+
}
966+
return Promise.resolve({ success: true, exitCode: 0, stdout: '', stderr: '' });
967+
}),
968+
gitCheckout: vi.fn().mockResolvedValue({ success: true, exitCode: 0 }),
969+
writeFile: vi.fn().mockResolvedValue(undefined),
970+
deleteFile: vi.fn().mockResolvedValue(undefined),
971+
};
972+
const sandboxExec = vi.fn().mockResolvedValue({ exitCode: 0 });
973+
const sandbox = {
974+
createSession: vi.fn().mockResolvedValue(fakeSession),
975+
mkdir: vi.fn().mockResolvedValue(undefined),
976+
exec: sandboxExec,
977+
writeFile: vi.fn().mockResolvedValue(undefined),
978+
} as unknown as SandboxInstance;
979+
980+
const kiloSessionId = 'ses_test_kilo_session_id_0001';
981+
const metadata = {
982+
version: 123456789,
983+
sessionId,
984+
orgId,
985+
userId,
986+
timestamp: 123456789,
987+
githubRepo: 'facebook/react',
988+
githubToken: 'test-token',
989+
kiloSessionId,
990+
};
991+
mockDOGetMetadata.mockResolvedValue(metadata);
992+
993+
const service = new SessionService();
994+
await expect(
995+
service.resume({
996+
sandbox,
997+
sandboxId: `${orgId}__${userId}`,
998+
orgId,
999+
userId,
1000+
sessionId,
1001+
kilocodeToken: 'test-token',
1002+
kilocodeModel: 'test-model',
1003+
env: envWithIngest,
1004+
})
1005+
).rejects.toThrow('Session snapshot import failed');
1006+
1007+
const workspacePath = `/workspace/${orgId}/${userId}/sessions/${sessionId}`;
1008+
const sessionHome = `/home/${sessionId}`;
1009+
expect(mockCleanupWorkspace).toHaveBeenCalledWith(fakeSession, workspacePath, sessionHome);
1010+
});
1011+
1012+
it('removes workspace when SessionSnapshotRestoreError (404) is thrown', async () => {
1013+
const mockDOGetMetadata = vi.fn();
1014+
const fetchMock = vi.fn().mockResolvedValue(new Response(null, { status: 404 }));
1015+
const envWithIngest: PersistenceEnv = {
1016+
...mockEnv,
1017+
SESSION_INGEST: {
1018+
fetch: fetchMock,
1019+
} as unknown as PersistenceEnv['SESSION_INGEST'],
1020+
CLOUD_AGENT_SESSION: {
1021+
idFromName: vi.fn(() => 'mock-do-id' as unknown as DurableObjectId),
1022+
get: vi.fn(() => ({
1023+
getMetadata: mockDOGetMetadata,
1024+
updateMetadata: vi.fn().mockResolvedValue(undefined),
1025+
deleteSession: vi.fn().mockResolvedValue(undefined),
1026+
})),
1027+
} as unknown as PersistenceEnv['CLOUD_AGENT_SESSION'],
1028+
};
1029+
const fakeSession = {
1030+
exec: vi
1031+
.fn()
1032+
.mockResolvedValueOnce({ success: true, exitCode: 1, stdout: '', stderr: '' })
1033+
.mockResolvedValue({ success: true, exitCode: 0, stdout: '', stderr: '' }),
1034+
gitCheckout: vi.fn().mockResolvedValue({ success: true, exitCode: 0 }),
1035+
writeFile: vi.fn().mockResolvedValue(undefined),
1036+
deleteFile: vi.fn().mockResolvedValue(undefined),
1037+
};
1038+
const sandboxExec = vi.fn().mockResolvedValue({ exitCode: 0 });
1039+
const sandbox = {
1040+
createSession: vi.fn().mockResolvedValue(fakeSession),
1041+
mkdir: vi.fn().mockResolvedValue(undefined),
1042+
exec: sandboxExec,
1043+
writeFile: vi.fn().mockResolvedValue(undefined),
1044+
} as unknown as SandboxInstance;
1045+
1046+
const kiloSessionId = 'ses_test_kilo_session_id_0001';
1047+
const metadata = {
1048+
version: 123456789,
1049+
sessionId,
1050+
orgId,
1051+
userId,
1052+
timestamp: 123456789,
1053+
githubRepo: 'facebook/react',
1054+
githubToken: 'test-token',
1055+
kiloSessionId,
1056+
};
1057+
mockDOGetMetadata.mockResolvedValue(metadata);
1058+
1059+
const service = new SessionService();
1060+
await expect(
1061+
service.resume({
1062+
sandbox,
1063+
sandboxId: `${orgId}__${userId}`,
1064+
orgId,
1065+
userId,
1066+
sessionId,
1067+
kilocodeToken: 'test-token',
1068+
kilocodeModel: 'test-model',
1069+
env: envWithIngest,
1070+
})
1071+
).rejects.toThrow('session not found');
1072+
1073+
const workspacePath = `/workspace/${orgId}/${userId}/sessions/${sessionId}`;
1074+
const sessionHome = `/home/${sessionId}`;
1075+
expect(mockCleanupWorkspace).toHaveBeenCalledWith(fakeSession, workspacePath, sessionHome);
1076+
});
1077+
1078+
it('removes workspace when restoreWorkspace (clone/branch) fails during cold start', async () => {
1079+
mockedRestoreWorkspace.mockRejectedValueOnce(new Error('branch checkout failed'));
1080+
1081+
const mockDOGetMetadata = vi.fn();
1082+
const envWithIngest: PersistenceEnv = {
1083+
...mockEnv,
1084+
CLOUD_AGENT_SESSION: {
1085+
idFromName: vi.fn(() => 'mock-do-id' as unknown as DurableObjectId),
1086+
get: vi.fn(() => ({
1087+
getMetadata: mockDOGetMetadata,
1088+
updateMetadata: vi.fn().mockResolvedValue(undefined),
1089+
deleteSession: vi.fn().mockResolvedValue(undefined),
1090+
})),
1091+
} as unknown as PersistenceEnv['CLOUD_AGENT_SESSION'],
1092+
};
1093+
const fakeSession = {
1094+
exec: vi
1095+
.fn()
1096+
.mockResolvedValueOnce({ success: true, exitCode: 1, stdout: '', stderr: '' })
1097+
.mockResolvedValue({ success: true, exitCode: 0, stdout: '', stderr: '' }),
1098+
gitCheckout: vi.fn().mockResolvedValue({ success: true, exitCode: 0 }),
1099+
writeFile: vi.fn().mockResolvedValue(undefined),
1100+
deleteFile: vi.fn().mockResolvedValue(undefined),
1101+
};
1102+
const sandbox = {
1103+
createSession: vi.fn().mockResolvedValue(fakeSession),
1104+
mkdir: vi.fn().mockResolvedValue(undefined),
1105+
exec: vi.fn().mockResolvedValue({ exitCode: 0 }),
1106+
writeFile: vi.fn().mockResolvedValue(undefined),
1107+
} as unknown as SandboxInstance;
1108+
1109+
const kiloSessionId = 'ses_test_kilo_session_id_0001';
1110+
const metadata = {
1111+
version: 123456789,
1112+
sessionId,
1113+
orgId,
1114+
userId,
1115+
timestamp: 123456789,
1116+
githubRepo: 'facebook/react',
1117+
githubToken: 'test-token',
1118+
kiloSessionId,
1119+
};
1120+
mockDOGetMetadata.mockResolvedValue(metadata);
1121+
1122+
const service = new SessionService();
1123+
await expect(
1124+
service.resume({
1125+
sandbox,
1126+
sandboxId: `${orgId}__${userId}`,
1127+
orgId,
1128+
userId,
1129+
sessionId,
1130+
kilocodeToken: 'test-token',
1131+
kilocodeModel: 'test-model',
1132+
env: envWithIngest,
1133+
})
1134+
).rejects.toThrow('branch checkout failed');
1135+
1136+
const workspacePath = `/workspace/${orgId}/${userId}/sessions/${sessionId}`;
1137+
const sessionHome = `/home/${sessionId}`;
1138+
expect(mockCleanupWorkspace).toHaveBeenCalledWith(fakeSession, workspacePath, sessionHome);
1139+
});
9331140
});
9341141

9351142
describe('Environment Variable Injection', () => {

cloud-agent-next/src/session-service.ts

Lines changed: 64 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,60 +1454,77 @@ export class SessionService {
14541454
`Session ${sessionId} has no kiloSessionId in metadata. Cannot restore snapshot.`
14551455
);
14561456
}
1457-
// Clone first so .git exists when `kilo import` runs — the CLI derives the
1458-
// project ID from the repo's root commit hash; without a repo the FK on
1459-
// session.project_id fails.
1460-
await restoreWorkspace(session, context.workspacePath, context.branchName, {
1461-
githubRepo: metadata.githubRepo,
1462-
githubToken: freshGithubToken ?? metadata.githubToken,
1463-
gitUrl: metadata.gitUrl,
1464-
gitToken: freshGitToken ?? metadata.gitToken,
1465-
gitAuthorEnv: getGitAuthorEnv(env, metadata.githubAppType),
1466-
lastSeenBranch: metadata.upstreamBranch,
1467-
platform: context.platform,
1468-
});
1469-
1470-
// Write auth file BEFORE kilo import so KiloSessions.bootstrap() can authenticate
1471-
await writeAuthFile(sandbox, context.sessionHome, kilocodeToken);
1472-
await writeGlobalRules(sandbox, context.sessionHome, sessionId);
1457+
// Wrap clone and all post-clone steps so that any failure removes the
1458+
// workspace directory. Without this, `.git` survives and the next retry
1459+
// sees `isColdStart = false`, skipping the full restore flow — leaving
1460+
// the session in a broken half-initialized state.
1461+
try {
1462+
// Clone first so .git exists when `kilo import` runs — the CLI derives
1463+
// the project ID from the repo's root commit hash; without a repo the
1464+
// FK on session.project_id fails.
1465+
await restoreWorkspace(session, context.workspacePath, context.branchName, {
1466+
githubRepo: metadata.githubRepo,
1467+
githubToken: freshGithubToken ?? metadata.githubToken,
1468+
gitUrl: metadata.gitUrl,
1469+
gitToken: freshGitToken ?? metadata.gitToken,
1470+
gitAuthorEnv: getGitAuthorEnv(env, metadata.githubAppType),
1471+
lastSeenBranch: metadata.upstreamBranch,
1472+
platform: context.platform,
1473+
});
1474+
// Write auth file BEFORE kilo import so KiloSessions.bootstrap() can authenticate
1475+
await writeAuthFile(sandbox, context.sessionHome, kilocodeToken);
1476+
await writeGlobalRules(sandbox, context.sessionHome, sessionId);
1477+
1478+
// Fetch snapshot from session-ingest DO and buffer it for sandbox writeFile (string-only API).
1479+
const internalSecret = await env.INTERNAL_API_SECRET_PROD.get();
1480+
const response = await env.SESSION_INGEST.fetch(
1481+
new Request(`https://session-ingest/internal/session/${metadata.kiloSessionId}/export`, {
1482+
headers: {
1483+
'X-Internal-Secret': internalSecret,
1484+
'X-Kilo-User-Id': userId,
1485+
},
1486+
})
1487+
);
14731488

1474-
// Fetch snapshot from session-ingest DO and buffer it for sandbox writeFile (string-only API).
1475-
const internalSecret = await env.INTERNAL_API_SECRET_PROD.get();
1476-
const response = await env.SESSION_INGEST.fetch(
1477-
new Request(`https://session-ingest/internal/session/${metadata.kiloSessionId}/export`, {
1478-
headers: {
1479-
'X-Internal-Secret': internalSecret,
1480-
'X-Kilo-User-Id': userId,
1481-
},
1482-
})
1483-
);
1489+
if (response.status === 404) {
1490+
throw new SessionSnapshotRestoreError(
1491+
`Session snapshot restore failed: session not found`,
1492+
404
1493+
);
1494+
}
1495+
if (!response.ok) {
1496+
throw new Error(`Session export failed: ${response.status}`);
1497+
}
14841498

1485-
if (response.status === 404) {
1486-
throw new SessionSnapshotRestoreError(
1487-
`Session snapshot restore failed: session not found`,
1488-
404
1489-
);
1490-
}
1491-
if (!response.ok) {
1492-
throw new Error(`Session export failed: ${response.status}`);
1493-
}
1499+
const snapshotPayload = await response.text();
14941500

1495-
const snapshotPayload = await response.text();
1501+
// Extract diffs client-side from the streamed snapshot messages.
1502+
const diffs = SessionService.extractDiffsFromMessages(snapshotPayload);
14961503

1497-
// Extract diffs client-side from the streamed snapshot messages.
1498-
const diffs = SessionService.extractDiffsFromMessages(snapshotPayload);
1504+
await this.restoreSessionSnapshot(session, sessionId, userId, snapshotPayload);
14991505

1500-
await this.restoreSessionSnapshot(session, sessionId, userId, snapshotPayload);
1506+
// Apply file-level changes from the previous session on top of the fresh clone.
1507+
// This runs after kilo import (conversation restore) since the CLI doesn't need
1508+
// the file state during import — it only restores its internal session DB.
1509+
await this.applySessionDiff(session, sessionId, userId, context.workspacePath, diffs);
15011510

1502-
// Apply file-level changes from the previous session on top of the fresh clone.
1503-
// This runs after kilo import (conversation restore) since the CLI doesn't need
1504-
// the file state during import — it only restores its internal session DB.
1505-
await this.applySessionDiff(session, sessionId, userId, context.workspacePath, diffs);
1511+
// Re-run setup commands (fresh clone, need to reinstall)
1512+
if (metadata.setupCommands && metadata.setupCommands.length > 0) {
1513+
logger.info('Re-running setup commands after fresh clone');
1514+
await runSetupCommands(session, context, metadata.setupCommands, false); // lenient
1515+
}
1516+
} catch (error) {
1517+
// Remove the workspace and sessionHome so the next retry sees a true
1518+
// cold start and re-runs the full restore from scratch.
1519+
logger
1520+
.withFields({
1521+
sessionId,
1522+
error: error instanceof Error ? error.message : String(error),
1523+
})
1524+
.warn('Cold-start resume step failed; removing workspace for clean retry');
1525+
await cleanupWorkspace(session, context.workspacePath, context.sessionHome);
15061526

1507-
// Re-run setup commands (fresh clone, need to reinstall)
1508-
if (metadata.setupCommands && metadata.setupCommands.length > 0) {
1509-
logger.info('Re-running setup commands after fresh clone');
1510-
await runSetupCommands(session, context, metadata.setupCommands, false); // lenient
1527+
throw error;
15111528
}
15121529
}
15131530

0 commit comments

Comments
 (0)