Skip to content

Commit 0ccb095

Browse files
authored
fix(cloud-agent-next): avoid OOM by downloading snapshot in sandbox (#908)
## Summary - Move snapshot download from worker memory to sandbox via `curl` — the full JSON never materializes in the 128MB worker - Replace worker-side diff extraction (`extractDiffsFromMessages`) and application (`applySessionDiff`) with a sandbox-side Node script that reads, deduplicates, and applies diffs directly on disk - Refactor `restoreSessionSnapshot` to accept a file path (curl writes directly) instead of a string payload - Remove `resolve` and `zod` imports that were only used by deleted methods Depends on PR #884 which adds the streaming `/api/session/:id/export` endpoint. ## Test plan - [x] All 601 existing tests pass (updated 4 cold-start tests for new exec-based flow) - [x] Typecheck passes - [ ] Integration test: cold-start resume with real session to verify curl download + kilo import + diff application
2 parents df72682 + b8aa586 commit 0ccb095

8 files changed

Lines changed: 964 additions & 381 deletions

File tree

cloud-agent-next/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ RUN mkdir -p -m 755 /etc/apt/keyrings \
1212
&& mkdir -p -m 755 /etc/apt/sources.list.d \
1313
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
1414
&& apt update \
15-
&& apt install gh -y
15+
&& apt install gh jq -y
1616

1717
# Install GitLab CLI (glab) - download official .deb from GitLab releases
1818
RUN GLAB_VERSION="1.80.4" \
@@ -39,9 +39,10 @@ RUN npm install -g pnpm @kilocode/cli@${KILOCODE_CLI_VERSION}
3939
COPY wrapper /tmp/wrapper-build/wrapper
4040
COPY src/shared /tmp/wrapper-build/src/shared
4141

42-
# Build the wrapper bundle
42+
# Build the wrapper bundle and restore-session script
4343
RUN cd /tmp/wrapper-build/wrapper && \
4444
bun build src/main.ts --outfile=/usr/local/bin/kilocode-wrapper.js --target=bun --minify && \
45+
bun build src/restore-session.ts --outfile=/usr/local/bin/kilo-restore-session.js --target=bun --minify && \
4546
rm -rf /tmp/wrapper-build
4647

4748
# DO NOT override USER, WORKDIR, or ENTRYPOINT from the base image

cloud-agent-next/Dockerfile.dev

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ ARG KILOCODE_CLI_VERSION="next"
1212
# This builds kilo-cli from source and copies the linux-x64 binary here.
1313

1414
# Install ripgrep and Generate locales to suppress setlocale warnings
15-
RUN apt-get update && apt-get install -y --no-install-recommends ripgrep locales && \
15+
RUN apt-get update && apt-get install -y --no-install-recommends ripgrep jq locales && \
1616
sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
1717
locale-gen && \
1818
apt-get clean && \
@@ -34,14 +34,15 @@ RUN npm install -g pnpm @kilocode/cli@${KILOCODE_CLI_VERSION}
3434
#COPY kilo /usr/local/bin/kilo
3535
#RUN chmod +x /usr/local/bin/kilo
3636

37-
# === Build wrapper bundle inside container ===
38-
# This ensures the wrapper is built with the same Bun version that will run it,
37+
# === Build wrapper bundle and restore-session script inside container ===
38+
# This ensures both scripts are built with the same Bun version that will run them,
3939
# avoiding any compatibility issues between build and runtime environments.
4040
COPY wrapper /tmp/wrapper-build/wrapper
4141
COPY src/shared /tmp/wrapper-build/src/shared
4242

4343
RUN cd /tmp/wrapper-build/wrapper && \
4444
bun build src/main.ts --outfile=/usr/local/bin/kilocode-wrapper.js --target=bun --minify && \
45+
bun build src/restore-session.ts --outfile=/usr/local/bin/kilo-restore-session.js --target=bun --minify && \
4546
rm -rf /tmp/wrapper-build
4647

4748
# DO NOT override USER, WORKDIR, or ENTRYPOINT from the base image

cloud-agent-next/src/session-service.test.ts

Lines changed: 154 additions & 167 deletions
Large diffs are not rendered by default.

cloud-agent-next/src/session-service.ts

Lines changed: 50 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import { resolve } from 'node:path';
2-
import { z } from 'zod';
31
import type {
42
ExecutionSession,
53
SandboxInstance,
@@ -933,185 +931,6 @@ export class SessionService {
933931
};
934932
}
935933

936-
/**
937-
* Write a snapshot payload to a temp file, run `kilo import`, then clean up.
938-
*
939-
* @param snapshotPayload - Pre-fetched JSON string of the session snapshot.
940-
*/
941-
private async restoreSessionSnapshot(
942-
session: ExecutionSession,
943-
sessionId: string,
944-
userId: string,
945-
snapshotPayload: string
946-
): Promise<void> {
947-
const tmpPath = `/tmp/kilo-session-export-${sessionId}.json`;
948-
let wroteSnapshot = false;
949-
try {
950-
await session.writeFile(tmpPath, snapshotPayload);
951-
wroteSnapshot = true;
952-
953-
const importResult = await session.exec(`kilo import "${tmpPath}"`);
954-
if (importResult.exitCode !== 0) {
955-
logger
956-
.withFields({
957-
sessionId,
958-
userId,
959-
exitCode: importResult.exitCode,
960-
stderr: importResult.stderr,
961-
stdout: importResult.stdout,
962-
})
963-
.error('Session snapshot import failed');
964-
throw new Error(`Session snapshot import failed with exit code ${importResult.exitCode}`);
965-
}
966-
} catch (error) {
967-
logger
968-
.withFields({
969-
sessionId,
970-
userId,
971-
error: error instanceof Error ? error.message : String(error),
972-
})
973-
.error('Session snapshot restore failed');
974-
throw error instanceof Error ? error : new Error(String(error));
975-
} finally {
976-
if (wroteSnapshot) {
977-
try {
978-
await session.deleteFile(tmpPath);
979-
} catch (error) {
980-
logger
981-
.withFields({
982-
sessionId,
983-
userId,
984-
error: error instanceof Error ? error.message : String(error),
985-
})
986-
.debug('Failed to delete session snapshot temp file');
987-
}
988-
}
989-
}
990-
}
991-
992-
/**
993-
* Apply file-level changes from a pre-parsed diff array on top of the freshly
994-
* cloned repo. Called during cold-start resume after `restoreSessionSnapshot`.
995-
*
996-
* Each diff entry contains the full `after` content, so we simply write (or
997-
* delete) files to recreate the workspace state from the previous session.
998-
*
999-
* @param diffs - Pre-parsed `FileDiff[]` array (or `null` when no diff exists).
1000-
*/
1001-
private async applySessionDiff(
1002-
session: ExecutionSession,
1003-
sessionId: string,
1004-
userId: string,
1005-
workspacePath: string,
1006-
diffs: Array<{
1007-
file: string;
1008-
after: string;
1009-
status?: 'added' | 'deleted' | 'modified';
1010-
}> | null
1011-
): Promise<void> {
1012-
if (!Array.isArray(diffs) || diffs.length === 0) {
1013-
return;
1014-
}
1015-
1016-
let applied = 0;
1017-
let skipped = 0;
1018-
1019-
for (const diff of diffs) {
1020-
if (!diff.file) {
1021-
skipped++;
1022-
continue;
1023-
}
1024-
1025-
// Skip binary files — they have empty after content
1026-
if (diff.status !== 'deleted' && !diff.after) {
1027-
skipped++;
1028-
continue;
1029-
}
1030-
1031-
const filePath = resolve(workspacePath, diff.file);
1032-
if (!filePath.startsWith(workspacePath + '/')) {
1033-
logger
1034-
.withFields({ sessionId, userId, file: diff.file })
1035-
.warn('Skipping diff entry with path outside workspace');
1036-
skipped++;
1037-
continue;
1038-
}
1039-
1040-
try {
1041-
if (diff.status === 'deleted') {
1042-
await session.deleteFile(filePath);
1043-
applied++;
1044-
} else {
1045-
// Ensure parent directory exists.
1046-
// Use single-quoted path to prevent shell metacharacter injection.
1047-
const lastSlash = filePath.lastIndexOf('/');
1048-
if (lastSlash > 0) {
1049-
const parentDir = filePath.substring(0, lastSlash);
1050-
const escaped = parentDir.replaceAll("'", "'\\''");
1051-
await session.exec(`mkdir -p '${escaped}'`);
1052-
}
1053-
await session.writeFile(filePath, diff.after);
1054-
applied++;
1055-
}
1056-
} catch (error) {
1057-
logger
1058-
.withFields({
1059-
sessionId,
1060-
userId,
1061-
file: diff.file,
1062-
status: diff.status,
1063-
error: error instanceof Error ? error.message : String(error),
1064-
})
1065-
.warn('Failed to apply file diff (non-fatal, continuing)');
1066-
skipped++;
1067-
}
1068-
}
1069-
1070-
logger
1071-
.withFields({ sessionId, userId, applied, skipped, total: diffs.length })
1072-
.info('Applied session diff');
1073-
}
1074-
1075-
/**
1076-
* Extract last-write-wins file diffs from streamed snapshot messages.
1077-
* The snapshot format is `{ info, messages: [{ info: { summary: { diffs } }, parts }] }`.
1078-
* Returns null when no diffs exist.
1079-
*/
1080-
private static extractDiffsFromMessages(
1081-
payload: string
1082-
): Array<{ file: string; after: string; status?: 'added' | 'deleted' | 'modified' }> | null {
1083-
const fileDiffSchema = z.object({
1084-
file: z.string(),
1085-
after: z.string().default(''),
1086-
status: z.enum(['added', 'deleted', 'modified']).default('modified'),
1087-
});
1088-
1089-
try {
1090-
const parsed = JSON.parse(payload) as {
1091-
messages?: Array<{ info?: { summary?: { diffs?: unknown[] } } }>;
1092-
};
1093-
if (!parsed.messages) return null;
1094-
1095-
const byFile = new Map<string, z.infer<typeof fileDiffSchema>>();
1096-
1097-
for (const msg of parsed.messages) {
1098-
const diffs = msg.info?.summary?.diffs;
1099-
if (!Array.isArray(diffs)) continue;
1100-
1101-
for (const d of diffs) {
1102-
const result = fileDiffSchema.safeParse(d);
1103-
if (!result.success) continue;
1104-
byFile.set(result.data.file, result.data);
1105-
}
1106-
}
1107-
1108-
if (byFile.size === 0) return null;
1109-
return [...byFile.values()];
1110-
} catch {
1111-
return null;
1112-
}
1113-
}
1114-
1115934
/**
1116935
* Initialize a cloud-agent session by resuming an existing kilo session.
1117936
*
@@ -1475,38 +1294,62 @@ export class SessionService {
14751294
await writeAuthFile(sandbox, context.sessionHome, kilocodeToken);
14761295
await writeGlobalRules(sandbox, context.sessionHome, sessionId);
14771296

1478-
// Fetch snapshot from session-ingest DO and buffer it for sandbox writeFile (string-only API).
1479-
const internalSecret = await env.INTERNAL_API_SECRET_PROD.get();
1480-
const response = await env.SESSION_INGEST.fetch(
1481-
new Request(`https://session-ingest/internal/session/${metadata.kiloSessionId}/export`, {
1482-
headers: {
1483-
'X-Internal-Secret': internalSecret,
1484-
'X-Kilo-User-Id': userId,
1485-
},
1486-
})
1297+
// Single restore script handles download, import, and diff application inside
1298+
// the sandbox — the snapshot never enters worker memory.
1299+
logger.info('Starting cold-start session restore');
1300+
1301+
const escapedId = metadata.kiloSessionId.replaceAll("'", "'\\''");
1302+
const escapedWorkspace = context.workspacePath.replaceAll("'", "'\\''");
1303+
const restoreResult = await session.exec(
1304+
`bun /usr/local/bin/kilo-restore-session.js '${escapedId}' '${escapedWorkspace}'`
14871305
);
14881306

1489-
if (response.status === 404) {
1307+
if (restoreResult.exitCode !== 0) {
1308+
logger
1309+
.withFields({
1310+
sessionId,
1311+
userId,
1312+
exitCode: restoreResult.exitCode,
1313+
stderr: restoreResult.stderr,
1314+
stdout: restoreResult.stdout,
1315+
})
1316+
.error('Cold-start session restore failed');
1317+
1318+
// Parse stdout JSON for structured error info
1319+
let code: number | undefined;
1320+
try {
1321+
const parsed = JSON.parse(restoreResult.stdout?.trim() ?? '{}') as Record<
1322+
string,
1323+
unknown
1324+
>;
1325+
if (typeof parsed.code === 'number') {
1326+
code = parsed.code;
1327+
}
1328+
} catch {
1329+
// non-JSON stdout, ignore
1330+
}
1331+
1332+
if (code === 404) {
1333+
throw new SessionSnapshotRestoreError(
1334+
'Session snapshot restore failed: session not found',
1335+
404
1336+
);
1337+
}
14901338
throw new SessionSnapshotRestoreError(
1491-
`Session snapshot restore failed: session not found`,
1492-
404
1339+
`Cold-start session restore failed: exit ${restoreResult.exitCode}`,
1340+
code
14931341
);
14941342
}
1495-
if (!response.ok) {
1496-
throw new Error(`Session export failed: ${response.status}`);
1497-
}
14981343

1499-
const snapshotPayload = await response.text();
1500-
1501-
// Extract diffs client-side from the streamed snapshot messages.
1502-
const diffs = SessionService.extractDiffsFromMessages(snapshotPayload);
1503-
1504-
await this.restoreSessionSnapshot(session, sessionId, userId, snapshotPayload);
1505-
1506-
// Apply file-level changes from the previous session on top of the fresh clone.
1507-
// This runs after kilo import (conversation restore) since the CLI doesn't need
1508-
// the file state during import — it only restores its internal session DB.
1509-
await this.applySessionDiff(session, sessionId, userId, context.workspacePath, diffs);
1344+
// Log structured summary from restore script
1345+
try {
1346+
const summary = JSON.parse(restoreResult.stdout?.trim() ?? '{}') as Record<string, unknown>;
1347+
logger
1348+
.withFields({ sessionId, userId, ...summary })
1349+
.info('Cold-start session restore completed');
1350+
} catch {
1351+
// non-JSON stdout, non-fatal
1352+
}
15101353

15111354
// Re-run setup commands (fresh clone, need to reinstall)
15121355
if (metadata.setupCommands && metadata.setupCommands.length > 0) {

cloud-agent-next/wrapper/build.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,12 @@ await Bun.build({
77
sourcemap: 'external',
88
});
99

10-
console.log('Build complete: dist/wrapper.js');
10+
await Bun.build({
11+
entrypoints: ['./src/restore-session.ts'],
12+
outdir: './dist',
13+
naming: 'restore-session.js',
14+
target: 'bun',
15+
minify: true,
16+
});
17+
18+
console.log('Build complete: dist/wrapper.js, dist/restore-session.js');

0 commit comments

Comments
 (0)