Skip to content

Commit f5438f7

Browse files
feat: implement auto-summarization of chat history and block inline code execution in agent mode
1 parent f2a0612 commit f5438f7

8 files changed

Lines changed: 188 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,30 @@ For releases before v1.3.35, see [GitHub Releases](https://github.com/VladoIvank
1111
> as the social-share summary (IFTTT → X/Bluesky), capped at 220 chars.
1212
> If omitted, the feed falls back to the first paragraph.
1313
14+
## [2.1.4] — 2026-05-22
15+
16+
> Long agent runs no longer silently forget how they started — when prior chat history overflows the context budget, the dropped older messages are summarized instead of just truncated. Plus a command-whitelist hardening.
17+
18+
### Security
19+
20+
- **Inline code execution is blocked in agent mode.** The command whitelist
21+
allowed interpreters like `node`/`python`/`php`, but their eval flags
22+
(`node -e`, `python -c`, `php -r`, `deno eval`, …) turned a whitelisted
23+
runtime into arbitrary code execution. Those flags are now rejected (including
24+
combined short clusters like `-pe`). Running a *file* (`node app.js`,
25+
`python script.py`) is unaffected. Defense-in-depth — the manual-mode
26+
permission prompt is still the primary gate.
27+
28+
### Added
29+
30+
- **Auto-summarized history.** When the prior conversation exceeds the agent's
31+
context budget, Codeep now condenses the dropped (oldest) messages into a
32+
short recap — preserving early decisions, constraints, and unfinished threads
33+
— and injects it before the recent verbatim history. Previously those older
34+
messages were silently truncated. The recap is one cheap LLM call, made only
35+
on overflow and cached per session. Opt out with
36+
`autoSummarizeHistory: false` (falls back to plain truncation, no extra call).
37+
1438
## [2.1.3] — 2026-05-22
1539

1640
> Security hardening: project hooks now require trust before they run, the web-fetch tool blocks internal/metadata addresses, and usage stats are sent with your sync token.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ Codeep works as a **full AI coding agent** that autonomously:
181181
### Context Persistence
182182
- **Save conversations** - Continue where you left off
183183
- **Per-project context** - Each project maintains its own history
184-
- **Automatic summarization** - Old messages are summarized to save space
184+
- **Automatic summarization** - When prior history overflows the agent's context budget, the dropped (oldest) messages are condensed into a short recap (decisions, constraints, unfinished threads) instead of being silently truncated — so long sessions don't forget how they started. One cheap LLM call, made only on overflow and cached per session; opt out with `autoSummarizeHistory: false` (`/settings`)
185185

186186
### Web & MCP Tools
187187
- Agent can fetch documentation and web content

src/config/index.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ interface ConfigSchema {
5656
* small background API call (uses the active model) once per session.
5757
* Default true; set false to avoid any unsolicited API calls. */
5858
autoSessionTitle: boolean;
59+
/** When prior chat history overflows the agent's context budget, summarize
60+
* the dropped (oldest) messages via one LLM call instead of silently
61+
* discarding them — so long sessions keep early decisions/constraints.
62+
* Default true; set false to fall back to plain truncation (no extra call). */
63+
autoSummarizeHistory: boolean;
5964
/** Absolute workspace roots whose project-local `.codeep/hooks/*` the user
6065
* has approved to run. Untrusted projects' hooks are skipped (a cloned repo
6166
* can't execute shell on first tool call). Granted via `/hooks trust`. */
@@ -281,6 +286,7 @@ function createConfig(): Conf<ConfigSchema> {
281286
language: 'en',
282287
autoSave: true,
283288
autoSessionTitle: true,
289+
autoSummarizeHistory: true,
284290
trustedHookProjects: [],
285291
currentSessionId: '',
286292
temperature: 0.7,

src/utils/agent.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import {
2424
loadProgressLog,
2525
writeProgressLog,
2626
formatChatHistoryForAgent,
27+
summarizeEarlierHistory,
2728
} from './agentChat';
2829
import { ApiError } from '../api/index';
2930
import type { AgentChatResponse } from './agentChat';
@@ -346,7 +347,13 @@ export async function runAgent(
346347
systemPrompt += taskCtx;
347348
}
348349

349-
// Inject prior chat session context
350+
// Inject prior chat session context. When the history overflows the budget,
351+
// prepend an LLM recap of the dropped (oldest) messages so long sessions
352+
// keep early decisions/constraints, then the recent messages verbatim.
353+
const earlierSummary = await summarizeEarlierHistory(opts.chatHistory);
354+
if (earlierSummary) {
355+
systemPrompt += earlierSummary;
356+
}
350357
const chatHistoryStr = formatChatHistoryForAgent(opts.chatHistory);
351358
if (chatHistoryStr) {
352359
systemPrompt += chatHistoryStr;

src/utils/agentChat.test.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,23 @@ vi.mock('./agentStream', () => ({
3232
AgentChatResponse: {},
3333
}));
3434

35-
import { loadProjectRules, formatChatHistoryForAgent, TimeoutError } from './agentChat';
35+
import { loadProjectRules, formatChatHistoryForAgent, summarizeEarlierHistory, TimeoutError } from './agentChat';
36+
37+
describe('summarizeEarlierHistory', () => {
38+
it('returns empty for missing/empty history', async () => {
39+
expect(await summarizeEarlierHistory()).toBe('');
40+
expect(await summarizeEarlierHistory([])).toBe('');
41+
});
42+
43+
it('returns empty when nothing overflows the budget (no LLM call)', async () => {
44+
const history: Array<{ role: 'user' | 'assistant'; content: string }> = [
45+
{ role: 'user', content: 'hi' },
46+
{ role: 'assistant', content: 'hello' },
47+
];
48+
// Everything fits → no dropped messages → returns '' without calling chat().
49+
expect(await summarizeEarlierHistory(history, 16000)).toBe('');
50+
});
51+
});
3652

3753
describe('loadProjectRules', () => {
3854
beforeEach(() => {

src/utils/agentChat.ts

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import { existsSync, readFileSync, writeFileSync } from 'fs';
1616
import { join } from 'path';
17+
import { createHash } from 'crypto';
1718
import { ProjectContext } from './project';
1819
import { config, getApiKey, Message, resolveBaseUrl } from '../config/index';
1920
import { loadProjectIntelligence, generateContextFromIntelligence } from './projectIntelligence';
@@ -201,6 +202,83 @@ export function formatChatHistoryForAgent(
201202
return `\n\n## Prior Conversation Context\nThe following is the recent chat history from this session. Use it as background context to understand the user's intent, but focus on completing the current task.\n\n${lines}`;
202203
}
203204

205+
// Same noise filter formatChatHistoryForAgent uses — kept in sync so the two
206+
// functions agree on which messages are "real" conversation.
207+
function filterAgentHistory<T extends { role: string; content: string }>(history: T[]): T[] {
208+
return history.filter(m => {
209+
const content = m.content.trimStart();
210+
if (content.startsWith('[AGENT]') || content.startsWith('[DRY RUN]')) return false;
211+
if (content.startsWith('Agent completed') || content.startsWith('Agent failed') || content.startsWith('Agent stopped')) return false;
212+
return true;
213+
});
214+
}
215+
216+
// Cache summaries by a hash of the dropped messages, so re-running the agent in
217+
// the same session (same overflow) doesn't re-summarize on every task.
218+
const earlierSummaryCache = new Map<string, string>();
219+
220+
/**
221+
* Summarize the OVERFLOW that `formatChatHistoryForAgent` drops. When prior
222+
* history exceeds `maxChars`, that function keeps only the most recent messages
223+
* and silently discards the older ones — losing early decisions/constraints on
224+
* long sessions. This condenses those dropped messages into a short recap that
225+
* the caller prepends *before* the recent verbatim history.
226+
*
227+
* Returns '' when: opted out (`autoSummarizeHistory === false`), nothing
228+
* overflows, or the summarization call fails (graceful fallback — the recent
229+
* history still goes in, we just don't add a recap).
230+
*/
231+
export async function summarizeEarlierHistory(
232+
history?: Array<{ role: 'user' | 'assistant'; content: string }>,
233+
maxChars: number = 16000,
234+
): Promise<string> {
235+
if (config.get('autoSummarizeHistory') === false) return '';
236+
if (!history || history.length === 0) return '';
237+
238+
const filtered = filterAgentHistory(history);
239+
if (filtered.length === 0) return '';
240+
241+
// Mirror formatChatHistoryForAgent's newest→oldest budget walk to find which
242+
// messages it KEEPS; everything older than the oldest kept message is dropped.
243+
let totalChars = 0;
244+
let firstKept = filtered.length;
245+
for (let i = filtered.length - 1; i >= 0; i--) {
246+
const entry = `${filtered[i].role === 'user' ? 'User' : 'Assistant'}: ${filtered[i].content}`;
247+
if (totalChars + entry.length > maxChars && firstKept < filtered.length) break;
248+
if (entry.length > maxChars) { firstKept = i; break; }
249+
firstKept = i;
250+
totalChars += entry.length;
251+
}
252+
const dropped = filtered.slice(0, firstKept);
253+
if (dropped.length === 0) return '';
254+
255+
const key = createHash('sha256')
256+
.update(dropped.map(m => `${m.role}:${m.content}`).join(''))
257+
.digest('hex');
258+
const cached = earlierSummaryCache.get(key);
259+
if (cached) return cached;
260+
261+
// Compact transcript of the dropped messages, capped so the summarization
262+
// prompt stays cheap even when a lot has overflowed.
263+
const transcript = dropped
264+
.map(m => `${m.role === 'user' ? 'User' : 'Assistant'}: ${m.content.replace(/\s+/g, ' ').slice(0, 600)}`)
265+
.join('\n')
266+
.slice(0, 24000);
267+
268+
const system = 'You are condensing the EARLIER part of an ongoing coding session that no longer fits the context window. Summarize what happened in 3-6 sentences: concrete decisions made, constraints/requirements stated, files or APIs involved, and anything still unfinished. Past tense, no preamble, no bullet headers — just the recap.';
269+
270+
try {
271+
const { chat } = await import('../api/index.js');
272+
const summary = (await chat(transcript, [{ role: 'system', content: system }])).trim();
273+
if (!summary) return '';
274+
const block = `\n\n## Earlier Conversation (summarized)\nThe earlier part of this session was condensed to fit context. Treat it as established background:\n\n${summary}`;
275+
earlierSummaryCache.set(key, block);
276+
return block;
277+
} catch {
278+
return ''; // graceful — recent verbatim history still gets injected
279+
}
280+
}
281+
204282
export function getAgentSystemPrompt(projectContext: ProjectContext): string {
205283
const root = projectContext.root || process.cwd();
206284
// State the real underlying model/provider so "which model are you"

src/utils/shell.test.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,25 @@ describe('validateCommand', () => {
6161
expect(validateCommand('node', ['index.js']).valid).toBe(true);
6262
});
6363

64+
it('blocks inline code execution (interpreter eval flags)', () => {
65+
expect(validateCommand('node', ['-e', 'process.exit(1)']).valid).toBe(false);
66+
expect(validateCommand('node', ['--eval', 'x']).valid).toBe(false);
67+
expect(validateCommand('node', ['-p', 'x']).valid).toBe(false);
68+
expect(validateCommand('node', ['-pe', 'x']).valid).toBe(false); // combined short cluster
69+
expect(validateCommand('python', ['-c', 'import os']).valid).toBe(false);
70+
expect(validateCommand('python3', ['-c', 'x']).valid).toBe(false);
71+
expect(validateCommand('php', ['-r', 'x']).valid).toBe(false);
72+
expect(validateCommand('deno', ['eval', 'x']).valid).toBe(false); // bare subcommand
73+
});
74+
75+
it('still allows interpreters running a file (not inline eval)', () => {
76+
expect(validateCommand('node', ['app.js', '--port', '3000']).valid).toBe(true);
77+
expect(validateCommand('python', ['script.py']).valid).toBe(true);
78+
expect(validateCommand('deno', ['run', 'main.ts']).valid).toBe(true);
79+
// -e on a NON-interpreter command is unaffected (not in the eval map).
80+
expect(validateCommand('npx', ['some-tool', '-e', 'config']).valid).toBe(true);
81+
});
82+
6483
it('blocks rm -rf / pattern', () => {
6584
const result = validateCommand('rm', ['-rf', '/']);
6685
expect(result.valid).toBe(false);

src/utils/shell.ts

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,34 @@ const ALLOWED_COMMANDS = new Set([
106106
'http', 'https',
107107
]);
108108

109+
// Interpreter flags that execute inline code straight from the command line.
110+
// Without this check, a whitelisted runtime (`node`, `python`, …) becomes
111+
// arbitrary code execution — `node -e "<anything>"`, `python -c "<anything>"` —
112+
// bypassing the command whitelist entirely. File execution (`node app.js`)
113+
// stays allowed; only the eval flags are blocked.
114+
const INLINE_EVAL_SHORT: Record<string, string[]> = {
115+
node: ['e', 'p'], bun: ['e'], python: ['c'], python3: ['c'], php: ['r'], ruby: ['e'], perl: ['e', 'E'],
116+
};
117+
const INLINE_EVAL_LONG: Record<string, string[]> = {
118+
node: ['--eval', '--print'], deno: ['eval'], bun: ['--eval'],
119+
};
120+
121+
function hasInlineEval(command: string, args: string[]): boolean {
122+
const short = INLINE_EVAL_SHORT[command] ?? [];
123+
const long = INLINE_EVAL_LONG[command] ?? [];
124+
if (short.length === 0 && long.length === 0) return false;
125+
for (const arg of args) {
126+
if (arg.startsWith('--')) {
127+
if (long.includes(arg.split('=')[0])) return true; // --eval / --print(=...)
128+
} else if (arg.length > 1 && arg.startsWith('-')) {
129+
if (arg.slice(1).split('').some((l) => short.includes(l))) return true; // -e, -c, -pe …
130+
} else if (long.includes(arg)) {
131+
return true; // bare subcommand, e.g. `deno eval`
132+
}
133+
}
134+
return false;
135+
}
136+
109137
/**
110138
* Validate if a command is safe to execute
111139
*/
@@ -123,7 +151,13 @@ export function validateCommand(
123151
if (!ALLOWED_COMMANDS.has(command)) {
124152
return { valid: false, reason: `Command '${command}' is not in the allowed list` };
125153
}
126-
154+
155+
// Block inline-code execution that would turn a whitelisted interpreter into
156+
// arbitrary code execution (the whitelist alone doesn't stop `node -e "…"`).
157+
if (hasInlineEval(command, args)) {
158+
return { valid: false, reason: `Inline code execution via '${command}' (e.g. -e/-c/--eval) is not allowed in agent mode — put the code in a file and run that, or run it yourself.` };
159+
}
160+
127161
// Check full command string against dangerous patterns
128162
const fullCommand = `${command} ${args.join(' ')}`;
129163
for (const pattern of BLOCKED_PATTERNS) {

0 commit comments

Comments
 (0)