feat: implement auto-summarization of chat history and block inline code execution in agent mode

VladoIvankovic · VladoIvankovic · commit f5438f76e58a · 2026-05-22T12:06:59.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,30 @@ For releases before v1.3.35, see [GitHub Releases](https://github.com/VladoIvank
 > as the social-share summary (IFTTT → X/Bluesky), capped at 220 chars.
 > If omitted, the feed falls back to the first paragraph.
 
+## [2.1.4] — 2026-05-22
+
+> Long agent runs no longer silently forget how they started — when prior chat history overflows the context budget, the dropped older messages are summarized instead of just truncated. Plus a command-whitelist hardening.
+
+### Security
+
+- **Inline code execution is blocked in agent mode.** The command whitelist
+  allowed interpreters like `node`/`python`/`php`, but their eval flags
+  (`node -e`, `python -c`, `php -r`, `deno eval`, …) turned a whitelisted
+  runtime into arbitrary code execution. Those flags are now rejected (including
+  combined short clusters like `-pe`). Running a *file* (`node app.js`,
+  `python script.py`) is unaffected. Defense-in-depth — the manual-mode
+  permission prompt is still the primary gate.
+
+### Added
+
+- **Auto-summarized history.** When the prior conversation exceeds the agent's
+  context budget, Codeep now condenses the dropped (oldest) messages into a
+  short recap — preserving early decisions, constraints, and unfinished threads
+  — and injects it before the recent verbatim history. Previously those older
+  messages were silently truncated. The recap is one cheap LLM call, made only
+  on overflow and cached per session. Opt out with
+  `autoSummarizeHistory: false` (falls back to plain truncation, no extra call).
+
 ## [2.1.3] — 2026-05-22
 
 > Security hardening: project hooks now require trust before they run, the web-fetch tool blocks internal/metadata addresses, and usage stats are sent with your sync token.
diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Codeep works as a **full AI coding agent** that autonomously:
 ### Context Persistence
 - **Save conversations** - Continue where you left off
 - **Per-project context** - Each project maintains its own history
-- **Automatic summarization** - Old messages are summarized to save space
+- **Automatic summarization** - When prior history overflows the agent's context budget, the dropped (oldest) messages are condensed into a short recap (decisions, constraints, unfinished threads) instead of being silently truncated — so long sessions don't forget how they started. One cheap LLM call, made only on overflow and cached per session; opt out with `autoSummarizeHistory: false` (`/settings`)
 
 ### Web & MCP Tools
 - Agent can fetch documentation and web content
diff --git a/src/config/index.ts b/src/config/index.ts
@@ -56,6 +56,11 @@ interface ConfigSchema {
    *  small background API call (uses the active model) once per session.
    *  Default true; set false to avoid any unsolicited API calls. */
   autoSessionTitle: boolean;
+  /** When prior chat history overflows the agent's context budget, summarize
+   *  the dropped (oldest) messages via one LLM call instead of silently
+   *  discarding them — so long sessions keep early decisions/constraints.
+   *  Default true; set false to fall back to plain truncation (no extra call). */
+  autoSummarizeHistory: boolean;
   /** Absolute workspace roots whose project-local `.codeep/hooks/*` the user
    *  has approved to run. Untrusted projects' hooks are skipped (a cloned repo
    *  can't execute shell on first tool call). Granted via `/hooks trust`. */
@@ -281,6 +286,7 @@ function createConfig(): Conf<ConfigSchema> {
     language: 'en',
     autoSave: true,
     autoSessionTitle: true,
+    autoSummarizeHistory: true,
     trustedHookProjects: [],
     currentSessionId: '',
     temperature: 0.7,
diff --git a/src/utils/agent.ts b/src/utils/agent.ts
@@ -24,6 +24,7 @@ import {
   loadProgressLog,
   writeProgressLog,
   formatChatHistoryForAgent,
+  summarizeEarlierHistory,
 } from './agentChat';
 import { ApiError } from '../api/index';
 import type { AgentChatResponse } from './agentChat';
@@ -346,7 +347,13 @@ export async function runAgent(
     systemPrompt += taskCtx;
   }
 
-  // Inject prior chat session context
+  // Inject prior chat session context. When the history overflows the budget,
+  // prepend an LLM recap of the dropped (oldest) messages so long sessions
+  // keep early decisions/constraints, then the recent messages verbatim.
+  const earlierSummary = await summarizeEarlierHistory(opts.chatHistory);
+  if (earlierSummary) {
+    systemPrompt += earlierSummary;
+  }
   const chatHistoryStr = formatChatHistoryForAgent(opts.chatHistory);
   if (chatHistoryStr) {
     systemPrompt += chatHistoryStr;
diff --git a/src/utils/agentChat.test.ts b/src/utils/agentChat.test.ts
@@ -32,7 +32,23 @@ vi.mock('./agentStream', () => ({
   AgentChatResponse: {},
 }));
 
-import { loadProjectRules, formatChatHistoryForAgent, TimeoutError } from './agentChat';
+import { loadProjectRules, formatChatHistoryForAgent, summarizeEarlierHistory, TimeoutError } from './agentChat';
+
+describe('summarizeEarlierHistory', () => {
+  it('returns empty for missing/empty history', async () => {
+    expect(await summarizeEarlierHistory()).toBe('');
+    expect(await summarizeEarlierHistory([])).toBe('');
+  });
+
+  it('returns empty when nothing overflows the budget (no LLM call)', async () => {
+    const history: Array<{ role: 'user' | 'assistant'; content: string }> = [
+      { role: 'user', content: 'hi' },
+      { role: 'assistant', content: 'hello' },
+    ];
+    // Everything fits → no dropped messages → returns '' without calling chat().
+    expect(await summarizeEarlierHistory(history, 16000)).toBe('');
+  });
+});
 
 describe('loadProjectRules', () => {
   beforeEach(() => {
diff --git a/src/utils/agentChat.ts b/src/utils/agentChat.ts
@@ -14,6 +14,7 @@
 
 import { existsSync, readFileSync, writeFileSync } from 'fs';
 import { join } from 'path';
+import { createHash } from 'crypto';
 import { ProjectContext } from './project';
 import { config, getApiKey, Message, resolveBaseUrl } from '../config/index';
 import { loadProjectIntelligence, generateContextFromIntelligence } from './projectIntelligence';
@@ -201,6 +202,83 @@ export function formatChatHistoryForAgent(
   return `\n\n## Prior Conversation Context\nThe following is the recent chat history from this session. Use it as background context to understand the user's intent, but focus on completing the current task.\n\n${lines}`;
 }
 
+// Same noise filter formatChatHistoryForAgent uses — kept in sync so the two
+// functions agree on which messages are "real" conversation.
+function filterAgentHistory<T extends { role: string; content: string }>(history: T[]): T[] {
+  return history.filter(m => {
+    const content = m.content.trimStart();
+    if (content.startsWith('[AGENT]') || content.startsWith('[DRY RUN]')) return false;
+    if (content.startsWith('Agent completed') || content.startsWith('Agent failed') || content.startsWith('Agent stopped')) return false;
+    return true;
+  });
+}
+
+// Cache summaries by a hash of the dropped messages, so re-running the agent in
+// the same session (same overflow) doesn't re-summarize on every task.
+const earlierSummaryCache = new Map<string, string>();
+
+/**
+ * Summarize the OVERFLOW that `formatChatHistoryForAgent` drops. When prior
+ * history exceeds `maxChars`, that function keeps only the most recent messages
+ * and silently discards the older ones — losing early decisions/constraints on
+ * long sessions. This condenses those dropped messages into a short recap that
+ * the caller prepends *before* the recent verbatim history.
+ *
+ * Returns '' when: opted out (`autoSummarizeHistory === false`), nothing
+ * overflows, or the summarization call fails (graceful fallback — the recent
+ * history still goes in, we just don't add a recap).
+ */
+export async function summarizeEarlierHistory(
+  history?: Array<{ role: 'user' | 'assistant'; content: string }>,
+  maxChars: number = 16000,
+): Promise<string> {
+  if (config.get('autoSummarizeHistory') === false) return '';
+  if (!history || history.length === 0) return '';
+
+  const filtered = filterAgentHistory(history);
+  if (filtered.length === 0) return '';
+
+  // Mirror formatChatHistoryForAgent's newest→oldest budget walk to find which
+  // messages it KEEPS; everything older than the oldest kept message is dropped.
+  let totalChars = 0;
+  let firstKept = filtered.length;
+  for (let i = filtered.length - 1; i >= 0; i--) {
+    const entry = `${filtered[i].role === 'user' ? 'User' : 'Assistant'}: ${filtered[i].content}`;
+    if (totalChars + entry.length > maxChars && firstKept < filtered.length) break;
+    if (entry.length > maxChars) { firstKept = i; break; }
+    firstKept = i;
+    totalChars += entry.length;
+  }
+  const dropped = filtered.slice(0, firstKept);
+  if (dropped.length === 0) return '';
+
+  const key = createHash('sha256')
+    .update(dropped.map(m => `${m.role}:${m.content}`).join(' '))
+    .digest('hex');
+  const cached = earlierSummaryCache.get(key);
+  if (cached) return cached;
+
+  // Compact transcript of the dropped messages, capped so the summarization
+  // prompt stays cheap even when a lot has overflowed.
+  const transcript = dropped
+    .map(m => `${m.role === 'user' ? 'User' : 'Assistant'}: ${m.content.replace(/\s+/g, ' ').slice(0, 600)}`)
+    .join('\n')
+    .slice(0, 24000);
+
+  const system = 'You are condensing the EARLIER part of an ongoing coding session that no longer fits the context window. Summarize what happened in 3-6 sentences: concrete decisions made, constraints/requirements stated, files or APIs involved, and anything still unfinished. Past tense, no preamble, no bullet headers — just the recap.';
+
+  try {
+    const { chat } = await import('../api/index.js');
+    const summary = (await chat(transcript, [{ role: 'system', content: system }])).trim();
+    if (!summary) return '';
+    const block = `\n\n## Earlier Conversation (summarized)\nThe earlier part of this session was condensed to fit context. Treat it as established background:\n\n${summary}`;
+    earlierSummaryCache.set(key, block);
+    return block;
+  } catch {
+    return ''; // graceful — recent verbatim history still gets injected
+  }
+}
+
 export function getAgentSystemPrompt(projectContext: ProjectContext): string {
   const root = projectContext.root || process.cwd();
   // State the real underlying model/provider so "which model are you"
diff --git a/src/utils/shell.test.ts b/src/utils/shell.test.ts
@@ -61,6 +61,25 @@ describe('validateCommand', () => {
     expect(validateCommand('node', ['index.js']).valid).toBe(true);
   });
 
+  it('blocks inline code execution (interpreter eval flags)', () => {
+    expect(validateCommand('node', ['-e', 'process.exit(1)']).valid).toBe(false);
+    expect(validateCommand('node', ['--eval', 'x']).valid).toBe(false);
+    expect(validateCommand('node', ['-p', 'x']).valid).toBe(false);
+    expect(validateCommand('node', ['-pe', 'x']).valid).toBe(false); // combined short cluster
+    expect(validateCommand('python', ['-c', 'import os']).valid).toBe(false);
+    expect(validateCommand('python3', ['-c', 'x']).valid).toBe(false);
+    expect(validateCommand('php', ['-r', 'x']).valid).toBe(false);
+    expect(validateCommand('deno', ['eval', 'x']).valid).toBe(false); // bare subcommand
+  });
+
+  it('still allows interpreters running a file (not inline eval)', () => {
+    expect(validateCommand('node', ['app.js', '--port', '3000']).valid).toBe(true);
+    expect(validateCommand('python', ['script.py']).valid).toBe(true);
+    expect(validateCommand('deno', ['run', 'main.ts']).valid).toBe(true);
+    // -e on a NON-interpreter command is unaffected (not in the eval map).
+    expect(validateCommand('npx', ['some-tool', '-e', 'config']).valid).toBe(true);
+  });
+
   it('blocks rm -rf / pattern', () => {
     const result = validateCommand('rm', ['-rf', '/']);
     expect(result.valid).toBe(false);
diff --git a/src/utils/shell.ts b/src/utils/shell.ts
@@ -106,6 +106,34 @@ const ALLOWED_COMMANDS = new Set([
   'http', 'https',
 ]);
 
+// Interpreter flags that execute inline code straight from the command line.
+// Without this check, a whitelisted runtime (`node`, `python`, …) becomes
+// arbitrary code execution — `node -e "<anything>"`, `python -c "<anything>"` —
+// bypassing the command whitelist entirely. File execution (`node app.js`)
+// stays allowed; only the eval flags are blocked.
+const INLINE_EVAL_SHORT: Record<string, string[]> = {
+  node: ['e', 'p'], bun: ['e'], python: ['c'], python3: ['c'], php: ['r'], ruby: ['e'], perl: ['e', 'E'],
+};
+const INLINE_EVAL_LONG: Record<string, string[]> = {
+  node: ['--eval', '--print'], deno: ['eval'], bun: ['--eval'],
+};
+
+function hasInlineEval(command: string, args: string[]): boolean {
+  const short = INLINE_EVAL_SHORT[command] ?? [];
+  const long = INLINE_EVAL_LONG[command] ?? [];
+  if (short.length === 0 && long.length === 0) return false;
+  for (const arg of args) {
+    if (arg.startsWith('--')) {
+      if (long.includes(arg.split('=')[0])) return true;            // --eval / --print(=...)
+    } else if (arg.length > 1 && arg.startsWith('-')) {
+      if (arg.slice(1).split('').some((l) => short.includes(l))) return true; // -e, -c, -pe …
+    } else if (long.includes(arg)) {
+      return true;                                                   // bare subcommand, e.g. `deno eval`
+    }
+  }
+  return false;
+}
+
 /**
  * Validate if a command is safe to execute
  */
@@ -123,7 +151,13 @@ export function validateCommand(
   if (!ALLOWED_COMMANDS.has(command)) {
     return { valid: false, reason: `Command '${command}' is not in the allowed list` };
   }
-  
+
+  // Block inline-code execution that would turn a whitelisted interpreter into
+  // arbitrary code execution (the whitelist alone doesn't stop `node -e "…"`).
+  if (hasInlineEval(command, args)) {
+    return { valid: false, reason: `Inline code execution via '${command}' (e.g. -e/-c/--eval) is not allowed in agent mode — put the code in a file and run that, or run it yourself.` };
+  }
+
   // Check full command string against dangerous patterns
   const fullCommand = `${command} ${args.join(' ')}`;
   for (const pattern of BLOCKED_PATTERNS) {