Skip to content

Commit df018d6

Browse files
authored
Merge pull request #305 from GoDiao/feature/compact-plugin
Add compact plugin for auto context compression
2 parents 3ccfab6 + 86d797f commit df018d6

9 files changed

Lines changed: 519 additions & 10 deletions

File tree

.github/workflows/test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ jobs:
8484
- name: Start optillm server
8585
run: |
8686
echo "Starting optillm server for integration tests..."
87-
OPTILLM_API_KEY=optillm python optillm.py --model google/gemma-3-270m-it --port 8000 &
87+
OPTILLM_API_KEY=optillm python optillm.py --model Qwen/Qwen2.5-Coder-0.5B-Instruct --port 8000 &
8888
echo $! > server.pid
8989
9090
# Wait for server to be ready
@@ -179,7 +179,7 @@ jobs:
179179
echo "Starting optillm server with conversation logging..."
180180
mkdir -p /tmp/optillm_conversations
181181
OPTILLM_API_KEY=optillm python optillm.py \
182-
--model google/gemma-3-270m-it \
182+
--model Qwen/Qwen2.5-Coder-0.5B-Instruct \
183183
--port 8000 \
184184
--log-conversations \
185185
--conversation-log-dir /tmp/optillm_conversations &

optillm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Version information
2-
__version__ = "0.3.14"
2+
__version__ = "0.3.15"
33

44
# Import from server module
55
from .server import (

optillm/plugins/compact_plugin.py

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
"""
2+
Compact plugin for OptiLLM.
3+
4+
Automatically compresses conversation context when it exceeds a token budget,
5+
preserving recent turns verbatim and generating a structured summary of older
6+
content — inspired by Claude Code's compact mechanism.
7+
8+
Uses one LLM call to produce a structured summary with:
9+
Scope, Key decisions, User preferences, Pending work, Key files referenced.
10+
Recent turns are preserved verbatim.
11+
12+
Composable with other approaches via & operator: compact&moa, compact&bon, etc.
13+
14+
Configuration (env vars or request_config):
15+
COMPACT_CONTEXT_WINDOW / compact_context_window — max context tokens (default: 128000)
16+
COMPACT_THRESHOLD / compact_threshold — trigger ratio 0.0-1.0 (default: 0.75)
17+
COMPACT_KEEP_RECENT / compact_keep_recent — turns to preserve verbatim (default: 4)
18+
"""
19+
20+
import os
21+
import re
22+
import logging
23+
from typing import Tuple, List, Optional
24+
25+
logger = logging.getLogger(__name__)
26+
27+
SLUG = "compact"
28+
29+
DEFAULT_CONTEXT_WINDOW = 128000
30+
DEFAULT_THRESHOLD = 0.75
31+
DEFAULT_KEEP_RECENT = 4
32+
33+
COMPACT_SYSTEM_PROMPT = """You are a conversation summarizer. Given a conversation history, produce a structured summary.
34+
35+
Output ONLY this format, nothing else:
36+
37+
<summary>
38+
Conversation summary:
39+
- Scope: {N} earlier messages compacted (user={U}, assistant={A}).
40+
- Key decisions: {list the main decisions or conclusions reached}
41+
- User preferences: {any stated preferences or constraints}
42+
- Pending work: {any remaining tasks or next steps mentioned}
43+
- Key files referenced: {file paths mentioned, if any}
44+
- Context: {a concise paragraph capturing the essential context needed to continue}
45+
</summary>
46+
47+
Rules:
48+
- Be specific: include actual values, names, and file paths — not vague references
49+
- Be concise: each section should be 1-2 lines maximum
50+
- Omit pleasantries, greetings, and filler
51+
- The Context paragraph is the most important part — it should capture everything a new assistant would need to pick up where this left off"""
52+
53+
54+
def _get_config(request_config: Optional[dict], key: str, env_var: str, default):
55+
val = None
56+
if request_config:
57+
val = request_config.get(key)
58+
if val is None:
59+
env_val = os.environ.get(env_var)
60+
if env_val is not None:
61+
try:
62+
val = type(default)(env_val)
63+
except (ValueError, TypeError):
64+
logger.warning(f"Invalid value for {env_var}: {env_val!r}, using default {default}")
65+
val = default
66+
return val if val is not None else default
67+
68+
69+
def _get_context_window(client, model: str, request_config: Optional[dict]) -> int:
70+
"""Get context window size: try provider /models endpoint first, then config fallback."""
71+
try:
72+
model_info = client.models.retrieve(model)
73+
for attr in ("context_length", "max_context_length", "context_window",
74+
"max_model_length", "max_position_embeddings"):
75+
val = getattr(model_info, attr, None)
76+
if val is not None:
77+
return int(val)
78+
except Exception:
79+
pass
80+
81+
return _get_config(request_config, "compact_context_window", "COMPACT_CONTEXT_WINDOW", DEFAULT_CONTEXT_WINDOW)
82+
83+
84+
def estimate_tokens(text: str) -> int:
85+
try:
86+
import tiktoken
87+
enc = tiktoken.encoding_for_model("gpt-4")
88+
return len(enc.encode(text))
89+
except (ImportError, KeyError):
90+
return max(1, len(text) // 4)
91+
92+
93+
def parse_tagged_conversation(text: str) -> List[Tuple[str, str]]:
94+
turns = []
95+
for match in re.finditer(r'^(User:|Assistant:)\s*', text, re.MULTILINE):
96+
role = "user" if match.group(1) == "User:" else "assistant"
97+
start = match.end()
98+
next_match = re.search(r'^(User:|Assistant:)', text[start:], re.MULTILINE)
99+
if next_match:
100+
content = text[start:start + next_match.start()].strip()
101+
else:
102+
content = text[start:].strip()
103+
turns.append((role, content))
104+
return turns
105+
106+
107+
def reconstruct_tagged(turns: List[Tuple[str, str]]) -> str:
108+
lines = []
109+
for role, content in turns:
110+
tag = "User:" if role == "user" else "Assistant:"
111+
lines.append(f"{tag} {content}")
112+
return "\n".join(lines)
113+
114+
115+
def compress_with_llm(
116+
older_turns: List[Tuple[str, str]],
117+
system_prompt: str,
118+
client,
119+
model: str,
120+
) -> Tuple[Optional[str], int]:
121+
conversation_text = reconstruct_tagged(older_turns)
122+
123+
system_content = COMPACT_SYSTEM_PROMPT
124+
if system_prompt:
125+
system_content += f"\n\nOriginal system context: {system_prompt}"
126+
127+
messages = [
128+
{"role": "system", "content": system_content},
129+
{"role": "user", "content": conversation_text},
130+
]
131+
132+
try:
133+
response = client.chat.completions.create(
134+
model=model,
135+
messages=messages,
136+
max_tokens=2000,
137+
temperature=0.3,
138+
)
139+
except Exception as e:
140+
logger.error(f"Compact: LLM compression failed: {e}")
141+
return None, 0
142+
143+
raw = response.choices[0].message.content.strip()
144+
tokens_used = response.usage.completion_tokens if response.usage else 0
145+
146+
match = re.search(r'<summary>(.*?)</summary>', raw, re.DOTALL)
147+
if match:
148+
summary = match.group(1).strip()
149+
else:
150+
summary = raw
151+
152+
return summary, tokens_used
153+
154+
155+
def run(
156+
system_prompt: str,
157+
initial_query: str,
158+
client,
159+
model: str,
160+
request_config: Optional[dict] = None,
161+
) -> Tuple[str, int]:
162+
context_window = _get_context_window(client, model, request_config)
163+
threshold = _get_config(request_config, "compact_threshold", "COMPACT_THRESHOLD", DEFAULT_THRESHOLD)
164+
keep_recent = _get_config(request_config, "compact_keep_recent", "COMPACT_KEEP_RECENT", DEFAULT_KEEP_RECENT)
165+
166+
token_count = estimate_tokens(initial_query)
167+
budget = int(context_window * threshold)
168+
169+
if token_count < budget:
170+
logger.debug(f"Compact: passthrough ({token_count} tokens < {budget} budget)")
171+
return initial_query, 0
172+
173+
turns = parse_tagged_conversation(initial_query)
174+
if len(turns) <= keep_recent:
175+
logger.debug(f"Compact: too few turns to compress ({len(turns)} <= {keep_recent})")
176+
return initial_query, 0
177+
178+
split_idx = len(turns) - keep_recent
179+
older_turns = turns[:split_idx]
180+
recent_turns = turns[split_idx:]
181+
182+
logger.info(f"Compact: compressing {len(older_turns)} older turns, keeping {len(recent_turns)} recent")
183+
184+
summary, tokens_used = compress_with_llm(older_turns, system_prompt, client, model)
185+
186+
if summary is None:
187+
logger.warning("Compact: compression failed, returning original query")
188+
return initial_query, 0
189+
190+
compressed_turns = [("user", f"[Conversation summary]:\n{summary}")]
191+
compressed_turns.extend(recent_turns)
192+
193+
result = reconstruct_tagged(compressed_turns)
194+
new_token_count = estimate_tokens(result)
195+
logger.info(f"Compact: {token_count} -> {new_token_count} tokens (used {tokens_used} for compression)")
196+
197+
return result, tokens_used

optillm/plugins/json_plugin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_device(self):
2222
else:
2323
return torch.device("cpu")
2424

25-
def __init__(self, model_name: str = "google/gemma-3-270m-it"):
25+
def __init__(self, model_name: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"):
2626
"""Initialize the JSON generator with a specific model."""
2727
self.device = self.get_device()
2828
logger.info(f"Using device: {self.device}")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.3.14"
7+
version = "0.3.15"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

0 commit comments

Comments
 (0)