Skip to content

Commit 70cb0db

Browse files
Merge pull request lightspeed-core#1718 from max-svistunov/lcore-1569-1570-token-estimation-and-compaction-module
LCORE-1569 LCORE-1570: token estimation + compaction core modules
1 parent fd255ad commit 70cb0db

13 files changed

Lines changed: 1958 additions & 0 deletions

docs/openapi.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11898,6 +11898,46 @@
1189811898
"title": "ClientCredentialsOAuthFlow",
1189911899
"description": "Defines configuration details for the OAuth 2.0 Client Credentials flow."
1190011900
},
11901+
"CompactionConfiguration": {
11902+
"properties": {
11903+
"enabled": {
11904+
"type": "boolean",
11905+
"title": "Enable compaction",
11906+
"description": "When true, older conversation turns are summarized when estimated tokens approach the context window limit.",
11907+
"default": false
11908+
},
11909+
"threshold_ratio": {
11910+
"type": "number",
11911+
"title": "Threshold ratio",
11912+
"description": "Trigger compaction when estimated tokens exceed this fraction of the model's context window (0.0-1.0).",
11913+
"default": 0.7
11914+
},
11915+
"token_floor": {
11916+
"type": "integer",
11917+
"minimum": 0.0,
11918+
"title": "Token floor",
11919+
"description": "Minimum token count before compaction can trigger. Prevents triggering on very small context windows.",
11920+
"default": 4096
11921+
},
11922+
"buffer_turns": {
11923+
"type": "integer",
11924+
"minimum": 0.0,
11925+
"title": "Buffer turns",
11926+
"description": "Number of recent turns to keep verbatim.",
11927+
"default": 4
11928+
},
11929+
"buffer_max_ratio": {
11930+
"type": "number",
11931+
"title": "Buffer max ratio",
11932+
"description": "Maximum fraction of context window the buffer zone can occupy, regardless of buffer_turns.",
11933+
"default": 0.3
11934+
}
11935+
},
11936+
"additionalProperties": false,
11937+
"type": "object",
11938+
"title": "CompactionConfiguration",
11939+
"description": "Configuration for conversation history compaction.\n\nCompaction summarizes older conversation turns when their estimated\ntoken count approaches the context window limit, keeping the\nconversation usable instead of failing with HTTP 413. The\nconfiguration here controls when compaction triggers and how much\nrecent context is preserved verbatim.\n\nAttributes:\n enabled: Master switch. When False, compaction never triggers\n and other fields are inert.\n threshold_ratio: Trigger compaction when estimated input tokens\n exceed this fraction of the model's context window\n (clamped to 0.0..1.0).\n token_floor: Minimum estimated token count before compaction\n can trigger, regardless of threshold_ratio. Prevents\n triggering on very small context windows.\n buffer_turns: Initial number of recent turns to keep verbatim.\n The runtime applies a degrading guard \u2014 if these turns\n exceed the available budget, it reduces buffer_turns by\n one repeatedly until the budget fits, down to zero.\n buffer_max_ratio: Hard cap on the fraction of the context\n window the buffer zone may occupy, regardless of\n buffer_turns."
11940+
},
1190111941
"Configuration": {
1190211942
"properties": {
1190311943
"name": {
@@ -11971,6 +12011,11 @@
1197112011
"$ref": "#/components/schemas/ConversationHistoryConfiguration",
1197212012
"title": "Conversation history configuration"
1197312013
},
12014+
"compaction": {
12015+
"$ref": "#/components/schemas/CompactionConfiguration",
12016+
"title": "Conversation compaction configuration",
12017+
"description": "Controls when conversation history is summarized to keep the model's input below the context window limit. Disabled by default \u2014 when disabled, requests that exceed the window continue to surface as HTTP 413."
12018+
},
1197412019
"byok_rag": {
1197512020
"items": {
1197612021
"$ref": "#/components/schemas/ByokRag"
@@ -13391,6 +13436,15 @@
1339113436
],
1339213437
"title": "Default provider",
1339313438
"description": "Identification of default provider used when no other model is specified."
13439+
},
13440+
"context_windows": {
13441+
"additionalProperties": {
13442+
"type": "integer",
13443+
"exclusiveMinimum": 0.0
13444+
},
13445+
"type": "object",
13446+
"title": "Per-model context window sizes (tokens)",
13447+
"description": "Map of fully-qualified model identifier (e.g., \"openai/gpt-4o-mini\") to context window size in tokens. Used by the conversation compaction trigger to decide when older turns must be summarized before the input exceeds the window. Models absent from this map have no registered window \u2014 callers fall back to their own default or skip the token-based trigger."
1339413448
}
1339513449
},
1339613450
"additionalProperties": false,

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ dependencies = [
7676
# Used for error tracking and monitoring
7777
"sentry-sdk[fastapi]>=2.58.0",
7878
"python-dotenv>=1.2.2",
79+
# Used for token estimation before LLM calls (LCORE-1569 / conversation compaction)
80+
"tiktoken>=0.8.0",
7981
]
8082

8183

src/models/compaction.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Pydantic models for conversation compaction.
2+
3+
Defines ``ConversationSummary`` — one chunk produced each time
4+
compaction triggers. The compaction module (``src/utils/compaction.py``)
5+
creates instances of this model from raw Llama Stack conversation
6+
items; the conversation cache (LCORE-1571) is responsible for
7+
persisting them.
8+
9+
Each compaction run produces exactly one ``ConversationSummary``. The
10+
additive design (decision 2 of the spike) keeps every chunk's summary
11+
as a separate record — they are only re-summarized into a single
12+
record by the recursive fallback when the total summary token count
13+
itself approaches the context window.
14+
"""
15+
16+
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
17+
18+
19+
class ConversationSummary(BaseModel):
20+
"""A single compaction-produced summary chunk.
21+
22+
Attributes:
23+
summary_text: The natural-language summary produced by the
24+
summarization LLM call. Used directly as context for
25+
subsequent requests (alongside any later summary chunks
26+
and the buffer of recent turns kept verbatim).
27+
summarized_through_turn: Running total of conversation items
28+
consumed by this and all preceding summaries. Used by the
29+
caller to advance the partition boundary on the next
30+
compaction so the new summary only covers items that
31+
have not yet been summarized.
32+
token_count: Number of tokens in ``summary_text``. Tracked so
33+
the recursive-resummarize fallback can decide when the
34+
cumulative summary size itself approaches the context
35+
limit without re-tokenizing.
36+
created_at: ISO 8601 timestamp recording when this summary was
37+
produced. Kept as a string (not datetime) to match the
38+
cache schema convention used elsewhere in the codebase.
39+
model_used: Fully-qualified model identifier used for the
40+
summarization LLM call (e.g., ``"openai/gpt-4o-mini"``).
41+
Preserved for audit and for diagnostics when summary
42+
quality varies between models.
43+
"""
44+
45+
summary_text: str = Field(
46+
...,
47+
title="Summary text",
48+
description="Natural-language summary produced by the summarization LLM call.",
49+
)
50+
summarized_through_turn: NonNegativeInt = Field(
51+
...,
52+
title="Summarized through turn",
53+
description="Running total of conversation items consumed by "
54+
"this and all preceding summaries.",
55+
)
56+
token_count: PositiveInt = Field(
57+
...,
58+
title="Token count",
59+
description="Number of tokens in summary_text.",
60+
)
61+
created_at: str = Field(
62+
...,
63+
title="Created at",
64+
description="ISO 8601 timestamp recording when this summary was produced.",
65+
)
66+
model_used: str = Field(
67+
...,
68+
title="Model used",
69+
description="Fully-qualified model identifier used for the summarization call.",
70+
)

src/models/config.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,6 +1427,17 @@ class InferenceConfiguration(ConfigurationBase):
14271427
description="Identification of default provider used when no other model is specified.",
14281428
)
14291429

1430+
context_windows: dict[str, PositiveInt] = Field(
1431+
default_factory=dict,
1432+
title="Per-model context window sizes (tokens)",
1433+
description="Map of fully-qualified model identifier (e.g., "
1434+
'"openai/gpt-4o-mini") to context window size in tokens. Used by '
1435+
"the conversation compaction trigger to decide when older turns "
1436+
"must be summarized before the input exceeds the window. Models "
1437+
"absent from this map have no registered window — callers fall "
1438+
"back to their own default or skip the token-based trigger.",
1439+
)
1440+
14301441
@model_validator(mode="after")
14311442
def check_default_model_and_provider(self) -> Self:
14321443
"""
@@ -1449,6 +1460,80 @@ def check_default_model_and_provider(self) -> Self:
14491460
return self
14501461

14511462

1463+
class CompactionConfiguration(ConfigurationBase):
1464+
"""Configuration for conversation history compaction.
1465+
1466+
Compaction summarizes older conversation turns when their estimated
1467+
token count approaches the context window limit, keeping the
1468+
conversation usable instead of failing with HTTP 413. The
1469+
configuration here controls when compaction triggers and how much
1470+
recent context is preserved verbatim.
1471+
1472+
Attributes:
1473+
enabled: Master switch. When False, compaction never triggers
1474+
and other fields are inert.
1475+
threshold_ratio: Trigger compaction when estimated input tokens
1476+
exceed this fraction of the model's context window
1477+
(clamped to 0.0..1.0).
1478+
token_floor: Minimum estimated token count before compaction
1479+
can trigger, regardless of threshold_ratio. Prevents
1480+
triggering on very small context windows.
1481+
buffer_turns: Initial number of recent turns to keep verbatim.
1482+
The runtime applies a degrading guard — if these turns
1483+
exceed the available budget, it reduces buffer_turns by
1484+
one repeatedly until the budget fits, down to zero.
1485+
buffer_max_ratio: Hard cap on the fraction of the context
1486+
window the buffer zone may occupy, regardless of
1487+
buffer_turns.
1488+
"""
1489+
1490+
enabled: bool = Field(
1491+
False,
1492+
title="Enable compaction",
1493+
description="When true, older conversation turns are summarized "
1494+
"when estimated tokens approach the context window limit.",
1495+
)
1496+
threshold_ratio: float = Field(
1497+
0.7,
1498+
title="Threshold ratio",
1499+
description="Trigger compaction when estimated tokens exceed "
1500+
"this fraction of the model's context window (0.0-1.0).",
1501+
)
1502+
token_floor: NonNegativeInt = Field(
1503+
4096,
1504+
title="Token floor",
1505+
description="Minimum token count before compaction can trigger. "
1506+
"Prevents triggering on very small context windows.",
1507+
)
1508+
buffer_turns: NonNegativeInt = Field(
1509+
4,
1510+
title="Buffer turns",
1511+
description="Number of recent turns to keep verbatim.",
1512+
)
1513+
buffer_max_ratio: float = Field(
1514+
0.3,
1515+
title="Buffer max ratio",
1516+
description="Maximum fraction of context window the buffer zone "
1517+
"can occupy, regardless of buffer_turns.",
1518+
)
1519+
1520+
@field_validator("threshold_ratio")
1521+
@classmethod
1522+
def _validate_threshold_ratio(cls, value: float) -> float:
1523+
"""Reject threshold ratios outside the inclusive 0..1 range."""
1524+
if not 0.0 <= value <= 1.0:
1525+
raise ValueError("threshold_ratio must be between 0.0 and 1.0 (inclusive)")
1526+
return value
1527+
1528+
@field_validator("buffer_max_ratio")
1529+
@classmethod
1530+
def _validate_buffer_max_ratio(cls, value: float) -> float:
1531+
"""Reject buffer-max ratios outside the inclusive 0..1 range."""
1532+
if not 0.0 <= value <= 1.0:
1533+
raise ValueError("buffer_max_ratio must be between 0.0 and 1.0 (inclusive)")
1534+
return value
1535+
1536+
14521537
class ConversationHistoryConfiguration(ConfigurationBase):
14531538
"""Conversation history configuration."""
14541539

@@ -1921,6 +2006,21 @@ class Configuration(ConfigurationBase):
19212006
description="Conversation history configuration.",
19222007
)
19232008

2009+
compaction: CompactionConfiguration = Field(
2010+
default_factory=lambda: CompactionConfiguration(
2011+
enabled=False,
2012+
threshold_ratio=0.7,
2013+
token_floor=4096,
2014+
buffer_turns=4,
2015+
buffer_max_ratio=0.3,
2016+
),
2017+
title="Conversation compaction configuration",
2018+
description="Controls when conversation history is summarized "
2019+
"to keep the model's input below the context window limit. "
2020+
"Disabled by default — when disabled, requests that exceed the "
2021+
"window continue to surface as HTTP 413.",
2022+
)
2023+
19242024
byok_rag: list[ByokRag] = Field(
19252025
default_factory=list,
19262026
title="BYOK RAG configuration",

0 commit comments

Comments
 (0)