tisnik
diff --git a/‎docs/openapi.json‎
Lines changed: 54 additions & 0 deletions b/‎docs/openapi.json‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/models/compaction.py‎
Lines changed: 70 additions & 0 deletions b/‎src/models/compaction.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/models/config.py‎
Lines changed: 100 additions & 0 deletions b/‎src/models/config.py‎
Lines changed: 100 additions & 0 deletions
@@ -11898,6 +11898,46 @@
                 "title": "ClientCredentialsOAuthFlow",
                 "description": "Defines configuration details for the OAuth 2.0 Client Credentials flow."
             },
+            "CompactionConfiguration": {
+                "properties": {
+                    "enabled": {
+                        "type": "boolean",
+                        "title": "Enable compaction",
+                        "description": "When true, older conversation turns are summarized when estimated tokens approach the context window limit.",
+                        "default": false
+                    },
+                    "threshold_ratio": {
+                        "type": "number",
+                        "title": "Threshold ratio",
+                        "description": "Trigger compaction when estimated tokens exceed this fraction of the model's context window (0.0-1.0).",
+                        "default": 0.7
+                    },
+                    "token_floor": {
+                        "type": "integer",
+                        "minimum": 0.0,
+                        "title": "Token floor",
+                        "description": "Minimum token count before compaction can trigger. Prevents triggering on very small context windows.",
+                        "default": 4096
+                    },
+                    "buffer_turns": {
+                        "type": "integer",
+                        "minimum": 0.0,
+                        "title": "Buffer turns",
+                        "description": "Number of recent turns to keep verbatim.",
+                        "default": 4
+                    },
+                    "buffer_max_ratio": {
+                        "type": "number",
+                        "title": "Buffer max ratio",
+                        "description": "Maximum fraction of context window the buffer zone can occupy, regardless of buffer_turns.",
+                        "default": 0.3
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "title": "CompactionConfiguration",
+                "description": "Configuration for conversation history compaction.\n\nCompaction summarizes older conversation turns when their estimated\ntoken count approaches the context window limit, keeping the\nconversation usable instead of failing with HTTP 413. The\nconfiguration here controls when compaction triggers and how much\nrecent context is preserved verbatim.\n\nAttributes:\n    enabled: Master switch. When False, compaction never triggers\n        and other fields are inert.\n    threshold_ratio: Trigger compaction when estimated input tokens\n        exceed this fraction of the model's context window\n        (clamped to 0.0..1.0).\n    token_floor: Minimum estimated token count before compaction\n        can trigger, regardless of threshold_ratio. Prevents\n        triggering on very small context windows.\n    buffer_turns: Initial number of recent turns to keep verbatim.\n        The runtime applies a degrading guard \u2014 if these turns\n        exceed the available budget, it reduces buffer_turns by\n        one repeatedly until the budget fits, down to zero.\n    buffer_max_ratio: Hard cap on the fraction of the context\n        window the buffer zone may occupy, regardless of\n        buffer_turns."
+            },
             "Configuration": {
                 "properties": {
                     "name": {
@@ -11971,6 +12011,11 @@
                         "$ref": "#/components/schemas/ConversationHistoryConfiguration",
                         "title": "Conversation history configuration"
                     },
+                    "compaction": {
+                        "$ref": "#/components/schemas/CompactionConfiguration",
+                        "title": "Conversation compaction configuration",
+                        "description": "Controls when conversation history is summarized to keep the model's input below the context window limit. Disabled by default \u2014 when disabled, requests that exceed the window continue to surface as HTTP 413."
+                    },
                     "byok_rag": {
                         "items": {
                             "$ref": "#/components/schemas/ByokRag"
@@ -13391,6 +13436,15 @@
                         ],
                         "title": "Default provider",
                         "description": "Identification of default provider used when no other model is specified."
+                    },
+                    "context_windows": {
+                        "additionalProperties": {
+                            "type": "integer",
+                            "exclusiveMinimum": 0.0
+                        },
+                        "type": "object",
+                        "title": "Per-model context window sizes (tokens)",
+                        "description": "Map of fully-qualified model identifier (e.g., \"openai/gpt-4o-mini\") to context window size in tokens. Used by the conversation compaction trigger to decide when older turns must be summarized before the input exceeds the window. Models absent from this map have no registered window \u2014 callers fall back to their own default or skip the token-based trigger."
                     }
                 },
                 "additionalProperties": false,
 
@@ -76,6 +76,8 @@ dependencies = [
     # Used for error tracking and monitoring
     "sentry-sdk[fastapi]>=2.58.0",
     "python-dotenv>=1.2.2",
+    # Used for token estimation before LLM calls (LCORE-1569 / conversation compaction)
+    "tiktoken>=0.8.0",
 ]
 
 
 
@@ -0,0 +1,70 @@
+"""Pydantic models for conversation compaction.
+
+Defines ``ConversationSummary`` — one chunk produced each time
+compaction triggers. The compaction module (``src/utils/compaction.py``)
+creates instances of this model from raw Llama Stack conversation
+items; the conversation cache (LCORE-1571) is responsible for
+persisting them.
+
+Each compaction run produces exactly one ``ConversationSummary``. The
+additive design (decision 2 of the spike) keeps every chunk's summary
+as a separate record — they are only re-summarized into a single
+record by the recursive fallback when the total summary token count
+itself approaches the context window.
+"""
+
+from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
+
+
+class ConversationSummary(BaseModel):
+    """A single compaction-produced summary chunk.
+
+    Attributes:
+        summary_text: The natural-language summary produced by the
+            summarization LLM call. Used directly as context for
+            subsequent requests (alongside any later summary chunks
+            and the buffer of recent turns kept verbatim).
+        summarized_through_turn: Running total of conversation items
+            consumed by this and all preceding summaries. Used by the
+            caller to advance the partition boundary on the next
+            compaction so the new summary only covers items that
+            have not yet been summarized.
+        token_count: Number of tokens in ``summary_text``. Tracked so
+            the recursive-resummarize fallback can decide when the
+            cumulative summary size itself approaches the context
+            limit without re-tokenizing.
+        created_at: ISO 8601 timestamp recording when this summary was
+            produced. Kept as a string (not datetime) to match the
+            cache schema convention used elsewhere in the codebase.
+        model_used: Fully-qualified model identifier used for the
+            summarization LLM call (e.g., ``"openai/gpt-4o-mini"``).
+            Preserved for audit and for diagnostics when summary
+            quality varies between models.
+    """
+
+    summary_text: str = Field(
+        ...,
+        title="Summary text",
+        description="Natural-language summary produced by the summarization LLM call.",
+    )
+    summarized_through_turn: NonNegativeInt = Field(
+        ...,
+        title="Summarized through turn",
+        description="Running total of conversation items consumed by "
+        "this and all preceding summaries.",
+    )
+    token_count: PositiveInt = Field(
+        ...,
+        title="Token count",
+        description="Number of tokens in summary_text.",
+    )
+    created_at: str = Field(
+        ...,
+        title="Created at",
+        description="ISO 8601 timestamp recording when this summary was produced.",
+    )
+    model_used: str = Field(
+        ...,
+        title="Model used",
+        description="Fully-qualified model identifier used for the summarization call.",
+    )
@@ -1427,6 +1427,17 @@ class InferenceConfiguration(ConfigurationBase):
         description="Identification of default provider used when no other model is specified.",
     )
 
+    context_windows: dict[str, PositiveInt] = Field(
+        default_factory=dict,
+        title="Per-model context window sizes (tokens)",
+        description="Map of fully-qualified model identifier (e.g., "
+        '"openai/gpt-4o-mini") to context window size in tokens. Used by '
+        "the conversation compaction trigger to decide when older turns "
+        "must be summarized before the input exceeds the window. Models "
+        "absent from this map have no registered window — callers fall "
+        "back to their own default or skip the token-based trigger.",
+    )
+
     @model_validator(mode="after")
     def check_default_model_and_provider(self) -> Self:
         """
@@ -1449,6 +1460,80 @@ def check_default_model_and_provider(self) -> Self:
         return self
 
 
+class CompactionConfiguration(ConfigurationBase):
+    """Configuration for conversation history compaction.
+
+    Compaction summarizes older conversation turns when their estimated
+    token count approaches the context window limit, keeping the
+    conversation usable instead of failing with HTTP 413. The
+    configuration here controls when compaction triggers and how much
+    recent context is preserved verbatim.
+
+    Attributes:
+        enabled: Master switch. When False, compaction never triggers
+            and other fields are inert.
+        threshold_ratio: Trigger compaction when estimated input tokens
+            exceed this fraction of the model's context window
+            (clamped to 0.0..1.0).
+        token_floor: Minimum estimated token count before compaction
+            can trigger, regardless of threshold_ratio. Prevents
+            triggering on very small context windows.
+        buffer_turns: Initial number of recent turns to keep verbatim.
+            The runtime applies a degrading guard — if these turns
+            exceed the available budget, it reduces buffer_turns by
+            one repeatedly until the budget fits, down to zero.
+        buffer_max_ratio: Hard cap on the fraction of the context
+            window the buffer zone may occupy, regardless of
+            buffer_turns.
+    """
+
+    enabled: bool = Field(
+        False,
+        title="Enable compaction",
+        description="When true, older conversation turns are summarized "
+        "when estimated tokens approach the context window limit.",
+    )
+    threshold_ratio: float = Field(
+        0.7,
+        title="Threshold ratio",
+        description="Trigger compaction when estimated tokens exceed "
+        "this fraction of the model's context window (0.0-1.0).",
+    )
+    token_floor: NonNegativeInt = Field(
+        4096,
+        title="Token floor",
+        description="Minimum token count before compaction can trigger. "
+        "Prevents triggering on very small context windows.",
+    )
+    buffer_turns: NonNegativeInt = Field(
+        4,
+        title="Buffer turns",
+        description="Number of recent turns to keep verbatim.",
+    )
+    buffer_max_ratio: float = Field(
+        0.3,
+        title="Buffer max ratio",
+        description="Maximum fraction of context window the buffer zone "
+        "can occupy, regardless of buffer_turns.",
+    )
+
+    @field_validator("threshold_ratio")
+    @classmethod
+    def _validate_threshold_ratio(cls, value: float) -> float:
+        """Reject threshold ratios outside the inclusive 0..1 range."""
+        if not 0.0 <= value <= 1.0:
+            raise ValueError("threshold_ratio must be between 0.0 and 1.0 (inclusive)")
+        return value
+
+    @field_validator("buffer_max_ratio")
+    @classmethod
+    def _validate_buffer_max_ratio(cls, value: float) -> float:
+        """Reject buffer-max ratios outside the inclusive 0..1 range."""
+        if not 0.0 <= value <= 1.0:
+            raise ValueError("buffer_max_ratio must be between 0.0 and 1.0 (inclusive)")
+        return value
+
+
 class ConversationHistoryConfiguration(ConfigurationBase):
     """Conversation history configuration."""
 
@@ -1921,6 +2006,21 @@ class Configuration(ConfigurationBase):
         description="Conversation history configuration.",
     )
 
+    compaction: CompactionConfiguration = Field(
+        default_factory=lambda: CompactionConfiguration(
+            enabled=False,
+            threshold_ratio=0.7,
+            token_floor=4096,
+            buffer_turns=4,
+            buffer_max_ratio=0.3,
+        ),
+        title="Conversation compaction configuration",
+        description="Controls when conversation history is summarized "
+        "to keep the model's input below the context window limit. "
+        "Disabled by default — when disabled, requests that exceed the "
+        "window continue to surface as HTTP 413.",
+    )
+
     byok_rag: list[ByokRag] = Field(
         default_factory=list,
         title="BYOK RAG configuration",
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,8 @@ dependencies = [`
`76`	`76`	`# Used for error tracking and monitoring`
`77`	`77`	`"sentry-sdk[fastapi]>=2.58.0",`
`78`	`78`	`"python-dotenv>=1.2.2",`
	`79`	`+ # Used for token estimation before LLM calls (LCORE-1569 / conversation compaction)`
	`80`	`+ "tiktoken>=0.8.0",`
`79`	`81`	`]`
`80`	`82`
`81`	`83`