1jehuang · sternelee · May 11, 2026
diff --git a/src/agent.rs b/src/agent.rs
@@ -25,6 +25,7 @@ use crate::build;
 use crate::bus::{Bus, BusEvent, SubagentStatus, ToolEvent, ToolStatus};
 use crate::cache_tracker::CacheTracker;
 use crate::compaction::CompactionEvent;
+use crate::prefix_cache_stable;
 use crate::id;
 use crate::logging;
 use crate::message::{
@@ -522,6 +523,19 @@ impl Agent {
                         self.note_compaction_applied();
                         self.persist_session_best_effort("compaction completion");
                     }
+                    let messages = if prefix_cache_stable::is_prefix_cache_stable_mode() {
+                        let (truncated, truncate_count) =
+                            prefix_cache_stable::truncate_tool_results_for_api(&messages);
+                        if truncate_count > 0 {
+                            logging::info(&format!(
+                                "Prefix-cache mode: truncated {} tool results for API",
+                                truncate_count
+                            ));
+                        }
+                        truncated
+                    } else {
+                        messages
+                    };
                     let user_count = messages
                         .iter()
                         .filter(|message| matches!(message.role, Role::User))
@@ -543,6 +557,19 @@ impl Agent {
 
         let all_messages = self.session.provider_messages();
         let messages = all_messages.to_vec();
+        let messages = if prefix_cache_stable::is_prefix_cache_stable_mode() {
+            let (truncated, truncate_count) =
+                prefix_cache_stable::truncate_tool_results_for_api(&messages);
+            if truncate_count > 0 {
+                logging::info(&format!(
+                    "Prefix-cache mode: truncated {} tool results for API (session path)",
+                    truncate_count
+                ));
+            }
+            truncated
+        } else {
+            messages
+        };
         let user_count = messages
             .iter()
             .filter(|message| matches!(message.role, Role::User))

diff --git a/src/agent/turn_loops.rs b/src/agent/turn_loops.rs
@@ -49,6 +49,29 @@ impl Agent {
             // false-positive violations every turn (prior turn's memory ≠ current history prefix).
             self.record_client_cache_request(&messages);
 
+            // Preflight check for DeepSeek prefix-cache stability mode.
+            // If the payload is near the context limit, warn early so compaction
+            // can fold history before the API returns a 400.
+            if prefix_cache_stable::is_prefix_cache_stable_mode() {
+                let preflight =
+                    prefix_cache_stable::preflight_check(&messages, &tools, &self.provider.model());
+                if preflight.needs_action {
+                    logging::warn(&format!(
+                        "Prefix-cache preflight: context at {:.1}% ({} / {} tokens) — emergency fold recommended",
+                        preflight.ratio * 100.0,
+                        preflight.estimate_tokens,
+                        preflight.ctx_max,
+                    ));
+                } else if preflight.ratio > 0.5 {
+                    logging::info(&format!(
+                        "Prefix-cache preflight: context at {:.1}% ({} / {} tokens)",
+                        preflight.ratio * 100.0,
+                        preflight.estimate_tokens,
+                        preflight.ctx_max,
+                    ));
+                }
+            }
+
             // Inject memory as a user message at the end (preserves cache prefix)
             let mut messages_with_memory: Vec<Message> = messages.iter().cloned().collect();
             if let Some(memory) = memory_pending.as_ref() {
@@ -503,6 +526,13 @@ impl Agent {
                     usage_cache_read,
                     usage_cache_creation,
                 );
+                // Record cache usage for prefix-cache hit-rate tracking
+                self.cache_tracker.record_usage(usage_cache_read, usage_input.unwrap_or(0));
+                if prefix_cache_stable::is_prefix_cache_stable_mode()
+                    && self.cache_tracker.usage_turn_count() % 5 == 0
+                {
+                    logging::info(&format!("Prefix-cache stats: {}", self.cache_tracker.cache_hit_summary()));
+                }
             }
 
             if print_output

diff --git a/src/cache_tracker.rs b/src/cache_tracker.rs
@@ -26,6 +26,12 @@ pub struct CacheTracker {
     hash_history: VecDeque<u64>,
     /// Whether append-only was violated on the last request
     last_violation: Option<CacheViolation>,
+    /// Cumulative cache hit tokens (from provider-reported usage)
+    cache_hit_tokens: u64,
+    /// Cumulative cache miss tokens (from provider-reported usage)
+    cache_miss_tokens: u64,
+    /// Number of turns with usage data recorded
+    usage_turns: u32,
 }
 
 /// Information about a cache violation
@@ -207,6 +213,59 @@ impl CacheTracker {
     pub fn had_violation(&self) -> bool {
         self.last_violation.is_some()
     }
+
+    /// Record provider-reported cache usage for cache-hit-rate tracking.
+    /// Call this after each successful API response when usage data is available.
+    pub fn record_usage(&mut self, cache_read_input_tokens: Option<u64>, input_tokens: u64) {
+        if let Some(hit) = cache_read_input_tokens {
+            self.cache_hit_tokens += hit;
+            // Miss tokens = total input minus cache hits
+            let miss = input_tokens.saturating_sub(hit);
+            self.cache_miss_tokens += miss;
+        } else {
+            // Provider doesn't report cache hits; count all as miss
+            self.cache_miss_tokens += input_tokens;
+        }
+        self.usage_turns += 1;
+    }
+
+    /// Cumulative cache hit tokens
+    pub fn cache_hit_tokens(&self) -> u64 {
+        self.cache_hit_tokens
+    }
+
+    /// Cumulative cache miss tokens
+    pub fn cache_miss_tokens(&self) -> u64 {
+        self.cache_miss_tokens
+    }
+
+    /// Number of turns with usage data recorded
+    pub fn usage_turn_count(&self) -> u32 {
+        self.usage_turns
+    }
+
+    /// Cache hit rate as a ratio (0.0–1.0), or None if no usage recorded
+    pub fn cache_hit_rate(&self) -> Option<f64> {
+        let total = self.cache_hit_tokens + self.cache_miss_tokens;
+        if total == 0 {
+            return None;
+        }
+        Some(self.cache_hit_tokens as f64 / total as f64)
+    }
+
+    /// Human-readable cache hit summary
+    pub fn cache_hit_summary(&self) -> String {
+        match self.cache_hit_rate() {
+            None => "no cache usage data yet".to_string(),
+            Some(rate) => format!(
+                "cache hit: {:.1}% ({} hit / {} miss tokens over {} turns)",
+                rate * 100.0,
+                self.cache_hit_tokens,
+                self.cache_miss_tokens,
+                self.usage_turns
+            ),
+        }
+    }
 }
 
 #[cfg(test)]

diff --git a/src/lib.rs b/src/lib.rs
@@ -17,6 +17,7 @@ pub mod build;
 pub mod bus;
 pub mod cache_tracker;
 pub mod catchup;
+pub mod prefix_cache_stable;
 pub mod channel;
 pub mod cli;
 pub mod compaction;