Fix session-summary cost over-billing: discount cache reads, add cache-hit row

alexylon · alexylon · commit d33d0c48dc68 · 2026-05-02T22:41:48.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,8 +4,16 @@ All notable changes to Sofos are documented in this file.
 
 ## [Unreleased]
 
+### Fixed
+
+- **Session summary "Estimated cost" now accounts for the cache discount.** `calculate_cost` was billing every input token at the full rate, ignoring the `cache_read_input_tokens` and `cache_creation_input_tokens` fields the 0.2.6 fix had started collecting. With a high cache-hit rate that overstated the bill by ~3× (e.g. for `gpt-5.5` at 75% hit, displayed `$0.50` vs. real `$0.16` per 100k input). The cost function now subtracts cache reads from the uncached portion, prices them at 10% of the base input rate (both providers), and bills Anthropic 5-min cache writes at 125% of base. Provider semantics are normalized inside `calculate_cost`: OpenAI's `input_tokens` already includes cached, Anthropic's excludes them.
+
 ### Added
 
+- **Cache-hit indicator in the session summary.** When a turn has any cached or written tokens, the summary now shows `cache read: N (M% hit)` and (Anthropic only) `cache write: N` underneath the input row, and the displayed `Input tokens` row now reflects the total the model actually saw (cached + uncached) on both providers — previously Anthropic's row understated by the cached portion.
+
+- **"Finished in Xs" turn-completion marker.** A dimmed `Finished in 1m 34s` line prints after the assistant has fully finished a turn (last text reply, last tool call, last continuation) so the prompt-ready signal is unambiguous. Steer messages typed mid-turn don't reset the timer — they fold into the same turn and the marker still prints once at the end. Skipped on interrupt or error.
+
 - **Bare `"Bash"` entry in allow / deny acts as a blanket rule.** Adding `"Bash"` to `permissions.allow` in `~/.sofos/config.toml` or `.sofos/config.local.toml` auto-passes every bash command (no Yes/No/remember prompt) except those in the built-in forbidden set (`rm`, `chmod`, `sudo`, …) — useful when you've decided to trust sofos with shell access in a project. Symmetrically, `"Bash"` in `permissions.deny` auto-rejects every bash command. The blanket entry beats every more-specific rule (`Bash(cmd:*)` wildcards, exact-match entries, the built-in allow-list); when both lists contain `"Bash"`, deny wins. Structural safety (`>` redirection, `<<`, `git push` and friends, parent traversal, external-path prompts) still applies.
 
 ### Changed
diff --git a/README.md b/README.md
@@ -151,7 +151,7 @@ Analyze https://example.com/chart.png
 
 ### Cost Tracking
 
-Exit summary shows token usage and estimated cost (based on official API pricing).
+Exit summary shows token usage and estimated cost based on official API pricing. When the provider prompt cache served any tokens during the session, a `cache read: N (M% hit)` row appears under the input total, and the estimated cost reflects the cache discount (10% of base input on both providers, plus 125% for Anthropic 5-min cache writes).
 
 ### CLI Options
 
diff --git a/src/repl/mod.rs b/src/repl/mod.rs
@@ -22,7 +22,7 @@ use colored::Colorize;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio::time::sleep;
 
 /// Shared buffer used by the TUI to inject user messages mid-turn. The UI
@@ -267,6 +267,13 @@ impl Repl {
         user_input: &str,
         pasted_images: Vec<crate::clipboard::PastedImage>,
     ) -> Result<()> {
+        // Record turn start so we can show "Finished in Xs" when the
+        // model is fully done (after every text reply, tool call, and
+        // continuation). Steer messages typed mid-turn don't reset
+        // this — they're folded into the same turn via `SteerQueue` and
+        // the same `process_message` call keeps running until the
+        // agent loop exits.
+        let turn_start = Instant::now();
         let (remaining_text, image_refs) = extract_image_references(user_input);
 
         let has_images = !image_refs.is_empty() || !pasted_images.is_empty();
@@ -584,8 +591,7 @@ impl Repl {
             }
         };
 
-        self.session_state
-            .add_tokens(response.usage.input_tokens, response.usage.output_tokens);
+        self.session_state.add_usage(&response.usage);
 
         let mut handler = ResponseHandler::new(
             self.client.clone(),
@@ -607,13 +613,21 @@ impl Repl {
             &mut self.session_state.display_messages,
             &mut self.session_state.total_input_tokens,
             &mut self.session_state.total_output_tokens,
+            &mut self.session_state.total_cache_read_tokens,
+            &mut self.session_state.total_cache_creation_tokens,
         ));
 
         // Always preserve conversation state so the AI retains context on retry
         self.session_state.conversation = handler.conversation().clone();
 
         match result {
-            Ok(_) => Ok(()),
+            Ok(_) => {
+                println!(
+                    "{}",
+                    UI::format_turn_finished(turn_start.elapsed()).dimmed()
+                );
+                Ok(())
+            }
             Err(SofosError::Interrupted) => Ok(()),
             Err(e) => {
                 // Add error context so the AI knows what happened on next turn.
@@ -672,6 +686,8 @@ impl Repl {
             &self.model_config.model,
             self.session_state.total_input_tokens,
             self.session_state.total_output_tokens,
+            self.session_state.total_cache_read_tokens,
+            self.session_state.total_cache_creation_tokens,
         );
 
         Ok(())
@@ -694,12 +710,14 @@ impl Repl {
         Ok(())
     }
 
-    pub fn get_session_summary(&self) -> (String, u32, u32) {
-        (
-            self.model_config.model.clone(),
-            self.session_state.total_input_tokens,
-            self.session_state.total_output_tokens,
-        )
+    pub fn get_session_summary(&self) -> tui::event::ExitSummary {
+        tui::event::ExitSummary {
+            model: self.model_config.model.clone(),
+            input_tokens: self.session_state.total_input_tokens,
+            output_tokens: self.session_state.total_output_tokens,
+            cache_read_tokens: self.session_state.total_cache_read_tokens,
+            cache_creation_tokens: self.session_state.total_cache_creation_tokens,
+        }
     }
 
     pub fn handle_clear_command(&mut self) -> Result<()> {
@@ -998,8 +1016,7 @@ impl Repl {
                     .conversation
                     .replace_with_summary(summary_text, split_point);
 
-                self.session_state
-                    .add_tokens(response.usage.input_tokens, response.usage.output_tokens);
+                self.session_state.add_usage(&response.usage);
 
                 let tokens_after = self.session_state.conversation.estimate_total_tokens();
                 println!(
diff --git a/src/repl/response_handler.rs b/src/repl/response_handler.rs
@@ -65,6 +65,24 @@ impl ResponseHandler {
         }
     }
 
+    /// Fold a `Usage` payload into the per-turn running totals carried
+    /// by `handle_response`. Centralised so the four-counter increment
+    /// stays consistent across the three sites that consume responses
+    /// (auto-continue after reasoning-only blocks, tool-result loop,
+    /// max-iterations summary).
+    fn accumulate_usage(
+        usage: &crate::api::Usage,
+        total_input: &mut u32,
+        total_output: &mut u32,
+        total_cache_read: &mut u32,
+        total_cache_creation: &mut u32,
+    ) {
+        *total_input += usage.input_tokens;
+        *total_output += usage.output_tokens;
+        *total_cache_read += usage.cache_read_input_tokens.unwrap_or(0);
+        *total_cache_creation += usage.cache_creation_input_tokens.unwrap_or(0);
+    }
+
     /// Atomically drain all pending steer messages the user typed while
     /// this turn was running. Returns `None` if the queue is empty, or
     /// `Some(text)` with the messages joined by blank lines (preserving
@@ -83,12 +101,15 @@ impl ResponseHandler {
         Some(messages.join("\n\n"))
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub async fn handle_response(
         &mut self,
         mut content_blocks: Vec<ContentBlock>,
         display_messages: &mut Vec<DisplayMessage>,
         total_input_tokens: &mut u32,
         total_output_tokens: &mut u32,
+        total_cache_read_tokens: &mut u32,
+        total_cache_creation_tokens: &mut u32,
     ) -> Result<()> {
         let mut iteration = 0;
 
@@ -108,6 +129,8 @@ impl ResponseHandler {
                     display_messages,
                     total_input_tokens,
                     total_output_tokens,
+                    total_cache_read_tokens,
+                    total_cache_creation_tokens,
                 )
                 .await?;
                 return Ok(());
@@ -154,8 +177,13 @@ impl ResponseHandler {
             {
                 let response = self.get_next_response(&[], display_messages).await?;
 
-                *total_input_tokens += response.usage.input_tokens;
-                *total_output_tokens += response.usage.output_tokens;
+                Self::accumulate_usage(
+                    &response.usage,
+                    total_input_tokens,
+                    total_output_tokens,
+                    total_cache_read_tokens,
+                    total_cache_creation_tokens,
+                );
 
                 if response.content.is_empty() {
                     println!(
@@ -223,8 +251,13 @@ impl ResponseHandler {
 
             let response = self.get_next_response(&tool_uses, display_messages).await?;
 
-            *total_input_tokens += response.usage.input_tokens;
-            *total_output_tokens += response.usage.output_tokens;
+            Self::accumulate_usage(
+                &response.usage,
+                total_input_tokens,
+                total_output_tokens,
+                total_cache_read_tokens,
+                total_cache_creation_tokens,
+            );
 
             if std::env::var("SOFOS_DEBUG").is_ok() {
                 eprintln!(
@@ -570,6 +603,8 @@ impl ResponseHandler {
         display_messages: &mut Vec<DisplayMessage>,
         total_input_tokens: &mut u32,
         total_output_tokens: &mut u32,
+        total_cache_read_tokens: &mut u32,
+        total_cache_creation_tokens: &mut u32,
     ) -> Result<()> {
         UI::print_warning("Maximum tool iterations reached. Stopping to prevent infinite loop.");
 
@@ -601,8 +636,13 @@ impl ResponseHandler {
 
         match response_result {
             Ok(response) => {
-                *total_input_tokens += response.usage.input_tokens;
-                *total_output_tokens += response.usage.output_tokens;
+                Self::accumulate_usage(
+                    &response.usage,
+                    total_input_tokens,
+                    total_output_tokens,
+                    total_cache_read_tokens,
+                    total_cache_creation_tokens,
+                );
 
                 for block in &response.content {
                     if let ContentBlock::Text { text } = block {
diff --git a/src/repl/tui/event.rs b/src/repl/tui/event.rs
@@ -12,6 +12,8 @@ pub struct ExitSummary {
     pub model: String,
     pub input_tokens: u32,
     pub output_tokens: u32,
+    pub cache_read_tokens: u32,
+    pub cache_creation_tokens: u32,
 }
 
 /// Tool access mode shown in the status line.
diff --git a/src/repl/tui/mod.rs b/src/repl/tui/mod.rs
@@ -222,6 +222,8 @@ pub fn run(mut repl: Repl) -> Result<()> {
             &summary.model,
             summary.input_tokens,
             summary.output_tokens,
+            summary.cache_read_tokens,
+            summary.cache_creation_tokens,
         );
         // The summary emits its own leading newline when it prints; if
         // it short-circuited, the cursor is still parked at the end of
diff --git a/src/repl/tui/worker.rs b/src/repl/tui/worker.rs
@@ -79,6 +79,8 @@ impl<'a> ShutdownSender<'a> {
             model: String::new(),
             input_tokens: 0,
             output_tokens: 0,
+            cache_read_tokens: 0,
+            cache_creation_tokens: 0,
         });
         let _ = self.ui_tx.send(UiEvent::WorkerShutdown(summary));
         self.sent = true;
@@ -167,16 +169,12 @@ fn run(
         }
     }
 
-    let (model, input_tokens, output_tokens) = repl.get_session_summary();
+    let summary = repl.get_session_summary();
     if let Err(e) = repl.save_current_session() {
         UI::print_warning(&format!("Failed to save session: {}", e));
     }
     flush_captured_streams();
-    shutdown.set_summary(ExitSummary {
-        model,
-        input_tokens,
-        output_tokens,
-    });
+    shutdown.set_summary(summary);
     shutdown.send_now();
 }
 
diff --git a/src/session/state.rs b/src/session/state.rs
@@ -10,10 +10,26 @@ pub struct SessionState {
     pub conversation: ConversationHistory,
     /// Display-friendly message history for UI
     pub display_messages: Vec<DisplayMessage>,
-    /// Total input tokens consumed in this session
+    /// Total input tokens consumed in this session.
+    /// Provider semantics differ:
+    ///
+    /// - OpenAI Responses API: this is the **total** count, of which
+    ///   `total_cache_read_tokens` is a subset.
+    /// - Anthropic Messages API: this is **uncached** new tokens only;
+    ///   cache read/creation are tracked separately and disjoint.
+    ///
+    /// `calculate_cost` normalizes this when computing the bill.
     pub total_input_tokens: u32,
     /// Total output tokens generated in this session
     pub total_output_tokens: u32,
+    /// Tokens served from the provider prompt cache (charged at a
+    /// reduced rate). Both providers report this; semantics relative to
+    /// `total_input_tokens` differ as documented above.
+    pub total_cache_read_tokens: u32,
+    /// Tokens written to the Anthropic prompt cache (charged at a
+    /// premium). OpenAI does not surface a creation counter and leaves
+    /// this at 0.
+    pub total_cache_creation_tokens: u32,
 }
 
 impl SessionState {
@@ -24,6 +40,8 @@ impl SessionState {
             display_messages: Vec::new(),
             total_input_tokens: 0,
             total_output_tokens: 0,
+            total_cache_read_tokens: 0,
+            total_cache_creation_tokens: 0,
         }
     }
 
@@ -33,10 +51,14 @@ impl SessionState {
         self.display_messages.clear();
         self.total_input_tokens = 0;
         self.total_output_tokens = 0;
+        self.total_cache_read_tokens = 0;
+        self.total_cache_creation_tokens = 0;
     }
 
-    pub fn add_tokens(&mut self, input_tokens: u32, output_tokens: u32) {
-        self.total_input_tokens += input_tokens;
-        self.total_output_tokens += output_tokens;
+    pub fn add_usage(&mut self, usage: &crate::api::Usage) {
+        self.total_input_tokens += usage.input_tokens;
+        self.total_output_tokens += usage.output_tokens;
+        self.total_cache_read_tokens += usage.cache_read_input_tokens.unwrap_or(0);
+        self.total_cache_creation_tokens += usage.cache_creation_input_tokens.unwrap_or(0);
     }
 }
diff --git a/src/ui/mod.rs b/src/ui/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,8 @@ pub struct ExitSummary {`
`12`	`12`	`pub model: String,`
`13`	`13`	`pub input_tokens: u32,`
`14`	`14`	`pub output_tokens: u32,`
	`15`	`+ pub cache_read_tokens: u32,`
	`16`	`+ pub cache_creation_tokens: u32,`
`15`	`17`	`}`
`16`	`18`
`17`	`19`	`/// Tool access mode shown in the status line.`