Sanitise provider errors, fix OpenAI refusals, cap streaming buffer

alexylon · alexylon · commit 56ff1e024662 · 2026-05-19T23:43:40.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,16 @@ All notable changes to Sofos are documented in this file.
 
 ### Fixed
 
+- **Anthropic decode failures now name the provider and show what came back.** A non-streaming response that doesn't match the expected JSON shape used to surface as the generic "HTTP request failed: error decoding response body" with no provider context; sofos now reads the body as text first and includes a redacted preview in the error so a misconfigured proxy is obvious at a glance.
+- **Cache-cost numbers settle correctly on turns where server-side compaction lands late.** Anthropic emits the final `cache_read_input_tokens` and `cache_creation_input_tokens` only on the trailing `message_delta` event in those cases; sofos now refreshes both totals when they appear there, so the cost summary picks up the cache-creation premium instead of under-reporting.
+- **OpenAI refusals reach the conversation.** A `{type: "refusal", refusal: "..."}` block used to be dropped silently, surfacing as "Assistant returned an empty response"; sofos now lifts the refusal text into the visible response so both the user and the next turn see what the model said.
+- **OpenAI truncations without an `incomplete.reason` still trigger the token-limit warning.** When the provider sets `status: "incomplete"` but omits `incomplete_details.reason`, sofos now treats it as `max_tokens` so the existing "Response was cut off" warning fires instead of letting a half-formed tool call enter the conversation history.
+
+### Security
+
+- **API-key-shaped strings in provider error bodies are redacted before display.** Provider 401 responses sometimes echo a truncated key (`sk-ant-api03-…` or `Bearer …`); sofos now replaces every matching run with `sk-[redacted]` / `Bearer [redacted]` and caps the body at 4 KB, so a noisy proxy or moderation block can no longer flood the status line or leak key prefixes to transcripts and crash reports.
+- **The SSE re-assembly buffer is capped at 16 MB on both providers.** A server (or middlebox) that streams gigabytes without a newline used to grow the buffer until the 30-minute request timeout fired; sofos now aborts the stream cleanly with a clear error long before memory exhaustion becomes a real risk.
+
 - **Hitting the tool-iteration cap no longer corrupts the saved session.** The recovery turn used to ship with the tools list still attached, so the assistant could answer with a fresh `tool_use` block that was never executed; the next request 400'd and `--resume` was dead on arrival. Sofos now strips tools from that final summary request and removes any tool-related blocks it returns defensively, so the session resumes cleanly afterwards.
 - **`/clear` keeps safe mode visible to the model.** Clearing the history used to strip the "you are running in safe mode" preamble even when the executor was still in safe mode, so the model proposed writes and bash and was met with opaque denials. The preamble is re-added automatically when safe mode is still on.
 - **Switching reasoning effort mid-session refuses combinations the next request would reject.** On the Anthropic legacy thinking models, `/effort high` (or any enabled level) requires `--max-tokens` above 16 384; the gate that startup enforces now also fires from `/effort`, so the next turn doesn't 400 with no clear hint at the cause.
diff --git a/src/api/anthropic/client.rs b/src/api/anthropic/client.rs
@@ -72,7 +72,20 @@ impl AnthropicClient {
         )
         .await?;
 
-        let result = response.json::<CreateMessageResponse>().await?;
-        Ok(result)
+        // Read the body as text first so a JSON-shape mismatch surfaces
+        // with the provider name and a snippet of what we actually got
+        // — the bare `response.json::<…>().await?` path otherwise turns
+        // every decode failure into "HTTP request failed: error
+        // decoding response body" with no provider context to debug.
+        let body = response.text().await.map_err(|e| {
+            crate::error::SofosError::Api(format!("Failed to read Anthropic response body: {}", e))
+        })?;
+        serde_json::from_str::<CreateMessageResponse>(&body).map_err(|e| {
+            crate::error::SofosError::Api(format!(
+                "Failed to parse Anthropic response: {} (body preview: {})",
+                e,
+                utils::sanitize_provider_error_body(&body)
+            ))
+        })
     }
 }
diff --git a/src/api/anthropic/stream.rs b/src/api/anthropic/stream.rs
@@ -7,6 +7,7 @@ use crate::api::anthropic::client::AnthropicClient;
 use crate::api::anthropic::wire::{BETA_HEADER_NAME, anthropic_beta_for, prepare_request};
 use crate::api::types::*;
 use crate::api::utils;
+use crate::api::utils::MAX_SSE_BUFFER_BYTES;
 use crate::error::{Result, SofosError};
 use futures::stream::{Stream, StreamExt};
 use std::sync::Arc;
@@ -130,6 +131,13 @@ where
 
         let chunk = chunk_result?;
         buffer.extend_from_slice(chunk.as_ref());
+        if buffer.len() > MAX_SSE_BUFFER_BYTES {
+            return Err(SofosError::Api(format!(
+                "Anthropic SSE buffer exceeded {} MB without a line terminator; \
+                 likely a misbehaving server or middlebox",
+                MAX_SSE_BUFFER_BYTES / (1024 * 1024)
+            )));
+        }
 
         while let Some(pos) = buffer.iter().position(|b| *b == b'\n') {
             // The complete line is in-buffer, so codepoints aren't
@@ -371,6 +379,24 @@ where
                         output_tokens = saturate_u32(
                             u.get("output_tokens").and_then(|v| v.as_u64()).unwrap_or(0),
                         );
+                        // Server-side compaction can settle the cache-
+                        // related usage numbers only on the trailing
+                        // `message_delta`, not the opening
+                        // `message_start`. Refresh both totals when
+                        // present so the cost line picks up the cache-
+                        // creation premium on turns where compaction
+                        // landed late.
+                        if let Some(read) =
+                            u.get("cache_read_input_tokens").and_then(|v| v.as_u64())
+                        {
+                            cache_read_input_tokens = Some(saturate_u32(read));
+                        }
+                        if let Some(create) = u
+                            .get("cache_creation_input_tokens")
+                            .and_then(|v| v.as_u64())
+                        {
+                            cache_creation_input_tokens = Some(saturate_u32(create));
+                        }
                     }
                 }
                 "error" => {
diff --git a/src/api/openai/stream.rs b/src/api/openai/stream.rs
@@ -11,6 +11,7 @@ use crate::api::openai::client::OPENAI_API_BASE;
 use crate::api::openai::wire::{OpenAIResponse, build_response, build_responses_body};
 use crate::api::types::*;
 use crate::api::utils;
+use crate::api::utils::MAX_SSE_BUFFER_BYTES;
 use crate::error::{Result, SofosError};
 use futures::stream::{Stream, StreamExt};
 use serde_json::json;
@@ -89,6 +90,13 @@ where
 
         let chunk = chunk_result?;
         buffer.extend_from_slice(chunk.as_ref());
+        if buffer.len() > MAX_SSE_BUFFER_BYTES {
+            return Err(SofosError::Api(format!(
+                "OpenAI SSE buffer exceeded {} MB without a line terminator; \
+                 likely a misbehaving server or middlebox",
+                MAX_SSE_BUFFER_BYTES / (1024 * 1024)
+            )));
+        }
 
         while let Some(pos) = buffer.iter().position(|b| *b == b'\n') {
             // Re-check the interrupt flag between lines so a single
diff --git a/src/api/openai/wire.rs b/src/api/openai/wire.rs
@@ -306,6 +306,13 @@ pub(super) struct OpenAIOutputContent {
     pub(super) content_type: String,
     #[serde(default)]
     pub(super) text: String,
+    /// OpenAI emits a `refusal` block when the model declines the
+    /// request. The decline text lives under `refusal` instead of
+    /// `text`, so it has to be captured separately or it's silently
+    /// dropped, which surfaces to the user as the very confusing
+    /// "empty response" warning.
+    #[serde(default)]
+    pub(super) refusal: String,
 }
 
 #[derive(Debug, Deserialize)]
@@ -432,6 +439,16 @@ pub(super) fn build_response(response_parsed: OpenAIResponse) -> Result<CreateMe
                 for content in item.content {
                     if content.content_type == "output_text" && !content.text.trim().is_empty() {
                         content_blocks.push(ContentBlock::Text { text: content.text });
+                    } else if content.content_type == "refusal"
+                        && !content.refusal.trim().is_empty()
+                    {
+                        // Surface refusals as plain text so the model
+                        // sees its own decline next turn (instead of
+                        // looking like it returned an empty turn) and
+                        // the user gets the actual refusal text.
+                        content_blocks.push(ContentBlock::Text {
+                            text: content.refusal,
+                        });
                     }
                 }
 
@@ -517,6 +534,13 @@ pub(super) fn build_response(response_parsed: OpenAIResponse) -> Result<CreateMe
             Some("max_tokens".to_string())
         }
         (Some("incomplete"), Some(other)) => Some(other.to_string()),
+        // OpenAI sometimes reports `status: "incomplete"` without
+        // populating `incomplete_details.reason`. The truncation guard
+        // in the response handler looks for `stop_reason ==
+        // "max_tokens"`, so mapping the missing-reason case there
+        // keeps the warning firing instead of letting a half-formed
+        // tool call enter the conversation history.
+        (Some("incomplete"), None) => Some("max_tokens".to_string()),
         // Anthropic always sets `stop_reason` on a normal stop. Map the
         // OpenAI `status: "completed"` to the same `"end_turn"` value
         // so downstream `if let Some(stop_reason) = ...` branches treat
diff --git a/src/api/utils.rs b/src/api/utils.rs
@@ -346,6 +346,84 @@ pub fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> usize {
     i
 }
 
+/// Upper bound on the provider-error body interpolated into a user-facing
+/// `SofosError::Api`. A misconfigured proxy that returns a multi-MB HTML
+/// page, or a moderation block that echoes the whole request, otherwise
+/// floods stderr and the status line. Beyond this the body is truncated
+/// with a `[…N more bytes elided]` marker so the model still gets some
+/// signal but the UI stays readable.
+pub const MAX_PROVIDER_ERROR_BODY_BYTES: usize = 4 * 1024;
+
+/// Upper bound on the SSE re-assembly buffer used by both the Anthropic
+/// and OpenAI streaming parsers. A server (or a middlebox) that streams
+/// gigabytes without a newline would otherwise grow `buffer` until the
+/// 30-minute request timeout fires, exhausting memory long before the
+/// timeout helps. 16 MB is far above any legitimate single SSE line we
+/// have seen in practice.
+pub const MAX_SSE_BUFFER_BYTES: usize = 16 * 1024 * 1024;
+
+/// Best-effort redaction of API-key-shaped substrings inside a provider
+/// error body. Provider 401 responses sometimes echo the rejected key
+/// (truncated or otherwise), which would land verbatim in transcripts
+/// and crash reports. Scans for `sk-…` style prefixes and `Bearer …`
+/// pairs and rewrites each run as `<keyword>[redacted]`. Caller is
+/// expected to apply [`truncate_at_char_boundary`] separately if the
+/// body needs a length cap.
+pub fn redact_api_secrets(body: &str) -> String {
+    fn is_key_byte(b: u8) -> bool {
+        b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
+    }
+
+    let bytes = body.as_bytes();
+    let mut out = String::with_capacity(body.len());
+    let mut i = 0;
+    while i < bytes.len() {
+        if bytes[i..].starts_with(b"sk-") {
+            let mut end = i + 3;
+            while end < bytes.len() && is_key_byte(bytes[end]) {
+                end += 1;
+            }
+            if end - i >= 11 {
+                out.push_str("sk-[redacted]");
+                i = end;
+                continue;
+            }
+        }
+        if bytes[i..].starts_with(b"Bearer ") || bytes[i..].starts_with(b"bearer ") {
+            let prefix_len = 7;
+            let mut end = i + prefix_len;
+            while end < bytes.len() && is_key_byte(bytes[end]) {
+                end += 1;
+            }
+            if end - i >= prefix_len + 8 {
+                out.push_str(&body[i..i + prefix_len]);
+                out.push_str("[redacted]");
+                i = end;
+                continue;
+            }
+        }
+        // Non-ASCII bytes carry one full UTF-8 char; push it whole so
+        // the surrounding text stays valid.
+        let ch = body[i..].chars().next().unwrap_or('\u{FFFD}');
+        out.push(ch);
+        i += ch.len_utf8();
+    }
+    out
+}
+
+/// Truncate `body` to [`MAX_PROVIDER_ERROR_BODY_BYTES`] and run
+/// [`redact_api_secrets`] over the result. Centralised so every error-
+/// to-message hop applies the same cleanup.
+pub fn sanitize_provider_error_body(body: &str) -> String {
+    let cut = truncate_at_char_boundary(body, MAX_PROVIDER_ERROR_BODY_BYTES);
+    let mut truncated = redact_api_secrets(&body[..cut]);
+    if body.len() > cut {
+        let extra = body.len() - cut;
+        truncated.push_str(&format!(" […{} more bytes elided]", extra));
+    }
+    truncated
+}
+
 /// Upper bound applied to the `Retry-After` value advertised by a 429
 /// response. 60 seconds is comfortably above the burst-limit windows
 /// the APIs we integrate with use in practice, and short enough that
@@ -468,7 +546,10 @@ fn api_call_error_to_sofos(service_name: &str, attempts: u32, e: ApiCallError) -
         ApiCallError::ServerError { status, body } | ApiCallError::ClientError { status, body } => {
             SofosError::Api(format!(
                 "{} request failed with status {} after {} attempt(s): {}",
-                service_name, status, attempts, body
+                service_name,
+                status,
+                attempts,
+                sanitize_provider_error_body(&body)
             ))
         }
         ApiCallError::RateLimited { retry_after, body } => SofosError::Api(format!(
@@ -479,7 +560,7 @@ fn api_call_error_to_sofos(service_name: &str, attempts: u32, e: ApiCallError) -
                 None => String::new(),
             },
             attempts,
-            body
+            sanitize_provider_error_body(&body)
         )),
     }
 }
@@ -604,6 +685,54 @@ pub(crate) mod sse_test_support {
 mod tests {
     use super::*;
 
+    #[test]
+    fn redact_strips_sk_keys_and_bearer_tokens() {
+        let body = "Invalid x-api-key: sk-ant-api03-AAAAaaaa1111BBBBbbbb22 returned error";
+        let cleaned = redact_api_secrets(body);
+        assert!(
+            !cleaned.contains("sk-ant-api03"),
+            "key prefix must be removed, got: {cleaned}"
+        );
+        assert!(cleaned.contains("sk-[redacted]"));
+        assert!(cleaned.contains("returned error"));
+
+        let bearer = "Authorization: Bearer abcdefghijKLMN1234 expired";
+        let cleaned = redact_api_secrets(bearer);
+        assert!(
+            cleaned.contains("Bearer [redacted]"),
+            "bearer token must be redacted, got: {cleaned}"
+        );
+        assert!(cleaned.contains("expired"));
+
+        // Short fragments that LOOK like a key but lack enough chars
+        // after `sk-` stay untouched (avoids redacting unrelated
+        // `sk-` substrings).
+        let small = "see sk-x for details";
+        let unchanged = redact_api_secrets(small);
+        assert_eq!(unchanged, small);
+    }
+
+    #[test]
+    fn sanitize_body_caps_long_payload_with_marker() {
+        let payload = "Z".repeat(MAX_PROVIDER_ERROR_BODY_BYTES * 3);
+        let out = sanitize_provider_error_body(&payload);
+        assert!(out.len() < payload.len());
+        assert!(out.contains("more bytes elided"));
+    }
+
+    #[test]
+    fn sanitize_body_redacts_and_truncates_together() {
+        // Key followed by a long tail must lose the key AND keep the
+        // elision marker for the trailing bytes.
+        let payload = format!(
+            "key=sk-ant-api03-AAAAaaaa1111BBBB tail{}",
+            "Y".repeat(MAX_PROVIDER_ERROR_BODY_BYTES * 2)
+        );
+        let out = sanitize_provider_error_body(&payload);
+        assert!(out.contains("sk-[redacted]"));
+        assert!(out.contains("more bytes elided"));
+    }
+
     #[test]
     fn api_call_error_is_retryable_for_server_error_and_rate_limited() {
         let server = ApiCallError::ServerError {