fix(inference): sanitize pytorch kv truncate io errors

MrScripty · MrScripty · commit 58ddf4a1c737 · 2026-05-05T18:35:31.000-07:00
Route PyTorch KV-cache truncate temp-file read and write failures through the canonical pytorch_worker_kv_truncate_failed error path.

Generate the truncate request id before temp-file IO so local IO failures preserve request correlation and use the shared worker-message sanitizer instead of ad hoc inference errors.

Update the backend README and inference execution boundary plan with the KV truncate temp-file hygiene rule.

Validation: cargo fmt --all; cargo test -p inference --features backend-pytorch pytorch_kv; cargo test -p inference --features backend-pytorch test_pytorch_worker_kv_truncate; cargo check -p inference --features backend-pytorch; cargo check -p inference --no-default-features; git diff --check.
diff --git a/crates/inference/src/backend/README.md b/crates/inference/src/backend/README.md
@@ -187,6 +187,9 @@ fn create_backend() {
   malformed loaded-model/live-KV metadata through the canonical
   `pytorch_worker_kv_*_failed` paths so request ids and bounded diagnostics are
   preserved even when Python returns an unexpected shape.
+- PyTorch KV-cache truncate temp-file read/write failures must use the
+  canonical `pytorch_worker_kv_truncate_failed` path so local temp paths are
+  sanitized before they can reach backend or workflow diagnostics.
 - Backend-native generation fields and kwargs must stay inside backend-local
   mapping helpers. PyTorch maps canonical generation options to
   Transformers-style kwargs, while llama.cpp maps them to bounded
diff --git a/crates/inference/src/backend/pytorch.rs b/crates/inference/src/backend/pytorch.rs
@@ -144,6 +144,10 @@ fn kv_worker_failure_from_message(
     .into_backend_error()
 }
 
+fn kv_truncate_worker_failure_from_message(request_id: &str, message: String) -> BackendError {
+    kv_worker_failure_from_message(request_id, "pytorch_worker_kv_truncate_failed", message)
+}
+
 fn kv_loaded_info_unavailable_error(request_id: &str) -> BackendError {
     kv_worker_failure_from_message(
         request_id,
@@ -2106,21 +2110,25 @@ impl InferenceBackend for PyTorchBackend {
         token_position: usize,
         _active_config: Option<&BackendConfig>,
     ) -> Result<Vec<u8>, BackendError> {
+        let request_id = format!("pytorch-kv-truncate-{}", Uuid::new_v4().simple());
         let temp_path = std::env::temp_dir().join(format!(
             "pantograph-pytorch-kv-truncate-{}.bin",
             uuid::Uuid::new_v4()
         ));
-        std::fs::write(&temp_path, data)
-            .map_err(|e| BackendError::Inference(format!("Failed to write KV temp file: {}", e)))?;
+        std::fs::write(&temp_path, data).map_err(|e| {
+            kv_truncate_worker_failure_from_message(
+                &request_id,
+                format!("Failed to write KV temp file: {e}"),
+            )
+        })?;
         let truncate_result = tokio::task::spawn_blocking({
             let temp_path = temp_path.clone();
-            let request_id = format!("pytorch-kv-truncate-{}", Uuid::new_v4().simple());
+            let request_id = request_id.clone();
             move || {
                 Python::with_gil(|py| -> Result<(), BackendError> {
                     let worker = pytorch_worker::worker_module(py).map_err(|e| {
-                        kv_worker_failure_from_message(
+                        kv_truncate_worker_failure_from_message(
                             &request_id,
-                            "pytorch_worker_kv_truncate_failed",
                             format!("Failed to get worker module: {}", e),
                         )
                     })?;
@@ -2130,9 +2138,8 @@ impl InferenceBackend for PyTorchBackend {
                             (temp_path.to_string_lossy().to_string(), token_position),
                         )
                         .map_err(|e| {
-                            kv_worker_failure_from_message(
+                            kv_truncate_worker_failure_from_message(
                                 &request_id,
-                                "pytorch_worker_kv_truncate_failed",
                                 format!("PyTorch KV truncate failed: {}", e),
                             )
                         })?;
@@ -2142,8 +2149,12 @@ impl InferenceBackend for PyTorchBackend {
         })
         .await
         .map_err(|e| BackendError::Inference(task_join_error_message(e)))?;
-        let read_result = std::fs::read(&temp_path)
-            .map_err(|e| BackendError::Inference(format!("Failed to read KV temp file: {}", e)));
+        let read_result = std::fs::read(&temp_path).map_err(|e| {
+            kv_truncate_worker_failure_from_message(
+                &request_id,
+                format!("Failed to read KV temp file: {e}"),
+            )
+        });
         let _ = std::fs::remove_file(&temp_path);
         truncate_result?;
         read_result
diff --git a/crates/inference/src/backend/pytorch_tests.rs b/crates/inference/src/backend/pytorch_tests.rs
@@ -2486,9 +2486,8 @@ fn test_pytorch_kv_live_info_malformed_result_normalizes_to_backend_error() {
 
 #[test]
 fn test_pytorch_worker_kv_truncate_transport_error_normalizes_to_backend_error() {
-    match kv_worker_failure_from_message(
+    match kv_truncate_worker_failure_from_message(
         "req-kv-truncate",
-        "pytorch_worker_kv_truncate_failed",
         "PyTorch KV truncate failed: invalid marker.".to_string(),
     ) {
         BackendError::Inference(message) => {
@@ -2500,6 +2499,23 @@ fn test_pytorch_worker_kv_truncate_transport_error_normalizes_to_backend_error()
     }
 }
 
+#[test]
+fn test_pytorch_worker_kv_truncate_temp_file_errors_strip_local_paths() {
+    match kv_truncate_worker_failure_from_message(
+        "req-kv-truncate-temp",
+        "Failed to write KV temp file: Permission denied at /tmp/pantograph-pytorch-kv-truncate-private.bin".to_string(),
+    ) {
+        BackendError::Inference(message) => {
+            assert!(message.contains("pytorch_worker_kv_truncate_failed"));
+            assert!(message.contains("req-kv-truncate-temp"));
+            assert!(message.contains("Failed to write KV temp file"));
+            assert!(message.contains("[local-path]"));
+            assert!(!message.contains("/tmp/pantograph-pytorch-kv-truncate-private.bin"));
+        }
+        other => panic!("expected Inference error, got {other:?}"),
+    }
+}
+
 #[test]
 fn test_pytorch_worker_envelope_rejects_missing_required_fields() {
     let fixture = include_str!(
diff --git a/docs/plans/inference-execution-boundary-contracts/plan.md b/docs/plans/inference-execution-boundary-contracts/plan.md
@@ -1852,7 +1852,10 @@ using Python Transformers behind the boundary for broad HF-compatible support.
   codes without exposing cache bytes or file paths in the canonical code.
   PyTorch live-KV loaded-model and live-KV metadata extraction failures now
   use the same canonical KV worker failure shape when Python returns no active
-  model or malformed KV metadata.
+  model or malformed KV metadata. PyTorch KV-cache truncation temp-file
+  read/write failures now also use the canonical
+  `pytorch_worker_kv_truncate_failed` path so local temp paths are sanitized
+  before backend errors can become workflow diagnostics.
   PyTorch backend trait KV slot save/restore/clear/truncate worker failures
   now use the same canonical KV worker failure shape.
   Non-streaming PyTorch generate-text worker transport failures now also retain
@@ -3834,6 +3837,10 @@ Update during implementation:
   envelope and typed response decoder with request-id correlation,
   malformed-response rejection, and canonical sanitized unload errors before
   clearing Rust-side loaded-model state.
+- 2026-05-06: PyTorch KV-cache truncation temp-file read/write failures now
+  route through canonical `pytorch_worker_kv_truncate_failed` errors with a
+  generated request id and shared path sanitizer instead of ad hoc inference
+  errors that could expose local temp paths.
 - 2026-05-05: Added append-only `ChatChunk.cache_handle_id` stream metadata and
   threaded terminal text/chat cache-handle ids through typed gateway results,
   backend-execution lifecycle completion events, and `llm-inference.kv_cache_out`