quiet-node
diff --git a/‎src-tauri/src/commands.rs‎
Lines changed: 172 additions & 1 deletion b/‎src-tauri/src/commands.rs‎
Lines changed: 172 additions & 1 deletion
diff --git a/‎src-tauri/src/warmup.rs‎
Lines changed: 77 additions & 0 deletions b/‎src-tauri/src/warmup.rs‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/App.tsx‎
Lines changed: 11 additions & 0 deletions b/‎src/App.tsx‎
Lines changed: 11 additions & 0 deletions
@@ -3,7 +3,7 @@ use std::sync::Mutex;
 
 use futures_util::StreamExt;
 use serde::{Deserialize, Serialize};
-use tauri::{ipc::Channel, State};
+use tauri::{ipc::Channel, Emitter, State};
 use tokio_util::sync::CancellationToken;
 
 use crate::config::defaults::STRIP_PATTERNS;
@@ -393,6 +393,16 @@ pub fn engine_start_error(detail: &str) -> EngineError {
 /// [`apply_capability_filter`] path and stderr notice the cache-driven filter
 /// uses, instead of letting the whole request fail.
 ///
+/// This request's own first content chunk (`Token`/`ThinkingToken`) is
+/// authoritative proof the model is warm, independent of the proactive
+/// warm-up prime (`crate::warmup::warm_builtin`), which can still be queued
+/// behind this same request at the engine's single execution slot. On that
+/// first chunk, `warm_state.mark_warmed_by_real_request` is consulted and
+/// `on_warmed` fires at most once, so a caller wired to emit
+/// `warmup:builtin-warmed` from it never leaves the Settings status stuck on
+/// "warming" for the duration of a response that raced ahead of its own
+/// prime.
+///
 /// Returns the accumulated assistant content (empty on the error paths) so
 /// the caller's persistence tail treats every route identically.
 #[allow(clippy::too_many_arguments)]
@@ -404,6 +414,8 @@ pub(crate) async fn stream_builtin_chat(
     mut messages: Vec<ChatMessage>,
     client: &reqwest::Client,
     cancel_token: CancellationToken,
+    warm_state: &crate::warmup::BuiltinWarmState,
+    on_warmed: impl Fn(),
     on_chunk: impl Fn(StreamChunk),
 ) -> String {
     engine.touch();
@@ -436,6 +448,18 @@ pub(crate) async fn stream_builtin_chat(
                     );
                 }
             }
+            let warmed_announced = std::sync::atomic::AtomicBool::new(false);
+            let on_chunk = |chunk: StreamChunk| {
+                if !warmed_announced.load(std::sync::atomic::Ordering::Relaxed)
+                    && matches!(chunk, StreamChunk::Token(_) | StreamChunk::ThinkingToken(_))
+                {
+                    warmed_announced.store(true, std::sync::atomic::Ordering::Relaxed);
+                    if warm_state.mark_warmed_by_real_request(port) {
+                        on_warmed();
+                    }
+                }
+                on_chunk(chunk);
+            };
             crate::openai::stream_openai_chat(
                 crate::openai::OpenAiChatParams {
                     base_url,
@@ -1152,6 +1176,8 @@ pub async fn ask_model(
     model_store: State<'_, crate::models::storage::ModelStore>,
     engine: State<'_, crate::engine::runner::EngineHandle>,
     secrets: State<'_, crate::keychain::Secrets>,
+    app: tauri::AppHandle,
+    warm_state: State<'_, crate::warmup::BuiltinWarmState>,
 ) -> Result<(), String> {
     // Snapshot the config once so all downstream reads (endpoint, prompt, model)
     // see a consistent view even if the user edits Settings mid-stream.
@@ -1367,6 +1393,10 @@ pub async fn ask_model(
                         messages,
                         &client,
                         cancel_token.clone(),
+                        &warm_state,
+                        || {
+                            let _ = app.emit("warmup:builtin-warmed", ());
+                        },
                         builtin_pump,
                     )
                     .await;
@@ -1521,6 +1551,28 @@ mod tests {
         (chunks, callback)
     }
 
+    /// Shared `stream_builtin_chat` `on_warmed` no-op for tests that never
+    /// reach a real streamed token (ensure fails/cancels, or the mocked
+    /// response has no content chunk). One source location shared across
+    /// every such call site, so `stream_builtin_chat_announces_warmed_*`
+    /// invoking the equivalent counting closure below is enough to prove
+    /// this shape is reachable - none of these individual call sites need to
+    /// invoke it themselves for coverage.
+    fn noop_on_warmed() -> impl Fn() {
+        || {}
+    }
+
+    /// Builds an `on_warmed` counter for tests: the returned closure
+    /// increments a shared count so a test can assert exactly how many times
+    /// `stream_builtin_chat` announced a warm-up.
+    fn warmed_counter() -> (Arc<AtomicU64>, impl Fn()) {
+        let count = Arc::new(AtomicU64::new(0));
+        let count_cb = Arc::clone(&count);
+        (count, move || {
+            count_cb.fetch_add(1, Ordering::Relaxed);
+        })
+    }
+
     /// Helper: builds a `/api/chat` response line from content + done flag.
     fn chat_line(content: &str, done: bool) -> String {
         format!(
@@ -3750,6 +3802,8 @@ mod tests {
             vec![],
             &client,
             CancellationToken::new(),
+            &crate::warmup::BuiltinWarmState::default(),
+            noop_on_warmed(),
             callback,
         )
         .await;
@@ -3765,6 +3819,115 @@ mod tests {
         engine.shutdown().await;
     }
 
+    #[tokio::test]
+    async fn stream_builtin_chat_announces_warmed_exactly_once_on_first_token() {
+        let mut server = mockito::Server::new_async().await;
+        let port: u16 = server
+            .url()
+            .rsplit(':')
+            .next()
+            .unwrap()
+            .parse()
+            .expect("mockito url ends in a port");
+        let mock = server
+            .mock("POST", "/v1/chat/completions")
+            .with_header("content-type", "text/event-stream")
+            .with_body(
+                "data: {\"choices\":[{\"delta\":{\"content\":\"Hi\"}}]}\n\n\
+                 data: {\"choices\":[{\"delta\":{\"content\":\" there\"}}]}\n\n\
+                 data: [DONE]\n",
+            )
+            .create_async()
+            .await;
+
+        let engine = spawn_engine(ScriptedEngineProcess {
+            port,
+            spawn_error: None,
+            healthy: true,
+        });
+        let client = reqwest::Client::new();
+        let (_chunks, callback) = collect_chunks();
+        let warm_state = crate::warmup::BuiltinWarmState::default();
+        let (warmed_count, on_warmed) = warmed_counter();
+        stream_builtin_chat(
+            &engine,
+            engine_target(),
+            "org/repo:m.gguf".to_string(),
+            false,
+            vec![],
+            &client,
+            CancellationToken::new(),
+            &warm_state,
+            on_warmed,
+            callback,
+        )
+        .await;
+
+        mock.assert_async().await;
+        assert_eq!(
+            warmed_count.load(Ordering::Relaxed),
+            1,
+            "two tokens stream but on_warmed fires only for the first"
+        );
+        assert!(
+            !warm_state.try_begin(port),
+            "the real request's first token marked this port as warmed"
+        );
+        engine.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn stream_builtin_chat_skips_on_warmed_when_the_port_is_already_marked() {
+        let mut server = mockito::Server::new_async().await;
+        let port: u16 = server
+            .url()
+            .rsplit(':')
+            .next()
+            .unwrap()
+            .parse()
+            .expect("mockito url ends in a port");
+        let mock = server
+            .mock("POST", "/v1/chat/completions")
+            .with_header("content-type", "text/event-stream")
+            .with_body("data: {\"choices\":[{\"delta\":{\"content\":\"Hi\"}}]}\n\ndata: [DONE]\n")
+            .create_async()
+            .await;
+
+        let engine = spawn_engine(ScriptedEngineProcess {
+            port,
+            spawn_error: None,
+            healthy: true,
+        });
+        let client = reqwest::Client::new();
+        let (_chunks, callback) = collect_chunks();
+        let warm_state = crate::warmup::BuiltinWarmState::default();
+        // A proactive prime already announced this port as warmed before the
+        // real request's first token arrives.
+        assert!(warm_state.mark_warmed_by_real_request(port));
+        let (warmed_count, on_warmed) = warmed_counter();
+        stream_builtin_chat(
+            &engine,
+            engine_target(),
+            "org/repo:m.gguf".to_string(),
+            false,
+            vec![],
+            &client,
+            CancellationToken::new(),
+            &warm_state,
+            on_warmed,
+            callback,
+        )
+        .await;
+
+        mock.assert_async().await;
+        assert_eq!(
+            warmed_count.load(Ordering::Relaxed),
+            0,
+            "the port was already announced warmed; no redundant emit"
+        );
+        engine.shutdown().await;
+    }
+
     #[tokio::test]
     async fn superseded_ensure_emits_cancelled() {
         // Health probes hang, so the ensure stays in flight until the
@@ -3788,6 +3951,8 @@ mod tests {
                     vec![],
                     &client,
                     CancellationToken::new(),
+                    &crate::warmup::BuiltinWarmState::default(),
+                    noop_on_warmed(),
                     callback,
                 )
                 .await
@@ -3842,6 +4007,8 @@ mod tests {
                     vec![],
                     &client,
                     cancel_token,
+                    &crate::warmup::BuiltinWarmState::default(),
+                    noop_on_warmed(),
                     callback,
                 )
                 .await
@@ -3887,6 +4054,8 @@ mod tests {
             vec![],
             &client,
             CancellationToken::new(),
+            &crate::warmup::BuiltinWarmState::default(),
+            noop_on_warmed(),
             callback,
         )
         .await;
@@ -4028,6 +4197,8 @@ mod tests {
             image_message(),
             &client,
             CancellationToken::new(),
+            &crate::warmup::BuiltinWarmState::default(),
+            noop_on_warmed(),
             callback,
         )
         .await;
 
@@ -201,6 +201,32 @@ impl BuiltinWarmState {
         }
     }
 
+    /// Marks `port` as warmed because a real chat request's first token has
+    /// already streamed - authoritative proof the prefill is done,
+    /// independent of whether a proactive prime (`try_begin`/`finish`) is
+    /// still queued behind this same request at the engine's single
+    /// execution slot. Without this, a real request that races ahead of its
+    /// own proactive prime in that queue leaves `warmup:builtin-warmed`
+    /// unfired - and the Settings status stuck on "warming" - until the
+    /// stale prime eventually finishes, which can be well after the real
+    /// response has already completed.
+    ///
+    /// Returns true the first time this fires for `port` (the caller should
+    /// emit `warmup:builtin-warmed`); returns false on every later call for
+    /// the same port, including if the queued prime's own `finish` already
+    /// announced it, so the frontend never sees a redundant second emit.
+    pub fn mark_warmed_by_real_request(&self, port: u16) -> bool {
+        let mut g = self.inner.lock().unwrap();
+        if g.primed_port == Some(port) {
+            return false;
+        }
+        g.primed_port = Some(port);
+        if g.in_flight == Some(port) {
+            g.in_flight = None;
+        }
+        true
+    }
+
     /// Whether a prime is currently in flight. Seeds the Settings keep-warm
     /// status when the panel mounts during a cold prime (it otherwise learns
     /// the state only from the `warmup:builtin-warming`/`-warmed` events).
@@ -1809,6 +1835,57 @@ mod tests {
         assert!(!s.is_warming(), "a finished prime is no longer warming");
     }
 
+    #[test]
+    fn warm_state_real_request_marks_warmed_and_reports_true_once() {
+        let s = BuiltinWarmState::default();
+        assert!(s.try_begin(40000), "a proactive prime is in flight");
+        assert!(
+            s.mark_warmed_by_real_request(40000),
+            "the real request's first token is authoritative proof of warm"
+        );
+        assert!(
+            !s.mark_warmed_by_real_request(40000),
+            "a second call for the same port must not re-announce"
+        );
+    }
+
+    #[test]
+    fn warm_state_real_request_clears_the_in_flight_slot() {
+        let s = BuiltinWarmState::default();
+        assert!(s.try_begin(40000));
+        assert!(s.is_warming(), "the proactive prime is in flight");
+        s.mark_warmed_by_real_request(40000);
+        assert!(
+            !s.is_warming(),
+            "the real request proves warm even though the redundant prime is still queued"
+        );
+    }
+
+    #[test]
+    fn warm_state_real_request_dedups_against_an_already_finished_prime() {
+        let s = BuiltinWarmState::default();
+        assert!(s.try_begin(40000));
+        s.finish(40000, true);
+        assert!(
+            !s.mark_warmed_by_real_request(40000),
+            "the prime already announced warmed for this port; no second emit"
+        );
+    }
+
+    #[test]
+    fn warm_state_real_request_does_not_disturb_a_different_ports_in_flight_slot() {
+        let s = BuiltinWarmState::default();
+        assert!(s.try_begin(40000), "port 40000's prime is in flight");
+        assert!(
+            s.mark_warmed_by_real_request(40001),
+            "a real request on a different (newer) port still reports true"
+        );
+        assert!(
+            s.is_warming(),
+            "port 40000's own in-flight slot is untouched"
+        );
+    }
+
     #[test]
     fn builtin_loaded_model_names_the_resident_blob_not_the_selection() {
         use std::path::PathBuf;
 
@@ -22,6 +22,7 @@ import type { Message } from './hooks/useModel';
 import { useConversationHistory } from './hooks/useConversationHistory';
 import { useModelSelection } from './hooks/useModelSelection';
 import { useModelCapabilities } from './hooks/useModelCapabilities';
+import { useEngineWarmupStatus } from './hooks/useEngineWarmupStatus';
 import { useDownloadCtx } from './contexts/DownloadContext';
 import {
   downloadFailureMessage,
@@ -456,6 +457,12 @@ function App() {
   const { capabilities: modelCapabilities, refresh: refreshModelCapabilities } =
     useModelCapabilities();
 
+  // Mounted here (app root, alive for the app's lifetime) rather than inside
+  // ConversationView (which only mounts once chat starts) so the warming
+  // state is already current the moment a turn needs it - see the hook's
+  // doc comment for why a late-mounting subscriber would risk missing it.
+  const { warming: builtinEngineWarming } = useEngineWarmupStatus();
+
   /** Capability flags for the currently active model, or undefined if not loaded yet. */
   const activeModelCapabilities = activeModel
     ? modelCapabilities[activeModel]
@@ -3583,6 +3590,10 @@ function App() {
                                     : undefined
                                 }
                                 isExportOpen={isExportOpen}
+                                providerKind={
+                                  config.inference.activeProviderKind
+                                }
+                                engineWarming={builtinEngineWarming}
                               />
                             ) : null}
                           </AnimatePresence>