Skip to content

Commit bf75c42

Browse files
authored
fix: show cold-start loading label during engine warmup (#287)
Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
1 parent 5d61272 commit bf75c42

10 files changed

Lines changed: 918 additions & 10 deletions

src-tauri/src/commands.rs

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::sync::Mutex;
33

44
use futures_util::StreamExt;
55
use serde::{Deserialize, Serialize};
6-
use tauri::{ipc::Channel, State};
6+
use tauri::{ipc::Channel, Emitter, State};
77
use tokio_util::sync::CancellationToken;
88

99
use crate::config::defaults::STRIP_PATTERNS;
@@ -393,6 +393,16 @@ pub fn engine_start_error(detail: &str) -> EngineError {
393393
/// [`apply_capability_filter`] path and stderr notice the cache-driven filter
394394
/// uses, instead of letting the whole request fail.
395395
///
396+
/// This request's own first content chunk (`Token`/`ThinkingToken`) is
397+
/// authoritative proof the model is warm, independent of the proactive
398+
/// warm-up prime (`crate::warmup::warm_builtin`), which can still be queued
399+
/// behind this same request at the engine's single execution slot. On that
400+
/// first chunk, `warm_state.mark_warmed_by_real_request` is consulted and
401+
/// `on_warmed` fires at most once, so a caller wired to emit
402+
/// `warmup:builtin-warmed` from it never leaves the Settings status stuck on
403+
/// "warming" for the duration of a response that raced ahead of its own
404+
/// prime.
405+
///
396406
/// Returns the accumulated assistant content (empty on the error paths) so
397407
/// the caller's persistence tail treats every route identically.
398408
#[allow(clippy::too_many_arguments)]
@@ -404,6 +414,8 @@ pub(crate) async fn stream_builtin_chat(
404414
mut messages: Vec<ChatMessage>,
405415
client: &reqwest::Client,
406416
cancel_token: CancellationToken,
417+
warm_state: &crate::warmup::BuiltinWarmState,
418+
on_warmed: impl Fn(),
407419
on_chunk: impl Fn(StreamChunk),
408420
) -> String {
409421
engine.touch();
@@ -436,6 +448,18 @@ pub(crate) async fn stream_builtin_chat(
436448
);
437449
}
438450
}
451+
let warmed_announced = std::sync::atomic::AtomicBool::new(false);
452+
let on_chunk = |chunk: StreamChunk| {
453+
if !warmed_announced.load(std::sync::atomic::Ordering::Relaxed)
454+
&& matches!(chunk, StreamChunk::Token(_) | StreamChunk::ThinkingToken(_))
455+
{
456+
warmed_announced.store(true, std::sync::atomic::Ordering::Relaxed);
457+
if warm_state.mark_warmed_by_real_request(port) {
458+
on_warmed();
459+
}
460+
}
461+
on_chunk(chunk);
462+
};
439463
crate::openai::stream_openai_chat(
440464
crate::openai::OpenAiChatParams {
441465
base_url,
@@ -1152,6 +1176,8 @@ pub async fn ask_model(
11521176
model_store: State<'_, crate::models::storage::ModelStore>,
11531177
engine: State<'_, crate::engine::runner::EngineHandle>,
11541178
secrets: State<'_, crate::keychain::Secrets>,
1179+
app: tauri::AppHandle,
1180+
warm_state: State<'_, crate::warmup::BuiltinWarmState>,
11551181
) -> Result<(), String> {
11561182
// Snapshot the config once so all downstream reads (endpoint, prompt, model)
11571183
// see a consistent view even if the user edits Settings mid-stream.
@@ -1367,6 +1393,10 @@ pub async fn ask_model(
13671393
messages,
13681394
&client,
13691395
cancel_token.clone(),
1396+
&warm_state,
1397+
|| {
1398+
let _ = app.emit("warmup:builtin-warmed", ());
1399+
},
13701400
builtin_pump,
13711401
)
13721402
.await;
@@ -1521,6 +1551,28 @@ mod tests {
15211551
(chunks, callback)
15221552
}
15231553

1554+
/// Shared `stream_builtin_chat` `on_warmed` no-op for tests that never
1555+
/// reach a real streamed token (ensure fails/cancels, or the mocked
1556+
/// response has no content chunk). One source location shared across
1557+
/// every such call site, so `stream_builtin_chat_announces_warmed_*`
1558+
/// invoking the equivalent counting closure below is enough to prove
1559+
/// this shape is reachable - none of these individual call sites need to
1560+
/// invoke it themselves for coverage.
1561+
fn noop_on_warmed() -> impl Fn() {
1562+
|| {}
1563+
}
1564+
1565+
/// Builds an `on_warmed` counter for tests: the returned closure
1566+
/// increments a shared count so a test can assert exactly how many times
1567+
/// `stream_builtin_chat` announced a warm-up.
1568+
fn warmed_counter() -> (Arc<AtomicU64>, impl Fn()) {
1569+
let count = Arc::new(AtomicU64::new(0));
1570+
let count_cb = Arc::clone(&count);
1571+
(count, move || {
1572+
count_cb.fetch_add(1, Ordering::Relaxed);
1573+
})
1574+
}
1575+
15241576
/// Helper: builds a `/api/chat` response line from content + done flag.
15251577
fn chat_line(content: &str, done: bool) -> String {
15261578
format!(
@@ -3750,6 +3802,8 @@ mod tests {
37503802
vec![],
37513803
&client,
37523804
CancellationToken::new(),
3805+
&crate::warmup::BuiltinWarmState::default(),
3806+
noop_on_warmed(),
37533807
callback,
37543808
)
37553809
.await;
@@ -3765,6 +3819,115 @@ mod tests {
37653819
engine.shutdown().await;
37663820
}
37673821

3822+
#[tokio::test]
3823+
async fn stream_builtin_chat_announces_warmed_exactly_once_on_first_token() {
3824+
let mut server = mockito::Server::new_async().await;
3825+
let port: u16 = server
3826+
.url()
3827+
.rsplit(':')
3828+
.next()
3829+
.unwrap()
3830+
.parse()
3831+
.expect("mockito url ends in a port");
3832+
let mock = server
3833+
.mock("POST", "/v1/chat/completions")
3834+
.with_header("content-type", "text/event-stream")
3835+
.with_body(
3836+
"data: {\"choices\":[{\"delta\":{\"content\":\"Hi\"}}]}\n\n\
3837+
data: {\"choices\":[{\"delta\":{\"content\":\" there\"}}]}\n\n\
3838+
data: [DONE]\n",
3839+
)
3840+
.create_async()
3841+
.await;
3842+
3843+
let engine = spawn_engine(ScriptedEngineProcess {
3844+
port,
3845+
spawn_error: None,
3846+
healthy: true,
3847+
});
3848+
let client = reqwest::Client::new();
3849+
let (_chunks, callback) = collect_chunks();
3850+
let warm_state = crate::warmup::BuiltinWarmState::default();
3851+
let (warmed_count, on_warmed) = warmed_counter();
3852+
stream_builtin_chat(
3853+
&engine,
3854+
engine_target(),
3855+
"org/repo:m.gguf".to_string(),
3856+
false,
3857+
vec![],
3858+
&client,
3859+
CancellationToken::new(),
3860+
&warm_state,
3861+
on_warmed,
3862+
callback,
3863+
)
3864+
.await;
3865+
3866+
mock.assert_async().await;
3867+
assert_eq!(
3868+
warmed_count.load(Ordering::Relaxed),
3869+
1,
3870+
"two tokens stream but on_warmed fires only for the first"
3871+
);
3872+
assert!(
3873+
!warm_state.try_begin(port),
3874+
"the real request's first token marked this port as warmed"
3875+
);
3876+
engine.shutdown().await;
3877+
}
3878+
3879+
#[tokio::test]
3880+
async fn stream_builtin_chat_skips_on_warmed_when_the_port_is_already_marked() {
3881+
let mut server = mockito::Server::new_async().await;
3882+
let port: u16 = server
3883+
.url()
3884+
.rsplit(':')
3885+
.next()
3886+
.unwrap()
3887+
.parse()
3888+
.expect("mockito url ends in a port");
3889+
let mock = server
3890+
.mock("POST", "/v1/chat/completions")
3891+
.with_header("content-type", "text/event-stream")
3892+
.with_body("data: {\"choices\":[{\"delta\":{\"content\":\"Hi\"}}]}\n\ndata: [DONE]\n")
3893+
.create_async()
3894+
.await;
3895+
3896+
let engine = spawn_engine(ScriptedEngineProcess {
3897+
port,
3898+
spawn_error: None,
3899+
healthy: true,
3900+
});
3901+
let client = reqwest::Client::new();
3902+
let (_chunks, callback) = collect_chunks();
3903+
let warm_state = crate::warmup::BuiltinWarmState::default();
3904+
// A proactive prime already announced this port as warmed before the
3905+
// real request's first token arrives.
3906+
assert!(warm_state.mark_warmed_by_real_request(port));
3907+
let (warmed_count, on_warmed) = warmed_counter();
3908+
stream_builtin_chat(
3909+
&engine,
3910+
engine_target(),
3911+
"org/repo:m.gguf".to_string(),
3912+
false,
3913+
vec![],
3914+
&client,
3915+
CancellationToken::new(),
3916+
&warm_state,
3917+
on_warmed,
3918+
callback,
3919+
)
3920+
.await;
3921+
3922+
mock.assert_async().await;
3923+
assert_eq!(
3924+
warmed_count.load(Ordering::Relaxed),
3925+
0,
3926+
"the port was already announced warmed; no redundant emit"
3927+
);
3928+
engine.shutdown().await;
3929+
}
3930+
37683931
#[tokio::test]
37693932
async fn superseded_ensure_emits_cancelled() {
37703933
// Health probes hang, so the ensure stays in flight until the
@@ -3788,6 +3951,8 @@ mod tests {
37883951
vec![],
37893952
&client,
37903953
CancellationToken::new(),
3954+
&crate::warmup::BuiltinWarmState::default(),
3955+
noop_on_warmed(),
37913956
callback,
37923957
)
37933958
.await
@@ -3842,6 +4007,8 @@ mod tests {
38424007
vec![],
38434008
&client,
38444009
cancel_token,
4010+
&crate::warmup::BuiltinWarmState::default(),
4011+
noop_on_warmed(),
38454012
callback,
38464013
)
38474014
.await
@@ -3887,6 +4054,8 @@ mod tests {
38874054
vec![],
38884055
&client,
38894056
CancellationToken::new(),
4057+
&crate::warmup::BuiltinWarmState::default(),
4058+
noop_on_warmed(),
38904059
callback,
38914060
)
38924061
.await;
@@ -4028,6 +4197,8 @@ mod tests {
40284197
image_message(),
40294198
&client,
40304199
CancellationToken::new(),
4200+
&crate::warmup::BuiltinWarmState::default(),
4201+
noop_on_warmed(),
40314202
callback,
40324203
)
40334204
.await;

src-tauri/src/warmup.rs

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,32 @@ impl BuiltinWarmState {
201201
}
202202
}
203203

204+
/// Marks `port` as warmed because a real chat request's first token has
205+
/// already streamed - authoritative proof the prefill is done,
206+
/// independent of whether a proactive prime (`try_begin`/`finish`) is
207+
/// still queued behind this same request at the engine's single
208+
/// execution slot. Without this, a real request that races ahead of its
209+
/// own proactive prime in that queue leaves `warmup:builtin-warmed`
210+
/// unfired - and the Settings status stuck on "warming" - until the
211+
/// stale prime eventually finishes, which can be well after the real
212+
/// response has already completed.
213+
///
214+
/// Returns true the first time this fires for `port` (the caller should
215+
/// emit `warmup:builtin-warmed`); returns false on every later call for
216+
/// the same port, including if the queued prime's own `finish` already
217+
/// announced it, so the frontend never sees a redundant second emit.
218+
pub fn mark_warmed_by_real_request(&self, port: u16) -> bool {
219+
let mut g = self.inner.lock().unwrap();
220+
if g.primed_port == Some(port) {
221+
return false;
222+
}
223+
g.primed_port = Some(port);
224+
if g.in_flight == Some(port) {
225+
g.in_flight = None;
226+
}
227+
true
228+
}
229+
204230
/// Whether a prime is currently in flight. Seeds the Settings keep-warm
205231
/// status when the panel mounts during a cold prime (it otherwise learns
206232
/// the state only from the `warmup:builtin-warming`/`-warmed` events).
@@ -1809,6 +1835,57 @@ mod tests {
18091835
assert!(!s.is_warming(), "a finished prime is no longer warming");
18101836
}
18111837

1838+
#[test]
1839+
fn warm_state_real_request_marks_warmed_and_reports_true_once() {
1840+
let s = BuiltinWarmState::default();
1841+
assert!(s.try_begin(40000), "a proactive prime is in flight");
1842+
assert!(
1843+
s.mark_warmed_by_real_request(40000),
1844+
"the real request's first token is authoritative proof of warm"
1845+
);
1846+
assert!(
1847+
!s.mark_warmed_by_real_request(40000),
1848+
"a second call for the same port must not re-announce"
1849+
);
1850+
}
1851+
1852+
#[test]
1853+
fn warm_state_real_request_clears_the_in_flight_slot() {
1854+
let s = BuiltinWarmState::default();
1855+
assert!(s.try_begin(40000));
1856+
assert!(s.is_warming(), "the proactive prime is in flight");
1857+
s.mark_warmed_by_real_request(40000);
1858+
assert!(
1859+
!s.is_warming(),
1860+
"the real request proves warm even though the redundant prime is still queued"
1861+
);
1862+
}
1863+
1864+
#[test]
1865+
fn warm_state_real_request_dedups_against_an_already_finished_prime() {
1866+
let s = BuiltinWarmState::default();
1867+
assert!(s.try_begin(40000));
1868+
s.finish(40000, true);
1869+
assert!(
1870+
!s.mark_warmed_by_real_request(40000),
1871+
"the prime already announced warmed for this port; no second emit"
1872+
);
1873+
}
1874+
1875+
#[test]
1876+
fn warm_state_real_request_does_not_disturb_a_different_ports_in_flight_slot() {
1877+
let s = BuiltinWarmState::default();
1878+
assert!(s.try_begin(40000), "port 40000's prime is in flight");
1879+
assert!(
1880+
s.mark_warmed_by_real_request(40001),
1881+
"a real request on a different (newer) port still reports true"
1882+
);
1883+
assert!(
1884+
s.is_warming(),
1885+
"port 40000's own in-flight slot is untouched"
1886+
);
1887+
}
1888+
18121889
#[test]
18131890
fn builtin_loaded_model_names_the_resident_blob_not_the_selection() {
18141891
use std::path::PathBuf;

src/App.tsx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import type { Message } from './hooks/useModel';
2222
import { useConversationHistory } from './hooks/useConversationHistory';
2323
import { useModelSelection } from './hooks/useModelSelection';
2424
import { useModelCapabilities } from './hooks/useModelCapabilities';
25+
import { useEngineWarmupStatus } from './hooks/useEngineWarmupStatus';
2526
import { useDownloadCtx } from './contexts/DownloadContext';
2627
import {
2728
downloadFailureMessage,
@@ -456,6 +457,12 @@ function App() {
456457
const { capabilities: modelCapabilities, refresh: refreshModelCapabilities } =
457458
useModelCapabilities();
458459

460+
// Mounted here (app root, alive for the app's lifetime) rather than inside
461+
// ConversationView (which only mounts once chat starts) so the warming
462+
// state is already current the moment a turn needs it - see the hook's
463+
// doc comment for why a late-mounting subscriber would risk missing it.
464+
const { warming: builtinEngineWarming } = useEngineWarmupStatus();
465+
459466
/** Capability flags for the currently active model, or undefined if not loaded yet. */
460467
const activeModelCapabilities = activeModel
461468
? modelCapabilities[activeModel]
@@ -3583,6 +3590,10 @@ function App() {
35833590
: undefined
35843591
}
35853592
isExportOpen={isExportOpen}
3593+
providerKind={
3594+
config.inference.activeProviderKind
3595+
}
3596+
engineWarming={builtinEngineWarming}
35863597
/>
35873598
) : null}
35883599
</AnimatePresence>

0 commit comments

Comments
 (0)