From 0f70fb0219d7480d8ae670292d312f93e6680ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 12:05:36 +0900 Subject: [PATCH 01/13] fix: add backend startup heartbeat liveness probe --- docs/environment-variables.md | 4 +- scripts/backend/templates/launch_backend.py | 56 ++++++ src-tauri/src/app_constants.rs | 10 +- src-tauri/src/backend/config.rs | 84 +++++++++ src-tauri/src/backend/launch.rs | 11 +- src-tauri/src/backend/readiness.rs | 183 ++++++++++++++++++-- src-tauri/src/backend/runtime.rs | 28 ++- 7 files changed, 353 insertions(+), 23 deletions(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 97738dd9..cbea4c6a 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -9,7 +9,8 @@ | --- | --- | --- | | `ASTRBOT_BACKEND_URL` | 后端基础 URL | 默认 `http://127.0.0.1:6185/` | | `ASTRBOT_BACKEND_AUTO_START` | 是否自动拉起后端 | 默认 `1`(启用) | -| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `300000` | +| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `900000` | +| `ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS` | 后端启动 heartbeat 空闲超时 | 默认 `60000`,范围 `5000~900000` | | `ASTRBOT_BACKEND_READY_HTTP_PATH` | 就绪探针 HTTP 路径 | 默认 `/api/stat/start-time` | | `ASTRBOT_BACKEND_READY_PROBE_TIMEOUT_MS` | 就绪探针单次超时 | 默认回退到 `ASTRBOT_BACKEND_PING_TIMEOUT_MS` | | `ASTRBOT_BACKEND_READY_POLL_INTERVAL_MS` | 就绪轮询间隔 | 默认 `300`,并按边界 clamp | @@ -53,6 +54,7 @@ | 变量 | 用途 | 默认值/行为 | | --- | --- | --- | | `ASTRBOT_DESKTOP_CLIENT` | 标记桌面客户端环境 | 打包态启动后端时写入 `1` | +| `ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH` | 桌面端写给后端启动器的 heartbeat 文件路径 | 打包态默认写到 `ASTRBOT_ROOT/data/backend-startup-heartbeat.json` | ## 4. 发布/CI(GitHub Actions) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index ce31c4f0..eff148e7 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -1,14 +1,20 @@ from __future__ import annotations +import atexit import ctypes +import json import os import runpy import sys +import threading +import time from pathlib import Path BACKEND_DIR = Path(__file__).resolve().parent APP_DIR = BACKEND_DIR / "app" _WINDOWS_DLL_DIRECTORY_HANDLES: list[object] = [] +STARTUP_HEARTBEAT_ENV = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH" +STARTUP_HEARTBEAT_INTERVAL_SECONDS = 2.0 def configure_stdio_utf8() -> None: @@ -113,9 +119,59 @@ def preload_windows_runtime_dlls() -> None: continue +def resolve_startup_heartbeat_path() -> Path | None: + raw = os.environ.get(STARTUP_HEARTBEAT_ENV, "").strip() + if not raw: + return None + return Path(raw) + + +def write_startup_heartbeat(path: Path, state: str) -> None: + try: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "pid": os.getpid(), + "state": state, + "updated_at_ms": int(time.time() * 1000), + } + temp_path = path.with_name(f"{path.name}.tmp") + temp_path.write_text( + json.dumps(payload, separators=(",", ":")), + encoding="utf-8", + ) + temp_path.replace(path) + except Exception: + return + + +def start_startup_heartbeat() -> None: + heartbeat_path = resolve_startup_heartbeat_path() + if heartbeat_path is None: + return + + stop_event = threading.Event() + write_startup_heartbeat(heartbeat_path, "starting") + + def stop_heartbeat() -> None: + stop_event.set() + write_startup_heartbeat(heartbeat_path, "stopping") + + def heartbeat_loop() -> None: + while not stop_event.wait(STARTUP_HEARTBEAT_INTERVAL_SECONDS): + write_startup_heartbeat(heartbeat_path, "starting") + + atexit.register(stop_heartbeat) + threading.Thread( + target=heartbeat_loop, + name="astrbot-startup-heartbeat", + daemon=True, + ).start() + + configure_stdio_utf8() configure_windows_dll_search_path() preload_windows_runtime_dlls() +start_startup_heartbeat() sys.path.insert(0, str(APP_DIR)) diff --git a/src-tauri/src/app_constants.rs b/src-tauri/src/app_constants.rs index 4959f299..e42e2e29 100644 --- a/src-tauri/src/app_constants.rs +++ b/src-tauri/src/app_constants.rs @@ -2,7 +2,7 @@ use std::time::Duration; pub(crate) const DEFAULT_BACKEND_URL: &str = "http://127.0.0.1:6185/"; pub(crate) const BACKEND_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_TIMEOUT_MS"; -pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 5 * 60 * 1000; +pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 15 * 60 * 1000; pub(crate) const GRACEFUL_RESTART_REQUEST_TIMEOUT_MS: u64 = 2_500; pub(crate) const GRACEFUL_RESTART_START_TIME_TIMEOUT_MS: u64 = 1_800; pub(crate) const GRACEFUL_RESTART_POLL_INTERVAL_MS: u64 = 350; @@ -17,6 +17,14 @@ pub(crate) const BACKEND_READY_PROBE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_READY_ pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MIN_MS: u64 = 100; pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MAX_MS: u64 = 30_000; pub(crate) const BACKEND_READY_TCP_PROBE_TIMEOUT_MAX_MS: u64 = 1_000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS"; +pub(crate) const DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS: u64 = 60 * 1000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS: u64 = 5_000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS: u64 = 15 * 60 * 1000; +pub(crate) const BACKEND_STARTUP_HEARTBEAT_PATH_ENV: &str = + "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH"; +pub(crate) const DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH: &str = + "data/backend-startup-heartbeat.json"; pub(crate) const DEFAULT_BACKEND_PING_TIMEOUT_MS: u64 = 800; pub(crate) const BACKEND_PING_TIMEOUT_MIN_MS: u64 = 50; pub(crate) const BACKEND_PING_TIMEOUT_MAX_MS: u64 = 30_000; diff --git a/src-tauri/src/backend/config.rs b/src-tauri/src/backend/config.rs index c58676a2..f499723f 100644 --- a/src-tauri/src/backend/config.rs +++ b/src-tauri/src/backend/config.rs @@ -1,4 +1,5 @@ use std::env; +use std::path::{Path, PathBuf}; use std::time::Duration; use url::Url; @@ -7,6 +8,8 @@ pub struct BackendReadinessConfig { pub path: String, pub probe_timeout_ms: u64, pub poll_interval_ms: u64, + pub startup_idle_timeout_ms: u64, + pub startup_heartbeat_path: Option, } pub fn resolve_backend_ready_http_path(env_name: &str, default_path: &str, mut log: F) -> String @@ -97,6 +100,44 @@ where parse_clamped_timeout_env(raw, env_name, fallback_ms, min_ms, max_ms, log) } +pub fn resolve_backend_startup_idle_timeout_ms( + raw: &str, + env_name: &str, + fallback_ms: u64, + min_ms: u64, + max_ms: u64, + log: F, +) -> u64 +where + F: FnMut(String), +{ + parse_clamped_timeout_env(raw, env_name, fallback_ms, min_ms, max_ms, log) +} + +pub fn resolve_backend_startup_heartbeat_path( + root_dir: Option<&Path>, + packaged_root: Option, + relative_path: &str, +) -> Option { + let trimmed = relative_path.trim(); + if trimmed.is_empty() { + return None; + } + + if let Some(root) = root_dir { + return Some(root.join(trimmed)); + } + + if let Ok(root) = env::var("ASTRBOT_ROOT") { + let root = PathBuf::from(root.trim()); + if !root.as_os_str().is_empty() { + return Some(root.join(trimmed)); + } + } + + packaged_root.map(|root| root.join(trimmed)) +} + #[allow(clippy::too_many_arguments)] pub fn resolve_backend_readiness_config( ready_http_path_env: &str, @@ -221,6 +262,8 @@ where path, probe_timeout_ms, poll_interval_ms, + startup_idle_timeout_ms: 0, + startup_heartbeat_path: None, } } @@ -260,6 +303,47 @@ mod tests { assert_eq!(value, 3_000); } + #[test] + fn resolve_backend_startup_idle_timeout_clamps_large_value() { + let value = resolve_backend_startup_idle_timeout_ms( + "999999", + "TEST_STARTUP_IDLE_TIMEOUT_ENV", + 60_000, + 5_000, + 300_000, + |_| {}, + ); + assert_eq!(value, 300_000); + } + + #[test] + fn resolve_backend_startup_idle_timeout_clamps_small_value() { + let value = resolve_backend_startup_idle_timeout_ms( + "1000", + "TEST_STARTUP_IDLE_TIMEOUT_ENV", + 60_000, + 5_000, + 300_000, + |_| {}, + ); + assert_eq!(value, 5_000); + } + + #[test] + fn resolve_backend_startup_heartbeat_path_prefers_root_dir() { + let path = resolve_backend_startup_heartbeat_path( + Some(Path::new("/tmp/astrbot-root")), + Some(PathBuf::from("/tmp/packaged-root")), + "data/backend-startup-heartbeat.json", + ) + .expect("expected heartbeat path"); + + assert_eq!( + path, + PathBuf::from("/tmp/astrbot-root").join("data/backend-startup-heartbeat.json") + ); + } + #[test] fn resolve_backend_timeout_uses_packaged_fallback_when_zero() { let timeout = resolve_backend_timeout_ms(true, "TEST_TIMEOUT_ENV_MISSING", 20_000, 300_000); diff --git a/src-tauri/src/backend/launch.rs b/src-tauri/src/backend/launch.rs index ac91d7af..e9a13185 100644 --- a/src-tauri/src/backend/launch.rs +++ b/src-tauri/src/backend/launch.rs @@ -10,7 +10,7 @@ use std::os::windows::process::CommandExt; use tauri::AppHandle; use crate::{ - append_desktop_log, backend_path_override, build_debug_command, launch_plan, logging, + append_desktop_log, backend, backend_path_override, build_debug_command, launch_plan, logging, runtime_paths, BackendState, BACKEND_LOG_MAX_BYTES, DEFAULT_SHELL_LOCALE, LOG_BACKUP_COUNT, }; #[cfg(target_os = "windows")] @@ -125,6 +125,15 @@ impl BackendState { if let Some(root_dir) = &plan.root_dir { command.env("ASTRBOT_ROOT", root_dir); } + if let Some(heartbeat_path) = backend::config::resolve_backend_startup_heartbeat_path( + plan.root_dir.as_deref(), + plan.packaged_mode + .then(runtime_paths::default_packaged_root_dir) + .flatten(), + crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, + ) { + command.env(crate::BACKEND_STARTUP_HEARTBEAT_PATH_ENV, heartbeat_path); + } if let Some(webui_dir) = &plan.webui_dir { command.env("ASTRBOT_WEBUI_DIR", webui_dir); } diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 7eb3793d..75f5cc7f 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -1,6 +1,8 @@ use std::{ - env, thread, - time::{Duration, Instant}, + env, fs, + path::Path, + thread, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, }; use tauri::AppHandle; @@ -40,10 +42,13 @@ impl BackendState { 20_000, PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS, ); - let readiness = backend::runtime::backend_readiness_config(append_desktop_log); + let readiness = backend::runtime::backend_readiness_config(plan, append_desktop_log); + let startup_idle_timeout = Duration::from_millis(readiness.startup_idle_timeout_ms); let start_time = Instant::now(); let mut tcp_ready_logged = false; let mut ever_tcp_reachable = false; + let mut startup_heartbeat_logged = false; + let mut last_startup_heartbeat_at = None; loop { let (http_status, tcp_reachable) = @@ -52,22 +57,13 @@ impl BackendState { return Ok(()); } - if tcp_reachable { - ever_tcp_reachable = true; - if !tcp_ready_logged { - append_desktop_log( - "backend TCP port is reachable but HTTP dashboard is not ready yet; waiting", - ); - tcp_ready_logged = true; - } - } - - { + let child_pid = { let mut guard = self .child .lock() .map_err(|_| "Backend process lock poisoned.".to_string())?; if let Some(child) = guard.as_mut() { + let child_pid = child.id(); match child.try_wait() { Ok(Some(status)) => { *guard = None; @@ -75,7 +71,7 @@ impl BackendState { "Backend process exited before becoming reachable: {status}" )); } - Ok(None) => {} + Ok(None) => child_pid, Err(error) => { return Err(format!("Failed to poll backend process status: {error}")); } @@ -83,6 +79,50 @@ impl BackendState { } else { return Err("Backend process is not running.".to_string()); } + }; + + if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { + if let Some(updated_at) = + read_startup_heartbeat_updated_at(heartbeat_path, child_pid) + { + if last_startup_heartbeat_at + .map(|last_seen| updated_at > last_seen) + .unwrap_or(true) + { + last_startup_heartbeat_at = Some(updated_at); + } + } + + if startup_heartbeat_timestamp_is_fresh( + last_startup_heartbeat_at, + SystemTime::now(), + startup_idle_timeout, + ) { + if !startup_heartbeat_logged { + append_desktop_log( + "backend startup heartbeat is fresh while HTTP dashboard is not ready yet; waiting", + ); + startup_heartbeat_logged = true; + } + } else if last_startup_heartbeat_at.is_some() { + append_desktop_log( + "backend startup heartbeat went stale before HTTP dashboard became ready", + ); + return Err(format!( + "Backend startup heartbeat went stale after {}ms without HTTP readiness.", + readiness.startup_idle_timeout_ms + )); + } + } + + if tcp_reachable { + ever_tcp_reachable = true; + if !tcp_ready_logged { + append_desktop_log( + "backend TCP port is reachable but HTTP dashboard is not ready yet; waiting", + ); + tcp_ready_logged = true; + } } if let Some(limit) = timeout_ms { @@ -93,6 +133,7 @@ impl BackendState { readiness.probe_timeout_ms, http_status, ever_tcp_reachable, + last_startup_heartbeat_at, ); return Err(format!( "Timed out after {}ms waiting for backend startup.", @@ -124,18 +165,126 @@ impl BackendState { probe_timeout_ms: u64, last_http_status: Option, tcp_reachable: bool, + last_startup_heartbeat_at: Option, ) { let last_http_status_text = last_http_status .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); + let startup_heartbeat_age_ms = last_startup_heartbeat_at + .and_then(|updated_at| SystemTime::now().duration_since(updated_at).ok()) + .map(|age| age.as_millis().to_string()) + .unwrap_or_else(|| "none".to_string()); append_desktop_log(&format!( - "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}", + "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}, startup_heartbeat_age_ms={}", timeout.as_millis(), self.backend_url, ready_http_path, probe_timeout_ms, tcp_reachable, - last_http_status_text + last_http_status_text, + startup_heartbeat_age_ms + )); + } +} + +#[derive(serde::Deserialize)] +struct StartupHeartbeatFile { + pid: u32, + state: String, + updated_at_ms: u64, +} + +fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { + let payload = fs::read_to_string(path).ok()?; + let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; + if heartbeat.pid != expected_pid || heartbeat.state != "starting" { + return None; + } + Some(UNIX_EPOCH + Duration::from_millis(heartbeat.updated_at_ms)) +} + +fn startup_heartbeat_timestamp_is_fresh( + updated_at: Option, + now: SystemTime, + max_age: Duration, +) -> bool { + updated_at + .map(|updated_at| now.duration_since(updated_at).unwrap_or(Duration::ZERO)) + .is_some_and(|age| age <= max_age) +} + +fn startup_heartbeat_is_fresh( + path: &Path, + expected_pid: u32, + now: SystemTime, + max_age: Duration, +) -> bool { + startup_heartbeat_timestamp_is_fresh( + read_startup_heartbeat_updated_at(path, expected_pid), + now, + max_age, + ) +} + +#[cfg(test)] +mod tests { + use std::time::{Duration, SystemTime, UNIX_EPOCH}; + + use tempfile::TempDir; + + use super::*; + + #[test] + fn startup_heartbeat_is_fresh_for_recent_timestamp() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + r#"{"pid":42,"state":"starting","updated_at_ms":5000}"#, + ) + .expect("write heartbeat file"); + + assert!(startup_heartbeat_is_fresh( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + Duration::from_secs(1), + )); + } + + #[test] + fn startup_heartbeat_is_not_fresh_for_stale_timestamp() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + r#"{"pid":42,"state":"starting","updated_at_ms":1000}"#, + ) + .expect("write heartbeat file"); + + assert!(!startup_heartbeat_is_fresh( + &heartbeat_path, + 42, + SystemTime::UNIX_EPOCH + Duration::from_millis(5000), + Duration::from_secs(1), + )); + } + + #[test] + fn startup_heartbeat_is_not_fresh_for_mismatched_pid() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + r#"{"pid":7,"state":"starting","updated_at_ms":5000}"#, + ) + .expect("write heartbeat file"); + + assert!(!startup_heartbeat_is_fresh( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + Duration::from_secs(1), )); } } diff --git a/src-tauri/src/backend/runtime.rs b/src-tauri/src/backend/runtime.rs index c928c4a6..894a52dd 100644 --- a/src-tauri/src/backend/runtime.rs +++ b/src-tauri/src/backend/runtime.rs @@ -15,12 +15,15 @@ pub fn backend_wait_timeout(packaged_mode: bool) -> Duration { .unwrap_or(Duration::from_millis(20_000)) } -pub fn backend_readiness_config(log: F) -> backend::config::BackendReadinessConfig +pub fn backend_readiness_config( + plan: &crate::LaunchPlan, + log: F, +) -> backend::config::BackendReadinessConfig where F: Fn(&str) + Copy, { let probe_timeout_fallback = backend_ping_timeout_ms(log); - backend::config::backend_readiness_config( + let mut readiness = backend::config::backend_readiness_config( crate::BACKEND_READY_HTTP_PATH_ENV, crate::DEFAULT_BACKEND_READY_HTTP_PATH, crate::BACKEND_READY_PROBE_TIMEOUT_ENV, @@ -32,7 +35,26 @@ where crate::BACKEND_READY_POLL_INTERVAL_MIN_MS, crate::BACKEND_READY_POLL_INTERVAL_MAX_MS, |message| log(&message), - ) + ); + readiness.startup_idle_timeout_ms = match env::var(crate::BACKEND_STARTUP_IDLE_TIMEOUT_ENV) { + Ok(raw) => backend::config::resolve_backend_startup_idle_timeout_ms( + &raw, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_ENV, + crate::DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS, + |message| log(&message), + ), + Err(_) => crate::DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS, + }; + readiness.startup_heartbeat_path = backend::config::resolve_backend_startup_heartbeat_path( + plan.root_dir.as_deref(), + plan.packaged_mode + .then(crate::runtime_paths::default_packaged_root_dir) + .flatten(), + crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, + ); + readiness } pub fn backend_ping_timeout_ms(log: F) -> u64 From 068564a26fcedf53c010a62f72da07ccd117a3c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 12:13:07 +0900 Subject: [PATCH 02/13] fix: tighten startup heartbeat validation --- src-tauri/src/backend/readiness.rs | 70 +++++++++++++++++------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 75f5cc7f..f9fffce9 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -82,16 +82,10 @@ impl BackendState { }; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { - if let Some(updated_at) = - read_startup_heartbeat_updated_at(heartbeat_path, child_pid) - { - if last_startup_heartbeat_at - .map(|last_seen| updated_at > last_seen) - .unwrap_or(true) - { - last_startup_heartbeat_at = Some(updated_at); - } - } + last_startup_heartbeat_at = next_startup_heartbeat_at( + last_startup_heartbeat_at, + read_startup_heartbeat_updated_at(heartbeat_path, child_pid), + ); if startup_heartbeat_timestamp_is_fresh( last_startup_heartbeat_at, @@ -209,21 +203,19 @@ fn startup_heartbeat_timestamp_is_fresh( max_age: Duration, ) -> bool { updated_at - .map(|updated_at| now.duration_since(updated_at).unwrap_or(Duration::ZERO)) + .and_then(|updated_at| now.duration_since(updated_at).ok()) .is_some_and(|age| age <= max_age) } -fn startup_heartbeat_is_fresh( - path: &Path, - expected_pid: u32, - now: SystemTime, - max_age: Duration, -) -> bool { - startup_heartbeat_timestamp_is_fresh( - read_startup_heartbeat_updated_at(path, expected_pid), - now, - max_age, - ) +fn next_startup_heartbeat_at( + previous: Option, + current: Option, +) -> Option { + match (previous, current) { + (_, None) => None, + (Some(previous), Some(current)) if current <= previous => Some(previous), + (_, Some(current)) => Some(current), + } } #[cfg(test)] @@ -244,9 +236,11 @@ mod tests { ) .expect("write heartbeat file"); - assert!(startup_heartbeat_is_fresh( - &heartbeat_path, - 42, + let updated_at = + read_startup_heartbeat_updated_at(&heartbeat_path, 42).expect("heartbeat timestamp"); + + assert!(startup_heartbeat_timestamp_is_fresh( + Some(updated_at), UNIX_EPOCH + Duration::from_millis(5500), Duration::from_secs(1), )); @@ -262,9 +256,11 @@ mod tests { ) .expect("write heartbeat file"); - assert!(!startup_heartbeat_is_fresh( - &heartbeat_path, - 42, + let updated_at = + read_startup_heartbeat_updated_at(&heartbeat_path, 42).expect("heartbeat timestamp"); + + assert!(!startup_heartbeat_timestamp_is_fresh( + Some(updated_at), SystemTime::UNIX_EPOCH + Duration::from_millis(5000), Duration::from_secs(1), )); @@ -280,11 +276,23 @@ mod tests { ) .expect("write heartbeat file"); - assert!(!startup_heartbeat_is_fresh( - &heartbeat_path, - 42, + assert_eq!(read_startup_heartbeat_updated_at(&heartbeat_path, 42), None); + } + + #[test] + fn startup_heartbeat_is_not_fresh_for_future_timestamp() { + assert!(!startup_heartbeat_timestamp_is_fresh( + Some(UNIX_EPOCH + Duration::from_millis(6000)), UNIX_EPOCH + Duration::from_millis(5500), Duration::from_secs(1), )); } + + #[test] + fn next_startup_heartbeat_at_clears_previous_timestamp_when_current_is_invalid() { + assert_eq!( + next_startup_heartbeat_at(Some(UNIX_EPOCH + Duration::from_millis(5000)), None), + None + ); + } } From 011aeaa79b25ddf0a6848a87d3b3cbe46d111d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 12:42:37 +0900 Subject: [PATCH 03/13] refactor: centralize startup heartbeat metadata --- src-tauri/src/app_helpers.rs | 1 + src-tauri/src/app_types.rs | 1 + src-tauri/src/backend/launch.rs | 10 ++---- src-tauri/src/backend/readiness.rs | 19 ++++++++-- src-tauri/src/backend/restart.rs | 1 + src-tauri/src/backend/runtime.rs | 8 +---- src-tauri/src/launch_plan.rs | 58 ++++++++++++++++++++++++++++-- 7 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src-tauri/src/app_helpers.rs b/src-tauri/src/app_helpers.rs index 6d85071e..11e37307 100644 --- a/src-tauri/src/app_helpers.rs +++ b/src-tauri/src/app_helpers.rs @@ -79,6 +79,7 @@ mod tests { cwd: PathBuf::from("."), root_dir: None, webui_dir: None, + startup_heartbeat_path: None, packaged_mode: false, }; diff --git a/src-tauri/src/app_types.rs b/src-tauri/src/app_types.rs index 53e0bbf7..aea509a3 100644 --- a/src-tauri/src/app_types.rs +++ b/src-tauri/src/app_types.rs @@ -33,6 +33,7 @@ pub(crate) struct LaunchPlan { pub(crate) cwd: PathBuf, pub(crate) root_dir: Option, pub(crate) webui_dir: Option, + pub(crate) startup_heartbeat_path: Option, pub(crate) packaged_mode: bool, } diff --git a/src-tauri/src/backend/launch.rs b/src-tauri/src/backend/launch.rs index e9a13185..a96e9449 100644 --- a/src-tauri/src/backend/launch.rs +++ b/src-tauri/src/backend/launch.rs @@ -10,7 +10,7 @@ use std::os::windows::process::CommandExt; use tauri::AppHandle; use crate::{ - append_desktop_log, backend, backend_path_override, build_debug_command, launch_plan, logging, + append_desktop_log, backend_path_override, build_debug_command, launch_plan, logging, runtime_paths, BackendState, BACKEND_LOG_MAX_BYTES, DEFAULT_SHELL_LOCALE, LOG_BACKUP_COUNT, }; #[cfg(target_os = "windows")] @@ -125,13 +125,7 @@ impl BackendState { if let Some(root_dir) = &plan.root_dir { command.env("ASTRBOT_ROOT", root_dir); } - if let Some(heartbeat_path) = backend::config::resolve_backend_startup_heartbeat_path( - plan.root_dir.as_deref(), - plan.packaged_mode - .then(runtime_paths::default_packaged_root_dir) - .flatten(), - crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, - ) { + if let Some(heartbeat_path) = plan.startup_heartbeat_path.as_ref() { command.env(crate::BACKEND_STARTUP_HEARTBEAT_PATH_ENV, heartbeat_path); } if let Some(webui_dir) = &plan.webui_dir { diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index f9fffce9..4e1313bd 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -184,14 +184,21 @@ impl BackendState { #[derive(serde::Deserialize)] struct StartupHeartbeatFile { pid: u32, - state: String, + state: StartupHeartbeatState, updated_at_ms: u64, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Deserialize)] +#[serde(rename_all = "lowercase")] +enum StartupHeartbeatState { + Starting, + Stopping, +} + fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { let payload = fs::read_to_string(path).ok()?; let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; - if heartbeat.pid != expected_pid || heartbeat.state != "starting" { + if heartbeat.pid != expected_pid || heartbeat.state != StartupHeartbeatState::Starting { return None; } Some(UNIX_EPOCH + Duration::from_millis(heartbeat.updated_at_ms)) @@ -295,4 +302,12 @@ mod tests { None ); } + + #[test] + fn startup_heartbeat_file_rejects_unknown_state() { + assert!(serde_json::from_str::( + r#"{"pid":42,"state":"unexpected","updated_at_ms":5000}"# + ) + .is_err()); + } } diff --git a/src-tauri/src/backend/restart.rs b/src-tauri/src/backend/restart.rs index bc93c3bd..7372d6f9 100644 --- a/src-tauri/src/backend/restart.rs +++ b/src-tauri/src/backend/restart.rs @@ -340,6 +340,7 @@ mod tests { cwd: std::path::PathBuf::from("."), root_dir: None, webui_dir: None, + startup_heartbeat_path: None, packaged_mode: true, }; let state = BackendState::default(); diff --git a/src-tauri/src/backend/runtime.rs b/src-tauri/src/backend/runtime.rs index 894a52dd..990191fd 100644 --- a/src-tauri/src/backend/runtime.rs +++ b/src-tauri/src/backend/runtime.rs @@ -47,13 +47,7 @@ where ), Err(_) => crate::DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS, }; - readiness.startup_heartbeat_path = backend::config::resolve_backend_startup_heartbeat_path( - plan.root_dir.as_deref(), - plan.packaged_mode - .then(crate::runtime_paths::default_packaged_root_dir) - .flatten(), - crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, - ); + readiness.startup_heartbeat_path = plan.startup_heartbeat_path.clone(); readiness } diff --git a/src-tauri/src/launch_plan.rs b/src-tauri/src/launch_plan.rs index cb421bbe..21e1121c 100644 --- a/src-tauri/src/launch_plan.rs +++ b/src-tauri/src/launch_plan.rs @@ -5,7 +5,7 @@ use std::{ use tauri::AppHandle; -use crate::{packaged_webui, runtime_paths, LaunchPlan, RuntimeManifest}; +use crate::{backend, packaged_webui, runtime_paths, LaunchPlan, RuntimeManifest}; const BACKEND_RESOURCE_ALIAS: &str = env!("ASTRBOT_BACKEND_RESOURCE_ALIAS"); const WEBUI_RESOURCE_ALIAS: &str = env!("ASTRBOT_WEBUI_RESOURCE_ALIAS"); @@ -14,6 +14,19 @@ fn build_packaged_resource_relative_path(resource_alias: &str, leaf_name: &str) PathBuf::from(resource_alias).join(leaf_name) } +fn resolve_launch_startup_heartbeat_path( + root_dir: Option<&Path>, + packaged_mode: bool, +) -> Option { + backend::config::resolve_backend_startup_heartbeat_path( + root_dir, + packaged_mode + .then(runtime_paths::default_packaged_root_dir) + .flatten(), + crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, + ) +} + pub fn resolve_custom_launch(custom_cmd: String) -> Result { let mut pieces = shlex::split(&custom_cmd) .ok_or_else(|| format!("Invalid ASTRBOT_BACKEND_CMD: {custom_cmd}"))?; @@ -29,6 +42,7 @@ pub fn resolve_custom_launch(custom_cmd: String) -> Result { .unwrap_or_else(runtime_paths::workspace_root_dir); let root_dir = env::var("ASTRBOT_ROOT").ok().map(PathBuf::from); let webui_dir = env::var("ASTRBOT_WEBUI_DIR").ok().map(PathBuf::from); + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); Ok(LaunchPlan { cmd, @@ -36,6 +50,7 @@ pub fn resolve_custom_launch(custom_cmd: String) -> Result { cwd, root_dir, webui_dir, + startup_heartbeat_path, packaged_mode: false, }) } @@ -141,6 +156,7 @@ where "--webui-dir".to_string(), webui_dir.to_string_lossy().to_string(), ]; + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), true); let plan = LaunchPlan { cmd: python_path.to_string_lossy().to_string(), @@ -148,6 +164,7 @@ where cwd, root_dir, webui_dir: Some(webui_dir), + startup_heartbeat_path, packaged_mode: true, }; Ok(Some(plan)) @@ -174,6 +191,8 @@ pub fn resolve_dev_launch() -> Result { args.push("--webui-dir".to_string()); args.push(path.to_string_lossy().to_string()); } + let root_dir = env::var("ASTRBOT_ROOT").ok().map(PathBuf::from); + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); Ok(LaunchPlan { cmd: "uv".to_string(), @@ -181,8 +200,9 @@ pub fn resolve_dev_launch() -> Result { cwd: env::var("ASTRBOT_BACKEND_CWD") .map(PathBuf::from) .unwrap_or(source_root), - root_dir: env::var("ASTRBOT_ROOT").ok().map(PathBuf::from), + root_dir, webui_dir, + startup_heartbeat_path, packaged_mode: false, }) } @@ -191,6 +211,28 @@ pub fn resolve_dev_launch() -> Result { mod tests { use super::*; + struct EnvVarGuard { + key: &'static str, + previous: Option, + } + + impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + let previous = env::var(key).ok(); + env::set_var(key, value); + Self { key, previous } + } + } + + impl Drop for EnvVarGuard { + fn drop(&mut self) { + match &self.previous { + Some(value) => env::set_var(self.key, value), + None => env::remove_var(self.key), + } + } + } + #[test] fn build_packaged_resource_relative_path_joins_alias_and_leaf_name() { assert_eq!( @@ -202,4 +244,16 @@ mod tests { PathBuf::from("runtime/webui").join("index.html") ); } + + #[test] + fn resolve_custom_launch_sets_startup_heartbeat_path_from_root_dir() { + let _root_guard = EnvVarGuard::set("ASTRBOT_ROOT", "/tmp/astrbot-root"); + + let plan = resolve_custom_launch("python main.py".to_string()).expect("custom plan"); + + assert_eq!( + plan.startup_heartbeat_path, + Some(PathBuf::from("/tmp/astrbot-root").join("data/backend-startup-heartbeat.json")) + ); + } } From c72be6e60f862314e75b82fa70fbb87358d9c33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 12:50:34 +0900 Subject: [PATCH 04/13] fix: surface heartbeat invalidation sooner --- scripts/backend/templates/launch_backend.py | 33 ++++++++++++--- src-tauri/src/backend/readiness.rs | 47 +++++++++++++++++---- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index eff148e7..1a64f404 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -126,7 +126,9 @@ def resolve_startup_heartbeat_path() -> Path | None: return Path(raw) -def write_startup_heartbeat(path: Path, state: str) -> None: +def write_startup_heartbeat( + path: Path, state: str, *, warn_on_error: bool = False +) -> bool: try: path.parent.mkdir(parents=True, exist_ok=True) payload = { @@ -140,8 +142,14 @@ def write_startup_heartbeat(path: Path, state: str) -> None: encoding="utf-8", ) temp_path.replace(path) - except Exception: - return + return True + except Exception as exc: + if warn_on_error: + print( + f"[startup-heartbeat] failed to write heartbeat to {path}: {exc.__class__.__name__}: {exc}", + file=sys.stderr, + ) + return False def start_startup_heartbeat() -> None: @@ -150,15 +158,28 @@ def start_startup_heartbeat() -> None: return stop_event = threading.Event() - write_startup_heartbeat(heartbeat_path, "starting") + warning_emitted = not write_startup_heartbeat( + heartbeat_path, + "starting", + warn_on_error=True, + ) + + def refresh_heartbeat(state: str) -> None: + nonlocal warning_emitted + if not write_startup_heartbeat( + heartbeat_path, + state, + warn_on_error=not warning_emitted, + ): + warning_emitted = True def stop_heartbeat() -> None: stop_event.set() - write_startup_heartbeat(heartbeat_path, "stopping") + refresh_heartbeat("stopping") def heartbeat_loop() -> None: while not stop_event.wait(STARTUP_HEARTBEAT_INTERVAL_SECONDS): - write_startup_heartbeat(heartbeat_path, "starting") + refresh_heartbeat("starting") atexit.register(stop_heartbeat) threading.Thread( diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 4e1313bd..d3f38f2f 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -82,10 +82,31 @@ impl BackendState { }; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { - last_startup_heartbeat_at = next_startup_heartbeat_at( + match next_startup_heartbeat_at( last_startup_heartbeat_at, read_startup_heartbeat_updated_at(heartbeat_path, child_pid), - ); + ) { + StartupHeartbeatObservation::Missing => { + last_startup_heartbeat_at = None; + } + StartupHeartbeatObservation::Observed(updated_at) => { + last_startup_heartbeat_at = Some(updated_at); + } + StartupHeartbeatObservation::Invalidated(previous) => { + let heartbeat_age_ms = SystemTime::now() + .duration_since(previous) + .ok() + .map(|age| age.as_millis().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + append_desktop_log(&format!( + "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" + )); + return Err( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string(), + ); + } + } if startup_heartbeat_timestamp_is_fresh( last_startup_heartbeat_at, @@ -195,6 +216,13 @@ enum StartupHeartbeatState { Stopping, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum StartupHeartbeatObservation { + Missing, + Observed(SystemTime), + Invalidated(SystemTime), +} + fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { let payload = fs::read_to_string(path).ok()?; let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; @@ -217,11 +245,14 @@ fn startup_heartbeat_timestamp_is_fresh( fn next_startup_heartbeat_at( previous: Option, current: Option, -) -> Option { +) -> StartupHeartbeatObservation { match (previous, current) { - (_, None) => None, - (Some(previous), Some(current)) if current <= previous => Some(previous), - (_, Some(current)) => Some(current), + (Some(previous), None) => StartupHeartbeatObservation::Invalidated(previous), + (None, None) => StartupHeartbeatObservation::Missing, + (Some(previous), Some(current)) if current <= previous => { + StartupHeartbeatObservation::Observed(previous) + } + (_, Some(current)) => StartupHeartbeatObservation::Observed(current), } } @@ -296,10 +327,10 @@ mod tests { } #[test] - fn next_startup_heartbeat_at_clears_previous_timestamp_when_current_is_invalid() { + fn next_startup_heartbeat_at_marks_previous_timestamp_invalid_when_current_is_missing() { assert_eq!( next_startup_heartbeat_at(Some(UNIX_EPOCH + Duration::from_millis(5000)), None), - None + StartupHeartbeatObservation::Invalidated(UNIX_EPOCH + Duration::from_millis(5000)) ); } From d8f8200d91ba0edb943aa42f912500e9881ae252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 13:02:09 +0900 Subject: [PATCH 05/13] fix: harden startup heartbeat parsing --- scripts/backend/templates/launch_backend.py | 66 +++++++++++---------- src-tauri/src/backend/readiness.rs | 59 ++++++++++++++---- 2 files changed, 82 insertions(+), 43 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 1a64f404..9eab090b 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -152,41 +152,47 @@ def write_startup_heartbeat( return False +class StartupHeartbeat: + def __init__(self, path: Path, interval_seconds: float) -> None: + self._path = path + self._interval_seconds = interval_seconds + self._stop_event = threading.Event() + self._warning_emitted = False + + def _write(self, state: str, *, warn_on_error: bool) -> bool: + ok = write_startup_heartbeat( + self._path, + state, + warn_on_error=warn_on_error, + ) + if not ok: + self._warning_emitted = True + return ok + + def start(self) -> None: + self._write("starting", warn_on_error=True) + atexit.register(self.stop) + threading.Thread( + target=self._loop, + name="astrbot-startup-heartbeat", + daemon=True, + ).start() + + def stop(self) -> None: + self._stop_event.set() + self._write("stopping", warn_on_error=not self._warning_emitted) + + def _loop(self) -> None: + while not self._stop_event.wait(self._interval_seconds): + self._write("starting", warn_on_error=not self._warning_emitted) + + def start_startup_heartbeat() -> None: heartbeat_path = resolve_startup_heartbeat_path() if heartbeat_path is None: return - stop_event = threading.Event() - warning_emitted = not write_startup_heartbeat( - heartbeat_path, - "starting", - warn_on_error=True, - ) - - def refresh_heartbeat(state: str) -> None: - nonlocal warning_emitted - if not write_startup_heartbeat( - heartbeat_path, - state, - warn_on_error=not warning_emitted, - ): - warning_emitted = True - - def stop_heartbeat() -> None: - stop_event.set() - refresh_heartbeat("stopping") - - def heartbeat_loop() -> None: - while not stop_event.wait(STARTUP_HEARTBEAT_INTERVAL_SECONDS): - refresh_heartbeat("starting") - - atexit.register(stop_heartbeat) - threading.Thread( - target=heartbeat_loop, - name="astrbot-startup-heartbeat", - daemon=True, - ).start() + StartupHeartbeat(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS).start() configure_stdio_utf8() diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index d3f38f2f..4ee401fd 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -56,6 +56,7 @@ impl BackendState { if matches!(http_status, Some(status_code) if (200..400).contains(&status_code)) { return Ok(()); } + let now = SystemTime::now(); let child_pid = { let mut guard = self @@ -93,7 +94,7 @@ impl BackendState { last_startup_heartbeat_at = Some(updated_at); } StartupHeartbeatObservation::Invalidated(previous) => { - let heartbeat_age_ms = SystemTime::now() + let heartbeat_age_ms = now .duration_since(previous) .ok() .map(|age| age.as_millis().to_string()) @@ -110,7 +111,7 @@ impl BackendState { if startup_heartbeat_timestamp_is_fresh( last_startup_heartbeat_at, - SystemTime::now(), + now, startup_idle_timeout, ) { if !startup_heartbeat_logged { @@ -146,9 +147,12 @@ impl BackendState { limit, &readiness.path, readiness.probe_timeout_ms, - http_status, - ever_tcp_reachable, - last_startup_heartbeat_at, + ReadinessTimeoutSnapshot { + now, + last_http_status: http_status, + tcp_reachable: ever_tcp_reachable, + last_startup_heartbeat_at, + }, ); return Err(format!( "Timed out after {}ms waiting for backend startup.", @@ -178,15 +182,15 @@ impl BackendState { timeout: Duration, ready_http_path: &str, probe_timeout_ms: u64, - last_http_status: Option, - tcp_reachable: bool, - last_startup_heartbeat_at: Option, + snapshot: ReadinessTimeoutSnapshot, ) { - let last_http_status_text = last_http_status + let last_http_status_text = snapshot + .last_http_status .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); - let startup_heartbeat_age_ms = last_startup_heartbeat_at - .and_then(|updated_at| SystemTime::now().duration_since(updated_at).ok()) + let startup_heartbeat_age_ms = snapshot + .last_startup_heartbeat_at + .and_then(|updated_at| snapshot.now.duration_since(updated_at).ok()) .map(|age| age.as_millis().to_string()) .unwrap_or_else(|| "none".to_string()); append_desktop_log(&format!( @@ -195,7 +199,7 @@ impl BackendState { self.backend_url, ready_http_path, probe_timeout_ms, - tcp_reachable, + snapshot.tcp_reachable, last_http_status_text, startup_heartbeat_age_ms )); @@ -203,6 +207,7 @@ impl BackendState { } #[derive(serde::Deserialize)] +#[serde(deny_unknown_fields)] struct StartupHeartbeatFile { pid: u32, state: StartupHeartbeatState, @@ -223,13 +228,25 @@ enum StartupHeartbeatObservation { Invalidated(SystemTime), } +#[derive(Debug, Clone, Copy)] +struct ReadinessTimeoutSnapshot { + now: SystemTime, + last_http_status: Option, + tcp_reachable: bool, + last_startup_heartbeat_at: Option, +} + fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { let payload = fs::read_to_string(path).ok()?; let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; if heartbeat.pid != expected_pid || heartbeat.state != StartupHeartbeatState::Starting { return None; } - Some(UNIX_EPOCH + Duration::from_millis(heartbeat.updated_at_ms)) + heartbeat_updated_at_ms_to_system_time(heartbeat.updated_at_ms) +} + +fn heartbeat_updated_at_ms_to_system_time(updated_at_ms: u64) -> Option { + UNIX_EPOCH.checked_add(Duration::from_millis(updated_at_ms)) } fn startup_heartbeat_timestamp_is_fresh( @@ -341,4 +358,20 @@ mod tests { ) .is_err()); } + + #[test] + fn startup_heartbeat_file_rejects_unknown_fields() { + assert!(serde_json::from_str::( + r#"{"pid":42,"state":"starting","updated_at_ms":5000,"unexpected":true}"# + ) + .is_err()); + } + + #[test] + fn heartbeat_updated_at_ms_to_system_time_matches_checked_add() { + assert_eq!( + heartbeat_updated_at_ms_to_system_time(u64::MAX), + UNIX_EPOCH.checked_add(Duration::from_millis(u64::MAX)) + ); + } } From 8638ba9a0993014a599a293050efccfdb917afc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 13:44:30 +0900 Subject: [PATCH 06/13] fix: warn on stop-time heartbeat failures --- scripts/backend/templates/launch_backend.py | 42 +++++++---- .../backend/templates/test_launch_backend.py | 71 +++++++++++++++++++ 2 files changed, 99 insertions(+), 14 deletions(-) create mode 100644 scripts/backend/templates/test_launch_backend.py diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 9eab090b..4715bf60 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -157,15 +157,24 @@ def __init__(self, path: Path, interval_seconds: float) -> None: self._path = path self._interval_seconds = interval_seconds self._stop_event = threading.Event() + self._had_successful_write = False self._warning_emitted = False def _write(self, state: str, *, warn_on_error: bool) -> bool: + effective_warn_on_error = warn_on_error and ( + state == "stopping" + or not self._warning_emitted + or not self._had_successful_write + ) ok = write_startup_heartbeat( self._path, state, - warn_on_error=warn_on_error, + warn_on_error=effective_warn_on_error, ) - if not ok: + if ok: + self._had_successful_write = True + self._warning_emitted = False + elif effective_warn_on_error: self._warning_emitted = True return ok @@ -180,11 +189,11 @@ def start(self) -> None: def stop(self) -> None: self._stop_event.set() - self._write("stopping", warn_on_error=not self._warning_emitted) + self._write("stopping", warn_on_error=True) def _loop(self) -> None: while not self._stop_event.wait(self._interval_seconds): - self._write("starting", warn_on_error=not self._warning_emitted) + self._write("starting", warn_on_error=True) def start_startup_heartbeat() -> None: @@ -195,16 +204,21 @@ def start_startup_heartbeat() -> None: StartupHeartbeat(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS).start() -configure_stdio_utf8() -configure_windows_dll_search_path() -preload_windows_runtime_dlls() -start_startup_heartbeat() +def main() -> None: + configure_stdio_utf8() + configure_windows_dll_search_path() + preload_windows_runtime_dlls() + start_startup_heartbeat() + + sys.path.insert(0, str(APP_DIR)) + + main_file = APP_DIR / "main.py" + if not main_file.is_file(): + raise FileNotFoundError(f"Backend entrypoint not found: {main_file}") -sys.path.insert(0, str(APP_DIR)) + sys.argv[0] = str(main_file) + runpy.run_path(str(main_file), run_name="__main__") -main_file = APP_DIR / "main.py" -if not main_file.is_file(): - raise FileNotFoundError(f"Backend entrypoint not found: {main_file}") -sys.argv[0] = str(main_file) -runpy.run_path(str(main_file), run_name="__main__") +if __name__ == "__main__": + main() diff --git a/scripts/backend/templates/test_launch_backend.py b/scripts/backend/templates/test_launch_backend.py new file mode 100644 index 00000000..9f316794 --- /dev/null +++ b/scripts/backend/templates/test_launch_backend.py @@ -0,0 +1,71 @@ +import importlib.util +import unittest +from pathlib import Path +from unittest import mock + + +MODULE_PATH = Path(__file__).with_name("launch_backend.py") +SPEC = importlib.util.spec_from_file_location("launch_backend_under_test", MODULE_PATH) +if SPEC is None or SPEC.loader is None: + raise RuntimeError(f"Cannot load launch_backend module from {MODULE_PATH}") +launch_backend = importlib.util.module_from_spec(SPEC) +SPEC.loader.exec_module(launch_backend) + + +class StartupHeartbeatTests(unittest.TestCase): + def test_repeated_failures_warn_before_first_success(self) -> None: + heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + side_effect=[False, False], + ) as write_mock: + heartbeat._write("starting", warn_on_error=True) + heartbeat._write("starting", warn_on_error=True) + + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True, True], + ) + + def test_repeated_failures_after_success_are_suppressed(self) -> None: + heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + side_effect=[True, False, False], + ) as write_mock: + heartbeat._write("starting", warn_on_error=True) + heartbeat._write("starting", warn_on_error=True) + heartbeat._write("starting", warn_on_error=True) + + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True, True, False], + ) + + def test_stop_failure_still_warns_after_earlier_failure(self) -> None: + heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + side_effect=[False, False], + ) as write_mock: + heartbeat._write("starting", warn_on_error=True) + heartbeat.stop() + + self.assertEqual( + [call.args[1] for call in write_mock.call_args_list], + ["starting", "stopping"], + ) + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True, True], + ) + + +if __name__ == "__main__": + unittest.main() From cb423aa5e7c3e4ffc38d7959a7a3ed4e8ba4d098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 13:56:20 +0900 Subject: [PATCH 07/13] refactor: simplify startup heartbeat control flow --- scripts/backend/templates/launch_backend.py | 74 +++++----- .../backend/templates/test_launch_backend.py | 42 ++++-- src-tauri/src/backend/readiness.rs | 127 ++++++++++++------ 3 files changed, 144 insertions(+), 99 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 4715bf60..bfe6ba32 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -152,48 +152,26 @@ def write_startup_heartbeat( return False -class StartupHeartbeat: - def __init__(self, path: Path, interval_seconds: float) -> None: - self._path = path - self._interval_seconds = interval_seconds - self._stop_event = threading.Event() - self._had_successful_write = False - self._warning_emitted = False - - def _write(self, state: str, *, warn_on_error: bool) -> bool: - effective_warn_on_error = warn_on_error and ( - state == "stopping" - or not self._warning_emitted - or not self._had_successful_write - ) - ok = write_startup_heartbeat( - self._path, - state, - warn_on_error=effective_warn_on_error, - ) +def heartbeat_loop( + path: Path, interval_seconds: float, stop_event: threading.Event +) -> None: + had_successful_write = False + warning_emitted = False + + ok = write_startup_heartbeat(path, "starting", warn_on_error=True) + if ok: + had_successful_write = True + else: + warning_emitted = True + + while not stop_event.wait(interval_seconds): + warn_now = (not warning_emitted) or (not had_successful_write) + ok = write_startup_heartbeat(path, "starting", warn_on_error=warn_now) if ok: - self._had_successful_write = True - self._warning_emitted = False - elif effective_warn_on_error: - self._warning_emitted = True - return ok - - def start(self) -> None: - self._write("starting", warn_on_error=True) - atexit.register(self.stop) - threading.Thread( - target=self._loop, - name="astrbot-startup-heartbeat", - daemon=True, - ).start() - - def stop(self) -> None: - self._stop_event.set() - self._write("stopping", warn_on_error=True) - - def _loop(self) -> None: - while not self._stop_event.wait(self._interval_seconds): - self._write("starting", warn_on_error=True) + had_successful_write = True + warning_emitted = False + elif warn_now: + warning_emitted = True def start_startup_heartbeat() -> None: @@ -201,7 +179,19 @@ def start_startup_heartbeat() -> None: if heartbeat_path is None: return - StartupHeartbeat(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS).start() + stop_event = threading.Event() + + def on_exit() -> None: + stop_event.set() + write_startup_heartbeat(heartbeat_path, "stopping", warn_on_error=True) + + atexit.register(on_exit) + threading.Thread( + target=heartbeat_loop, + args=(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS, stop_event), + name="astrbot-startup-heartbeat", + daemon=True, + ).start() def main() -> None: diff --git a/scripts/backend/templates/test_launch_backend.py b/scripts/backend/templates/test_launch_backend.py index 9f316794..00f55d74 100644 --- a/scripts/backend/templates/test_launch_backend.py +++ b/scripts/backend/templates/test_launch_backend.py @@ -14,15 +14,15 @@ class StartupHeartbeatTests(unittest.TestCase): def test_repeated_failures_warn_before_first_success(self) -> None: - heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + stop_event = mock.Mock() + stop_event.wait.side_effect = [False, True] with mock.patch.object( launch_backend, "write_startup_heartbeat", side_effect=[False, False], ) as write_mock: - heartbeat._write("starting", warn_on_error=True) - heartbeat._write("starting", warn_on_error=True) + launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event) self.assertEqual( [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], @@ -30,16 +30,15 @@ def test_repeated_failures_warn_before_first_success(self) -> None: ) def test_repeated_failures_after_success_are_suppressed(self) -> None: - heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + stop_event = mock.Mock() + stop_event.wait.side_effect = [False, False, True] with mock.patch.object( launch_backend, "write_startup_heartbeat", side_effect=[True, False, False], ) as write_mock: - heartbeat._write("starting", warn_on_error=True) - heartbeat._write("starting", warn_on_error=True) - heartbeat._write("starting", warn_on_error=True) + launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event) self.assertEqual( [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], @@ -47,23 +46,40 @@ def test_repeated_failures_after_success_are_suppressed(self) -> None: ) def test_stop_failure_still_warns_after_earlier_failure(self) -> None: - heartbeat = launch_backend.StartupHeartbeat(Path("/tmp/heartbeat.json"), 2.0) + stop_event = mock.Mock() + thread = mock.Mock() + register = mock.Mock() with mock.patch.object( launch_backend, "write_startup_heartbeat", - side_effect=[False, False], + return_value=False, ) as write_mock: - heartbeat._write("starting", warn_on_error=True) - heartbeat.stop() + with mock.patch.object( + launch_backend, + "resolve_startup_heartbeat_path", + return_value=Path("/tmp/heartbeat.json"), + ): + with mock.patch.object( + launch_backend.threading, "Event", return_value=stop_event + ): + with mock.patch.object( + launch_backend.threading, "Thread", return_value=thread + ): + with mock.patch.object( + launch_backend.atexit, "register", register + ): + launch_backend.start_startup_heartbeat() + on_exit = register.call_args.args[0] + on_exit() self.assertEqual( [call.args[1] for call in write_mock.call_args_list], - ["starting", "stopping"], + ["stopping"], ) self.assertEqual( [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], - [True, True], + [True], ) diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 4ee401fd..6eebac52 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -47,8 +47,7 @@ impl BackendState { let start_time = Instant::now(); let mut tcp_ready_logged = false; let mut ever_tcp_reachable = false; - let mut startup_heartbeat_logged = false; - let mut last_startup_heartbeat_at = None; + let mut startup_heartbeat_state = StartupHeartbeatTracker::new(); loop { let (http_status, tcp_reachable) = @@ -83,51 +82,19 @@ impl BackendState { }; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { - match next_startup_heartbeat_at( - last_startup_heartbeat_at, - read_startup_heartbeat_updated_at(heartbeat_path, child_pid), - ) { - StartupHeartbeatObservation::Missing => { - last_startup_heartbeat_at = None; - } - StartupHeartbeatObservation::Observed(updated_at) => { - last_startup_heartbeat_at = Some(updated_at); - } - StartupHeartbeatObservation::Invalidated(previous) => { - let heartbeat_age_ms = now - .duration_since(previous) - .ok() - .map(|age| age.as_millis().to_string()) - .unwrap_or_else(|| "unknown".to_string()); - append_desktop_log(&format!( - "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" - )); - return Err( - "Backend startup heartbeat disappeared or became invalid before HTTP readiness." - .to_string(), - ); - } - } - - if startup_heartbeat_timestamp_is_fresh( - last_startup_heartbeat_at, + match step_startup_heartbeat( + heartbeat_path, + child_pid, now, startup_idle_timeout, + startup_heartbeat_state, ) { - if !startup_heartbeat_logged { - append_desktop_log( - "backend startup heartbeat is fresh while HTTP dashboard is not ready yet; waiting", - ); - startup_heartbeat_logged = true; + StartupHeartbeatStep::Continue(next_state) => { + startup_heartbeat_state = next_state; + } + StartupHeartbeatStep::Failed(reason) => { + return Err(reason); } - } else if last_startup_heartbeat_at.is_some() { - append_desktop_log( - "backend startup heartbeat went stale before HTTP dashboard became ready", - ); - return Err(format!( - "Backend startup heartbeat went stale after {}ms without HTTP readiness.", - readiness.startup_idle_timeout_ms - )); } } @@ -151,7 +118,7 @@ impl BackendState { now, last_http_status: http_status, tcp_reachable: ever_tcp_reachable, - last_startup_heartbeat_at, + last_startup_heartbeat_at: startup_heartbeat_state.last_seen_at, }, ); return Err(format!( @@ -228,6 +195,26 @@ enum StartupHeartbeatObservation { Invalidated(SystemTime), } +#[derive(Debug, Clone, Copy)] +struct StartupHeartbeatTracker { + last_seen_at: Option, + logged_fresh: bool, +} + +impl StartupHeartbeatTracker { + fn new() -> Self { + Self { + last_seen_at: None, + logged_fresh: false, + } + } +} + +enum StartupHeartbeatStep { + Continue(StartupHeartbeatTracker), + Failed(String), +} + #[derive(Debug, Clone, Copy)] struct ReadinessTimeoutSnapshot { now: SystemTime, @@ -273,6 +260,58 @@ fn next_startup_heartbeat_at( } } +fn step_startup_heartbeat( + heartbeat_path: &Path, + child_pid: u32, + now: SystemTime, + idle_timeout: Duration, + mut state: StartupHeartbeatTracker, +) -> StartupHeartbeatStep { + match next_startup_heartbeat_at( + state.last_seen_at, + read_startup_heartbeat_updated_at(heartbeat_path, child_pid), + ) { + StartupHeartbeatObservation::Missing => { + state.last_seen_at = None; + StartupHeartbeatStep::Continue(state) + } + StartupHeartbeatObservation::Observed(updated_at) => { + state.last_seen_at = Some(updated_at); + if startup_heartbeat_timestamp_is_fresh(state.last_seen_at, now, idle_timeout) { + if !state.logged_fresh { + append_desktop_log( + "backend startup heartbeat is fresh while HTTP dashboard is not ready yet; waiting", + ); + state.logged_fresh = true; + } + StartupHeartbeatStep::Continue(state) + } else { + append_desktop_log( + "backend startup heartbeat went stale before HTTP dashboard became ready", + ); + StartupHeartbeatStep::Failed(format!( + "Backend startup heartbeat went stale after {}ms without HTTP readiness.", + idle_timeout.as_millis() + )) + } + } + StartupHeartbeatObservation::Invalidated(previous) => { + let heartbeat_age_ms = now + .duration_since(previous) + .ok() + .map(|age| age.as_millis().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + append_desktop_log(&format!( + "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" + )); + StartupHeartbeatStep::Failed( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string(), + ) + } + } +} + #[cfg(test)] mod tests { use std::time::{Duration, SystemTime, UNIX_EPOCH}; From b64664ebdaabc0ec1ba26f9bc39a0fbfa20af1e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 14:07:27 +0900 Subject: [PATCH 08/13] refactor: flatten readiness heartbeat helpers --- src-tauri/src/backend/readiness.rs | 227 ++++++++++++++--------------- 1 file changed, 106 insertions(+), 121 deletions(-) diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 6eebac52..4914d6f7 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -57,45 +57,16 @@ impl BackendState { } let now = SystemTime::now(); - let child_pid = { - let mut guard = self - .child - .lock() - .map_err(|_| "Backend process lock poisoned.".to_string())?; - if let Some(child) = guard.as_mut() { - let child_pid = child.id(); - match child.try_wait() { - Ok(Some(status)) => { - *guard = None; - return Err(format!( - "Backend process exited before becoming reachable: {status}" - )); - } - Ok(None) => child_pid, - Err(error) => { - return Err(format!("Failed to poll backend process status: {error}")); - } - } - } else { - return Err("Backend process is not running.".to_string()); - } - }; + let child_pid = self.child_pid_or_error()?; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { - match step_startup_heartbeat( + step_startup_heartbeat( heartbeat_path, child_pid, now, startup_idle_timeout, - startup_heartbeat_state, - ) { - StartupHeartbeatStep::Continue(next_state) => { - startup_heartbeat_state = next_state; - } - StartupHeartbeatStep::Failed(reason) => { - return Err(reason); - } - } + &mut startup_heartbeat_state, + )?; } if tcp_reachable { @@ -112,14 +83,11 @@ impl BackendState { if start_time.elapsed() >= limit { self.log_backend_readiness_timeout( limit, - &readiness.path, - readiness.probe_timeout_ms, - ReadinessTimeoutSnapshot { - now, - last_http_status: http_status, - tcp_reachable: ever_tcp_reachable, - last_startup_heartbeat_at: startup_heartbeat_state.last_seen_at, - }, + &readiness, + now, + http_status, + ever_tcp_reachable, + startup_heartbeat_state.last_seen_at, ); return Err(format!( "Timed out after {}ms waiting for backend startup.", @@ -144,29 +112,51 @@ impl BackendState { (http_status, tcp_reachable) } + fn child_pid_or_error(&self) -> Result { + let mut guard = self + .child + .lock() + .map_err(|_| "Backend process lock poisoned.".to_string())?; + if let Some(child) = guard.as_mut() { + let pid = child.id(); + match child.try_wait() { + Ok(Some(status)) => { + *guard = None; + Err(format!( + "Backend process exited before becoming reachable: {status}" + )) + } + Ok(None) => Ok(pid), + Err(error) => Err(format!("Failed to poll backend process status: {error}")), + } + } else { + Err("Backend process is not running.".to_string()) + } + } + fn log_backend_readiness_timeout( &self, timeout: Duration, - ready_http_path: &str, - probe_timeout_ms: u64, - snapshot: ReadinessTimeoutSnapshot, + readiness: &backend::config::BackendReadinessConfig, + now: SystemTime, + last_http_status: Option, + tcp_reachable: bool, + last_startup_heartbeat_at: Option, ) { - let last_http_status_text = snapshot - .last_http_status + let last_http_status_text = last_http_status .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); - let startup_heartbeat_age_ms = snapshot - .last_startup_heartbeat_at - .and_then(|updated_at| snapshot.now.duration_since(updated_at).ok()) + let startup_heartbeat_age_ms = last_startup_heartbeat_at + .and_then(|updated_at| now.duration_since(updated_at).ok()) .map(|age| age.as_millis().to_string()) .unwrap_or_else(|| "none".to_string()); append_desktop_log(&format!( "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}, startup_heartbeat_age_ms={}", timeout.as_millis(), self.backend_url, - ready_http_path, - probe_timeout_ms, - snapshot.tcp_reachable, + readiness.path, + readiness.probe_timeout_ms, + tcp_reachable, last_http_status_text, startup_heartbeat_age_ms )); @@ -188,13 +178,6 @@ enum StartupHeartbeatState { Stopping, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum StartupHeartbeatObservation { - Missing, - Observed(SystemTime), - Invalidated(SystemTime), -} - #[derive(Debug, Clone, Copy)] struct StartupHeartbeatTracker { last_seen_at: Option, @@ -210,30 +193,13 @@ impl StartupHeartbeatTracker { } } -enum StartupHeartbeatStep { - Continue(StartupHeartbeatTracker), - Failed(String), -} - -#[derive(Debug, Clone, Copy)] -struct ReadinessTimeoutSnapshot { - now: SystemTime, - last_http_status: Option, - tcp_reachable: bool, - last_startup_heartbeat_at: Option, -} - fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { let payload = fs::read_to_string(path).ok()?; let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; if heartbeat.pid != expected_pid || heartbeat.state != StartupHeartbeatState::Starting { return None; } - heartbeat_updated_at_ms_to_system_time(heartbeat.updated_at_ms) -} - -fn heartbeat_updated_at_ms_to_system_time(updated_at_ms: u64) -> Option { - UNIX_EPOCH.checked_add(Duration::from_millis(updated_at_ms)) + UNIX_EPOCH.checked_add(Duration::from_millis(heartbeat.updated_at_ms)) } fn startup_heartbeat_timestamp_is_fresh( @@ -246,36 +212,40 @@ fn startup_heartbeat_timestamp_is_fresh( .is_some_and(|age| age <= max_age) } -fn next_startup_heartbeat_at( - previous: Option, - current: Option, -) -> StartupHeartbeatObservation { - match (previous, current) { - (Some(previous), None) => StartupHeartbeatObservation::Invalidated(previous), - (None, None) => StartupHeartbeatObservation::Missing, - (Some(previous), Some(current)) if current <= previous => { - StartupHeartbeatObservation::Observed(previous) - } - (_, Some(current)) => StartupHeartbeatObservation::Observed(current), - } -} - fn step_startup_heartbeat( heartbeat_path: &Path, child_pid: u32, now: SystemTime, idle_timeout: Duration, - mut state: StartupHeartbeatTracker, -) -> StartupHeartbeatStep { - match next_startup_heartbeat_at( - state.last_seen_at, - read_startup_heartbeat_updated_at(heartbeat_path, child_pid), - ) { - StartupHeartbeatObservation::Missing => { + state: &mut StartupHeartbeatTracker, +) -> Result<(), String> { + let previous = state.last_seen_at; + let current = read_startup_heartbeat_updated_at(heartbeat_path, child_pid); + + match (previous, current) { + (Some(previous), None) => { + let heartbeat_age_ms = now + .duration_since(previous) + .ok() + .map(|age| age.as_millis().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + append_desktop_log(&format!( + "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" + )); + Err( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string(), + ) + } + (None, None) => { state.last_seen_at = None; - StartupHeartbeatStep::Continue(state) + Ok(()) } - StartupHeartbeatObservation::Observed(updated_at) => { + (_, Some(current)) => { + let updated_at = match previous { + Some(previous) if current <= previous => previous, + _ => current, + }; state.last_seen_at = Some(updated_at); if startup_heartbeat_timestamp_is_fresh(state.last_seen_at, now, idle_timeout) { if !state.logged_fresh { @@ -284,31 +254,17 @@ fn step_startup_heartbeat( ); state.logged_fresh = true; } - StartupHeartbeatStep::Continue(state) + Ok(()) } else { append_desktop_log( "backend startup heartbeat went stale before HTTP dashboard became ready", ); - StartupHeartbeatStep::Failed(format!( + Err(format!( "Backend startup heartbeat went stale after {}ms without HTTP readiness.", idle_timeout.as_millis() )) } } - StartupHeartbeatObservation::Invalidated(previous) => { - let heartbeat_age_ms = now - .duration_since(previous) - .ok() - .map(|age| age.as_millis().to_string()) - .unwrap_or_else(|| "unknown".to_string()); - append_desktop_log(&format!( - "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" - )); - StartupHeartbeatStep::Failed( - "Backend startup heartbeat disappeared or became invalid before HTTP readiness." - .to_string(), - ) - } } } @@ -383,10 +339,28 @@ mod tests { } #[test] - fn next_startup_heartbeat_at_marks_previous_timestamp_invalid_when_current_is_missing() { + fn step_startup_heartbeat_fails_when_existing_heartbeat_disappears() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("missing-startup-heartbeat.json"); + let mut tracker = StartupHeartbeatTracker { + last_seen_at: Some(UNIX_EPOCH + Duration::from_millis(5000)), + logged_fresh: false, + }; + + let result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + Duration::from_secs(1), + &mut tracker, + ); + assert_eq!( - next_startup_heartbeat_at(Some(UNIX_EPOCH + Duration::from_millis(5000)), None), - StartupHeartbeatObservation::Invalidated(UNIX_EPOCH + Duration::from_millis(5000)) + result, + Err( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string() + ) ); } @@ -407,9 +381,20 @@ mod tests { } #[test] - fn heartbeat_updated_at_ms_to_system_time_matches_checked_add() { + fn read_startup_heartbeat_updated_at_handles_large_timestamp_without_panic() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + format!( + r#"{{"pid":42,"state":"starting","updated_at_ms":{}}}"#, + u64::MAX + ), + ) + .expect("write heartbeat file"); + assert_eq!( - heartbeat_updated_at_ms_to_system_time(u64::MAX), + read_startup_heartbeat_updated_at(&heartbeat_path, 42), UNIX_EPOCH.checked_add(Duration::from_millis(u64::MAX)) ); } From 47f00a098017d233684aca3744e9232a3cb2fc8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 14:22:25 +0900 Subject: [PATCH 09/13] refactor: clarify heartbeat helper responsibilities --- scripts/backend/templates/launch_backend.py | 44 +++++++++++++------- src-tauri/src/backend/readiness.rs | 46 ++++++++++++++------- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index bfe6ba32..06eba7af 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -126,22 +126,29 @@ def resolve_startup_heartbeat_path() -> Path | None: return Path(raw) +def build_heartbeat_payload(state: str) -> dict[str, object]: + return { + "pid": os.getpid(), + "state": state, + "updated_at_ms": int(time.time() * 1000), + } + + +def atomic_write_json(path: Path, payload: dict[str, object]) -> None: + temp_path = path.with_name(f"{path.name}.tmp") + temp_path.write_text( + json.dumps(payload, separators=(",", ":")), + encoding="utf-8", + ) + temp_path.replace(path) + + def write_startup_heartbeat( path: Path, state: str, *, warn_on_error: bool = False ) -> bool: try: path.parent.mkdir(parents=True, exist_ok=True) - payload = { - "pid": os.getpid(), - "state": state, - "updated_at_ms": int(time.time() * 1000), - } - temp_path = path.with_name(f"{path.name}.tmp") - temp_path.write_text( - json.dumps(payload, separators=(",", ":")), - encoding="utf-8", - ) - temp_path.replace(path) + atomic_write_json(path, build_heartbeat_payload(state)) return True except Exception as exc: if warn_on_error: @@ -155,23 +162,28 @@ def write_startup_heartbeat( def heartbeat_loop( path: Path, interval_seconds: float, stop_event: threading.Event ) -> None: + # At least one successful write has happened. had_successful_write = False - warning_emitted = False + # A warning has already been emitted since the last successful write. + warning_emitted_since_last_success = False + + def should_warn() -> bool: + return (not had_successful_write) or (not warning_emitted_since_last_success) ok = write_startup_heartbeat(path, "starting", warn_on_error=True) if ok: had_successful_write = True else: - warning_emitted = True + warning_emitted_since_last_success = True while not stop_event.wait(interval_seconds): - warn_now = (not warning_emitted) or (not had_successful_write) + warn_now = should_warn() ok = write_startup_heartbeat(path, "starting", warn_on_error=warn_now) if ok: had_successful_write = True - warning_emitted = False + warning_emitted_since_last_success = False elif warn_now: - warning_emitted = True + warning_emitted_since_last_success = True def start_startup_heartbeat() -> None: diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 4914d6f7..eb0ec504 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -57,7 +57,8 @@ impl BackendState { } let now = SystemTime::now(); - let child_pid = self.child_pid_or_error()?; + self.ensure_child_alive()?; + let child_pid = self.child_pid()?; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { step_startup_heartbeat( @@ -112,13 +113,25 @@ impl BackendState { (http_status, tcp_reachable) } - fn child_pid_or_error(&self) -> Result { + fn child_pid(&self) -> Result { + let guard = self + .child + .lock() + .map_err(|_| "Backend process lock poisoned.".to_string())?; + + guard + .as_ref() + .map(|child| child.id()) + .ok_or_else(|| "Backend process is not running.".to_string()) + } + + fn ensure_child_alive(&self) -> Result<(), String> { let mut guard = self .child .lock() .map_err(|_| "Backend process lock poisoned.".to_string())?; + if let Some(child) = guard.as_mut() { - let pid = child.id(); match child.try_wait() { Ok(Some(status)) => { *guard = None; @@ -126,7 +139,7 @@ impl BackendState { "Backend process exited before becoming reachable: {status}" )) } - Ok(None) => Ok(pid), + Ok(None) => Ok(()), Err(error) => Err(format!("Failed to poll backend process status: {error}")), } } else { @@ -147,8 +160,8 @@ impl BackendState { .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); let startup_heartbeat_age_ms = last_startup_heartbeat_at - .and_then(|updated_at| now.duration_since(updated_at).ok()) - .map(|age| age.as_millis().to_string()) + .and_then(|updated_at| ms_since(updated_at, now)) + .map(|age| age.to_string()) .unwrap_or_else(|| "none".to_string()); append_desktop_log(&format!( "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}, startup_heartbeat_age_ms={}", @@ -208,8 +221,14 @@ fn startup_heartbeat_timestamp_is_fresh( max_age: Duration, ) -> bool { updated_at - .and_then(|updated_at| now.duration_since(updated_at).ok()) - .is_some_and(|age| age <= max_age) + .and_then(|updated_at| ms_since(updated_at, now)) + .is_some_and(|age_ms| age_ms <= max_age.as_millis()) +} + +fn ms_since(earlier: SystemTime, now: SystemTime) -> Option { + now.duration_since(earlier) + .ok() + .map(|duration| duration.as_millis()) } fn step_startup_heartbeat( @@ -224,10 +243,8 @@ fn step_startup_heartbeat( match (previous, current) { (Some(previous), None) => { - let heartbeat_age_ms = now - .duration_since(previous) - .ok() - .map(|age| age.as_millis().to_string()) + let heartbeat_age_ms = ms_since(previous, now) + .map(|age| age.to_string()) .unwrap_or_else(|| "unknown".to_string()); append_desktop_log(&format!( "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" @@ -237,10 +254,7 @@ fn step_startup_heartbeat( .to_string(), ) } - (None, None) => { - state.last_seen_at = None; - Ok(()) - } + (None, None) => Ok(()), (_, Some(current)) => { let updated_at = match previous { Some(previous) if current <= previous => previous, From e43de52069007aef9db4eb26737c32dac09656b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 14:33:39 +0900 Subject: [PATCH 10/13] docs: clarify startup heartbeat path coupling --- scripts/backend/templates/launch_backend.py | 4 ++++ src-tauri/src/app_constants.rs | 1 + src-tauri/src/backend/config.rs | 3 +++ 3 files changed, 8 insertions(+) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 06eba7af..0ad084af 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -13,6 +13,7 @@ BACKEND_DIR = Path(__file__).resolve().parent APP_DIR = BACKEND_DIR / "app" _WINDOWS_DLL_DIRECTORY_HANDLES: list[object] = [] +# Keep this in sync with BACKEND_STARTUP_HEARTBEAT_PATH_ENV in src-tauri/src/app_constants.rs. STARTUP_HEARTBEAT_ENV = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH" STARTUP_HEARTBEAT_INTERVAL_SECONDS = 2.0 @@ -168,6 +169,9 @@ def heartbeat_loop( warning_emitted_since_last_success = False def should_warn() -> bool: + # Before the first successful heartbeat we want every failure to surface so startup + # path/permission issues stay visible. After a success, only warn on the first failure in + # each consecutive failure run to avoid log spam. return (not had_successful_write) or (not warning_emitted_since_last_success) ok = write_startup_heartbeat(path, "starting", warn_on_error=True) diff --git a/src-tauri/src/app_constants.rs b/src-tauri/src/app_constants.rs index e42e2e29..e5f61b47 100644 --- a/src-tauri/src/app_constants.rs +++ b/src-tauri/src/app_constants.rs @@ -21,6 +21,7 @@ pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_START pub(crate) const DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS: u64 = 60 * 1000; pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS: u64 = 5_000; pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS: u64 = 15 * 60 * 1000; +// Keep this in sync with STARTUP_HEARTBEAT_ENV in scripts/backend/templates/launch_backend.py. pub(crate) const BACKEND_STARTUP_HEARTBEAT_PATH_ENV: &str = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH"; pub(crate) const DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH: &str = diff --git a/src-tauri/src/backend/config.rs b/src-tauri/src/backend/config.rs index f499723f..285dbad6 100644 --- a/src-tauri/src/backend/config.rs +++ b/src-tauri/src/backend/config.rs @@ -124,6 +124,9 @@ pub fn resolve_backend_startup_heartbeat_path( return None; } + // Prefer the launch plan's resolved root so spawn-time and readiness-time heartbeat paths + // stay aligned. Falling back to ASTRBOT_ROOT only helps older/custom call sites that do not + // pass a root dir; packaged launches may finally fall back to the default packaged root. if let Some(root) = root_dir { return Some(root.join(trimmed)); } From 4f4c5c9a43e11fdb6a3716f747b63bda51278860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 14:46:30 +0900 Subject: [PATCH 11/13] fix: harden startup heartbeat coordination --- scripts/backend/templates/launch_backend.py | 15 +++--- .../backend/templates/test_launch_backend.py | 4 ++ src-tauri/src/app_constants.rs | 1 + src-tauri/src/backend/config.rs | 2 +- src-tauri/src/backend/launch.rs | 2 +- src-tauri/src/backend/readiness.rs | 53 ++++++++++++------- src-tauri/src/desktop_state.rs | 2 +- src-tauri/src/launch_plan.rs | 8 +-- src-tauri/src/logging.rs | 4 +- src-tauri/src/update_channel.rs | 8 +-- 10 files changed, 60 insertions(+), 39 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 0ad084af..8693261f 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -16,6 +16,7 @@ # Keep this in sync with BACKEND_STARTUP_HEARTBEAT_PATH_ENV in src-tauri/src/app_constants.rs. STARTUP_HEARTBEAT_ENV = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH" STARTUP_HEARTBEAT_INTERVAL_SECONDS = 2.0 +STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS = 1.0 def configure_stdio_utf8() -> None: @@ -196,18 +197,20 @@ def start_startup_heartbeat() -> None: return stop_event = threading.Event() + thread = threading.Thread( + target=heartbeat_loop, + args=(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS, stop_event), + name="astrbot-startup-heartbeat", + daemon=True, + ) def on_exit() -> None: stop_event.set() + thread.join(timeout=STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS) write_startup_heartbeat(heartbeat_path, "stopping", warn_on_error=True) atexit.register(on_exit) - threading.Thread( - target=heartbeat_loop, - args=(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS, stop_event), - name="astrbot-startup-heartbeat", - daemon=True, - ).start() + thread.start() def main() -> None: diff --git a/scripts/backend/templates/test_launch_backend.py b/scripts/backend/templates/test_launch_backend.py index 00f55d74..4cdf1163 100644 --- a/scripts/backend/templates/test_launch_backend.py +++ b/scripts/backend/templates/test_launch_backend.py @@ -70,9 +70,13 @@ def test_stop_failure_still_warns_after_earlier_failure(self) -> None: launch_backend.atexit, "register", register ): launch_backend.start_startup_heartbeat() + thread.join.assert_not_called() on_exit = register.call_args.args[0] on_exit() + thread.join.assert_called_once_with( + timeout=launch_backend.STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS + ) self.assertEqual( [call.args[1] for call in write_mock.call_args_list], ["stopping"], diff --git a/src-tauri/src/app_constants.rs b/src-tauri/src/app_constants.rs index e5f61b47..e7de6dd9 100644 --- a/src-tauri/src/app_constants.rs +++ b/src-tauri/src/app_constants.rs @@ -1,6 +1,7 @@ use std::time::Duration; pub(crate) const DEFAULT_BACKEND_URL: &str = "http://127.0.0.1:6185/"; +pub(crate) const ASTRBOT_ROOT_ENV: &str = "ASTRBOT_ROOT"; pub(crate) const BACKEND_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_TIMEOUT_MS"; pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 15 * 60 * 1000; pub(crate) const GRACEFUL_RESTART_REQUEST_TIMEOUT_MS: u64 = 2_500; diff --git a/src-tauri/src/backend/config.rs b/src-tauri/src/backend/config.rs index 285dbad6..a2b5e995 100644 --- a/src-tauri/src/backend/config.rs +++ b/src-tauri/src/backend/config.rs @@ -131,7 +131,7 @@ pub fn resolve_backend_startup_heartbeat_path( return Some(root.join(trimmed)); } - if let Ok(root) = env::var("ASTRBOT_ROOT") { + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { let root = PathBuf::from(root.trim()); if !root.as_os_str().is_empty() { return Some(root.join(trimmed)); diff --git a/src-tauri/src/backend/launch.rs b/src-tauri/src/backend/launch.rs index a96e9449..b161b7bf 100644 --- a/src-tauri/src/backend/launch.rs +++ b/src-tauri/src/backend/launch.rs @@ -123,7 +123,7 @@ impl BackendState { } if let Some(root_dir) = &plan.root_dir { - command.env("ASTRBOT_ROOT", root_dir); + command.env(crate::ASTRBOT_ROOT_ENV, root_dir); } if let Some(heartbeat_path) = plan.startup_heartbeat_path.as_ref() { command.env(crate::BACKEND_STARTUP_HEARTBEAT_PATH_ENV, heartbeat_path); diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index eb0ec504..06d29dbe 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -57,8 +57,7 @@ impl BackendState { } let now = SystemTime::now(); - self.ensure_child_alive()?; - let child_pid = self.child_pid()?; + let child_pid = self.live_child_pid()?; if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { step_startup_heartbeat( @@ -113,25 +112,14 @@ impl BackendState { (http_status, tcp_reachable) } - fn child_pid(&self) -> Result { - let guard = self - .child - .lock() - .map_err(|_| "Backend process lock poisoned.".to_string())?; - - guard - .as_ref() - .map(|child| child.id()) - .ok_or_else(|| "Backend process is not running.".to_string()) - } - - fn ensure_child_alive(&self) -> Result<(), String> { + fn live_child_pid(&self) -> Result { let mut guard = self .child .lock() .map_err(|_| "Backend process lock poisoned.".to_string())?; if let Some(child) = guard.as_mut() { + let pid = child.id(); match child.try_wait() { Ok(Some(status)) => { *guard = None; @@ -139,7 +127,7 @@ impl BackendState { "Backend process exited before becoming reachable: {status}" )) } - Ok(None) => Ok(()), + Ok(None) => Ok(pid), Err(error) => Err(format!("Failed to poll backend process status: {error}")), } } else { @@ -159,10 +147,7 @@ impl BackendState { let last_http_status_text = last_http_status .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); - let startup_heartbeat_age_ms = last_startup_heartbeat_at - .and_then(|updated_at| ms_since(updated_at, now)) - .map(|age| age.to_string()) - .unwrap_or_else(|| "none".to_string()); + let startup_heartbeat_age_ms = describe_heartbeat_age(last_startup_heartbeat_at, now); append_desktop_log(&format!( "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}, startup_heartbeat_age_ms={}", timeout.as_millis(), @@ -231,6 +216,19 @@ fn ms_since(earlier: SystemTime, now: SystemTime) -> Option { .map(|duration| duration.as_millis()) } +fn describe_heartbeat_age( + last_startup_heartbeat_at: Option, + now: SystemTime, +) -> String { + match last_startup_heartbeat_at { + Some(updated_at) => match ms_since(updated_at, now) { + Some(age) => age.to_string(), + None => format!("future ({updated_at:?})"), + }, + None => "none".to_string(), + } +} + fn step_startup_heartbeat( heartbeat_path: &Path, child_pid: u32, @@ -412,4 +410,19 @@ mod tests { UNIX_EPOCH.checked_add(Duration::from_millis(u64::MAX)) ); } + + #[test] + fn describe_heartbeat_age_distinguishes_future_timestamp_from_missing() { + assert_eq!( + describe_heartbeat_age( + Some(UNIX_EPOCH + Duration::from_millis(6_000)), + UNIX_EPOCH + Duration::from_millis(5_500) + ), + format!("future ({:?})", UNIX_EPOCH + Duration::from_millis(6_000)) + ); + assert_eq!( + describe_heartbeat_age(None, UNIX_EPOCH + Duration::from_millis(5_500)), + "none" + ); + } } diff --git a/src-tauri/src/desktop_state.rs b/src-tauri/src/desktop_state.rs index c822c701..c7e011bf 100644 --- a/src-tauri/src/desktop_state.rs +++ b/src-tauri/src/desktop_state.rs @@ -5,7 +5,7 @@ use std::{ pub(crate) fn resolve_desktop_state_path(packaged_root_dir: Option<&Path>) -> Option { resolve_desktop_state_path_with_root( - env::var("ASTRBOT_ROOT").ok().as_deref(), + env::var(crate::ASTRBOT_ROOT_ENV).ok().as_deref(), packaged_root_dir, ) } diff --git a/src-tauri/src/launch_plan.rs b/src-tauri/src/launch_plan.rs index 21e1121c..8069ce98 100644 --- a/src-tauri/src/launch_plan.rs +++ b/src-tauri/src/launch_plan.rs @@ -40,7 +40,7 @@ pub fn resolve_custom_launch(custom_cmd: String) -> Result { .ok() .or_else(runtime_paths::detect_astrbot_source_root) .unwrap_or_else(runtime_paths::workspace_root_dir); - let root_dir = env::var("ASTRBOT_ROOT").ok().map(PathBuf::from); + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV).ok().map(PathBuf::from); let webui_dir = env::var("ASTRBOT_WEBUI_DIR").ok().map(PathBuf::from); let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); @@ -122,7 +122,7 @@ where )); } - let root_dir = env::var("ASTRBOT_ROOT") + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV) .map(PathBuf::from) .ok() .or_else(runtime_paths::default_packaged_root_dir); @@ -191,7 +191,7 @@ pub fn resolve_dev_launch() -> Result { args.push("--webui-dir".to_string()); args.push(path.to_string_lossy().to_string()); } - let root_dir = env::var("ASTRBOT_ROOT").ok().map(PathBuf::from); + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV).ok().map(PathBuf::from); let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); Ok(LaunchPlan { @@ -247,7 +247,7 @@ mod tests { #[test] fn resolve_custom_launch_sets_startup_heartbeat_path_from_root_dir() { - let _root_guard = EnvVarGuard::set("ASTRBOT_ROOT", "/tmp/astrbot-root"); + let _root_guard = EnvVarGuard::set(crate::ASTRBOT_ROOT_ENV, "/tmp/astrbot-root"); let plan = resolve_custom_launch("python main.py".to_string()).expect("custom plan"); diff --git a/src-tauri/src/logging.rs b/src-tauri/src/logging.rs index 99df4ad5..c641cf48 100644 --- a/src-tauri/src/logging.rs +++ b/src-tauri/src/logging.rs @@ -145,7 +145,7 @@ pub fn resolve_desktop_log_path(packaged_root: Option, desktop_log_file } } - if let Ok(root) = env::var("ASTRBOT_ROOT") { + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { let root = PathBuf::from(root.trim()); if !root.as_os_str().is_empty() { return root.join("logs").join(desktop_log_file); @@ -169,7 +169,7 @@ pub fn resolve_backend_log_path( if let Some(root) = root_dir { return root.join("logs").join("backend.log"); } - if let Ok(root) = env::var("ASTRBOT_ROOT") { + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { let path = PathBuf::from(root.trim()); if !path.as_os_str().is_empty() { return path.join("logs").join("backend.log"); diff --git a/src-tauri/src/update_channel.rs b/src-tauri/src/update_channel.rs index 90b1d28c..6042b4f5 100644 --- a/src-tauri/src/update_channel.rs +++ b/src-tauri/src/update_channel.rs @@ -625,7 +625,7 @@ mod tests { #[test] fn write_cached_channel_errors_when_state_path_unavailable() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let result = write_cached_update_channel(Some(UpdateChannel::Nightly), None); @@ -637,7 +637,7 @@ mod tests { #[test] fn read_cached_channel_round_trips_written_value() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("round-trip"); write_cached_update_channel(Some(UpdateChannel::Nightly), Some(&dir)) .expect("write cached channel"); @@ -652,7 +652,7 @@ mod tests { #[test] fn write_cached_channel_preserves_unrelated_state_fields() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("preserve-fields"); let state_path = dir.join("data").join("desktop_state.json"); fs::create_dir_all(state_path.parent().expect("state dir")).expect("create state dir"); @@ -693,7 +693,7 @@ mod tests { #[test] fn resolve_preferred_channel_falls_back_to_installed_version_channel() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("fallback"); assert_eq!( From 15f582b92dcb1656eac8b18fa5cc997d1b97e215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 15:02:39 +0900 Subject: [PATCH 12/13] fix: make startup heartbeat checks monotonic --- src-tauri/src/backend/readiness.rs | 130 ++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 40 deletions(-) diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 06d29dbe..432dda2f 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -55,7 +55,8 @@ impl BackendState { if matches!(http_status, Some(status_code) if (200..400).contains(&status_code)) { return Ok(()); } - let now = SystemTime::now(); + let wall_now = SystemTime::now(); + let monotonic_now = Instant::now(); let child_pid = self.live_child_pid()?; @@ -63,7 +64,8 @@ impl BackendState { step_startup_heartbeat( heartbeat_path, child_pid, - now, + wall_now, + monotonic_now, startup_idle_timeout, &mut startup_heartbeat_state, )?; @@ -84,7 +86,7 @@ impl BackendState { self.log_backend_readiness_timeout( limit, &readiness, - now, + wall_now, http_status, ever_tcp_reachable, startup_heartbeat_state.last_seen_at, @@ -179,6 +181,8 @@ enum StartupHeartbeatState { #[derive(Debug, Clone, Copy)] struct StartupHeartbeatTracker { last_seen_at: Option, + last_progress_at: Option, + consecutive_invalid_reads: u8, logged_fresh: bool, } @@ -186,11 +190,15 @@ impl StartupHeartbeatTracker { fn new() -> Self { Self { last_seen_at: None, + last_progress_at: None, + consecutive_invalid_reads: 0, logged_fresh: false, } } } +const STARTUP_HEARTBEAT_INVALID_READ_THRESHOLD: u8 = 2; + fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { let payload = fs::read_to_string(path).ok()?; let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; @@ -200,14 +208,12 @@ fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option, - now: SystemTime, +fn startup_heartbeat_progress_is_fresh( + last_progress_at: Option, + now: Instant, max_age: Duration, ) -> bool { - updated_at - .and_then(|updated_at| ms_since(updated_at, now)) - .is_some_and(|age_ms| age_ms <= max_age.as_millis()) + last_progress_at.is_some_and(|updated_at| now.duration_since(updated_at) <= max_age) } fn ms_since(earlier: SystemTime, now: SystemTime) -> Option { @@ -232,7 +238,8 @@ fn describe_heartbeat_age( fn step_startup_heartbeat( heartbeat_path: &Path, child_pid: u32, - now: SystemTime, + wall_now: SystemTime, + monotonic_now: Instant, idle_timeout: Duration, state: &mut StartupHeartbeatTracker, ) -> Result<(), String> { @@ -241,9 +248,12 @@ fn step_startup_heartbeat( match (previous, current) { (Some(previous), None) => { - let heartbeat_age_ms = ms_since(previous, now) - .map(|age| age.to_string()) - .unwrap_or_else(|| "unknown".to_string()); + state.consecutive_invalid_reads = state.consecutive_invalid_reads.saturating_add(1); + if state.consecutive_invalid_reads < STARTUP_HEARTBEAT_INVALID_READ_THRESHOLD { + return Ok(()); + } + + let heartbeat_age_ms = describe_heartbeat_age(Some(previous), wall_now); append_desktop_log(&format!( "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" )); @@ -252,14 +262,30 @@ fn step_startup_heartbeat( .to_string(), ) } - (None, None) => Ok(()), + (None, None) => { + state.consecutive_invalid_reads = 0; + Ok(()) + } (_, Some(current)) => { + state.consecutive_invalid_reads = 0; let updated_at = match previous { Some(previous) if current <= previous => previous, _ => current, }; state.last_seen_at = Some(updated_at); - if startup_heartbeat_timestamp_is_fresh(state.last_seen_at, now, idle_timeout) { + + if previous.is_none() + || Some(updated_at) != previous + || state.last_progress_at.is_none() + { + state.last_progress_at = Some(monotonic_now); + } + + if startup_heartbeat_progress_is_fresh( + state.last_progress_at, + monotonic_now, + idle_timeout, + ) { if !state.logged_fresh { append_desktop_log( "backend startup heartbeat is fresh while HTTP dashboard is not ready yet; waiting", @@ -282,14 +308,14 @@ fn step_startup_heartbeat( #[cfg(test)] mod tests { - use std::time::{Duration, SystemTime, UNIX_EPOCH}; + use std::time::{Duration, Instant, UNIX_EPOCH}; use tempfile::TempDir; use super::*; #[test] - fn startup_heartbeat_is_fresh_for_recent_timestamp() { + fn startup_heartbeat_progress_is_fresh_for_recent_instant() { let temp_dir = TempDir::new().expect("create temp dir"); let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); std::fs::write( @@ -298,18 +324,15 @@ mod tests { ) .expect("write heartbeat file"); - let updated_at = - read_startup_heartbeat_updated_at(&heartbeat_path, 42).expect("heartbeat timestamp"); - - assert!(startup_heartbeat_timestamp_is_fresh( - Some(updated_at), - UNIX_EPOCH + Duration::from_millis(5500), + assert!(startup_heartbeat_progress_is_fresh( + Some(Instant::now()), + Instant::now() + Duration::from_millis(500), Duration::from_secs(1), )); } #[test] - fn startup_heartbeat_is_not_fresh_for_stale_timestamp() { + fn startup_heartbeat_progress_is_not_fresh_when_stale() { let temp_dir = TempDir::new().expect("create temp dir"); let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); std::fs::write( @@ -318,12 +341,9 @@ mod tests { ) .expect("write heartbeat file"); - let updated_at = - read_startup_heartbeat_updated_at(&heartbeat_path, 42).expect("heartbeat timestamp"); - - assert!(!startup_heartbeat_timestamp_is_fresh( - Some(updated_at), - SystemTime::UNIX_EPOCH + Duration::from_millis(5000), + assert!(!startup_heartbeat_progress_is_fresh( + Some(Instant::now()), + Instant::now() + Duration::from_millis(1500), Duration::from_secs(1), )); } @@ -341,32 +361,37 @@ mod tests { assert_eq!(read_startup_heartbeat_updated_at(&heartbeat_path, 42), None); } - #[test] - fn startup_heartbeat_is_not_fresh_for_future_timestamp() { - assert!(!startup_heartbeat_timestamp_is_fresh( - Some(UNIX_EPOCH + Duration::from_millis(6000)), - UNIX_EPOCH + Duration::from_millis(5500), - Duration::from_secs(1), - )); - } - #[test] fn step_startup_heartbeat_fails_when_existing_heartbeat_disappears() { let temp_dir = TempDir::new().expect("create temp dir"); let heartbeat_path = temp_dir.path().join("missing-startup-heartbeat.json"); + let monotonic_now = Instant::now(); let mut tracker = StartupHeartbeatTracker { last_seen_at: Some(UNIX_EPOCH + Duration::from_millis(5000)), + last_progress_at: Some(monotonic_now), + consecutive_invalid_reads: 0, logged_fresh: false, }; - let result = step_startup_heartbeat( + let first_result = step_startup_heartbeat( &heartbeat_path, 42, UNIX_EPOCH + Duration::from_millis(5500), + monotonic_now, Duration::from_secs(1), &mut tracker, ); + let result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5600), + monotonic_now + Duration::from_millis(100), + Duration::from_secs(1), + &mut tracker, + ); + + assert_eq!(first_result, Ok(())); assert_eq!( result, Err( @@ -376,6 +401,31 @@ mod tests { ); } + #[test] + fn step_startup_heartbeat_tolerates_single_missing_read_after_valid_heartbeat() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("missing-startup-heartbeat.json"); + let monotonic_now = Instant::now(); + let mut tracker = StartupHeartbeatTracker { + last_seen_at: Some(UNIX_EPOCH + Duration::from_millis(5000)), + last_progress_at: Some(monotonic_now), + consecutive_invalid_reads: 0, + logged_fresh: false, + }; + + let result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + monotonic_now, + Duration::from_secs(1), + &mut tracker, + ); + + assert_eq!(result, Ok(())); + assert_eq!(tracker.consecutive_invalid_reads, 1); + } + #[test] fn startup_heartbeat_file_rejects_unknown_state() { assert!(serde_json::from_str::( From 46e2f1380c6953385df006a611a79481881d8f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Fri, 10 Apr 2026 15:16:05 +0900 Subject: [PATCH 13/13] fix: clean up heartbeat test and exit handling --- scripts/backend/templates/launch_backend.py | 11 ++++- .../backend/templates/test_launch_backend.py | 45 +++++++++++++++++++ src-tauri/src/backend/readiness.rs | 16 ------- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index 8693261f..72a48f23 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -142,7 +142,14 @@ def atomic_write_json(path: Path, payload: dict[str, object]) -> None: json.dumps(payload, separators=(",", ":")), encoding="utf-8", ) - temp_path.replace(path) + try: + temp_path.replace(path) + except Exception: + try: + temp_path.unlink(missing_ok=True) + except Exception: + pass + raise def write_startup_heartbeat( @@ -209,8 +216,8 @@ def on_exit() -> None: thread.join(timeout=STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS) write_startup_heartbeat(heartbeat_path, "stopping", warn_on_error=True) - atexit.register(on_exit) thread.start() + atexit.register(on_exit) def main() -> None: diff --git a/scripts/backend/templates/test_launch_backend.py b/scripts/backend/templates/test_launch_backend.py index 4cdf1163..0fc501a8 100644 --- a/scripts/backend/templates/test_launch_backend.py +++ b/scripts/backend/templates/test_launch_backend.py @@ -1,4 +1,5 @@ import importlib.util +import tempfile import unittest from pathlib import Path from unittest import mock @@ -13,6 +14,25 @@ class StartupHeartbeatTests(unittest.TestCase): + def test_atomic_write_json_cleans_up_temp_file_when_replace_fails(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + heartbeat_path = Path(temp_dir) / "heartbeat.json" + temp_path = heartbeat_path.with_name(f"{heartbeat_path.name}.tmp") + + with mock.patch.object( + Path, + "replace", + autospec=True, + side_effect=OSError("replace failed"), + ): + with self.assertRaises(OSError): + launch_backend.atomic_write_json( + heartbeat_path, + {"pid": 42, "state": "starting", "updated_at_ms": 5000}, + ) + + self.assertFalse(temp_path.exists()) + def test_repeated_failures_warn_before_first_success(self) -> None: stop_event = mock.Mock() stop_event.wait.side_effect = [False, True] @@ -86,6 +106,31 @@ def test_stop_failure_still_warns_after_earlier_failure(self) -> None: [True], ) + def test_start_startup_heartbeat_does_not_register_exit_handler_when_thread_start_fails( + self, + ) -> None: + stop_event = mock.Mock() + thread = mock.Mock() + thread.start.side_effect = RuntimeError("thread start failed") + register = mock.Mock() + + with mock.patch.object( + launch_backend, + "resolve_startup_heartbeat_path", + return_value=Path("/tmp/heartbeat.json"), + ): + with mock.patch.object( + launch_backend.threading, "Event", return_value=stop_event + ): + with mock.patch.object( + launch_backend.threading, "Thread", return_value=thread + ): + with mock.patch.object(launch_backend.atexit, "register", register): + with self.assertRaises(RuntimeError): + launch_backend.start_startup_heartbeat() + + register.assert_not_called() + if __name__ == "__main__": unittest.main() diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 432dda2f..dc49725f 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -316,14 +316,6 @@ mod tests { #[test] fn startup_heartbeat_progress_is_fresh_for_recent_instant() { - let temp_dir = TempDir::new().expect("create temp dir"); - let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); - std::fs::write( - &heartbeat_path, - r#"{"pid":42,"state":"starting","updated_at_ms":5000}"#, - ) - .expect("write heartbeat file"); - assert!(startup_heartbeat_progress_is_fresh( Some(Instant::now()), Instant::now() + Duration::from_millis(500), @@ -333,14 +325,6 @@ mod tests { #[test] fn startup_heartbeat_progress_is_not_fresh_when_stale() { - let temp_dir = TempDir::new().expect("create temp dir"); - let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); - std::fs::write( - &heartbeat_path, - r#"{"pid":42,"state":"starting","updated_at_ms":1000}"#, - ) - .expect("write heartbeat file"); - assert!(!startup_heartbeat_progress_is_fresh( Some(Instant::now()), Instant::now() + Duration::from_millis(1500),