diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 97738dd9..cbea4c6a 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -9,7 +9,8 @@ | --- | --- | --- | | `ASTRBOT_BACKEND_URL` | 后端基础 URL | 默认 `http://127.0.0.1:6185/` | | `ASTRBOT_BACKEND_AUTO_START` | 是否自动拉起后端 | 默认 `1`(启用) | -| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `300000` | +| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `900000` | +| `ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS` | 后端启动 heartbeat 空闲超时 | 默认 `60000`,范围 `5000~900000` | | `ASTRBOT_BACKEND_READY_HTTP_PATH` | 就绪探针 HTTP 路径 | 默认 `/api/stat/start-time` | | `ASTRBOT_BACKEND_READY_PROBE_TIMEOUT_MS` | 就绪探针单次超时 | 默认回退到 `ASTRBOT_BACKEND_PING_TIMEOUT_MS` | | `ASTRBOT_BACKEND_READY_POLL_INTERVAL_MS` | 就绪轮询间隔 | 默认 `300`,并按边界 clamp | @@ -53,6 +54,7 @@ | 变量 | 用途 | 默认值/行为 | | --- | --- | --- | | `ASTRBOT_DESKTOP_CLIENT` | 标记桌面客户端环境 | 打包态启动后端时写入 `1` | +| `ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH` | 桌面端写给后端启动器的 heartbeat 文件路径 | 打包态默认写到 `ASTRBOT_ROOT/data/backend-startup-heartbeat.json` | ## 4. 发布/CI(GitHub Actions) diff --git a/scripts/backend/templates/launch_backend.py b/scripts/backend/templates/launch_backend.py index ce31c4f0..72a48f23 100644 --- a/scripts/backend/templates/launch_backend.py +++ b/scripts/backend/templates/launch_backend.py @@ -1,14 +1,22 @@ from __future__ import annotations +import atexit import ctypes +import json import os import runpy import sys +import threading +import time from pathlib import Path BACKEND_DIR = Path(__file__).resolve().parent APP_DIR = BACKEND_DIR / "app" _WINDOWS_DLL_DIRECTORY_HANDLES: list[object] = [] +# Keep this in sync with BACKEND_STARTUP_HEARTBEAT_PATH_ENV in src-tauri/src/app_constants.rs. +STARTUP_HEARTBEAT_ENV = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH" +STARTUP_HEARTBEAT_INTERVAL_SECONDS = 2.0 +STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS = 1.0 def configure_stdio_utf8() -> None: @@ -113,15 +121,120 @@ def preload_windows_runtime_dlls() -> None: continue -configure_stdio_utf8() -configure_windows_dll_search_path() -preload_windows_runtime_dlls() +def resolve_startup_heartbeat_path() -> Path | None: + raw = os.environ.get(STARTUP_HEARTBEAT_ENV, "").strip() + if not raw: + return None + return Path(raw) -sys.path.insert(0, str(APP_DIR)) -main_file = APP_DIR / "main.py" -if not main_file.is_file(): - raise FileNotFoundError(f"Backend entrypoint not found: {main_file}") +def build_heartbeat_payload(state: str) -> dict[str, object]: + return { + "pid": os.getpid(), + "state": state, + "updated_at_ms": int(time.time() * 1000), + } -sys.argv[0] = str(main_file) -runpy.run_path(str(main_file), run_name="__main__") + +def atomic_write_json(path: Path, payload: dict[str, object]) -> None: + temp_path = path.with_name(f"{path.name}.tmp") + temp_path.write_text( + json.dumps(payload, separators=(",", ":")), + encoding="utf-8", + ) + try: + temp_path.replace(path) + except Exception: + try: + temp_path.unlink(missing_ok=True) + except Exception: + pass + raise + + +def write_startup_heartbeat( + path: Path, state: str, *, warn_on_error: bool = False +) -> bool: + try: + path.parent.mkdir(parents=True, exist_ok=True) + atomic_write_json(path, build_heartbeat_payload(state)) + return True + except Exception as exc: + if warn_on_error: + print( + f"[startup-heartbeat] failed to write heartbeat to {path}: {exc.__class__.__name__}: {exc}", + file=sys.stderr, + ) + return False + + +def heartbeat_loop( + path: Path, interval_seconds: float, stop_event: threading.Event +) -> None: + # At least one successful write has happened. + had_successful_write = False + # A warning has already been emitted since the last successful write. + warning_emitted_since_last_success = False + + def should_warn() -> bool: + # Before the first successful heartbeat we want every failure to surface so startup + # path/permission issues stay visible. After a success, only warn on the first failure in + # each consecutive failure run to avoid log spam. + return (not had_successful_write) or (not warning_emitted_since_last_success) + + ok = write_startup_heartbeat(path, "starting", warn_on_error=True) + if ok: + had_successful_write = True + else: + warning_emitted_since_last_success = True + + while not stop_event.wait(interval_seconds): + warn_now = should_warn() + ok = write_startup_heartbeat(path, "starting", warn_on_error=warn_now) + if ok: + had_successful_write = True + warning_emitted_since_last_success = False + elif warn_now: + warning_emitted_since_last_success = True + + +def start_startup_heartbeat() -> None: + heartbeat_path = resolve_startup_heartbeat_path() + if heartbeat_path is None: + return + + stop_event = threading.Event() + thread = threading.Thread( + target=heartbeat_loop, + args=(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS, stop_event), + name="astrbot-startup-heartbeat", + daemon=True, + ) + + def on_exit() -> None: + stop_event.set() + thread.join(timeout=STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS) + write_startup_heartbeat(heartbeat_path, "stopping", warn_on_error=True) + + thread.start() + atexit.register(on_exit) + + +def main() -> None: + configure_stdio_utf8() + configure_windows_dll_search_path() + preload_windows_runtime_dlls() + start_startup_heartbeat() + + sys.path.insert(0, str(APP_DIR)) + + main_file = APP_DIR / "main.py" + if not main_file.is_file(): + raise FileNotFoundError(f"Backend entrypoint not found: {main_file}") + + sys.argv[0] = str(main_file) + runpy.run_path(str(main_file), run_name="__main__") + + +if __name__ == "__main__": + main() diff --git a/scripts/backend/templates/test_launch_backend.py b/scripts/backend/templates/test_launch_backend.py new file mode 100644 index 00000000..0fc501a8 --- /dev/null +++ b/scripts/backend/templates/test_launch_backend.py @@ -0,0 +1,136 @@ +import importlib.util +import tempfile +import unittest +from pathlib import Path +from unittest import mock + + +MODULE_PATH = Path(__file__).with_name("launch_backend.py") +SPEC = importlib.util.spec_from_file_location("launch_backend_under_test", MODULE_PATH) +if SPEC is None or SPEC.loader is None: + raise RuntimeError(f"Cannot load launch_backend module from {MODULE_PATH}") +launch_backend = importlib.util.module_from_spec(SPEC) +SPEC.loader.exec_module(launch_backend) + + +class StartupHeartbeatTests(unittest.TestCase): + def test_atomic_write_json_cleans_up_temp_file_when_replace_fails(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + heartbeat_path = Path(temp_dir) / "heartbeat.json" + temp_path = heartbeat_path.with_name(f"{heartbeat_path.name}.tmp") + + with mock.patch.object( + Path, + "replace", + autospec=True, + side_effect=OSError("replace failed"), + ): + with self.assertRaises(OSError): + launch_backend.atomic_write_json( + heartbeat_path, + {"pid": 42, "state": "starting", "updated_at_ms": 5000}, + ) + + self.assertFalse(temp_path.exists()) + + def test_repeated_failures_warn_before_first_success(self) -> None: + stop_event = mock.Mock() + stop_event.wait.side_effect = [False, True] + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + side_effect=[False, False], + ) as write_mock: + launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event) + + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True, True], + ) + + def test_repeated_failures_after_success_are_suppressed(self) -> None: + stop_event = mock.Mock() + stop_event.wait.side_effect = [False, False, True] + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + side_effect=[True, False, False], + ) as write_mock: + launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event) + + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True, True, False], + ) + + def test_stop_failure_still_warns_after_earlier_failure(self) -> None: + stop_event = mock.Mock() + thread = mock.Mock() + register = mock.Mock() + + with mock.patch.object( + launch_backend, + "write_startup_heartbeat", + return_value=False, + ) as write_mock: + with mock.patch.object( + launch_backend, + "resolve_startup_heartbeat_path", + return_value=Path("/tmp/heartbeat.json"), + ): + with mock.patch.object( + launch_backend.threading, "Event", return_value=stop_event + ): + with mock.patch.object( + launch_backend.threading, "Thread", return_value=thread + ): + with mock.patch.object( + launch_backend.atexit, "register", register + ): + launch_backend.start_startup_heartbeat() + thread.join.assert_not_called() + on_exit = register.call_args.args[0] + on_exit() + + thread.join.assert_called_once_with( + timeout=launch_backend.STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS + ) + self.assertEqual( + [call.args[1] for call in write_mock.call_args_list], + ["stopping"], + ) + self.assertEqual( + [call.kwargs["warn_on_error"] for call in write_mock.call_args_list], + [True], + ) + + def test_start_startup_heartbeat_does_not_register_exit_handler_when_thread_start_fails( + self, + ) -> None: + stop_event = mock.Mock() + thread = mock.Mock() + thread.start.side_effect = RuntimeError("thread start failed") + register = mock.Mock() + + with mock.patch.object( + launch_backend, + "resolve_startup_heartbeat_path", + return_value=Path("/tmp/heartbeat.json"), + ): + with mock.patch.object( + launch_backend.threading, "Event", return_value=stop_event + ): + with mock.patch.object( + launch_backend.threading, "Thread", return_value=thread + ): + with mock.patch.object(launch_backend.atexit, "register", register): + with self.assertRaises(RuntimeError): + launch_backend.start_startup_heartbeat() + + register.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/src-tauri/src/app_constants.rs b/src-tauri/src/app_constants.rs index 4959f299..e7de6dd9 100644 --- a/src-tauri/src/app_constants.rs +++ b/src-tauri/src/app_constants.rs @@ -1,8 +1,9 @@ use std::time::Duration; pub(crate) const DEFAULT_BACKEND_URL: &str = "http://127.0.0.1:6185/"; +pub(crate) const ASTRBOT_ROOT_ENV: &str = "ASTRBOT_ROOT"; pub(crate) const BACKEND_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_TIMEOUT_MS"; -pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 5 * 60 * 1000; +pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 15 * 60 * 1000; pub(crate) const GRACEFUL_RESTART_REQUEST_TIMEOUT_MS: u64 = 2_500; pub(crate) const GRACEFUL_RESTART_START_TIME_TIMEOUT_MS: u64 = 1_800; pub(crate) const GRACEFUL_RESTART_POLL_INTERVAL_MS: u64 = 350; @@ -17,6 +18,15 @@ pub(crate) const BACKEND_READY_PROBE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_READY_ pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MIN_MS: u64 = 100; pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MAX_MS: u64 = 30_000; pub(crate) const BACKEND_READY_TCP_PROBE_TIMEOUT_MAX_MS: u64 = 1_000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS"; +pub(crate) const DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS: u64 = 60 * 1000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS: u64 = 5_000; +pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS: u64 = 15 * 60 * 1000; +// Keep this in sync with STARTUP_HEARTBEAT_ENV in scripts/backend/templates/launch_backend.py. +pub(crate) const BACKEND_STARTUP_HEARTBEAT_PATH_ENV: &str = + "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH"; +pub(crate) const DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH: &str = + "data/backend-startup-heartbeat.json"; pub(crate) const DEFAULT_BACKEND_PING_TIMEOUT_MS: u64 = 800; pub(crate) const BACKEND_PING_TIMEOUT_MIN_MS: u64 = 50; pub(crate) const BACKEND_PING_TIMEOUT_MAX_MS: u64 = 30_000; diff --git a/src-tauri/src/app_helpers.rs b/src-tauri/src/app_helpers.rs index 6d85071e..11e37307 100644 --- a/src-tauri/src/app_helpers.rs +++ b/src-tauri/src/app_helpers.rs @@ -79,6 +79,7 @@ mod tests { cwd: PathBuf::from("."), root_dir: None, webui_dir: None, + startup_heartbeat_path: None, packaged_mode: false, }; diff --git a/src-tauri/src/app_types.rs b/src-tauri/src/app_types.rs index 53e0bbf7..aea509a3 100644 --- a/src-tauri/src/app_types.rs +++ b/src-tauri/src/app_types.rs @@ -33,6 +33,7 @@ pub(crate) struct LaunchPlan { pub(crate) cwd: PathBuf, pub(crate) root_dir: Option, pub(crate) webui_dir: Option, + pub(crate) startup_heartbeat_path: Option, pub(crate) packaged_mode: bool, } diff --git a/src-tauri/src/backend/config.rs b/src-tauri/src/backend/config.rs index c58676a2..a2b5e995 100644 --- a/src-tauri/src/backend/config.rs +++ b/src-tauri/src/backend/config.rs @@ -1,4 +1,5 @@ use std::env; +use std::path::{Path, PathBuf}; use std::time::Duration; use url::Url; @@ -7,6 +8,8 @@ pub struct BackendReadinessConfig { pub path: String, pub probe_timeout_ms: u64, pub poll_interval_ms: u64, + pub startup_idle_timeout_ms: u64, + pub startup_heartbeat_path: Option, } pub fn resolve_backend_ready_http_path(env_name: &str, default_path: &str, mut log: F) -> String @@ -97,6 +100,47 @@ where parse_clamped_timeout_env(raw, env_name, fallback_ms, min_ms, max_ms, log) } +pub fn resolve_backend_startup_idle_timeout_ms( + raw: &str, + env_name: &str, + fallback_ms: u64, + min_ms: u64, + max_ms: u64, + log: F, +) -> u64 +where + F: FnMut(String), +{ + parse_clamped_timeout_env(raw, env_name, fallback_ms, min_ms, max_ms, log) +} + +pub fn resolve_backend_startup_heartbeat_path( + root_dir: Option<&Path>, + packaged_root: Option, + relative_path: &str, +) -> Option { + let trimmed = relative_path.trim(); + if trimmed.is_empty() { + return None; + } + + // Prefer the launch plan's resolved root so spawn-time and readiness-time heartbeat paths + // stay aligned. Falling back to ASTRBOT_ROOT only helps older/custom call sites that do not + // pass a root dir; packaged launches may finally fall back to the default packaged root. + if let Some(root) = root_dir { + return Some(root.join(trimmed)); + } + + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { + let root = PathBuf::from(root.trim()); + if !root.as_os_str().is_empty() { + return Some(root.join(trimmed)); + } + } + + packaged_root.map(|root| root.join(trimmed)) +} + #[allow(clippy::too_many_arguments)] pub fn resolve_backend_readiness_config( ready_http_path_env: &str, @@ -221,6 +265,8 @@ where path, probe_timeout_ms, poll_interval_ms, + startup_idle_timeout_ms: 0, + startup_heartbeat_path: None, } } @@ -260,6 +306,47 @@ mod tests { assert_eq!(value, 3_000); } + #[test] + fn resolve_backend_startup_idle_timeout_clamps_large_value() { + let value = resolve_backend_startup_idle_timeout_ms( + "999999", + "TEST_STARTUP_IDLE_TIMEOUT_ENV", + 60_000, + 5_000, + 300_000, + |_| {}, + ); + assert_eq!(value, 300_000); + } + + #[test] + fn resolve_backend_startup_idle_timeout_clamps_small_value() { + let value = resolve_backend_startup_idle_timeout_ms( + "1000", + "TEST_STARTUP_IDLE_TIMEOUT_ENV", + 60_000, + 5_000, + 300_000, + |_| {}, + ); + assert_eq!(value, 5_000); + } + + #[test] + fn resolve_backend_startup_heartbeat_path_prefers_root_dir() { + let path = resolve_backend_startup_heartbeat_path( + Some(Path::new("/tmp/astrbot-root")), + Some(PathBuf::from("/tmp/packaged-root")), + "data/backend-startup-heartbeat.json", + ) + .expect("expected heartbeat path"); + + assert_eq!( + path, + PathBuf::from("/tmp/astrbot-root").join("data/backend-startup-heartbeat.json") + ); + } + #[test] fn resolve_backend_timeout_uses_packaged_fallback_when_zero() { let timeout = resolve_backend_timeout_ms(true, "TEST_TIMEOUT_ENV_MISSING", 20_000, 300_000); diff --git a/src-tauri/src/backend/launch.rs b/src-tauri/src/backend/launch.rs index ac91d7af..b161b7bf 100644 --- a/src-tauri/src/backend/launch.rs +++ b/src-tauri/src/backend/launch.rs @@ -123,7 +123,10 @@ impl BackendState { } if let Some(root_dir) = &plan.root_dir { - command.env("ASTRBOT_ROOT", root_dir); + command.env(crate::ASTRBOT_ROOT_ENV, root_dir); + } + if let Some(heartbeat_path) = plan.startup_heartbeat_path.as_ref() { + command.env(crate::BACKEND_STARTUP_HEARTBEAT_PATH_ENV, heartbeat_path); } if let Some(webui_dir) = &plan.webui_dir { command.env("ASTRBOT_WEBUI_DIR", webui_dir); diff --git a/src-tauri/src/backend/readiness.rs b/src-tauri/src/backend/readiness.rs index 7eb3793d..dc49725f 100644 --- a/src-tauri/src/backend/readiness.rs +++ b/src-tauri/src/backend/readiness.rs @@ -1,6 +1,8 @@ use std::{ - env, thread, - time::{Duration, Instant}, + env, fs, + path::Path, + thread, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, }; use tauri::AppHandle; @@ -40,10 +42,12 @@ impl BackendState { 20_000, PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS, ); - let readiness = backend::runtime::backend_readiness_config(append_desktop_log); + let readiness = backend::runtime::backend_readiness_config(plan, append_desktop_log); + let startup_idle_timeout = Duration::from_millis(readiness.startup_idle_timeout_ms); let start_time = Instant::now(); let mut tcp_ready_logged = false; let mut ever_tcp_reachable = false; + let mut startup_heartbeat_state = StartupHeartbeatTracker::new(); loop { let (http_status, tcp_reachable) = @@ -51,6 +55,21 @@ impl BackendState { if matches!(http_status, Some(status_code) if (200..400).contains(&status_code)) { return Ok(()); } + let wall_now = SystemTime::now(); + let monotonic_now = Instant::now(); + + let child_pid = self.live_child_pid()?; + + if let Some(heartbeat_path) = readiness.startup_heartbeat_path.as_deref() { + step_startup_heartbeat( + heartbeat_path, + child_pid, + wall_now, + monotonic_now, + startup_idle_timeout, + &mut startup_heartbeat_state, + )?; + } if tcp_reachable { ever_tcp_reachable = true; @@ -62,37 +81,15 @@ impl BackendState { } } - { - let mut guard = self - .child - .lock() - .map_err(|_| "Backend process lock poisoned.".to_string())?; - if let Some(child) = guard.as_mut() { - match child.try_wait() { - Ok(Some(status)) => { - *guard = None; - return Err(format!( - "Backend process exited before becoming reachable: {status}" - )); - } - Ok(None) => {} - Err(error) => { - return Err(format!("Failed to poll backend process status: {error}")); - } - } - } else { - return Err("Backend process is not running.".to_string()); - } - } - if let Some(limit) = timeout_ms { if start_time.elapsed() >= limit { self.log_backend_readiness_timeout( limit, - &readiness.path, - readiness.probe_timeout_ms, + &readiness, + wall_now, http_status, ever_tcp_reachable, + startup_heartbeat_state.last_seen_at, ); return Err(format!( "Timed out after {}ms waiting for backend startup.", @@ -117,25 +114,349 @@ impl BackendState { (http_status, tcp_reachable) } + fn live_child_pid(&self) -> Result { + let mut guard = self + .child + .lock() + .map_err(|_| "Backend process lock poisoned.".to_string())?; + + if let Some(child) = guard.as_mut() { + let pid = child.id(); + match child.try_wait() { + Ok(Some(status)) => { + *guard = None; + Err(format!( + "Backend process exited before becoming reachable: {status}" + )) + } + Ok(None) => Ok(pid), + Err(error) => Err(format!("Failed to poll backend process status: {error}")), + } + } else { + Err("Backend process is not running.".to_string()) + } + } + fn log_backend_readiness_timeout( &self, timeout: Duration, - ready_http_path: &str, - probe_timeout_ms: u64, + readiness: &backend::config::BackendReadinessConfig, + now: SystemTime, last_http_status: Option, tcp_reachable: bool, + last_startup_heartbeat_at: Option, ) { let last_http_status_text = last_http_status .map(|status| status.to_string()) .unwrap_or_else(|| "none".to_string()); + let startup_heartbeat_age_ms = describe_heartbeat_age(last_startup_heartbeat_at, now); append_desktop_log(&format!( - "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}", + "backend HTTP readiness check timed out after {}ms: backend_url={}, path={}, probe_timeout_ms={}, tcp_reachable={}, last_http_status={}, startup_heartbeat_age_ms={}", timeout.as_millis(), self.backend_url, - ready_http_path, - probe_timeout_ms, + readiness.path, + readiness.probe_timeout_ms, tcp_reachable, - last_http_status_text + last_http_status_text, + startup_heartbeat_age_ms + )); + } +} + +#[derive(serde::Deserialize)] +#[serde(deny_unknown_fields)] +struct StartupHeartbeatFile { + pid: u32, + state: StartupHeartbeatState, + updated_at_ms: u64, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Deserialize)] +#[serde(rename_all = "lowercase")] +enum StartupHeartbeatState { + Starting, + Stopping, +} + +#[derive(Debug, Clone, Copy)] +struct StartupHeartbeatTracker { + last_seen_at: Option, + last_progress_at: Option, + consecutive_invalid_reads: u8, + logged_fresh: bool, +} + +impl StartupHeartbeatTracker { + fn new() -> Self { + Self { + last_seen_at: None, + last_progress_at: None, + consecutive_invalid_reads: 0, + logged_fresh: false, + } + } +} + +const STARTUP_HEARTBEAT_INVALID_READ_THRESHOLD: u8 = 2; + +fn read_startup_heartbeat_updated_at(path: &Path, expected_pid: u32) -> Option { + let payload = fs::read_to_string(path).ok()?; + let heartbeat: StartupHeartbeatFile = serde_json::from_str(&payload).ok()?; + if heartbeat.pid != expected_pid || heartbeat.state != StartupHeartbeatState::Starting { + return None; + } + UNIX_EPOCH.checked_add(Duration::from_millis(heartbeat.updated_at_ms)) +} + +fn startup_heartbeat_progress_is_fresh( + last_progress_at: Option, + now: Instant, + max_age: Duration, +) -> bool { + last_progress_at.is_some_and(|updated_at| now.duration_since(updated_at) <= max_age) +} + +fn ms_since(earlier: SystemTime, now: SystemTime) -> Option { + now.duration_since(earlier) + .ok() + .map(|duration| duration.as_millis()) +} + +fn describe_heartbeat_age( + last_startup_heartbeat_at: Option, + now: SystemTime, +) -> String { + match last_startup_heartbeat_at { + Some(updated_at) => match ms_since(updated_at, now) { + Some(age) => age.to_string(), + None => format!("future ({updated_at:?})"), + }, + None => "none".to_string(), + } +} + +fn step_startup_heartbeat( + heartbeat_path: &Path, + child_pid: u32, + wall_now: SystemTime, + monotonic_now: Instant, + idle_timeout: Duration, + state: &mut StartupHeartbeatTracker, +) -> Result<(), String> { + let previous = state.last_seen_at; + let current = read_startup_heartbeat_updated_at(heartbeat_path, child_pid); + + match (previous, current) { + (Some(previous), None) => { + state.consecutive_invalid_reads = state.consecutive_invalid_reads.saturating_add(1); + if state.consecutive_invalid_reads < STARTUP_HEARTBEAT_INVALID_READ_THRESHOLD { + return Ok(()); + } + + let heartbeat_age_ms = describe_heartbeat_age(Some(previous), wall_now); + append_desktop_log(&format!( + "backend startup heartbeat disappeared or became invalid before HTTP dashboard became ready: last_valid_age_ms={heartbeat_age_ms}" + )); + Err( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string(), + ) + } + (None, None) => { + state.consecutive_invalid_reads = 0; + Ok(()) + } + (_, Some(current)) => { + state.consecutive_invalid_reads = 0; + let updated_at = match previous { + Some(previous) if current <= previous => previous, + _ => current, + }; + state.last_seen_at = Some(updated_at); + + if previous.is_none() + || Some(updated_at) != previous + || state.last_progress_at.is_none() + { + state.last_progress_at = Some(monotonic_now); + } + + if startup_heartbeat_progress_is_fresh( + state.last_progress_at, + monotonic_now, + idle_timeout, + ) { + if !state.logged_fresh { + append_desktop_log( + "backend startup heartbeat is fresh while HTTP dashboard is not ready yet; waiting", + ); + state.logged_fresh = true; + } + Ok(()) + } else { + append_desktop_log( + "backend startup heartbeat went stale before HTTP dashboard became ready", + ); + Err(format!( + "Backend startup heartbeat went stale after {}ms without HTTP readiness.", + idle_timeout.as_millis() + )) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::{Duration, Instant, UNIX_EPOCH}; + + use tempfile::TempDir; + + use super::*; + + #[test] + fn startup_heartbeat_progress_is_fresh_for_recent_instant() { + assert!(startup_heartbeat_progress_is_fresh( + Some(Instant::now()), + Instant::now() + Duration::from_millis(500), + Duration::from_secs(1), + )); + } + + #[test] + fn startup_heartbeat_progress_is_not_fresh_when_stale() { + assert!(!startup_heartbeat_progress_is_fresh( + Some(Instant::now()), + Instant::now() + Duration::from_millis(1500), + Duration::from_secs(1), )); } + + #[test] + fn startup_heartbeat_is_not_fresh_for_mismatched_pid() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + r#"{"pid":7,"state":"starting","updated_at_ms":5000}"#, + ) + .expect("write heartbeat file"); + + assert_eq!(read_startup_heartbeat_updated_at(&heartbeat_path, 42), None); + } + + #[test] + fn step_startup_heartbeat_fails_when_existing_heartbeat_disappears() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("missing-startup-heartbeat.json"); + let monotonic_now = Instant::now(); + let mut tracker = StartupHeartbeatTracker { + last_seen_at: Some(UNIX_EPOCH + Duration::from_millis(5000)), + last_progress_at: Some(monotonic_now), + consecutive_invalid_reads: 0, + logged_fresh: false, + }; + + let first_result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + monotonic_now, + Duration::from_secs(1), + &mut tracker, + ); + + let result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5600), + monotonic_now + Duration::from_millis(100), + Duration::from_secs(1), + &mut tracker, + ); + + assert_eq!(first_result, Ok(())); + assert_eq!( + result, + Err( + "Backend startup heartbeat disappeared or became invalid before HTTP readiness." + .to_string() + ) + ); + } + + #[test] + fn step_startup_heartbeat_tolerates_single_missing_read_after_valid_heartbeat() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("missing-startup-heartbeat.json"); + let monotonic_now = Instant::now(); + let mut tracker = StartupHeartbeatTracker { + last_seen_at: Some(UNIX_EPOCH + Duration::from_millis(5000)), + last_progress_at: Some(monotonic_now), + consecutive_invalid_reads: 0, + logged_fresh: false, + }; + + let result = step_startup_heartbeat( + &heartbeat_path, + 42, + UNIX_EPOCH + Duration::from_millis(5500), + monotonic_now, + Duration::from_secs(1), + &mut tracker, + ); + + assert_eq!(result, Ok(())); + assert_eq!(tracker.consecutive_invalid_reads, 1); + } + + #[test] + fn startup_heartbeat_file_rejects_unknown_state() { + assert!(serde_json::from_str::( + r#"{"pid":42,"state":"unexpected","updated_at_ms":5000}"# + ) + .is_err()); + } + + #[test] + fn startup_heartbeat_file_rejects_unknown_fields() { + assert!(serde_json::from_str::( + r#"{"pid":42,"state":"starting","updated_at_ms":5000,"unexpected":true}"# + ) + .is_err()); + } + + #[test] + fn read_startup_heartbeat_updated_at_handles_large_timestamp_without_panic() { + let temp_dir = TempDir::new().expect("create temp dir"); + let heartbeat_path = temp_dir.path().join("startup-heartbeat.json"); + std::fs::write( + &heartbeat_path, + format!( + r#"{{"pid":42,"state":"starting","updated_at_ms":{}}}"#, + u64::MAX + ), + ) + .expect("write heartbeat file"); + + assert_eq!( + read_startup_heartbeat_updated_at(&heartbeat_path, 42), + UNIX_EPOCH.checked_add(Duration::from_millis(u64::MAX)) + ); + } + + #[test] + fn describe_heartbeat_age_distinguishes_future_timestamp_from_missing() { + assert_eq!( + describe_heartbeat_age( + Some(UNIX_EPOCH + Duration::from_millis(6_000)), + UNIX_EPOCH + Duration::from_millis(5_500) + ), + format!("future ({:?})", UNIX_EPOCH + Duration::from_millis(6_000)) + ); + assert_eq!( + describe_heartbeat_age(None, UNIX_EPOCH + Duration::from_millis(5_500)), + "none" + ); + } } diff --git a/src-tauri/src/backend/restart.rs b/src-tauri/src/backend/restart.rs index bc93c3bd..7372d6f9 100644 --- a/src-tauri/src/backend/restart.rs +++ b/src-tauri/src/backend/restart.rs @@ -340,6 +340,7 @@ mod tests { cwd: std::path::PathBuf::from("."), root_dir: None, webui_dir: None, + startup_heartbeat_path: None, packaged_mode: true, }; let state = BackendState::default(); diff --git a/src-tauri/src/backend/runtime.rs b/src-tauri/src/backend/runtime.rs index c928c4a6..990191fd 100644 --- a/src-tauri/src/backend/runtime.rs +++ b/src-tauri/src/backend/runtime.rs @@ -15,12 +15,15 @@ pub fn backend_wait_timeout(packaged_mode: bool) -> Duration { .unwrap_or(Duration::from_millis(20_000)) } -pub fn backend_readiness_config(log: F) -> backend::config::BackendReadinessConfig +pub fn backend_readiness_config( + plan: &crate::LaunchPlan, + log: F, +) -> backend::config::BackendReadinessConfig where F: Fn(&str) + Copy, { let probe_timeout_fallback = backend_ping_timeout_ms(log); - backend::config::backend_readiness_config( + let mut readiness = backend::config::backend_readiness_config( crate::BACKEND_READY_HTTP_PATH_ENV, crate::DEFAULT_BACKEND_READY_HTTP_PATH, crate::BACKEND_READY_PROBE_TIMEOUT_ENV, @@ -32,7 +35,20 @@ where crate::BACKEND_READY_POLL_INTERVAL_MIN_MS, crate::BACKEND_READY_POLL_INTERVAL_MAX_MS, |message| log(&message), - ) + ); + readiness.startup_idle_timeout_ms = match env::var(crate::BACKEND_STARTUP_IDLE_TIMEOUT_ENV) { + Ok(raw) => backend::config::resolve_backend_startup_idle_timeout_ms( + &raw, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_ENV, + crate::DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS, + crate::BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS, + |message| log(&message), + ), + Err(_) => crate::DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS, + }; + readiness.startup_heartbeat_path = plan.startup_heartbeat_path.clone(); + readiness } pub fn backend_ping_timeout_ms(log: F) -> u64 diff --git a/src-tauri/src/desktop_state.rs b/src-tauri/src/desktop_state.rs index c822c701..c7e011bf 100644 --- a/src-tauri/src/desktop_state.rs +++ b/src-tauri/src/desktop_state.rs @@ -5,7 +5,7 @@ use std::{ pub(crate) fn resolve_desktop_state_path(packaged_root_dir: Option<&Path>) -> Option { resolve_desktop_state_path_with_root( - env::var("ASTRBOT_ROOT").ok().as_deref(), + env::var(crate::ASTRBOT_ROOT_ENV).ok().as_deref(), packaged_root_dir, ) } diff --git a/src-tauri/src/launch_plan.rs b/src-tauri/src/launch_plan.rs index cb421bbe..8069ce98 100644 --- a/src-tauri/src/launch_plan.rs +++ b/src-tauri/src/launch_plan.rs @@ -5,7 +5,7 @@ use std::{ use tauri::AppHandle; -use crate::{packaged_webui, runtime_paths, LaunchPlan, RuntimeManifest}; +use crate::{backend, packaged_webui, runtime_paths, LaunchPlan, RuntimeManifest}; const BACKEND_RESOURCE_ALIAS: &str = env!("ASTRBOT_BACKEND_RESOURCE_ALIAS"); const WEBUI_RESOURCE_ALIAS: &str = env!("ASTRBOT_WEBUI_RESOURCE_ALIAS"); @@ -14,6 +14,19 @@ fn build_packaged_resource_relative_path(resource_alias: &str, leaf_name: &str) PathBuf::from(resource_alias).join(leaf_name) } +fn resolve_launch_startup_heartbeat_path( + root_dir: Option<&Path>, + packaged_mode: bool, +) -> Option { + backend::config::resolve_backend_startup_heartbeat_path( + root_dir, + packaged_mode + .then(runtime_paths::default_packaged_root_dir) + .flatten(), + crate::DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH, + ) +} + pub fn resolve_custom_launch(custom_cmd: String) -> Result { let mut pieces = shlex::split(&custom_cmd) .ok_or_else(|| format!("Invalid ASTRBOT_BACKEND_CMD: {custom_cmd}"))?; @@ -27,8 +40,9 @@ pub fn resolve_custom_launch(custom_cmd: String) -> Result { .ok() .or_else(runtime_paths::detect_astrbot_source_root) .unwrap_or_else(runtime_paths::workspace_root_dir); - let root_dir = env::var("ASTRBOT_ROOT").ok().map(PathBuf::from); + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV).ok().map(PathBuf::from); let webui_dir = env::var("ASTRBOT_WEBUI_DIR").ok().map(PathBuf::from); + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); Ok(LaunchPlan { cmd, @@ -36,6 +50,7 @@ pub fn resolve_custom_launch(custom_cmd: String) -> Result { cwd, root_dir, webui_dir, + startup_heartbeat_path, packaged_mode: false, }) } @@ -107,7 +122,7 @@ where )); } - let root_dir = env::var("ASTRBOT_ROOT") + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV) .map(PathBuf::from) .ok() .or_else(runtime_paths::default_packaged_root_dir); @@ -141,6 +156,7 @@ where "--webui-dir".to_string(), webui_dir.to_string_lossy().to_string(), ]; + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), true); let plan = LaunchPlan { cmd: python_path.to_string_lossy().to_string(), @@ -148,6 +164,7 @@ where cwd, root_dir, webui_dir: Some(webui_dir), + startup_heartbeat_path, packaged_mode: true, }; Ok(Some(plan)) @@ -174,6 +191,8 @@ pub fn resolve_dev_launch() -> Result { args.push("--webui-dir".to_string()); args.push(path.to_string_lossy().to_string()); } + let root_dir = env::var(crate::ASTRBOT_ROOT_ENV).ok().map(PathBuf::from); + let startup_heartbeat_path = resolve_launch_startup_heartbeat_path(root_dir.as_deref(), false); Ok(LaunchPlan { cmd: "uv".to_string(), @@ -181,8 +200,9 @@ pub fn resolve_dev_launch() -> Result { cwd: env::var("ASTRBOT_BACKEND_CWD") .map(PathBuf::from) .unwrap_or(source_root), - root_dir: env::var("ASTRBOT_ROOT").ok().map(PathBuf::from), + root_dir, webui_dir, + startup_heartbeat_path, packaged_mode: false, }) } @@ -191,6 +211,28 @@ pub fn resolve_dev_launch() -> Result { mod tests { use super::*; + struct EnvVarGuard { + key: &'static str, + previous: Option, + } + + impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + let previous = env::var(key).ok(); + env::set_var(key, value); + Self { key, previous } + } + } + + impl Drop for EnvVarGuard { + fn drop(&mut self) { + match &self.previous { + Some(value) => env::set_var(self.key, value), + None => env::remove_var(self.key), + } + } + } + #[test] fn build_packaged_resource_relative_path_joins_alias_and_leaf_name() { assert_eq!( @@ -202,4 +244,16 @@ mod tests { PathBuf::from("runtime/webui").join("index.html") ); } + + #[test] + fn resolve_custom_launch_sets_startup_heartbeat_path_from_root_dir() { + let _root_guard = EnvVarGuard::set(crate::ASTRBOT_ROOT_ENV, "/tmp/astrbot-root"); + + let plan = resolve_custom_launch("python main.py".to_string()).expect("custom plan"); + + assert_eq!( + plan.startup_heartbeat_path, + Some(PathBuf::from("/tmp/astrbot-root").join("data/backend-startup-heartbeat.json")) + ); + } } diff --git a/src-tauri/src/logging.rs b/src-tauri/src/logging.rs index 99df4ad5..c641cf48 100644 --- a/src-tauri/src/logging.rs +++ b/src-tauri/src/logging.rs @@ -145,7 +145,7 @@ pub fn resolve_desktop_log_path(packaged_root: Option, desktop_log_file } } - if let Ok(root) = env::var("ASTRBOT_ROOT") { + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { let root = PathBuf::from(root.trim()); if !root.as_os_str().is_empty() { return root.join("logs").join(desktop_log_file); @@ -169,7 +169,7 @@ pub fn resolve_backend_log_path( if let Some(root) = root_dir { return root.join("logs").join("backend.log"); } - if let Ok(root) = env::var("ASTRBOT_ROOT") { + if let Ok(root) = env::var(crate::ASTRBOT_ROOT_ENV) { let path = PathBuf::from(root.trim()); if !path.as_os_str().is_empty() { return path.join("logs").join("backend.log"); diff --git a/src-tauri/src/update_channel.rs b/src-tauri/src/update_channel.rs index 90b1d28c..6042b4f5 100644 --- a/src-tauri/src/update_channel.rs +++ b/src-tauri/src/update_channel.rs @@ -625,7 +625,7 @@ mod tests { #[test] fn write_cached_channel_errors_when_state_path_unavailable() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let result = write_cached_update_channel(Some(UpdateChannel::Nightly), None); @@ -637,7 +637,7 @@ mod tests { #[test] fn read_cached_channel_round_trips_written_value() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("round-trip"); write_cached_update_channel(Some(UpdateChannel::Nightly), Some(&dir)) .expect("write cached channel"); @@ -652,7 +652,7 @@ mod tests { #[test] fn write_cached_channel_preserves_unrelated_state_fields() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("preserve-fields"); let state_path = dir.join("data").join("desktop_state.json"); fs::create_dir_all(state_path.parent().expect("state dir")).expect("create state dir"); @@ -693,7 +693,7 @@ mod tests { #[test] fn resolve_preferred_channel_falls_back_to_installed_version_channel() { - let _root_guard = EnvVarGuard::clear("ASTRBOT_ROOT"); + let _root_guard = EnvVarGuard::clear(crate::ASTRBOT_ROOT_ENV); let dir = create_temp_case_dir("fallback"); assert_eq!(