Skip to content

Commit e90f366

Browse files
authored
fix: add backend startup heartbeat liveness probe (#114)
* fix: add backend startup heartbeat liveness probe * fix: tighten startup heartbeat validation * refactor: centralize startup heartbeat metadata * fix: surface heartbeat invalidation sooner * fix: harden startup heartbeat parsing * fix: warn on stop-time heartbeat failures * refactor: simplify startup heartbeat control flow * refactor: flatten readiness heartbeat helpers * refactor: clarify heartbeat helper responsibilities * docs: clarify startup heartbeat path coupling * fix: harden startup heartbeat coordination * fix: make startup heartbeat checks monotonic * fix: clean up heartbeat test and exit handling
1 parent c6ca243 commit e90f366

File tree

15 files changed

+805
-60
lines changed

15 files changed

+805
-60
lines changed

docs/environment-variables.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
| --- | --- | --- |
1010
| `ASTRBOT_BACKEND_URL` | 后端基础 URL | 默认 `http://127.0.0.1:6185/` |
1111
| `ASTRBOT_BACKEND_AUTO_START` | 是否自动拉起后端 | 默认 `1`(启用) |
12-
| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `300000` |
12+
| `ASTRBOT_BACKEND_TIMEOUT_MS` | 后端就绪等待超时 | 开发模式默认 `20000`;打包模式默认回退 `900000` |
13+
| `ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS` | 后端启动 heartbeat 空闲超时 | 默认 `60000`,范围 `5000~900000` |
1314
| `ASTRBOT_BACKEND_READY_HTTP_PATH` | 就绪探针 HTTP 路径 | 默认 `/api/stat/start-time` |
1415
| `ASTRBOT_BACKEND_READY_PROBE_TIMEOUT_MS` | 就绪探针单次超时 | 默认回退到 `ASTRBOT_BACKEND_PING_TIMEOUT_MS` |
1516
| `ASTRBOT_BACKEND_READY_POLL_INTERVAL_MS` | 就绪轮询间隔 | 默认 `300`,并按边界 clamp |
@@ -53,6 +54,7 @@
5354
| 变量 | 用途 | 默认值/行为 |
5455
| --- | --- | --- |
5556
| `ASTRBOT_DESKTOP_CLIENT` | 标记桌面客户端环境 | 打包态启动后端时写入 `1` |
57+
| `ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH` | 桌面端写给后端启动器的 heartbeat 文件路径 | 打包态默认写到 `ASTRBOT_ROOT/data/backend-startup-heartbeat.json` |
5658

5759
## 4. 发布/CI(GitHub Actions)
5860

scripts/backend/templates/launch_backend.py

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
11
from __future__ import annotations
22

3+
import atexit
34
import ctypes
5+
import json
46
import os
57
import runpy
68
import sys
9+
import threading
10+
import time
711
from pathlib import Path
812

913
BACKEND_DIR = Path(__file__).resolve().parent
1014
APP_DIR = BACKEND_DIR / "app"
1115
_WINDOWS_DLL_DIRECTORY_HANDLES: list[object] = []
16+
# Keep this in sync with BACKEND_STARTUP_HEARTBEAT_PATH_ENV in src-tauri/src/app_constants.rs.
17+
STARTUP_HEARTBEAT_ENV = "ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH"
18+
STARTUP_HEARTBEAT_INTERVAL_SECONDS = 2.0
19+
STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS = 1.0
1220

1321

1422
def configure_stdio_utf8() -> None:
@@ -113,15 +121,120 @@ def preload_windows_runtime_dlls() -> None:
113121
continue
114122

115123

116-
configure_stdio_utf8()
117-
configure_windows_dll_search_path()
118-
preload_windows_runtime_dlls()
124+
def resolve_startup_heartbeat_path() -> Path | None:
125+
raw = os.environ.get(STARTUP_HEARTBEAT_ENV, "").strip()
126+
if not raw:
127+
return None
128+
return Path(raw)
119129

120-
sys.path.insert(0, str(APP_DIR))
121130

122-
main_file = APP_DIR / "main.py"
123-
if not main_file.is_file():
124-
raise FileNotFoundError(f"Backend entrypoint not found: {main_file}")
131+
def build_heartbeat_payload(state: str) -> dict[str, object]:
132+
return {
133+
"pid": os.getpid(),
134+
"state": state,
135+
"updated_at_ms": int(time.time() * 1000),
136+
}
125137

126-
sys.argv[0] = str(main_file)
127-
runpy.run_path(str(main_file), run_name="__main__")
138+
139+
def atomic_write_json(path: Path, payload: dict[str, object]) -> None:
140+
temp_path = path.with_name(f"{path.name}.tmp")
141+
temp_path.write_text(
142+
json.dumps(payload, separators=(",", ":")),
143+
encoding="utf-8",
144+
)
145+
try:
146+
temp_path.replace(path)
147+
except Exception:
148+
try:
149+
temp_path.unlink(missing_ok=True)
150+
except Exception:
151+
pass
152+
raise
153+
154+
155+
def write_startup_heartbeat(
156+
path: Path, state: str, *, warn_on_error: bool = False
157+
) -> bool:
158+
try:
159+
path.parent.mkdir(parents=True, exist_ok=True)
160+
atomic_write_json(path, build_heartbeat_payload(state))
161+
return True
162+
except Exception as exc:
163+
if warn_on_error:
164+
print(
165+
f"[startup-heartbeat] failed to write heartbeat to {path}: {exc.__class__.__name__}: {exc}",
166+
file=sys.stderr,
167+
)
168+
return False
169+
170+
171+
def heartbeat_loop(
172+
path: Path, interval_seconds: float, stop_event: threading.Event
173+
) -> None:
174+
# At least one successful write has happened.
175+
had_successful_write = False
176+
# A warning has already been emitted since the last successful write.
177+
warning_emitted_since_last_success = False
178+
179+
def should_warn() -> bool:
180+
# Before the first successful heartbeat we want every failure to surface so startup
181+
# path/permission issues stay visible. After a success, only warn on the first failure in
182+
# each consecutive failure run to avoid log spam.
183+
return (not had_successful_write) or (not warning_emitted_since_last_success)
184+
185+
ok = write_startup_heartbeat(path, "starting", warn_on_error=True)
186+
if ok:
187+
had_successful_write = True
188+
else:
189+
warning_emitted_since_last_success = True
190+
191+
while not stop_event.wait(interval_seconds):
192+
warn_now = should_warn()
193+
ok = write_startup_heartbeat(path, "starting", warn_on_error=warn_now)
194+
if ok:
195+
had_successful_write = True
196+
warning_emitted_since_last_success = False
197+
elif warn_now:
198+
warning_emitted_since_last_success = True
199+
200+
201+
def start_startup_heartbeat() -> None:
202+
heartbeat_path = resolve_startup_heartbeat_path()
203+
if heartbeat_path is None:
204+
return
205+
206+
stop_event = threading.Event()
207+
thread = threading.Thread(
208+
target=heartbeat_loop,
209+
args=(heartbeat_path, STARTUP_HEARTBEAT_INTERVAL_SECONDS, stop_event),
210+
name="astrbot-startup-heartbeat",
211+
daemon=True,
212+
)
213+
214+
def on_exit() -> None:
215+
stop_event.set()
216+
thread.join(timeout=STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS)
217+
write_startup_heartbeat(heartbeat_path, "stopping", warn_on_error=True)
218+
219+
thread.start()
220+
atexit.register(on_exit)
221+
222+
223+
def main() -> None:
224+
configure_stdio_utf8()
225+
configure_windows_dll_search_path()
226+
preload_windows_runtime_dlls()
227+
start_startup_heartbeat()
228+
229+
sys.path.insert(0, str(APP_DIR))
230+
231+
main_file = APP_DIR / "main.py"
232+
if not main_file.is_file():
233+
raise FileNotFoundError(f"Backend entrypoint not found: {main_file}")
234+
235+
sys.argv[0] = str(main_file)
236+
runpy.run_path(str(main_file), run_name="__main__")
237+
238+
239+
if __name__ == "__main__":
240+
main()
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import importlib.util
2+
import tempfile
3+
import unittest
4+
from pathlib import Path
5+
from unittest import mock
6+
7+
8+
MODULE_PATH = Path(__file__).with_name("launch_backend.py")
9+
SPEC = importlib.util.spec_from_file_location("launch_backend_under_test", MODULE_PATH)
10+
if SPEC is None or SPEC.loader is None:
11+
raise RuntimeError(f"Cannot load launch_backend module from {MODULE_PATH}")
12+
launch_backend = importlib.util.module_from_spec(SPEC)
13+
SPEC.loader.exec_module(launch_backend)
14+
15+
16+
class StartupHeartbeatTests(unittest.TestCase):
17+
def test_atomic_write_json_cleans_up_temp_file_when_replace_fails(self) -> None:
18+
with tempfile.TemporaryDirectory() as temp_dir:
19+
heartbeat_path = Path(temp_dir) / "heartbeat.json"
20+
temp_path = heartbeat_path.with_name(f"{heartbeat_path.name}.tmp")
21+
22+
with mock.patch.object(
23+
Path,
24+
"replace",
25+
autospec=True,
26+
side_effect=OSError("replace failed"),
27+
):
28+
with self.assertRaises(OSError):
29+
launch_backend.atomic_write_json(
30+
heartbeat_path,
31+
{"pid": 42, "state": "starting", "updated_at_ms": 5000},
32+
)
33+
34+
self.assertFalse(temp_path.exists())
35+
36+
def test_repeated_failures_warn_before_first_success(self) -> None:
37+
stop_event = mock.Mock()
38+
stop_event.wait.side_effect = [False, True]
39+
40+
with mock.patch.object(
41+
launch_backend,
42+
"write_startup_heartbeat",
43+
side_effect=[False, False],
44+
) as write_mock:
45+
launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event)
46+
47+
self.assertEqual(
48+
[call.kwargs["warn_on_error"] for call in write_mock.call_args_list],
49+
[True, True],
50+
)
51+
52+
def test_repeated_failures_after_success_are_suppressed(self) -> None:
53+
stop_event = mock.Mock()
54+
stop_event.wait.side_effect = [False, False, True]
55+
56+
with mock.patch.object(
57+
launch_backend,
58+
"write_startup_heartbeat",
59+
side_effect=[True, False, False],
60+
) as write_mock:
61+
launch_backend.heartbeat_loop(Path("/tmp/heartbeat.json"), 2.0, stop_event)
62+
63+
self.assertEqual(
64+
[call.kwargs["warn_on_error"] for call in write_mock.call_args_list],
65+
[True, True, False],
66+
)
67+
68+
def test_stop_failure_still_warns_after_earlier_failure(self) -> None:
69+
stop_event = mock.Mock()
70+
thread = mock.Mock()
71+
register = mock.Mock()
72+
73+
with mock.patch.object(
74+
launch_backend,
75+
"write_startup_heartbeat",
76+
return_value=False,
77+
) as write_mock:
78+
with mock.patch.object(
79+
launch_backend,
80+
"resolve_startup_heartbeat_path",
81+
return_value=Path("/tmp/heartbeat.json"),
82+
):
83+
with mock.patch.object(
84+
launch_backend.threading, "Event", return_value=stop_event
85+
):
86+
with mock.patch.object(
87+
launch_backend.threading, "Thread", return_value=thread
88+
):
89+
with mock.patch.object(
90+
launch_backend.atexit, "register", register
91+
):
92+
launch_backend.start_startup_heartbeat()
93+
thread.join.assert_not_called()
94+
on_exit = register.call_args.args[0]
95+
on_exit()
96+
97+
thread.join.assert_called_once_with(
98+
timeout=launch_backend.STARTUP_HEARTBEAT_STOP_JOIN_TIMEOUT_SECONDS
99+
)
100+
self.assertEqual(
101+
[call.args[1] for call in write_mock.call_args_list],
102+
["stopping"],
103+
)
104+
self.assertEqual(
105+
[call.kwargs["warn_on_error"] for call in write_mock.call_args_list],
106+
[True],
107+
)
108+
109+
def test_start_startup_heartbeat_does_not_register_exit_handler_when_thread_start_fails(
110+
self,
111+
) -> None:
112+
stop_event = mock.Mock()
113+
thread = mock.Mock()
114+
thread.start.side_effect = RuntimeError("thread start failed")
115+
register = mock.Mock()
116+
117+
with mock.patch.object(
118+
launch_backend,
119+
"resolve_startup_heartbeat_path",
120+
return_value=Path("/tmp/heartbeat.json"),
121+
):
122+
with mock.patch.object(
123+
launch_backend.threading, "Event", return_value=stop_event
124+
):
125+
with mock.patch.object(
126+
launch_backend.threading, "Thread", return_value=thread
127+
):
128+
with mock.patch.object(launch_backend.atexit, "register", register):
129+
with self.assertRaises(RuntimeError):
130+
launch_backend.start_startup_heartbeat()
131+
132+
register.assert_not_called()
133+
134+
135+
if __name__ == "__main__":
136+
unittest.main()

src-tauri/src/app_constants.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use std::time::Duration;
22

33
pub(crate) const DEFAULT_BACKEND_URL: &str = "http://127.0.0.1:6185/";
4+
pub(crate) const ASTRBOT_ROOT_ENV: &str = "ASTRBOT_ROOT";
45
pub(crate) const BACKEND_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_TIMEOUT_MS";
5-
pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 5 * 60 * 1000;
6+
pub(crate) const PACKAGED_BACKEND_TIMEOUT_FALLBACK_MS: u64 = 15 * 60 * 1000;
67
pub(crate) const GRACEFUL_RESTART_REQUEST_TIMEOUT_MS: u64 = 2_500;
78
pub(crate) const GRACEFUL_RESTART_START_TIME_TIMEOUT_MS: u64 = 1_800;
89
pub(crate) const GRACEFUL_RESTART_POLL_INTERVAL_MS: u64 = 350;
@@ -17,6 +18,15 @@ pub(crate) const BACKEND_READY_PROBE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_READY_
1718
pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MIN_MS: u64 = 100;
1819
pub(crate) const BACKEND_READY_PROBE_TIMEOUT_MAX_MS: u64 = 30_000;
1920
pub(crate) const BACKEND_READY_TCP_PROBE_TIMEOUT_MAX_MS: u64 = 1_000;
21+
pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_ENV: &str = "ASTRBOT_BACKEND_STARTUP_IDLE_TIMEOUT_MS";
22+
pub(crate) const DEFAULT_BACKEND_STARTUP_IDLE_TIMEOUT_MS: u64 = 60 * 1000;
23+
pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MIN_MS: u64 = 5_000;
24+
pub(crate) const BACKEND_STARTUP_IDLE_TIMEOUT_MAX_MS: u64 = 15 * 60 * 1000;
25+
// Keep this in sync with STARTUP_HEARTBEAT_ENV in scripts/backend/templates/launch_backend.py.
26+
pub(crate) const BACKEND_STARTUP_HEARTBEAT_PATH_ENV: &str =
27+
"ASTRBOT_BACKEND_STARTUP_HEARTBEAT_PATH";
28+
pub(crate) const DEFAULT_BACKEND_STARTUP_HEARTBEAT_RELATIVE_PATH: &str =
29+
"data/backend-startup-heartbeat.json";
2030
pub(crate) const DEFAULT_BACKEND_PING_TIMEOUT_MS: u64 = 800;
2131
pub(crate) const BACKEND_PING_TIMEOUT_MIN_MS: u64 = 50;
2232
pub(crate) const BACKEND_PING_TIMEOUT_MAX_MS: u64 = 30_000;

src-tauri/src/app_helpers.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ mod tests {
7979
cwd: PathBuf::from("."),
8080
root_dir: None,
8181
webui_dir: None,
82+
startup_heartbeat_path: None,
8283
packaged_mode: false,
8384
};
8485

src-tauri/src/app_types.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ pub(crate) struct LaunchPlan {
3333
pub(crate) cwd: PathBuf,
3434
pub(crate) root_dir: Option<PathBuf>,
3535
pub(crate) webui_dir: Option<PathBuf>,
36+
pub(crate) startup_heartbeat_path: Option<PathBuf>,
3637
pub(crate) packaged_mode: bool,
3738
}
3839

0 commit comments

Comments
 (0)