Skip to content

Commit eeb373c

Browse files
committed
feat(coding): 补齐窗口预热执行日志
1 parent f655e3f commit eeb373c

5 files changed

Lines changed: 187 additions & 8 deletions

File tree

.trellis/spec/infra/coding-plan-window-warmer.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@
4848
- Warm requests use `litellm.completion(model=plan.model, messages=[...], api_base=target.base_url, api_key=api_key, timeout=..., max_tokens=..., temperature=...)`.
4949
- The warmer must not call local LiteLLM Proxy `/v1/chat/completions` for default GLM warmup.
5050
- Health checks may use direct HTTP GET because they are a readiness probe, not the warmup completion.
51+
- Logging contract:
52+
- Watch mode logs the next scheduled event and the due event before calling the warmer.
53+
- Each real warmup logs start, container check, API key source, health check, request send, retry wait, final success/failure, and millisecond durations.
54+
- Logs may include target name, model, base URL, timeout, max tokens, retry counts, and API key source (`env:<name>` or `file:<path>`).
55+
- Logs must not include prompt text, API key values, request headers, or full request body.
5156

5257
### 4. Validation & Error Matrix
5358

@@ -81,6 +86,7 @@
8186
- Unit tests for `interval` continuous-window rollover across midnight.
8287
- Unit tests for multiple plans with simultaneous base time remaining independently executable.
8388
- Config parse tests for multiple `[[plans]]`.
89+
- Logging regression test asserting real warmups log lifecycle checkpoints without exposing prompt text or API key values.
8490
- SDK call test mocking the local wrapper around `litellm.completion`, asserting:
8591
- `model` keeps the configured provider-prefixed model.
8692
- `api_base` is the direct target URL.

ai/coding/window-warmer/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ pm2 start ai/coding/window-warmer/window-warmer.pm2.config.cjs
9393
pm2 logs coding-window-warmer
9494
```
9595

96+
执行日志会包含调度触发、容器检查、API key 来源、健康检查、请求发送、重试、成功/失败和耗时;不会输出 prompt、API key、请求头或完整请求体。
97+
9698
重启:
9799

98100
```bash

ai/coding/window-warmer/tests/test_window_warmer.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,71 @@ def test_dry_run_skips_readiness_checks(self) -> None:
252252

253253
self.assertTrue(warmer.warm_plan(config, plan, dry_run=True))
254254

255+
def test_warm_plan_logs_execution_lifecycle_without_secret(self) -> None:
256+
"""真实执行应记录关键链路日志且不泄露 prompt 与 API key。
257+
258+
Args:
259+
None.
260+
261+
Returns:
262+
无返回值。
263+
"""
264+
plan = warmer.parse_plan_config(
265+
{
266+
"name": "glm",
267+
"model": "openai/GLM-5.1",
268+
"prompt": "secret prompt",
269+
"schedule_mode": "fixed_times",
270+
"times": ["08:00"],
271+
},
272+
warmer.SchedulerConfig(True, 60, 120, 1, 30, False),
273+
)
274+
config = warmer.AppConfig(
275+
target=warmer.TargetConfig(
276+
name="z-ai",
277+
base_url="https://open.bigmodel.cn/api/coding/paas/v4",
278+
container_name=None,
279+
api_key_env="Z_AI_API_KEY",
280+
env_file=None,
281+
health_path="/models",
282+
request_timeout_seconds=30,
283+
),
284+
scheduler=warmer.SchedulerConfig(True, 60, 120, 1, 30, False),
285+
plans=(plan,),
286+
)
287+
288+
def ready_with_logs(target: warmer.TargetConfig, log_fn: warmer.target.LogFn | None = None):
289+
"""模拟已通过前置检查并输出检查日志。
290+
291+
Args:
292+
target: 目标服务配置。
293+
log_fn: 可选日志函数。
294+
295+
Returns:
296+
目标可用状态、诊断消息与模拟 API key。
297+
"""
298+
self.assertEqual(target.name, "z-ai")
299+
if log_fn is not None:
300+
log_fn("container check skipped")
301+
log_fn("api key check passed source=file:.env.local")
302+
log_fn("health check passed result=target api is ready")
303+
return True, "z-ai 已就绪 api_key_source=file:.env.local", "sk-secret"
304+
305+
with (
306+
patch("window_warmer_lib.runner.ensure_target_ready", side_effect=ready_with_logs),
307+
patch("window_warmer_lib.runner.send_warm_completion"),
308+
patch("window_warmer_lib.runner.log") as log_mock,
309+
):
310+
self.assertTrue(warmer.warm_plan(config, plan))
311+
312+
messages = "\n".join(call.args[0] for call in log_mock.call_args_list)
313+
self.assertIn("warm started", messages)
314+
self.assertIn("api key check passed source=file:.env.local", messages)
315+
self.assertIn("sending warm request", messages)
316+
self.assertIn("warm succeeded", messages)
317+
self.assertNotIn("sk-secret", messages)
318+
self.assertNotIn("secret prompt", messages)
319+
255320
def test_read_api_key_falls_back_to_dotenv_file(self) -> None:
256321
"""API key 应支持从配置指定的 dotenv 文件读取。
257322

ai/coding/window-warmer/window_warmer_lib/runner.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,28 +23,50 @@ def warm_plan(config: AppConfig, plan: PlanConfig, dry_run: bool = False) -> boo
2323
Returns:
2424
预热成功时返回 True,否则返回 False。
2525
"""
26+
started_at = time.monotonic()
27+
log(f"[{plan.name}] warm started target={config.target.name} model={plan.model} schedule_mode={plan.schedule_mode}")
28+
2629
if dry_run or config.scheduler.dry_run:
27-
log(f"[{plan.name}] dry-run warm target={config.target.name} model={plan.model}")
30+
log(
31+
f"[{plan.name}] dry-run warm target={config.target.name} model={plan.model} "
32+
f"duration_ms={elapsed_ms(started_at)}"
33+
)
2834
return True
2935

30-
ready, message, api_key = ensure_target_ready(config.target)
36+
ready, message, api_key = ensure_target_ready(
37+
config.target,
38+
log_fn=lambda readiness_message: log(f"[{plan.name}] {readiness_message}"),
39+
)
3140
if not ready:
32-
log(f"[{plan.name}] skip warm: {message}")
41+
log(f"[{plan.name}] skip warm: {message} duration_ms={elapsed_ms(started_at)}")
3342
return False
43+
log(f"[{plan.name}] readiness check passed: {message}")
3444

3545
attempts = plan.retry_count + 1
3646
for attempt in range(1, attempts + 1):
47+
attempt_started_at = time.monotonic()
48+
log(
49+
f"[{plan.name}] sending warm request target={config.target.name} model={plan.model} "
50+
f"base_url={config.target.base_url} attempt={attempt}/{attempts} "
51+
f"timeout={config.target.request_timeout_seconds}s max_tokens={plan.max_tokens}"
52+
)
3753
try:
3854
send_warm_completion(config.target, plan, api_key)
3955
log(
4056
f"[{plan.name}] warm succeeded target={config.target.name} "
41-
f"model={plan.model} attempt={attempt}/{attempts}"
57+
f"model={plan.model} attempt={attempt}/{attempts} "
58+
f"duration_ms={elapsed_ms(attempt_started_at)} total_duration_ms={elapsed_ms(started_at)}"
4259
)
4360
return True
4461
except Exception as exc:
45-
log(f"[{plan.name}] warm failed attempt={attempt}/{attempts}: {exc}")
62+
log(
63+
f"[{plan.name}] warm failed attempt={attempt}/{attempts} "
64+
f"duration_ms={elapsed_ms(attempt_started_at)} error={exc}"
65+
)
4666
if attempt < attempts and plan.retry_delay_seconds > 0:
67+
log(f"[{plan.name}] retrying warm in {plan.retry_delay_seconds}s")
4768
time.sleep(plan.retry_delay_seconds)
69+
log(f"[{plan.name}] warm exhausted attempts={attempts} total_duration_ms={elapsed_ms(started_at)}")
4870
return False
4971

5072

@@ -63,9 +85,11 @@ def run_once(config: AppConfig, dry_run: bool = False) -> int:
6385
log("没有启用的 plan。")
6486
return 1
6587

88+
log(f"run once started enabled_plans={len(enabled_plans)}")
6689
success = True
6790
for plan in enabled_plans:
6891
success = warm_plan(config, plan, dry_run=dry_run) and success
92+
log(f"run once finished success={str(success).lower()}")
6993
return 0 if success else 1
7094

7195

@@ -120,6 +144,10 @@ def run_watch(config: AppConfig, rng: random.Random, dry_run: bool = False) -> i
120144
f"run_at={event.run_at.isoformat()} jitter={event.jitter_seconds}s"
121145
)
122146
sleep_until(event.run_at)
147+
log(
148+
f"warm due plan={event.plan.name} base_at={event.base_at.isoformat()} "
149+
f"run_at={event.run_at.isoformat()} now={datetime.now().isoformat(timespec='seconds')}"
150+
)
123151
warm_plan(config, event.plan, dry_run=dry_run)
124152
events[event.plan.name] = build_warm_event(event.plan, datetime.now(), rng)
125153

@@ -152,6 +180,18 @@ def interruptible_sleep(seconds: float) -> None:
152180
time.sleep(max(0, seconds))
153181

154182

183+
def elapsed_ms(started_at: float) -> int:
184+
"""计算从起点到当前的毫秒耗时。
185+
186+
Args:
187+
started_at: `time.monotonic()` 记录的起点。
188+
189+
Returns:
190+
非负毫秒数。
191+
"""
192+
return max(0, round((time.monotonic() - started_at) * 1000))
193+
194+
155195
def log(message: str) -> None:
156196
"""输出带本地时间戳的日志。
157197

ai/coding/window-warmer/window_warmer_lib/target.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
import subprocess
88
import urllib.error
99
import urllib.request
10+
from collections.abc import Callable
1011
from pathlib import Path
1112
from typing import Any
1213

1314
from dotenv import dotenv_values
1415

1516
from .models import PlanConfig, TargetConfig
1617

18+
LogFn = Callable[[str], None]
19+
1720

1821
def read_env_file(path: Path | None) -> dict[str, str]:
1922
"""读取 dotenv 环境变量文件。
@@ -47,6 +50,29 @@ def read_api_key(config: TargetConfig) -> str | None:
4750
return read_env_file(config.env_file).get(config.api_key_env)
4851

4952

53+
def read_api_key_with_source(config: TargetConfig) -> tuple[str | None, str]:
54+
"""读取目标服务 API key 及其来源。
55+
56+
Args:
57+
config: 目标服务连接配置。
58+
59+
Returns:
60+
`(API key, 来源描述)`;未配置或未找到时 API key 为 None。
61+
"""
62+
if config.api_key_env is None:
63+
return None, "disabled"
64+
65+
env_value = os.getenv(config.api_key_env)
66+
if env_value:
67+
return env_value, f"env:{config.api_key_env}"
68+
69+
env_values = read_env_file(config.env_file)
70+
file_value = env_values.get(config.api_key_env)
71+
if file_value:
72+
return file_value, f"file:{format_path(config.env_file)}"
73+
return None, "missing"
74+
75+
5076
def is_container_running(container_name: str) -> tuple[bool, str]:
5177
"""检查 Docker 容器是否处于 running 状态。
5278
@@ -199,25 +225,65 @@ def call_litellm_completion(**kwargs: Any) -> None:
199225
completion(**kwargs)
200226

201227

202-
def ensure_target_ready(config: TargetConfig) -> tuple[bool, str, str | None]:
228+
def ensure_target_ready(config: TargetConfig, log_fn: LogFn | None = None) -> tuple[bool, str, str | None]:
203229
"""确认目标前置条件与 API 均可用。
204230
205231
Args:
206232
config: 目标服务连接配置。
233+
log_fn: 可选日志函数,用于输出检查链路。
207234
208235
Returns:
209236
`(是否可用, 诊断信息, API key)`。
210237
"""
211238
if config.container_name is not None:
239+
emit(log_fn, f"container check started container={config.container_name}")
212240
container_running, container_message = is_container_running(config.container_name)
213241
if not container_running:
242+
emit(log_fn, f"container check failed container={config.container_name} reason={container_message}")
214243
return False, f"容器 {config.container_name} 未就绪: {container_message}", None
244+
emit(log_fn, f"container check passed container={config.container_name}")
245+
else:
246+
emit(log_fn, "container check skipped")
215247

216-
api_key = read_api_key(config)
248+
api_key, api_key_source = read_api_key_with_source(config)
217249
if config.api_key_env is not None and api_key is None:
250+
emit(log_fn, f"api key check failed source={api_key_source} env={config.api_key_env}")
218251
return False, f"未找到 {config.api_key_env}", None
252+
emit(log_fn, f"api key check passed source={api_key_source}")
219253

254+
if config.health_path is not None:
255+
emit(log_fn, f"health check started url={join_url(config.base_url, config.health_path)}")
220256
api_ready, api_message = is_target_api_ready(config, api_key)
221257
if not api_ready:
258+
emit(log_fn, f"health check failed reason={api_message}")
222259
return False, f"{config.name} API 未就绪: {api_message}", api_key
223-
return True, f"{config.name} 已就绪", api_key
260+
emit(log_fn, f"health check passed result={api_message}")
261+
return True, f"{config.name} 已就绪 api_key_source={api_key_source}", api_key
262+
263+
264+
def emit(log_fn: LogFn | None, message: str) -> None:
265+
"""按需输出目标检查日志。
266+
267+
Args:
268+
log_fn: 可选日志函数。
269+
message: 日志消息。
270+
271+
Returns:
272+
无返回值。
273+
"""
274+
if log_fn is not None:
275+
log_fn(message)
276+
277+
278+
def format_path(path: Path | None) -> str:
279+
"""格式化日志中的路径值。
280+
281+
Args:
282+
path: 可选路径。
283+
284+
Returns:
285+
适合日志展示的路径文本。
286+
"""
287+
if path is None:
288+
return "none"
289+
return str(path)

0 commit comments

Comments
 (0)