diff --git a/CLAUDE.md b/CLAUDE.md index 31929fd..e341739 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,6 +61,8 @@ src/clayde/ telemetry.py # OpenTelemetry tracing: init_tracer(), get_tracer(), # FileSpanExporter (JSONL) orchestrator.py # main() — single cycle, run_loop() — container entry point + disk.py # check_disk_and_alert() — best-effort host disk guard, + # ntfy alert (cooldown-rate-limited) when usage ≥ threshold prompts/ work.j2 # Jinja2 template for the unified work prompt tasks/ @@ -118,6 +120,10 @@ Plain `KEY=VALUE` file (no shell quoting). All keys use `CLAYDE_` prefix and are | `CLAYDE_NTFY_BASE_URL` | ntfy base URL (override for self-host) | | `CLAYDE_NTFY_TIMEOUT_S` | ntfy POST timeout seconds (default 10) | | `CLAYDE_KB_PATH` | In-container KB path; Pebble per-request cwd (default `/home/clayde/knowledge_base`) | +| `CLAYDE_DISK_ALERT_ENABLED` | Enable the per-tick disk-usage guard (default `true`) | +| `CLAYDE_DISK_ALERT_THRESHOLD_PCT` | Usage % that triggers an ntfy alert (default `85`) | +| `CLAYDE_DISK_ALERT_PATH` | Path whose partition is checked — same volume as host root (default `/data`) | +| `CLAYDE_DISK_ALERT_COOLDOWN_S` | Min seconds between repeat alerts while over threshold (default `21600`) | Config is loaded via `get_settings()` (singleton). `GH_TOKEN` is exported at startup for the `gh` CLI. diff --git a/src/clayde/config.py b/src/clayde/config.py index d3eef8c..68c8bab 100644 --- a/src/clayde/config.py +++ b/src/clayde/config.py @@ -55,6 +55,12 @@ def effective_git_name(self) -> str: ntfy_base_url: str = "https://ntfy.sh" ntfy_timeout_s: int = 10 + # Disk-usage self-check (alerts via ntfy when the host disk fills up) + disk_alert_enabled: bool = True + disk_alert_threshold_pct: int = 85 + disk_alert_path: str = "/data" # same partition as the host root volume + disk_alert_cooldown_s: int = 21600 # 6h — avoid re-alerting every cycle + # Knowledge base (default cwd for Pebble runs) kb_path: str = "/home/clayde/knowledge_base" diff --git a/src/clayde/disk.py b/src/clayde/disk.py new file mode 100644 index 0000000..8f0f6ce --- /dev/null +++ b/src/clayde/disk.py @@ -0,0 +1,111 @@ +"""Best-effort host disk-usage guard. + +Clayde runs in a container whose ``/data`` bind-mount lives on the host root +partition, so ``shutil.disk_usage("/data")`` reflects how full the host disk +is. When usage crosses a threshold we ntfy Max so the disk gets cleaned before +it fills up (a full disk silently breaks clones, builds, and the agent loop). + +Best-effort: any error is logged, never raised — a missed alert must never +take down the main loop. Re-alerts are rate-limited by a cooldown persisted in +a tiny JSON file so the 5-minute tick loop doesn't spam the same warning. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import shutil +import time +from pathlib import Path + +from clayde.config import DATA_DIR, Settings +from clayde.webhook.notify import send_ntfy + +log = logging.getLogger("clayde.disk") + + +def _default_state_path() -> Path: + return DATA_DIR / "disk_alert_state.json" + + +def _read_last_alert_ts(path: Path) -> float: + try: + return float(json.loads(path.read_text()).get("last_alert_ts", 0.0)) + except (OSError, ValueError, TypeError): + return 0.0 + + +def _write_last_alert_ts(path: Path, ts: float) -> None: + try: + path.write_text(json.dumps({"last_alert_ts": ts})) + except OSError as exc: + log.warning("could not persist disk alert state: %s", exc) + + +def _send(settings: Settings, *, usage_pct: int, free_gb: float) -> None: + """Emit a disk-full warning via the shared ntfy helper. + + Reuses ``webhook.notify.send_ntfy`` (itself best-effort — errors logged, + never raised). ``success=False`` gives it warning styling (priority 5, + rotating_light tag). Called from the sync tick loop, so the async helper + is driven via ``asyncio.run`` — safe here because ``main()`` never runs + inside an active event loop (in Pebble mode it runs via ``to_thread``). + """ + title = f"clayde.net disk {usage_pct}% full" + body = ( + f"Only {free_gb:.1f} GB free on {settings.disk_alert_path}. " + "Run disk cleanup (KB: vm-disk-cleanup)." + ) + asyncio.run( + send_ntfy( + title=title, + body=body, + success=False, + base_url=settings.ntfy_base_url, + topic=settings.ntfy_topic, + timeout_s=settings.ntfy_timeout_s, + ) + ) + + +def check_disk_and_alert( + settings: Settings, + *, + state_path: Path | None = None, + now: float | None = None, +) -> int | None: + """Check disk usage and ntfy when at/over the threshold (rate-limited). + + Returns the usage percentage, or ``None`` when disabled or the check + itself failed. Never raises. + """ + if not settings.disk_alert_enabled: + return None + try: + total, used, free = shutil.disk_usage(settings.disk_alert_path) + except OSError as exc: + log.warning( + "disk usage check failed for %s: %s", settings.disk_alert_path, exc + ) + return None + + usage_pct = round(used / total * 100) + if usage_pct < settings.disk_alert_threshold_pct: + return usage_pct + + now = time.time() if now is None else now + sf = state_path if state_path is not None else _default_state_path() + last = _read_last_alert_ts(sf) + if last and now - last < settings.disk_alert_cooldown_s: + log.info("disk %d%% over threshold but within alert cooldown", usage_pct) + return usage_pct + + log.warning( + "disk %d%% >= threshold %d%% — sending alert", + usage_pct, + settings.disk_alert_threshold_pct, + ) + _send(settings, usage_pct=usage_pct, free_gb=free / 1e9) + _write_last_alert_ts(sf, now) + return usage_pct diff --git a/src/clayde/orchestrator.py b/src/clayde/orchestrator.py index 74ef059..dc08048 100644 --- a/src/clayde/orchestrator.py +++ b/src/clayde/orchestrator.py @@ -29,6 +29,7 @@ from clayde.claude import InvocationTimeoutError, UsageLimitError, is_claude_available from clayde.config import get_github_client, get_settings, setup_logging +from clayde.disk import check_disk_and_alert from clayde.webhook import JobQueue, create_app, worker_loop from clayde.github import ( fetch_issue, @@ -324,6 +325,13 @@ def main(): log.info("=== Starting Clayde Tick [%s] ===", datetime.now().strftime("%Y-%m-%d %H:%M")) + # Disk-usage guard runs before any work so it still fires when Claude is + # rate-limited — a full disk would break everything else regardless. + try: + check_disk_and_alert(settings) + except Exception: + log.exception("disk usage check failed") + os.environ["GH_TOKEN"] = settings.github_token _configure_global_git_identity(settings) diff --git a/tests/test_disk.py b/tests/test_disk.py new file mode 100644 index 0000000..823e72d --- /dev/null +++ b/tests/test_disk.py @@ -0,0 +1,100 @@ +"""Tests for clayde.disk — best-effort disk-usage alert.""" + +from pathlib import Path + +import clayde.disk +from clayde.config import Settings +from clayde.disk import check_disk_and_alert + + +def _settings(**over) -> Settings: + base = dict( + disk_alert_enabled=True, + disk_alert_threshold_pct=85, + disk_alert_path="/data", + disk_alert_cooldown_s=21600, + ) + base.update(over) + return Settings(_env_file=None, **base) + + +def _patch_usage(monkeypatch, *, total, used): + free = total - used + monkeypatch.setattr( + clayde.disk.shutil, "disk_usage", lambda _p: (total, used, free) + ) + + +def _capture_sends(monkeypatch) -> list: + sent = [] + monkeypatch.setattr( + clayde.disk, "_send", lambda settings, **kw: sent.append(kw) + ) + return sent + + +class TestCheckDiskAndAlert: + def test_under_threshold_no_alert(self, monkeypatch, tmp_path): + _patch_usage(monkeypatch, total=100, used=50) + sent = _capture_sends(monkeypatch) + pct = check_disk_and_alert(_settings(), state_path=tmp_path / "s.json") + assert pct == 50 + assert sent == [] + + def test_over_threshold_alerts_once(self, monkeypatch, tmp_path): + _patch_usage(monkeypatch, total=100, used=91) + sent = _capture_sends(monkeypatch) + pct = check_disk_and_alert( + _settings(), state_path=tmp_path / "s.json", now=1000.0 + ) + assert pct == 91 + assert len(sent) == 1 + assert sent[0]["usage_pct"] == 91 + + def test_within_cooldown_suppressed(self, monkeypatch, tmp_path): + _patch_usage(monkeypatch, total=100, used=91) + sent = _capture_sends(monkeypatch) + sf = tmp_path / "s.json" + check_disk_and_alert(_settings(), state_path=sf, now=1000.0) + check_disk_and_alert(_settings(), state_path=sf, now=1000.0 + 3600) + assert len(sent) == 1 # second within 6h cooldown -> suppressed + + def test_after_cooldown_realerts(self, monkeypatch, tmp_path): + _patch_usage(monkeypatch, total=100, used=91) + sent = _capture_sends(monkeypatch) + sf = tmp_path / "s.json" + check_disk_and_alert(_settings(), state_path=sf, now=1000.0) + check_disk_and_alert(_settings(), state_path=sf, now=1000.0 + 21600 + 1) + assert len(sent) == 2 + + def test_disabled_skips(self, monkeypatch, tmp_path): + _patch_usage(monkeypatch, total=100, used=99) + sent = _capture_sends(monkeypatch) + pct = check_disk_and_alert( + _settings(disk_alert_enabled=False), state_path=tmp_path / "s.json" + ) + assert pct is None + assert sent == [] + + def test_send_uses_shared_ntfy_helper(self, monkeypatch): + calls = [] + + async def fake_send_ntfy(**kw): + calls.append(kw) + + monkeypatch.setattr(clayde.disk, "send_ntfy", fake_send_ntfy) + clayde.disk._send(_settings(), usage_pct=91, free_gb=4.2) + assert len(calls) == 1 + assert calls[0]["success"] is False + assert calls[0]["topic"] == _settings().ntfy_topic + assert "91%" in calls[0]["title"] + + def test_usage_error_swallowed(self, monkeypatch, tmp_path): + def boom(_p): + raise OSError("no such path") + + monkeypatch.setattr(clayde.disk.shutil, "disk_usage", boom) + sent = _capture_sends(monkeypatch) + pct = check_disk_and_alert(_settings(), state_path=tmp_path / "s.json") + assert pct is None + assert sent == []