Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ src/clayde/
telemetry.py # OpenTelemetry tracing: init_tracer(), get_tracer(),
# FileSpanExporter (JSONL)
orchestrator.py # main() — single cycle, run_loop() — container entry point
disk.py # check_disk_and_alert() — best-effort host disk guard,
# ntfy alert (cooldown-rate-limited) when usage ≥ threshold
prompts/
work.j2 # Jinja2 template for the unified work prompt
tasks/
Expand Down Expand Up @@ -118,6 +120,10 @@ Plain `KEY=VALUE` file (no shell quoting). All keys use `CLAYDE_` prefix and are
| `CLAYDE_NTFY_BASE_URL` | ntfy base URL (override for self-host) |
| `CLAYDE_NTFY_TIMEOUT_S` | ntfy POST timeout seconds (default 10) |
| `CLAYDE_KB_PATH` | In-container KB path; Pebble per-request cwd (default `/home/clayde/knowledge_base`) |
| `CLAYDE_DISK_ALERT_ENABLED` | Enable the per-tick disk-usage guard (default `true`) |
| `CLAYDE_DISK_ALERT_THRESHOLD_PCT` | Usage % that triggers an ntfy alert (default `85`) |
| `CLAYDE_DISK_ALERT_PATH` | Path whose partition is checked — same volume as host root (default `/data`) |
| `CLAYDE_DISK_ALERT_COOLDOWN_S` | Min seconds between repeat alerts while over threshold (default `21600`) |

Config is loaded via `get_settings()` (singleton). `GH_TOKEN` is exported at startup for the `gh` CLI.

Expand Down
6 changes: 6 additions & 0 deletions src/clayde/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def effective_git_name(self) -> str:
ntfy_base_url: str = "https://ntfy.sh"
ntfy_timeout_s: int = 10

# Disk-usage self-check (alerts via ntfy when the host disk fills up)
disk_alert_enabled: bool = True
disk_alert_threshold_pct: int = 85
disk_alert_path: str = "/data" # same partition as the host root volume
disk_alert_cooldown_s: int = 21600 # 6h — avoid re-alerting every cycle

# Knowledge base (default cwd for Pebble runs)
kb_path: str = "/home/clayde/knowledge_base"

Expand Down
111 changes: 111 additions & 0 deletions src/clayde/disk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Best-effort host disk-usage guard.

Clayde runs in a container whose ``/data`` bind-mount lives on the host root
partition, so ``shutil.disk_usage("/data")`` reflects how full the host disk
is. When usage crosses a threshold we ntfy Max so the disk gets cleaned before
it fills up (a full disk silently breaks clones, builds, and the agent loop).

Best-effort: any error is logged, never raised — a missed alert must never
take down the main loop. Re-alerts are rate-limited by a cooldown persisted in
a tiny JSON file so the 5-minute tick loop doesn't spam the same warning.
"""

from __future__ import annotations

import asyncio
import json
import logging
import shutil
import time
from pathlib import Path

from clayde.config import DATA_DIR, Settings
from clayde.webhook.notify import send_ntfy

log = logging.getLogger("clayde.disk")


def _default_state_path() -> Path:
return DATA_DIR / "disk_alert_state.json"


def _read_last_alert_ts(path: Path) -> float:
try:
return float(json.loads(path.read_text()).get("last_alert_ts", 0.0))
except (OSError, ValueError, TypeError):
return 0.0


def _write_last_alert_ts(path: Path, ts: float) -> None:
try:
path.write_text(json.dumps({"last_alert_ts": ts}))
except OSError as exc:
log.warning("could not persist disk alert state: %s", exc)


def _send(settings: Settings, *, usage_pct: int, free_gb: float) -> None:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe there should be a helper method for sending via ntfy somewhere. Use it if there is one.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — now delegates to webhook.notify.send_ntfy instead of the inline httpx POST. Added a test asserting the helper is called (success=False, right topic).

"""Emit a disk-full warning via the shared ntfy helper.

Reuses ``webhook.notify.send_ntfy`` (itself best-effort — errors logged,
never raised). ``success=False`` gives it warning styling (priority 5,
rotating_light tag). Called from the sync tick loop, so the async helper
is driven via ``asyncio.run`` — safe here because ``main()`` never runs
inside an active event loop (in Pebble mode it runs via ``to_thread``).
"""
title = f"clayde.net disk {usage_pct}% full"
body = (
f"Only {free_gb:.1f} GB free on {settings.disk_alert_path}. "
"Run disk cleanup (KB: vm-disk-cleanup)."
)
asyncio.run(
send_ntfy(
title=title,
body=body,
success=False,
base_url=settings.ntfy_base_url,
topic=settings.ntfy_topic,
timeout_s=settings.ntfy_timeout_s,
)
)


def check_disk_and_alert(
settings: Settings,
*,
state_path: Path | None = None,
now: float | None = None,
) -> int | None:
"""Check disk usage and ntfy when at/over the threshold (rate-limited).

Returns the usage percentage, or ``None`` when disabled or the check
itself failed. Never raises.
"""
if not settings.disk_alert_enabled:
return None
try:
total, used, free = shutil.disk_usage(settings.disk_alert_path)
except OSError as exc:
log.warning(
"disk usage check failed for %s: %s", settings.disk_alert_path, exc
)
return None

usage_pct = round(used / total * 100)
if usage_pct < settings.disk_alert_threshold_pct:
return usage_pct

now = time.time() if now is None else now
sf = state_path if state_path is not None else _default_state_path()
last = _read_last_alert_ts(sf)
if last and now - last < settings.disk_alert_cooldown_s:
log.info("disk %d%% over threshold but within alert cooldown", usage_pct)
return usage_pct

log.warning(
"disk %d%% >= threshold %d%% — sending alert",
usage_pct,
settings.disk_alert_threshold_pct,
)
_send(settings, usage_pct=usage_pct, free_gb=free / 1e9)
_write_last_alert_ts(sf, now)
return usage_pct
8 changes: 8 additions & 0 deletions src/clayde/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from clayde.claude import InvocationTimeoutError, UsageLimitError, is_claude_available
from clayde.config import get_github_client, get_settings, setup_logging
from clayde.disk import check_disk_and_alert
from clayde.webhook import JobQueue, create_app, worker_loop
from clayde.github import (
fetch_issue,
Expand Down Expand Up @@ -324,6 +325,13 @@ def main():

log.info("=== Starting Clayde Tick [%s] ===", datetime.now().strftime("%Y-%m-%d %H:%M"))

# Disk-usage guard runs before any work so it still fires when Claude is
# rate-limited — a full disk would break everything else regardless.
try:
check_disk_and_alert(settings)
except Exception:
log.exception("disk usage check failed")

os.environ["GH_TOKEN"] = settings.github_token

_configure_global_git_identity(settings)
Expand Down
100 changes: 100 additions & 0 deletions tests/test_disk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Tests for clayde.disk — best-effort disk-usage alert."""

from pathlib import Path

import clayde.disk
from clayde.config import Settings
from clayde.disk import check_disk_and_alert


def _settings(**over) -> Settings:
base = dict(
disk_alert_enabled=True,
disk_alert_threshold_pct=85,
disk_alert_path="/data",
disk_alert_cooldown_s=21600,
)
base.update(over)
return Settings(_env_file=None, **base)


def _patch_usage(monkeypatch, *, total, used):
free = total - used
monkeypatch.setattr(
clayde.disk.shutil, "disk_usage", lambda _p: (total, used, free)
)


def _capture_sends(monkeypatch) -> list:
sent = []
monkeypatch.setattr(
clayde.disk, "_send", lambda settings, **kw: sent.append(kw)
)
return sent


class TestCheckDiskAndAlert:
def test_under_threshold_no_alert(self, monkeypatch, tmp_path):
_patch_usage(monkeypatch, total=100, used=50)
sent = _capture_sends(monkeypatch)
pct = check_disk_and_alert(_settings(), state_path=tmp_path / "s.json")
assert pct == 50
assert sent == []

def test_over_threshold_alerts_once(self, monkeypatch, tmp_path):
_patch_usage(monkeypatch, total=100, used=91)
sent = _capture_sends(monkeypatch)
pct = check_disk_and_alert(
_settings(), state_path=tmp_path / "s.json", now=1000.0
)
assert pct == 91
assert len(sent) == 1
assert sent[0]["usage_pct"] == 91

def test_within_cooldown_suppressed(self, monkeypatch, tmp_path):
_patch_usage(monkeypatch, total=100, used=91)
sent = _capture_sends(monkeypatch)
sf = tmp_path / "s.json"
check_disk_and_alert(_settings(), state_path=sf, now=1000.0)
check_disk_and_alert(_settings(), state_path=sf, now=1000.0 + 3600)
assert len(sent) == 1 # second within 6h cooldown -> suppressed

def test_after_cooldown_realerts(self, monkeypatch, tmp_path):
_patch_usage(monkeypatch, total=100, used=91)
sent = _capture_sends(monkeypatch)
sf = tmp_path / "s.json"
check_disk_and_alert(_settings(), state_path=sf, now=1000.0)
check_disk_and_alert(_settings(), state_path=sf, now=1000.0 + 21600 + 1)
assert len(sent) == 2

def test_disabled_skips(self, monkeypatch, tmp_path):
_patch_usage(monkeypatch, total=100, used=99)
sent = _capture_sends(monkeypatch)
pct = check_disk_and_alert(
_settings(disk_alert_enabled=False), state_path=tmp_path / "s.json"
)
assert pct is None
assert sent == []

def test_send_uses_shared_ntfy_helper(self, monkeypatch):
calls = []

async def fake_send_ntfy(**kw):
calls.append(kw)

monkeypatch.setattr(clayde.disk, "send_ntfy", fake_send_ntfy)
clayde.disk._send(_settings(), usage_pct=91, free_gb=4.2)
assert len(calls) == 1
assert calls[0]["success"] is False
assert calls[0]["topic"] == _settings().ntfy_topic
assert "91%" in calls[0]["title"]

def test_usage_error_swallowed(self, monkeypatch, tmp_path):
def boom(_p):
raise OSError("no such path")

monkeypatch.setattr(clayde.disk.shutil, "disk_usage", boom)
sent = _capture_sends(monkeypatch)
pct = check_disk_and_alert(_settings(), state_path=tmp_path / "s.json")
assert pct is None
assert sent == []
Loading