Skip to content

Commit a7bc97a

Browse files
authored
Merge pull request #18 from rxf-sys/claude/setup-workflow-process-knX5t
v2 redesign: Settings tab, rich service stats, section rebuilds, drill-down drawers
2 parents 08983d4 + 1e6660d commit a7bc97a

46 files changed

Lines changed: 4860 additions & 575 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,12 @@ Configuration → Access Control → **Add** user `admin-dashboard@pbs`.
130130
Datastore → `<your-datastore>` → Permissions → **Add** for the user with
131131
role `DatastoreAudit`.
132132

133+
If you want the "Verify"-button next to each snapshot in the dashboard
134+
to actually work, the same user additionally needs `DatastoreReader` or
135+
a custom role that includes `Datastore.Verify` on the datastore.
136+
Read-only `DatastoreAudit` will return 403 on `POST /admin/datastore/<store>/verify`
137+
— the dashboard surfaces that as a toast.
138+
133139
Copy into `PBS_TOKEN_ID` / `PBS_TOKEN_SECRET`.
134140

135141
#### Cloudflare API
@@ -227,6 +233,14 @@ so you can iterate without Cloudflare in the loop.
227233
as `ext=false`. The internal LAN probes (`probe_targets`) keep TLS
228234
verification off because home-lab services typically use self-signed
229235
or private-CA certs.
236+
- **Probe history**: the backend persists each probe sample in a SQLite
237+
file (default `/data/rxf-admin.db`, configurable via
238+
`STORAGE_DB_PATH`). Retention defaults to 7 days. The service drawer
239+
shows the resulting uptime % and a 60-bucket history strip. To run
240+
the dashboard *without* persistent history, set `STORAGE_DB_PATH=`
241+
in the env — the UI falls back to "history disabled" gracefully.
242+
When deploying via docker-compose, mount a volume on `/data` so the
243+
history survives container restarts.
230244

231245
## Tech-stack rationale
232246

backend/app/clients/cloudflare.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from datetime import datetime, timezone
3+
from datetime import datetime, timedelta, timezone
44

55
import httpx
66
import structlog
@@ -191,6 +191,76 @@ async def fetch_certs(settings: Settings) -> list[CertInfo]:
191191
return sorted(dedup.values(), key=lambda c: c.days_left)
192192

193193

194+
async def fetch_access_sessions(settings: Settings, hours: int = 24, limit: int = 100) -> dict:
195+
"""Pull recent Cloudflare Access login events.
196+
197+
Uses the audit-log API at ``/accounts/{id}/access/logs/access_requests``.
198+
The CF token needs the "Access: Apps and Policies: Read" permission
199+
(or equivalent audit-log scope) — without it the call returns 403 and
200+
we surface ``reachable=false`` with the error so the UI can explain.
201+
202+
Returns ``{reachable, error, last_login_iso, sessions_24h, items: [...]}``.
203+
Each item has ``email``, ``app_uid``, ``allowed``, ``created_at``,
204+
``ip``, ``country``. Window-bounded server-side; we accept up to 100
205+
items so a busy account doesn't blow up the cache.
206+
"""
207+
if not (settings.cf_api_token and settings.cf_account_id):
208+
return {
209+
"reachable": False,
210+
"error": "CF_API_TOKEN oder CF_ACCOUNT_ID nicht konfiguriert",
211+
"last_login_iso": None,
212+
"sessions_24h": 0,
213+
"items": [],
214+
}
215+
since = datetime.now(timezone.utc) - timedelta(hours=hours)
216+
since_iso = since.isoformat().replace("+00:00", "Z")
217+
path = (
218+
f"/accounts/{settings.cf_account_id}/access/logs/access_requests"
219+
f"?since={since_iso}&limit={limit}"
220+
)
221+
async with httpx.AsyncClient(timeout=8.0) as client:
222+
try:
223+
body = await _get_raw(client, settings, path)
224+
except httpx.HTTPError as e:
225+
log.info(
226+
"cloudflare.access_sessions_failed",
227+
account=settings.cf_account_id,
228+
error=str(e),
229+
error_type=type(e).__name__,
230+
)
231+
return {
232+
"reachable": False,
233+
"error": f"Access-Audit-Log nicht abrufbar — Token-Scope prüfen ({type(e).__name__})",
234+
"last_login_iso": None,
235+
"sessions_24h": 0,
236+
"items": [],
237+
}
238+
raw = body.get("result")
239+
items: list[dict] = []
240+
if isinstance(raw, list):
241+
for r in raw[:limit]:
242+
if not isinstance(r, dict):
243+
continue
244+
items.append(
245+
{
246+
"email": r.get("user_email"),
247+
"app_uid": r.get("app_uid"),
248+
"allowed": bool(r.get("allowed", True)),
249+
"created_at": r.get("created_at"),
250+
"ip": r.get("ip_address"),
251+
"country": r.get("country"),
252+
}
253+
)
254+
last = items[0]["created_at"] if items else None
255+
return {
256+
"reachable": True,
257+
"error": None,
258+
"last_login_iso": last,
259+
"sessions_24h": len(items),
260+
"items": items,
261+
}
262+
263+
194264
async def fetch_dns_consistency(settings: Settings) -> list[DNSRecordCheck]:
195265
if not (settings.cf_api_token and settings.cf_zone_id and settings.cf_tunnel_id):
196266
return []

backend/app/clients/pbs.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,49 @@ def _verify_status(verification: dict | None) -> str:
4848
return "—"
4949

5050

51+
async def trigger_verify(
52+
settings: Settings, backup_type: str, backup_id: str, backup_time: int
53+
) -> str | None:
54+
"""Kick off a verification job for a single PBS snapshot.
55+
56+
Returns the resulting UPID on success, ``None`` on error. The token needs
57+
the ``Datastore.Verify`` privilege on the datastore — ``DatastoreAudit``
58+
alone (the default for read-only dashboards) is not sufficient.
59+
"""
60+
if not (settings.pbs_token_id and settings.pbs_token_secret):
61+
return None
62+
payload = {
63+
"backup-type": backup_type,
64+
"backup-id": backup_id,
65+
"backup-time": backup_time,
66+
}
67+
async with httpx.AsyncClient(verify=settings.pbs_verify_tls, timeout=10.0) as client:
68+
try:
69+
r = await client.post(
70+
f"{_base_url(settings)}/admin/datastore/{settings.pbs_datastore}/verify",
71+
headers={**_auth_header(settings), "Content-Type": "application/json"},
72+
json=payload,
73+
)
74+
r.raise_for_status()
75+
except httpx.HTTPError as e:
76+
log.warning(
77+
"pbs.verify_trigger_failed",
78+
backup_type=backup_type,
79+
backup_id=backup_id,
80+
backup_time=backup_time,
81+
error=str(e),
82+
error_type=type(e).__name__,
83+
)
84+
return None
85+
try:
86+
data = r.json().get("data")
87+
except ValueError:
88+
data = None
89+
if isinstance(data, str) and data.startswith("UPID:"):
90+
return data
91+
return None
92+
93+
5194
async def fetch_backup_summary(settings: Settings) -> BackupSummary:
5295
if not (settings.pbs_token_id and settings.pbs_token_secret):
5396
return BackupSummary(
@@ -96,12 +139,17 @@ async def fetch_backup_summary(settings: Settings) -> BackupSummary:
96139
when_iso = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
97140
verify = _verify_status(snap.get("verification"))
98141
size = int(snap.get("size", 0))
99-
target = f"{snap.get('backup-type', '?')}/{snap.get('backup-id', '?')}"
142+
b_type = str(snap.get("backup-type", "?"))
143+
b_id = str(snap.get("backup-id", "?"))
144+
target = f"{b_type}/{b_id}"
100145
status = "err" if verify == "failed" else ("warn" if verify == "pending" else "ok")
101146
jobs.append(
102147
BackupSnapshot(
103148
id=f"{target}@{ts}",
104149
target=target,
150+
backup_type=b_type,
151+
backup_id=b_id,
152+
backup_time=ts,
105153
status=status, # type: ignore[arg-type]
106154
verify=verify, # type: ignore[arg-type]
107155
size_b=size,

backend/app/clients/probes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import httpx
77

8+
from .. import storage
89
from ..config import Settings
910
from ..models import ServiceStatus
1011

@@ -85,4 +86,13 @@ async def run(svc: dict[str, str]) -> ServiceStatus:
8586
)
8687

8788
results = await asyncio.gather(*(run(s) for s in SERVICES))
89+
90+
# Persist a sample per service for the uptime view. Best-effort; if
91+
# storage is disabled or the write fails it's a no-op (see storage.py).
92+
await storage.record_probes([(r.id, r.status, int(r.ms)) for r in results])
93+
# Track incident transitions so we can answer "last_incident_iso" across
94+
# restarts. One row per state-change; serial calls within a single
95+
# incident just bump the worst_status if the new probe is more severe.
96+
for r in results:
97+
await storage.update_service_incident(r.id, r.status)
8898
return list(results)

backend/app/clients/proxmox.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,43 @@ async def fetch_task_log(settings: Settings, upid: str, limit: int = 200) -> lis
259259
return out
260260

261261

262+
async def fetch_host_journal_for_vmid(
263+
settings: Settings, vmid: int, lastentries: int = 500
264+
) -> list[str]:
265+
"""Return host-journal lines that mention this VMID.
266+
267+
Proxmox does not expose an API to read inside a container — the
268+
Integration API only offers ``/journal`` for the host. We fetch the last
269+
N entries and filter for the VMID as a whole word, which catches the
270+
usual ``lxc-<vmid>``, ``pve-container@<vmid>.service``, and bare-id
271+
references that systemd / pveproxy / pve-firewall emit around lifecycle
272+
events. This is *not* a replacement for ``journalctl`` inside the guest,
273+
but it surfaces host-side events for that container without requiring
274+
SSH access.
275+
"""
276+
import re
277+
278+
async with httpx.AsyncClient(verify=settings.proxmox_verify_tls, timeout=10.0) as client:
279+
try:
280+
data = await _get(
281+
client,
282+
settings,
283+
f"/nodes/{settings.proxmox_node}/journal?lastentries={lastentries}",
284+
)
285+
except httpx.HTTPError as e:
286+
log.warning("proxmox.journal_failed", vmid=vmid, error=str(e))
287+
return []
288+
289+
if not isinstance(data, list):
290+
return []
291+
292+
# \b<vmid>\b matches the id as a standalone token. We also accept
293+
# ``lxc-<vmid>`` and ``CT <vmid>`` shapes explicitly so a numeric-prefix
294+
# in a longer word (e.g. memory addresses) doesn't generate false hits.
295+
pattern = re.compile(rf"(?:\blxc-{vmid}\b|\bCT\s*{vmid}\b|@{vmid}\.service|\b{vmid}\b)")
296+
return [line for line in data if isinstance(line, str) and pattern.search(line)]
297+
298+
262299
async def fetch_guest_tasks(settings: Settings, vmid: int, limit: int = 10) -> list[dict]:
263300
"""Recent Proxmox cluster tasks scoped to a single VMID."""
264301
async with httpx.AsyncClient(verify=settings.proxmox_verify_tls, timeout=8.0) as client:

backend/app/config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,21 @@ class Settings(BaseSettings):
9090
# ---- IP Geolocation (for ISP name when UniFi doesn't expose it) ----
9191
geoip_enabled: bool = True
9292

93+
# ---- Probe history (SQLite) ----
94+
# Where to keep persisted service-probe history. Set to "" to disable
95+
# storage entirely; the dashboard then falls back to in-memory only.
96+
storage_db_path: str = "/data/rxf-admin.db"
97+
# How long to keep individual probe samples (days). Older rows are dropped
98+
# by a periodic cleanup task in the lifespan.
99+
history_retention_days: int = 7
100+
# Cleanup loop tick (seconds).
101+
history_cleanup_interval_s: int = 3600
102+
# Metrics sampling loop tick (seconds). Pulls guest CPU/RAM from Proxmox
103+
# and WAN throughput from UniFi, writing one row per guest + one row for
104+
# network. 60 s is a sensible default; lower values trade DB churn for
105+
# finer-grained charts. Set to 0 to disable the loop.
106+
metrics_sample_interval_s: int = 60
107+
93108
# ---- Notifications ----
94109
# Discord/Slack-compatible incoming webhook URL. Empty = disabled.
95110
notify_webhook_url: str = ""

backend/app/main.py

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,21 @@
99
from fastapi import Depends, FastAPI
1010
from fastapi.middleware.cors import CORSMiddleware
1111

12+
from . import storage
1213
from .auth import verify_cf_access
13-
from .clients import cloudflare, pbs, probes
14+
from .clients import cloudflare, pbs, probes, proxmox, unifi
1415
from .config import get_settings
1516
from .notify import NotificationCenter, run_notification_loop
16-
from .routers import audit, backups, certs, network, services, system, tunnel
17+
from .routers import (
18+
audit,
19+
backups,
20+
certs,
21+
cloudflare as cloudflare_router,
22+
network,
23+
services,
24+
system,
25+
tunnel,
26+
)
1727

1828
_settings = get_settings()
1929
logging.basicConfig(
@@ -70,12 +80,76 @@ async def _gather_notify_snapshot() -> dict:
7080
}
7181

7282

83+
async def _history_cleanup_loop() -> None:
84+
"""Periodically drop samples older than the retention window."""
85+
while True:
86+
try:
87+
await asyncio.sleep(_settings.history_cleanup_interval_s)
88+
await storage.cleanup_old(_settings.history_retention_days)
89+
except asyncio.CancelledError:
90+
raise
91+
except Exception as e: # noqa: BLE001 - never let the loop die
92+
structlog.get_logger().error(
93+
"history.cleanup_error", error=str(e), error_type=type(e).__name__
94+
)
95+
96+
97+
async def _metrics_sample_loop() -> None:
98+
"""Pull guest CPU/RAM and WAN throughput every tick and persist them.
99+
100+
Lets the dashboard draw 24h CPU/RAM-per-guest and 1h WAN-throughput
101+
charts without leaning on each upstream's (missing) history endpoints.
102+
Best-effort: if either upstream is unreachable on this tick we skip
103+
writing for that side rather than killing the loop.
104+
"""
105+
interval = max(15, _settings.metrics_sample_interval_s)
106+
log_ = structlog.get_logger("metrics")
107+
while True:
108+
try:
109+
await asyncio.sleep(interval)
110+
guests_task = asyncio.create_task(proxmox.fetch_guests(_settings))
111+
net_task = asyncio.create_task(unifi.fetch_network_snapshot(_settings))
112+
guests, net = await asyncio.gather(
113+
guests_task, net_task, return_exceptions=True
114+
)
115+
if isinstance(guests, list):
116+
rows = [
117+
(g.id, float(g.cpu_pct), int(g.ram_used_b), int(g.ram_total_b))
118+
for g in guests
119+
if g.running and g.type != "HOST"
120+
]
121+
await storage.record_guest_metrics(rows)
122+
else:
123+
log_.info("metrics.guests_skip", error=str(guests))
124+
if not isinstance(net, BaseException) and net.reachable:
125+
await storage.record_network_metrics(
126+
float(net.throughput_down_mbit), float(net.throughput_up_mbit)
127+
)
128+
elif isinstance(net, BaseException):
129+
log_.info("metrics.network_skip", error=str(net))
130+
except asyncio.CancelledError:
131+
raise
132+
except Exception as e: # noqa: BLE001 - never let the loop die
133+
log_.error(
134+
"metrics.sample_error", error=str(e), error_type=type(e).__name__
135+
)
136+
137+
73138
@asynccontextmanager
74139
async def lifespan(app: FastAPI):
75-
task: asyncio.Task | None = None
140+
notify_task: asyncio.Task | None = None
141+
cleanup_task: asyncio.Task | None = None
142+
metrics_task: asyncio.Task | None = None
143+
144+
await storage.ensure_schema(_settings)
145+
if storage.is_enabled():
146+
cleanup_task = asyncio.create_task(_history_cleanup_loop())
147+
if _settings.metrics_sample_interval_s > 0:
148+
metrics_task = asyncio.create_task(_metrics_sample_loop())
149+
76150
if _settings.notify_webhook_url:
77151
center = NotificationCenter(settings=_settings)
78-
task = asyncio.create_task(
152+
notify_task = asyncio.create_task(
79153
run_notification_loop(
80154
center, _gather_notify_snapshot, interval_s=_settings.notify_interval_s
81155
)
@@ -86,7 +160,9 @@ async def lifespan(app: FastAPI):
86160
try:
87161
yield
88162
finally:
89-
if task:
163+
for task in (notify_task, cleanup_task, metrics_task):
164+
if task is None:
165+
continue
90166
task.cancel()
91167
try:
92168
await task
@@ -135,3 +211,5 @@ async def me(claims: dict = Depends(verify_cf_access)) -> dict:
135211
app.include_router(network.router)
136212
app.include_router(certs.router)
137213
app.include_router(audit.router)
214+
app.include_router(audit.events_router)
215+
app.include_router(cloudflare_router.router)

0 commit comments

Comments
 (0)