Skip to content

Commit 89576b4

Browse files
widgetiiclaude
andauthored
install/restore: auto-fallback to host TFTP when pod PSRAM is too small (#95)
## Summary - `defib install/restore --tftp-via=auto` (the default when `power=rack`) now does a pre-flight `GET /tftp` on the pod and falls back to host TFTP if `psram_largest_free_block < total_bytes + 256 KiB` headroom — instead of crashing partway through the staging POST with a 503 OOM. - Strict `--tftp-via=pod` is unchanged — still errors on OOM, since the user explicitly opted out of the host path. - New `RackController.psram_can_fit(total_bytes, headroom_bytes=256 KiB) -> (bool, stats)` — any transport error returns `(False, {})` so an unreachable pod is treated the same as "won't fit", and the CLI cleanly falls back rather than crashing the pre-check. ## Test plan - [x] `uv run pytest tests/test_power_rack.py -x -v` — 29 pass, 4 new in `TestPsramCanFit` - [x] `uv run pytest tests/ -x --ignore=tests/fuzz` — full suite clean - [x] `uv run ruff check src/ tests/` — clean - [x] `uv run mypy src/defib --ignore-missing-imports` — clean - [x] Live against pod `10.216.128.69`: - 6 MB filler staged → `largest_free=2 MB` → `auto` falls back: *"Pod PSRAM has 2016 KB contiguous free, need 6535 KB ..."* - same condition with `--tftp-via=pod` → 503 OOM (strict mode preserved) - cleared pod (`DELETE /tftp`) → `auto` uses pod path normally 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Dmitry Ilyin <widgetii@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a5ae454 commit 89576b4

3 files changed

Lines changed: 129 additions & 0 deletions

File tree

src/defib/cli/app.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2174,6 +2174,26 @@ async def _cmd(cmd: str, timeout: float = 60.0, **kw: object) -> str:
21742174
rootfs_name: rootfs_data,
21752175
}
21762176

2177+
# --tftp-via=auto pre-flight: if the pod doesn't have enough
2178+
# contiguous PSRAM for the firmware, fall back to host TFTP.
2179+
# Surfaces "too-big rootfs" cleanly instead of OOMing the staging
2180+
# POST mid-way. --tftp-via=pod stays strict (error on OOM, no
2181+
# silent fallback).
2182+
if use_pod_tftp and tftp_via == "auto":
2183+
assert isinstance(power_controller, RackController)
2184+
total_bytes = sum(len(d) for d in tftp_files.values())
2185+
fits, pod_stats = await power_controller.psram_can_fit(total_bytes)
2186+
if not fits:
2187+
_raw = pod_stats.get("psram_largest_free_block", 0)
2188+
largest = int(_raw) if isinstance(_raw, (int, float)) else 0
2189+
if output == "human":
2190+
console.print(
2191+
f" [yellow]Pod PSRAM has {largest // 1024} KB contiguous free, "
2192+
f"need {total_bytes // 1024} KB for this install — falling back "
2193+
f"to host TFTP.[/yellow]"
2194+
)
2195+
use_pod_tftp = False
2196+
21772197
if not use_pod_tftp:
21782198
# Host TFTP needs a NIC + host_ip; pod path needs neither.
21792199
if not nic:
@@ -2893,6 +2913,24 @@ async def _send(cmd: str, timeout: float = 60.0) -> str:
28932913
await transport.close()
28942914
raise typer.Exit(1)
28952915

2916+
# Auto-fallback: if the pod's PSRAM can't fit the dump, drop to
2917+
# host TFTP rather than OOMing mid-stage. Explicit --tftp-via=pod
2918+
# stays strict.
2919+
if use_pod_tftp and tftp_via == "auto":
2920+
assert isinstance(power_controller, RackController)
2921+
total_bytes = sum(len(d) for _, d in partitions)
2922+
fits, pod_stats = await power_controller.psram_can_fit(total_bytes)
2923+
if not fits:
2924+
_raw = pod_stats.get("psram_largest_free_block", 0)
2925+
largest = int(_raw) if isinstance(_raw, (int, float)) else 0
2926+
if output == "human":
2927+
console.print(
2928+
f" [yellow]Pod PSRAM has {largest // 1024} KB contiguous free, "
2929+
f"need {total_bytes // 1024} KB for this dump — falling back "
2930+
f"to host TFTP.[/yellow]"
2931+
)
2932+
use_pod_tftp = False
2933+
28962934
if use_pod_tftp:
28972935
# Override host_ip + device_ip so they live on the pod's camera-
28982936
# side subnet — the host_ip auto-detect picks something on the

src/defib/power/rack.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,30 @@ async def tftp_list(self, timeout: float = 10.0) -> dict[str, object]:
138138
url = f"http://{self._host}:{self._port}/tftp"
139139
return await asyncio.to_thread(self._http_send_sync, "GET", url, None, timeout)
140140

141+
async def psram_can_fit(
142+
self,
143+
total_bytes: int,
144+
headroom_bytes: int = 256 * 1024,
145+
) -> tuple[bool, dict[str, object]]:
146+
"""Best-effort: does the pod have a contiguous PSRAM block large
147+
enough to stage ``total_bytes`` (plus ``headroom_bytes`` slack)?
148+
149+
Queries the pod's ``GET /tftp`` for ``psram_largest_free_block``
150+
and compares. Returns ``(fits, stats)``; on any transport error
151+
returns ``(False, {})`` so the caller can treat that as "fall
152+
back to host TFTP".
153+
154+
Used by ``defib install --tftp-via=auto`` to pick between pod and
155+
host TFTP without making the user predict file sizes vs PSRAM.
156+
"""
157+
try:
158+
stats = await self.tftp_list()
159+
except Exception:
160+
return False, {}
161+
raw = stats.get("psram_largest_free_block", 0)
162+
largest = int(raw) if isinstance(raw, (int, float)) else 0
163+
return largest >= total_bytes + headroom_bytes, stats
164+
141165
@staticmethod
142166
def _http_send_sync(
143167
method: str, url: str, body: bytes | None, timeout: float,

tests/test_power_rack.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,3 +506,70 @@ def http503(req: Any, timeout: float | None = None) -> None:
506506
ctrl = RackController(host="pod", port=8080)
507507
with pytest.raises(PowerControllerError, match="503"):
508508
await ctrl.tftp_put("rootfs", b"X" * 4096)
509+
510+
511+
class TestPsramCanFit:
512+
"""RackController.psram_can_fit — best-effort pre-flight check
513+
used by --tftp-via=auto to pick pod vs host TFTP."""
514+
515+
@pytest.mark.asyncio
516+
async def test_fits_with_headroom(
517+
self, monkeypatch: pytest.MonkeyPatch,
518+
) -> None:
519+
body = (
520+
b'{"files":[],"max_size_bytes":8388608,"max_slots":4,'
521+
b'"psram_free_bytes":8000000,"psram_largest_free_block":7000000}'
522+
)
523+
ctrl = RackController(host="pod", port=8080)
524+
with patched_urlopen(monkeypatch, body=body):
525+
fits, stats = await ctrl.psram_can_fit(6 * 1024 * 1024)
526+
assert fits is True
527+
assert stats["psram_largest_free_block"] == 7000000
528+
529+
@pytest.mark.asyncio
530+
async def test_does_not_fit_when_largest_block_too_small(
531+
self, monkeypatch: pytest.MonkeyPatch,
532+
) -> None:
533+
"""OpenIPC nor-ultimate is ~10 MB; an N8R2 pod with 2 MB PSRAM
534+
couldn't host it. Must say no clearly so auto-mode falls back."""
535+
body = (
536+
b'{"files":[],"max_size_bytes":1572864,"max_slots":4,'
537+
b'"psram_free_bytes":1900000,"psram_largest_free_block":1700000}'
538+
)
539+
ctrl = RackController(host="pod", port=8080)
540+
with patched_urlopen(monkeypatch, body=body):
541+
fits, stats = await ctrl.psram_can_fit(10 * 1024 * 1024)
542+
assert fits is False
543+
assert stats["psram_largest_free_block"] == 1700000
544+
545+
@pytest.mark.asyncio
546+
async def test_does_not_fit_when_headroom_eats_margin(
547+
self, monkeypatch: pytest.MonkeyPatch,
548+
) -> None:
549+
"""Exactly-equal block size must NOT pass — leave headroom for
550+
the HTTP handler's transient scratch + lwip buffers."""
551+
body = (
552+
b'{"psram_largest_free_block":1048576}' # 1 MiB exactly
553+
)
554+
ctrl = RackController(host="pod", port=8080)
555+
with patched_urlopen(monkeypatch, body=body):
556+
# default headroom = 256 KiB; total 1 MiB → needs 1.25 MiB
557+
fits, _ = await ctrl.psram_can_fit(1024 * 1024)
558+
assert fits is False
559+
560+
@pytest.mark.asyncio
561+
async def test_pod_unreachable_returns_false(
562+
self, monkeypatch: pytest.MonkeyPatch,
563+
) -> None:
564+
"""Network error → treat as "doesn't fit" so the CLI cleanly
565+
falls back to host instead of crashing on the pre-check."""
566+
import urllib.error
567+
568+
def boom(req: Any, timeout: float | None = None) -> None:
569+
raise urllib.error.URLError("connection refused")
570+
571+
monkeypatch.setattr(rack_mod.urllib.request, "urlopen", boom)
572+
ctrl = RackController(host="pod", port=8080)
573+
fits, stats = await ctrl.psram_can_fit(1024)
574+
assert fits is False
575+
assert stats == {}

0 commit comments

Comments
 (0)