Skip to content

Commit a4208f9

Browse files
authored
Merge pull request #60 from FluffyAIcode/AgentMemory/v030-pr-g5-prewarm-cli-8e7f
PR-G5: model prewarm CLI + gRPC server first-run cache check
2 parents 43335da + cd563f3 commit a4208f9

9 files changed

Lines changed: 965 additions & 2 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,17 @@ jobs:
9494
tests/inference_engine/pipeline/ \
9595
tests/inference_engine/session/ \
9696
tests/inference_engine/bench/ \
97+
tests/inference_engine/setup/ \
9798
tests/sdk/python/ \
9899
tests/training/repr_align/ \
99100
tests/backends/mlx/test_env.py \
100101
--junitxml=junit.xml \
101102
-v
102103
coverage report \
103-
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*' \
104+
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*' \
104105
--fail-under=100
105106
coverage xml -o coverage.xml \
106-
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*'
107+
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*'
107108
108109
- name: Upload coverage artifact
109110
if: always()

inference_engine/setup/__init__.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""First-run UX helpers for the v0.3 runtime.
2+
3+
Today this package is just the ``prewarm`` module — a small set of
4+
pure-Python helpers used by ``scripts/kakeya_prewarm.py`` and by
5+
``scripts/start_grpc_runtime_server.py``'s cache-check pre-flight.
6+
Kept under ``inference_engine.setup`` rather than under
7+
``inference_engine.server`` because it is platform-neutral
8+
(operates on the HF cache filesystem, no torch / mlx dependency).
9+
"""
10+
11+
from .prewarm import (
12+
HF_CACHE_DEFAULT,
13+
PrewarmStatus,
14+
assert_cached_or_raise,
15+
cache_dir_for_model,
16+
free_disk_bytes,
17+
is_model_in_cache,
18+
prewarm_model_id,
19+
snapshot_size_bytes,
20+
)
21+
22+
__all__ = [
23+
"HF_CACHE_DEFAULT",
24+
"PrewarmStatus",
25+
"assert_cached_or_raise",
26+
"cache_dir_for_model",
27+
"free_disk_bytes",
28+
"is_model_in_cache",
29+
"prewarm_model_id",
30+
"snapshot_size_bytes",
31+
]

inference_engine/setup/prewarm.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
"""HuggingFace model-cache pre-warm helpers.
2+
3+
The v0.3 runtime loads its verifier from the HuggingFace cache at
4+
startup. Without a pre-warmed cache a first-run download blocks the
5+
server boot for 1-10 minutes (depending on bandwidth) with no
6+
progress feedback — a poor first-time-user experience.
7+
8+
This module exposes:
9+
10+
* :data:`HF_CACHE_DEFAULT` — canonical cache root.
11+
* :func:`cache_dir_for_model` — the directory a given HF model id
12+
lands in under the cache.
13+
* :func:`is_model_in_cache` — fast read-only check; no network.
14+
* :func:`snapshot_size_bytes` — total bytes resident on disk for a
15+
cached model (informational).
16+
* :func:`prewarm_model_id` — explicit download with progress;
17+
raises on failure rather than silently re-trying.
18+
19+
These helpers are platform-neutral (no torch / mlx imports) so they
20+
run quickly during the gRPC server's pre-flight check and don't
21+
trigger the lazy-loaded backend dependencies. The CLI driver lives
22+
at ``scripts/kakeya_prewarm.py``.
23+
24+
Per ADR 0008 §9: this is verifier-independent infrastructure;
25+
exercised by Linux unit tests against synthetic cache directories.
26+
The full HF download path is exercised by the integration suite +
27+
the Mac M4 reviewer aid (``scripts/review_pr_g5_on_mac.sh``).
28+
"""
29+
30+
from __future__ import annotations
31+
32+
import os
33+
import shutil
34+
from dataclasses import dataclass
35+
from pathlib import Path
36+
from typing import Optional
37+
38+
39+
HF_CACHE_DEFAULT = Path(
40+
os.environ.get(
41+
"HF_HUB_CACHE",
42+
os.environ.get("HF_HOME", str(Path.home() / ".cache" / "huggingface")),
43+
)
44+
)
45+
"""Default HuggingFace cache root.
46+
47+
Resolution order (matches transformers / huggingface_hub):
48+
1. ``$HF_HUB_CACHE`` if set.
49+
2. ``$HF_HOME`` if set (cache lives under ``hub/`` subdir).
50+
3. ``~/.cache/huggingface`` (cache lives under ``hub/`` subdir).
51+
52+
The directory we actually look in is :data:`HF_CACHE_DEFAULT` / ``hub``
53+
when this module's helpers convert a model id to a path; ``HF_HOME``
54+
historically pointed at the root, ``HF_HUB_CACHE`` at the ``hub`` dir
55+
directly. We normalize by always appending ``hub`` for default-cache
56+
inspection unless the caller already passed a directory ending in
57+
``hub`` or ``models--*``.
58+
"""
59+
60+
61+
def _hub_root(cache_root: Path) -> Path:
62+
"""Return the ``hub/`` subdirectory of a cache root.
63+
64+
Keeps callers from caring whether they passed ``HF_HOME``-style
65+
or ``HF_HUB_CACHE``-style. Idempotent: passing a ``hub``-suffixed
66+
path returns it unchanged.
67+
"""
68+
cache_root = Path(cache_root)
69+
if cache_root.name == "hub":
70+
return cache_root
71+
return cache_root / "hub"
72+
73+
74+
def cache_dir_for_model(
75+
model_id: str, *, cache_root: Optional[Path] = None,
76+
) -> Path:
77+
"""Return the directory under the HF cache where a model id lands.
78+
79+
HF caches use a ``models--<owner>--<repo>`` directory naming
80+
scheme. This helper computes the path WITHOUT any I/O — it does
81+
NOT check whether the directory exists. Pair with
82+
:func:`is_model_in_cache` for the existence check.
83+
"""
84+
if "/" not in model_id:
85+
raise ValueError(
86+
f"model_id must be 'owner/repo' shape, got {model_id!r}"
87+
)
88+
flat = "models--" + model_id.replace("/", "--")
89+
root = _hub_root(cache_root or HF_CACHE_DEFAULT)
90+
return root / flat
91+
92+
93+
def is_model_in_cache(
94+
model_id: str, *, cache_root: Optional[Path] = None,
95+
) -> bool:
96+
"""Read-only check: is the model already cached on disk?
97+
98+
Only checks for the existence of the model's cache directory and
99+
that it contains at least one snapshot. Does NOT validate that
100+
the snapshot is complete / consistent — a partial download leaves
101+
a directory tree in place. The pre-warm CLI (:func:`prewarm_model_id`)
102+
is the canonical source of "fully downloaded".
103+
"""
104+
cache_dir = cache_dir_for_model(model_id, cache_root=cache_root)
105+
if not cache_dir.is_dir():
106+
return False
107+
snapshots = cache_dir / "snapshots"
108+
if not snapshots.is_dir():
109+
return False
110+
return any(snapshots.iterdir())
111+
112+
113+
def snapshot_size_bytes(
114+
model_id: str, *, cache_root: Optional[Path] = None,
115+
) -> int:
116+
"""Total bytes resident on disk for a cached model.
117+
118+
Sums the size of every regular file under the model's cache
119+
directory. Returns 0 if the model isn't in cache. Fast for
120+
typical model directory sizes (Qwen3-0.6B = ~12 files, ~1.2 GB).
121+
"""
122+
cache_dir = cache_dir_for_model(model_id, cache_root=cache_root)
123+
if not cache_dir.is_dir():
124+
return 0
125+
total = 0
126+
for entry in cache_dir.rglob("*"):
127+
# Resolve symlinks; HF cache uses symlinks heavily under
128+
# snapshots/<rev>/. Skip dangling symlinks rather than
129+
# raising — partial downloads can leave them behind.
130+
try:
131+
if entry.is_symlink():
132+
target = entry.resolve(strict=False)
133+
if target.is_file():
134+
total += target.stat().st_size
135+
elif entry.is_file():
136+
total += entry.stat().st_size
137+
except OSError: # pragma: no cover - filesystem races
138+
continue
139+
return total
140+
141+
142+
@dataclass(frozen=True)
143+
class PrewarmStatus:
144+
"""Result of a :func:`prewarm_model_id` call."""
145+
146+
model_id: str
147+
cache_dir: Path
148+
snapshot_bytes: int
149+
was_already_cached: bool
150+
151+
def human(self) -> str:
152+
action = "already cached" if self.was_already_cached else "downloaded"
153+
return (
154+
f"{self.model_id}: {action} at {self.cache_dir} "
155+
f"({self.snapshot_bytes / (1024 * 1024):.1f} MiB on disk)"
156+
)
157+
158+
159+
def prewarm_model_id(
160+
model_id: str,
161+
*,
162+
cache_root: Optional[Path] = None,
163+
include_tokenizer: bool = True,
164+
progress_callback=None,
165+
) -> PrewarmStatus:
166+
"""Ensure a HuggingFace model + tokenizer are fully downloaded.
167+
168+
Idempotent: returns ``was_already_cached=True`` immediately if the
169+
model is already on disk. Otherwise runs ``snapshot_download``
170+
(via huggingface_hub, the standard tool) which surfaces a
171+
progress bar by default and uses HF Hub's resume-friendly
172+
chunked downloads.
173+
174+
``include_tokenizer=False`` skips the tokenizer download — useful
175+
for inference-only workflows that already have the tokenizer
176+
elsewhere. Default True because v0.3's verifier-side code needs
177+
both weights and tokenizer config.
178+
179+
The ``progress_callback`` parameter is reserved for future use;
180+
huggingface_hub's standard tqdm bar is what users see today.
181+
182+
Raises on download failure (network error, permission denied,
183+
disk full); does NOT silently fall back to "best effort".
184+
"""
185+
del progress_callback # reserved for v0.4
186+
187+
if is_model_in_cache(model_id, cache_root=cache_root):
188+
return PrewarmStatus(
189+
model_id=model_id,
190+
cache_dir=cache_dir_for_model(model_id, cache_root=cache_root),
191+
snapshot_bytes=snapshot_size_bytes(
192+
model_id, cache_root=cache_root,
193+
),
194+
was_already_cached=True,
195+
)
196+
197+
# Lazy import: keep the module's top-level import surface tiny
198+
# (huggingface_hub itself is fine but pulls in transitive deps).
199+
from huggingface_hub import snapshot_download
200+
201+
download_kwargs = {"repo_id": model_id}
202+
if cache_root is not None:
203+
download_kwargs["cache_dir"] = str(_hub_root(cache_root))
204+
# The default `allow_patterns` is None which downloads everything;
205+
# if the caller wants only weights, they can post-filter. v0.3
206+
# downloads the full snapshot — tokenizer + weights + config in
207+
# one call. Removing files saves <50 MB on Qwen3-0.6B; not worth
208+
# the API complexity here.
209+
if not include_tokenizer:
210+
download_kwargs["ignore_patterns"] = [
211+
"tokenizer*",
212+
"vocab*",
213+
"merges*",
214+
"*.txt", # tokenizer.json variants
215+
]
216+
217+
snapshot_download(**download_kwargs)
218+
219+
return PrewarmStatus(
220+
model_id=model_id,
221+
cache_dir=cache_dir_for_model(model_id, cache_root=cache_root),
222+
snapshot_bytes=snapshot_size_bytes(
223+
model_id, cache_root=cache_root,
224+
),
225+
was_already_cached=False,
226+
)
227+
228+
229+
def assert_cached_or_raise(
230+
model_id: str,
231+
*,
232+
cache_root: Optional[Path] = None,
233+
prewarm_command_hint: str = (
234+
"python3 scripts/kakeya_prewarm.py --verifier-id {model_id}"
235+
),
236+
) -> None:
237+
"""Pre-flight assertion: raise with a friendly message if missing.
238+
239+
Used by ``scripts/start_grpc_runtime_server.py`` to fail fast on
240+
a cold cache rather than silently triggering a 5 GB download
241+
inside the server boot path. The error message points at the
242+
prewarm CLI; substitute ``{model_id}`` is filled in for clarity.
243+
"""
244+
if is_model_in_cache(model_id, cache_root=cache_root):
245+
return
246+
cache_dir = cache_dir_for_model(model_id, cache_root=cache_root)
247+
hint = prewarm_command_hint.format(model_id=model_id)
248+
raise FileNotFoundError(
249+
f"HF cache miss for {model_id!r} (looked in {cache_dir}).\n"
250+
f"Pre-warm the cache before starting the server:\n"
251+
f" {hint}\n"
252+
f"This avoids blocking server boot on a multi-GB download "
253+
f"with no progress feedback."
254+
)
255+
256+
257+
def free_disk_bytes(path: Optional[Path] = None) -> int:
258+
"""Best-effort free-disk-bytes for the cache filesystem.
259+
260+
Useful for the prewarm CLI to give the user a "this won't fit"
261+
error before starting the download instead of after. Returns 0
262+
if the path doesn't exist or stat fails.
263+
"""
264+
target = Path(path or HF_CACHE_DEFAULT)
265+
if not target.exists():
266+
target = target.parent
267+
try:
268+
return shutil.disk_usage(target).free
269+
except OSError: # pragma: no cover - filesystem-dependent
270+
return 0
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"schema_version": 1,
3+
"kind": "pr_g5_mac_prewarm",
4+
"host": {
5+
"platform": "macOS-26.5-arm64-arm-64bit-Mach-O",
6+
"machine": "arm64",
7+
"python": "3.13.12"
8+
},
9+
"server_preflight_exit_code": 2,
10+
"server_preflight_passed": true
11+
}

0 commit comments

Comments
 (0)