Skip to content

Commit 4c23b23

Browse files
author
Francisco
committed
feat(training): enforce HF cache guard at job dispatch
- Add _is_model_in_hf_cache() to training_service.py — checks for a non-empty snapshot directory under HF_CACHE_PATH/hub/models--*/snapshots/ mirroring the actual HuggingFace cache directory structure - Reject job creation with HTTP 400 if model is not cached locally — user gets a clear error at dispatch rather than a silent GPU failure 10 minutes into training - Fix HF_HUB_OFFLINE = '1' in unsloth_train.py — enforces cache-only mode in the training subprocess, comment now matches the code
1 parent 9e46e7b commit 4c23b23

2 files changed

Lines changed: 53 additions & 3 deletions

File tree

src/api/training/services/training_service.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
# src/api/training/services/training_service.py
12
import json
23
import os
34
import time
5+
from pathlib import Path
46
from typing import List, Optional
57

68
import redis
@@ -22,6 +24,35 @@ def get_redis_client() -> redis.Redis:
2224
return redis.from_url(redis_url, decode_responses=True)
2325

2426

27+
def _is_model_in_hf_cache(model_id: str) -> bool:
28+
"""
29+
Check whether a HuggingFace model is present in the local cache.
30+
31+
Looks for a snapshot directory under:
32+
$HF_CACHE_PATH/hub/models--<org>--<model>/snapshots/
33+
34+
A model is considered cached if at least one snapshot directory exists
35+
and is non-empty. This mirrors how HuggingFace stores downloaded models
36+
and is cache-format stable across transformers versions.
37+
38+
Returns True if cached, False otherwise.
39+
"""
40+
hf_cache = os.getenv("HF_CACHE_PATH", "/root/.cache/huggingface")
41+
# Convert 'org/model-name' → 'models--org--model-name'
42+
safe_name = "models--" + model_id.replace("/", "--")
43+
snapshots_path = Path(hf_cache) / "hub" / safe_name / "snapshots"
44+
45+
if not snapshots_path.exists():
46+
return False
47+
48+
# At least one non-empty snapshot directory must exist
49+
for snapshot_dir in snapshots_path.iterdir():
50+
if snapshot_dir.is_dir() and any(snapshot_dir.iterdir()):
51+
return True
52+
53+
return False
54+
55+
2556
class TrainingService:
2657
"""
2758
Service layer for training job lifecycle management.
@@ -63,7 +94,26 @@ def create_training_job(
6394
detail=f"Dataset {dataset_id} is not ready. Current status: {current_status}",
6495
)
6596

66-
# 2. Create job record — node binding deferred to activation
97+
# 2. HF cache guard — reject immediately if model is not cached locally.
98+
# This prevents the training worker from attempting a download mid-job,
99+
# which would fail silently after claiming the GPU and consuming queue time.
100+
# Users must pre-cache models via the model registry before fine-tuning.
101+
if not _is_model_in_hf_cache(base_model):
102+
logging_utility.warning(
103+
"Training job rejected — model '%s' not found in HF cache for user %s",
104+
base_model,
105+
user_id,
106+
)
107+
raise HTTPException(
108+
status_code=400,
109+
detail=(
110+
f"Model '{base_model}' is not available in the local HuggingFace cache. "
111+
f"Register and activate the base model via the model registry before "
112+
f"submitting a fine-tuning job."
113+
),
114+
)
115+
116+
# 3. Create job record — node binding deferred to activation
67117
job_id = IdentifierService.generate_prefixed_id("job")
68118
now = int(time.time())
69119

@@ -95,7 +145,7 @@ def create_training_job(
95145
status_code=500, detail="Failed to save training job to database."
96146
)
97147

98-
# 3. Enqueue to Redis
148+
# 4. Enqueue to Redis
99149
self._enqueue(job)
100150

101151
return job

src/api/training/unsloth_train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# HF_HUB_OFFLINE = "1" enforces cache-only mode — no downloads permitted.
66
# Set to "0" only if you explicitly want to allow HuggingFace hub downloads.
77
# For production sovereign deployments this should be "1".
8-
os.environ["HF_HUB_OFFLINE"] = "0"
8+
os.environ["HF_HUB_OFFLINE"] = "1"
99
# ──────────────────────────────────────────────────────────────────────────────
1010

1111
# isort: split

0 commit comments

Comments
 (0)