Skip to content

Commit eaefade

Browse files
committed
fix(env_config): graceful CPU fallback when ROCm PyTorch unavailable
- load_optimized() now catches device='cuda' failures on ROCm systems where PyTorch-ROCm is not installed, degrades to CPU gracefully - deploy.sh removes CPU-only onnxruntime before installing onnxruntime-rocm to prevent the shadowing bug
1 parent 26aef50 commit eaefade

File tree

3 files changed

+32
-4
lines changed

3 files changed

+32
-4
lines changed

skills/detection/yolo-detection-2026/deploy.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,14 @@ fi
160160
log "Installing dependencies from $REQ_FILE ..."
161161
emit "{\"event\": \"progress\", \"stage\": \"install\", \"message\": \"Installing $BACKEND dependencies...\"}"
162162

163+
# ROCm: remove CPU-only onnxruntime if present (it shadows onnxruntime-rocm)
164+
if [ "$BACKEND" = "rocm" ]; then
165+
if "$PIP" show onnxruntime &>/dev/null 2>&1; then
166+
log "Removing CPU-only onnxruntime to avoid shadowing onnxruntime-rocm..."
167+
"$PIP" uninstall -y onnxruntime -q 2>&1 || true
168+
fi
169+
fi
170+
163171
"$PIP" install -r "$REQ_FILE" -q 2>&1 | tail -5 >&2
164172

165173
# ─── Step 5: Pre-convert model to optimized format ───────────────────────────

skills/detection/yolo-detection-2026/scripts/env_config.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,13 +467,23 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467467

468468
# Fallback: use the PT model we already loaded
469469
_log("Falling back to PyTorch model")
470-
pt_model.to(self.device)
470+
try:
471+
pt_model.to(self.device)
472+
except Exception as e:
473+
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
474+
self.device = "cpu"
475+
pt_model.to("cpu")
471476
self.load_ms = (time.perf_counter() - t0) * 1000
472477
return pt_model, "pytorch"
473478

474479
# No optimization requested or framework missing
475480
model = YOLO(f"{model_name}.pt")
476-
model.to(self.device)
481+
try:
482+
model.to(self.device)
483+
except Exception as e:
484+
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
485+
self.device = "cpu"
486+
model.to("cpu")
477487
self.load_ms = (time.perf_counter() - t0) * 1000
478488
return model, "pytorch"
479489

skills/lib/env_config.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,13 +467,23 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467467

468468
# Fallback: use the PT model we already loaded
469469
_log("Falling back to PyTorch model")
470-
pt_model.to(self.device)
470+
try:
471+
pt_model.to(self.device)
472+
except Exception as e:
473+
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
474+
self.device = "cpu"
475+
pt_model.to("cpu")
471476
self.load_ms = (time.perf_counter() - t0) * 1000
472477
return pt_model, "pytorch"
473478

474479
# No optimization requested or framework missing
475480
model = YOLO(f"{model_name}.pt")
476-
model.to(self.device)
481+
try:
482+
model.to(self.device)
483+
except Exception as e:
484+
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
485+
self.device = "cpu"
486+
model.to("cpu")
477487
self.load_ms = (time.perf_counter() - t0) * 1000
478488
return model, "pytorch"
479489

0 commit comments

Comments
 (0)