Skip to content

Commit fd87c7f

Browse files
committed
fix(env_config): proactive torch.cuda guard for ROCm PyTorch fallback
- _try_rocm() checks torch.cuda.is_available() before setting device='cuda' If PyTorch-ROCm is not installed, device stays 'cpu' from the start - load_optimized() fallback pre-checks torch.cuda instead of catching NVIDIA driver exceptions reactively (cleaner logs, no crash) - Added test: no-PyTorch-ROCm falls back to cpu device (15 tests total)
1 parent eaefade commit fd87c7f

File tree

3 files changed

+86
-28
lines changed

3 files changed

+86
-28
lines changed

skills/detection/yolo-detection-2026/scripts/env_config.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,16 @@ def _try_rocm(self) -> bool:
165165
return False
166166

167167
self.backend = "rocm"
168-
self.device = "cuda" # ROCm exposes as CUDA in PyTorch
168+
# ROCm exposes as CUDA in PyTorch — but only if PyTorch-ROCm is installed
169+
try:
170+
import torch
171+
if torch.cuda.is_available():
172+
self.device = "cuda"
173+
else:
174+
self.device = "cpu"
175+
_log("PyTorch CUDA/ROCm not available, using CPU for PyTorch fallback")
176+
except ImportError:
177+
self.device = "cpu"
169178

170179
# Strategy 1: amd-smi static --json (ROCm 6.3+/7.x, richest output)
171180
if has_amd_smi:
@@ -467,23 +476,33 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467476

468477
# Fallback: use the PT model we already loaded
469478
_log("Falling back to PyTorch model")
470-
try:
471-
pt_model.to(self.device)
472-
except Exception as e:
473-
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
474-
self.device = "cpu"
475-
pt_model.to("cpu")
479+
fallback_device = self.device
480+
if fallback_device == "cuda":
481+
try:
482+
import torch
483+
if not torch.cuda.is_available():
484+
fallback_device = "cpu"
485+
_log("torch.cuda not available, falling back to CPU")
486+
except ImportError:
487+
fallback_device = "cpu"
488+
pt_model.to(fallback_device)
489+
self.device = fallback_device
476490
self.load_ms = (time.perf_counter() - t0) * 1000
477491
return pt_model, "pytorch"
478492

479493
# No optimization requested or framework missing
480494
model = YOLO(f"{model_name}.pt")
481-
try:
482-
model.to(self.device)
483-
except Exception as e:
484-
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
485-
self.device = "cpu"
486-
model.to("cpu")
495+
fallback_device = self.device
496+
if fallback_device == "cuda":
497+
try:
498+
import torch
499+
if not torch.cuda.is_available():
500+
fallback_device = "cpu"
501+
_log("torch.cuda not available, falling back to CPU")
502+
except ImportError:
503+
fallback_device = "cpu"
504+
model.to(fallback_device)
505+
self.device = fallback_device
487506
self.load_ms = (time.perf_counter() - t0) * 1000
488507
return model, "pytorch"
489508

skills/lib/env_config.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,16 @@ def _try_rocm(self) -> bool:
165165
return False
166166

167167
self.backend = "rocm"
168-
self.device = "cuda" # ROCm exposes as CUDA in PyTorch
168+
# ROCm exposes as CUDA in PyTorch — but only if PyTorch-ROCm is installed
169+
try:
170+
import torch
171+
if torch.cuda.is_available():
172+
self.device = "cuda"
173+
else:
174+
self.device = "cpu"
175+
_log("PyTorch CUDA/ROCm not available, using CPU for PyTorch fallback")
176+
except ImportError:
177+
self.device = "cpu"
169178

170179
# Strategy 1: amd-smi static --json (ROCm 6.3+/7.x, richest output)
171180
if has_amd_smi:
@@ -467,23 +476,33 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467476

468477
# Fallback: use the PT model we already loaded
469478
_log("Falling back to PyTorch model")
470-
try:
471-
pt_model.to(self.device)
472-
except Exception as e:
473-
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
474-
self.device = "cpu"
475-
pt_model.to("cpu")
479+
fallback_device = self.device
480+
if fallback_device == "cuda":
481+
try:
482+
import torch
483+
if not torch.cuda.is_available():
484+
fallback_device = "cpu"
485+
_log("torch.cuda not available, falling back to CPU")
486+
except ImportError:
487+
fallback_device = "cpu"
488+
pt_model.to(fallback_device)
489+
self.device = fallback_device
476490
self.load_ms = (time.perf_counter() - t0) * 1000
477491
return pt_model, "pytorch"
478492

479493
# No optimization requested or framework missing
480494
model = YOLO(f"{model_name}.pt")
481-
try:
482-
model.to(self.device)
483-
except Exception as e:
484-
_log(f"Device {self.device} unavailable ({e}), falling back to CPU")
485-
self.device = "cpu"
486-
model.to("cpu")
495+
fallback_device = self.device
496+
if fallback_device == "cuda":
497+
try:
498+
import torch
499+
if not torch.cuda.is_available():
500+
fallback_device = "cpu"
501+
_log("torch.cuda not available, falling back to CPU")
502+
except ImportError:
503+
fallback_device = "cpu"
504+
model.to(fallback_device)
505+
self.device = fallback_device
487506
self.load_ms = (time.perf_counter() - t0) * 1000
488507
return model, "pytorch"
489508

skills/lib/test_env_config_rocm.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,11 @@ def test_dual_gpu_picks_discrete(self, mock_run, _mock_dir):
9393
"""With 2 GPUs, picks the R9700 (32 GB) over iGPU (2 GB)."""
9494
mock_run.return_value = _make_run_result(AMD_SMI_DUAL_GPU)
9595

96-
env = HardwareEnv()
97-
result = env._try_rocm()
96+
mock_torch = mock.MagicMock()
97+
mock_torch.cuda.is_available.return_value = True
98+
with mock.patch.dict("sys.modules", {"torch": mock_torch}):
99+
env = HardwareEnv()
100+
result = env._try_rocm()
98101

99102
assert result is True
100103
assert env.backend == "rocm"
@@ -170,6 +173,23 @@ def test_amd_smi_failure_returns_true_with_defaults(self, mock_run, _mock_dir):
170173
assert env.backend == "rocm"
171174
assert env.gpu_name == "" # No name parsed, but backend detected
172175

176+
@mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
177+
@mock.patch("env_config.Path.is_dir", return_value=False)
178+
@mock.patch("env_config.subprocess.run")
179+
def test_no_pytorch_rocm_falls_back_to_cpu_device(self, mock_run, _mock_dir):
180+
"""When torch.cuda.is_available() is False, device stays 'cpu'."""
181+
mock_run.return_value = _make_run_result(AMD_SMI_SINGLE_GPU)
182+
183+
mock_torch = mock.MagicMock()
184+
mock_torch.cuda.is_available.return_value = False
185+
with mock.patch.dict("sys.modules", {"torch": mock_torch}):
186+
env = HardwareEnv()
187+
env._try_rocm()
188+
189+
assert env.backend == "rocm"
190+
assert env.device == "cpu" # No PyTorch-ROCm → CPU fallback
191+
assert env.gpu_name == "AMD Radeon RX 7900 XTX" # GPU still detected
192+
173193

174194
class TestTryRocmFallback:
175195
"""rocm-smi fallback (amd-smi not available)."""

0 commit comments

Comments
 (0)