fix: fallback to torch.cuda.mem_get_info when NVML memory query is unsupported

dbuos · dbuos · commit db8ae128721d · 2026-04-03T06:56:23.000-05:00
nvmlDeviceGetMemoryInfo returns NVML_ERROR_NOT_SUPPORTED on DGX Spark
(GB10). Log the error and fall back to torch.cuda.mem_get_info which
works on all CUDA devices.

Signed-off-by: Daniel Bustamante Ospina &lt;dbustamante70@gmail.com&gt;
diff --git a/nemo_rl/utils/nvml.py b/nemo_rl/utils/nvml.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import logging
 import os
 from typing import Generator
 
 import pynvml
 
+logger = logging.getLogger(__name__)
+
 
 @contextlib.contextmanager
 def nvml_context() -> Generator[None, None, None]:
@@ -84,10 +87,9 @@ def get_free_memory_bytes(device_idx: int) -> float:
         try:
             handle = pynvml.nvmlDeviceGetHandleByIndex(global_device_idx)
             return pynvml.nvmlDeviceGetMemoryInfo(handle).free
-        except pynvml.NVMLError:
-            pass
+        except pynvml.NVMLError as e:
+            logger.warning("NVML memory query failed for device %d: %s. Falling back to torch.cuda.mem_get_info.", device_idx, e)
 
-    # Fallback for GPUs where NVML memory query is not supported (e.g. DGX Spark)
     import torch
 
     free, _total = torch.cuda.mem_get_info(device_idx)