CPU fallback if possible

Ghost---Shadow · Ghost---Shadow · commit a39f532a8be3 · 2025-08-27T10:22:20.000+05:30
diff --git a/src/config.py b/src/config.py
@@ -12,6 +12,7 @@
 from pydantic import BaseModel, model_validator, validator
 from pathlib import Path
 from typing import List
+import torch
 
 
 def type_validator(lut):
@@ -191,6 +192,12 @@ def _load_from_file(filepath):
 
     @staticmethod
     def _create_from_dict(config_data, filepath=None):
+        # Apply CPU fallback for device settings if CUDA is not available
+        if not torch.cuda.is_available():
+            print("CUDA not available, falling back to CPU for all devices")
+            # Recursively replace any cuda device settings with cpu
+            Config._apply_cpu_fallback(config_data)
+
         # Instantiate RootConfig with the loaded data
         root_config = RootConfig(**config_data)
 
@@ -203,3 +210,22 @@ def _create_from_dict(config_data, filepath=None):
                 )
 
         return root_config
+
+    @staticmethod
+    def _apply_cpu_fallback(config_data):
+        """Recursively replace CUDA device settings with CPU when CUDA is not available"""
+        if isinstance(config_data, dict):
+            for key, value in config_data.items():
+                if (
+                    key == "device"
+                    and isinstance(value, str)
+                    and "cuda" in value.lower()
+                ):
+                    config_data[key] = "cpu"
+                    print(f"  {key}: {value} -> cpu")
+                elif isinstance(value, (dict, list)):
+                    Config._apply_cpu_fallback(value)
+        elif isinstance(config_data, list):
+            for item in config_data:
+                if isinstance(item, (dict, list)):
+                    Config._apply_cpu_fallback(item)
diff --git a/src/losses/quaild_facility_location_loss_test.py b/src/losses/quaild_facility_location_loss_test.py
@@ -91,22 +91,22 @@ def test_submodularity(self):
 
         best_diversity, best_candidate = pick_most_diverse([b], [a, c, d, e])
 
-        assert best_diversity == 1.0, best_diversity
+        self.assertAlmostEqual(best_diversity, 1.0, places=4)
         assert best_candidate == e, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([b, e], [a, d, c])
 
-        assert best_diversity == 0.5, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.5, places=4)
         assert best_candidate == c, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([b, c, e], [a, d])
 
-        assert best_diversity == 0.2642977237701416, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.2642977237701416, places=4)
         assert best_candidate == a, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([a, b, c, e], [d])
 
-        assert best_diversity == 0.13720381259918213, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.13720381259918213, places=4)
         assert best_candidate == d, best_candidate
 
     # python -m unittest losses.quaild_facility_location_loss_test.TestQuaildFacilityLocation.test_submodularity_with_arbitary_order -v
@@ -124,22 +124,22 @@ def test_submodularity_with_arbitary_order(self):
 
         best_diversity, best_candidate = pick_most_diverse([b], [a, c, d, e])
 
-        assert best_diversity == 1.0, best_diversity
+        self.assertAlmostEqual(best_diversity, 1.0, places=4)
         assert best_candidate == e, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([a, b], [c, d, e])
 
-        assert best_diversity == 0.8779611587524414, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.8779611587524414, places=4)
         assert best_candidate == e, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([a, b, c], [d, e])
 
-        assert best_diversity == 0.5948392152786255, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.5948392152786255, places=4)
         assert best_candidate == e, best_candidate
 
         best_diversity, best_candidate = pick_most_diverse([a, b, c, d], [e])
 
-        assert best_diversity == 0.6127961277961731, best_diversity
+        self.assertAlmostEqual(best_diversity, 0.6127961277961731, places=4)
         assert best_candidate == e, best_candidate
 
     # python -m unittest losses.quaild_facility_location_loss_test.TestQuaildFacilityLocation.test_overfit -v
diff --git a/src/losses/quaild_log_det_mi_loss_test.py b/src/losses/quaild_log_det_mi_loss_test.py
@@ -7,6 +7,7 @@
 from train_utils import set_seed
 import torch.nn.functional as F
 from torch.cuda.amp import GradScaler, autocast
+from test_utils import skip_if_no_gpu
 
 
 # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss -v
@@ -17,6 +18,7 @@ def setUp(self):
         self.loss_fn = QuaidLogDetMILoss(config)
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_log_det_happy -v
+    @skip_if_no_gpu
     def test_log_det_happy(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -41,6 +43,7 @@ def test_log_det_happy(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_log_det_singular -v
+    @skip_if_no_gpu
     def test_log_det_singular(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -65,6 +68,7 @@ def test_log_det_singular(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_log_det_weird -v
+    @skip_if_no_gpu
     def test_log_det_weird(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -95,6 +99,7 @@ def test_log_det_weird(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_safe_pinverse_happy -v
+    @skip_if_no_gpu
     def test_safe_pinverse_happy(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -123,6 +128,7 @@ def test_safe_pinverse_happy(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_safe_pinverse_singular -v
+    @skip_if_no_gpu
     def test_safe_pinverse_singular(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -151,6 +157,7 @@ def test_safe_pinverse_singular(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_safe_pinverse_weird -v
+    @skip_if_no_gpu
     def test_safe_pinverse_weird(self):
         # Create a tensor representing positive infinity
         matrix = torch.tensor(
@@ -194,6 +201,7 @@ def test_safe_pinverse_weird(self):
         ], matrix.grad.tolist()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_theoretical_lower_bound -v
+    @skip_if_no_gpu
     def test_theoretical_lower_bound(self):
         # Construct vectors that should ideally minimize mutual information
         original_a = torch.tensor(
@@ -220,6 +228,7 @@ def test_theoretical_lower_bound(self):
             loss.backward()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_theoretical_upper_bound -v
+    @skip_if_no_gpu
     def test_theoretical_upper_bound(self):
         original_a = torch.tensor(
             [[[1.0, 0.0], [-1.0, 0.0]]], requires_grad=True, device="cuda:0"
@@ -246,6 +255,7 @@ def test_theoretical_upper_bound(self):
             loss.backward()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_dimension_mismatch -v
+    @skip_if_no_gpu
     def test_dimension_mismatch(self):
         a = torch.tensor(
             [[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]], [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]],
@@ -266,6 +276,7 @@ def test_dimension_mismatch(self):
             loss.backward()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_submodularity -v
+    @skip_if_no_gpu
     def test_submodularity(self):
         # q = [0.7071, 0.7071, 0.0000] # query
         a = [1.0000, 0.0000, 0.0000]  # 0 # partial match
@@ -297,6 +308,7 @@ def test_submodularity(self):
         assert best_candidate == e, best_candidate
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_submodularity_with_arbitary_order -v
+    @skip_if_no_gpu
     def test_submodularity_with_arbitary_order(self):
         # q = [0.7071, 0.7071, 0.0000] # query
         a = [1.0000, 0.0000, 0.0000]  # 0 # partial match
@@ -330,6 +342,7 @@ def test_submodularity_with_arbitary_order(self):
         assert best_candidate == e, best_candidate
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_overfit -v
+    @skip_if_no_gpu
     def test_overfit(self):
         set_seed(42)
 
@@ -416,6 +429,7 @@ def test_overfit(self):
         # assert mse_loss.item() <= 1.5, mse_loss.item()
 
     # python -m unittest losses.quaild_log_det_mi_loss_test.TestQuaidLogDetMILoss.test_overfit_amp -v
+    @skip_if_no_gpu
     def test_overfit_amp(self):
         set_seed(42)
 
diff --git a/src/offline_eval_pipeline.py b/src/offline_eval_pipeline.py
@@ -45,9 +45,10 @@ def cleanup(self):
         gc.collect()
 
         # Hopefully fix oom
-        torch.cuda.synchronize()
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
 
     def is_done(self):
         if self.current_dataset_name is None:
diff --git a/src/subset_selection_strategies/quaild_submodular_test.py b/src/subset_selection_strategies/quaild_submodular_test.py
@@ -4,6 +4,7 @@
 from subset_selection_strategies.quaild_submodular import QuaildSubmodularStrategy
 import unittest
 import torch.nn.functional as F
+from test_utils import skip_if_no_gpu
 
 
 # python -m unittest subset_selection_strategies.quaild_submodular_test.TestQuaildSubmodularStrategy -v
@@ -368,6 +369,7 @@ def test_subset_select_with_similarity_many_fl(self):
         assert scores.tolist() == expected_scores, scores.tolist()
 
     # python -m unittest subset_selection_strategies.quaild_submodular_test.TestQuaildSubmodularStrategy.test_subset_select_many_ld -v
+    @skip_if_no_gpu
     def test_subset_select_many_ld(self):
         config = Config.from_file("experiments/tests/quaild_test_experiment.yaml")
         config.architecture.semantic_search_model.type = "noop"
@@ -426,6 +428,7 @@ def test_subset_select_many_ld(self):
         assert scores.tolist() == expected_scores, scores.tolist()
 
     # python -m unittest subset_selection_strategies.quaild_submodular_test.TestQuaildSubmodularStrategy.test_subset_select_with_similarity_many_ld -v
+    @skip_if_no_gpu
     def test_subset_select_with_similarity_many_ld(self):
         config = Config.from_file("experiments/tests/quaild_test_experiment.yaml")
         config.architecture.semantic_search_model.type = "noop"
diff --git a/src/test_utils.py b/src/test_utils.py
@@ -0,0 +1,12 @@
+import torch
+
+
+def skip_if_no_gpu(test_func):
+    """Decorator to skip test if GPU is not available"""
+
+    def wrapper(self):
+        if not torch.cuda.is_available():
+            self.skipTest("GPU not available")
+        return test_func(self)
+
+    return wrapper
diff --git a/src/training_pipeline.py b/src/training_pipeline.py
@@ -19,7 +19,14 @@
 )
 from training_strategies import TRAINING_STRATEGIES_LUT
 from config import RootConfig
-from torch.cuda.amp import GradScaler
+
+try:
+    from torch.cuda.amp import GradScaler
+
+    CUDA_AVAILABLE = torch.cuda.is_available()
+except ImportError:
+    GradScaler = None
+    CUDA_AVAILABLE = False
 import torch.optim as optim
 from tqdm import tqdm
 
@@ -107,7 +114,7 @@ def _load_parts(self, config: RootConfig):
 
         # Optimizer
         print("Preparing optimizer")
-        self.scaler = GradScaler()
+        self.scaler = GradScaler() if CUDA_AVAILABLE and GradScaler else None
         self.optimizer = optim.AdamW(
             self.semantic_search_model.get_all_trainable_parameters(),
             lr=config.training.learning_rate,
@@ -171,17 +178,23 @@ def train_one_epoch(self):
                 self.optimizer.zero_grad()
 
                 # Automatic Mixed Precision
-                with torch.cuda.amp.autocast():
+                if CUDA_AVAILABLE:
+                    with torch.cuda.amp.autocast():
+                        loss = self.training_strategy.train_step(batch)
+                else:
                     loss = self.training_strategy.train_step(batch)
 
-                    # Bad batch
-                    if check_for_nan_then_dump(loss, batch):
-                        continue
+                # Bad batch
+                if check_for_nan_then_dump(loss, batch):
+                    continue
 
-                    pbar.set_description(f"Loss: {round(loss.item()*10000)/10000}")
+                pbar.set_description(f"Loss: {round(loss.item()*10000)/10000}")
 
                 # Scales loss. Calls backward() on scaled loss to create scaled gradients.
-                self.scaler.scale(loss).backward()
+                if self.scaler:
+                    self.scaler.scale(loss).backward()
+                else:
+                    loss.backward()
 
                 extra_metrics = {}
                 if self.current_step % 100 == 0:
@@ -206,15 +219,20 @@ def train_one_epoch(self):
                 )
 
                 # Unscales gradients and calls or skips optimizer.step()
-                self.scaler.step(self.optimizer)
+                if self.scaler:
+                    self.scaler.step(self.optimizer)
+                else:
+                    self.optimizer.step()
                 self.lr_scheduler.step()
 
                 # Updates the scale for next iteration
-                self.scaler.update()
+                if self.scaler:
+                    self.scaler.update()
             except Exception as e:
                 traceback.print_exc()
                 print("[train_one_epoch]", e)
-                torch.cuda.empty_cache()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
 
             self.current_step += 1