rework float32_matmul_precision hack

Erotemic · Erotemic · commit 823312793cf1 · 2025-02-23T13:41:29.000-05:00
diff --git a/train_kwcoco_demo.sh b/train_kwcoco_demo.sh
@@ -79,35 +79,36 @@ validation: $VALI_FPATH
 
 $CLASS_YAML
 "
-
 echo "$CONFIG_YAML" > "$DATASET_CONFIG_FPATH"
 
-
+TRAIN_DPATH="$BUNDLE_DPATH/kwcoco-demo-train-dir"
 # This might only work in development mode, otherwise we will get site packages
 # That still might be fine, but we do want to fix this to run anywhere.
 cd "$REPO_DPATH"
 LOG_BATCH_VIZ_TO_DISK=1 python -m yolo.lazy \
     task=train \
     dataset=kwcoco-demo \
+    use_tensorboard=True \
     use_wandb=False \
-    out_path="$BUNDLE_DPATH"/training \
+    out_path="$TRAIN_DPATH" \
     name=kwcoco-demo \
     cpu_num=0 \
     device=0 \
     accelerator=auto \
     task.data.batch_size=2 \
     "image_size=[640, 640]" \
-    task.optimizer.args.lr=0.0003
+    task.optimizer.args.lr=0.03
 
 
 ### show how to run inference
 
 BUNDLE_DPATH=$HOME/demo-yolo-kwcoco-train
+TRAIN_DPATH="$BUNDLE_DPATH/kwcoco-demo-train-dir"
 TEST_FPATH=$BUNDLE_DPATH/vidshapes_rgb_test/data.kwcoco.json
 # Grab a checkpoint
 CKPT_FPATH=$(python -c "if 1:
     import pathlib
-    ckpt_dpath = pathlib.Path('$BUNDLE_DPATH') / 'training/train/kwcoco-demo/checkpoints'
+    ckpt_dpath = pathlib.Path('$TRAIN_DPATH') / 'train/kwcoco-demo/checkpoints'
     checkpoints = sorted(ckpt_dpath.glob('*'))
     print(checkpoints[-1])
 ")
@@ -133,9 +134,11 @@ python yolo/lazy.py \
 ### Show how to run validation
 
 # Grab a checkpoint
+BUNDLE_DPATH=$HOME/demo-yolo-kwcoco-train
+TRAIN_DPATH="$BUNDLE_DPATH/kwcoco-demo-train-dir"
 CKPT_FPATH=$(python -c "if 1:
     import pathlib
-    ckpt_dpath = pathlib.Path('$BUNDLE_DPATH') / 'training/train/kwcoco-demo/checkpoints'
+    ckpt_dpath = pathlib.Path('$TRAIN_DPATH') / 'train/kwcoco-demo/checkpoints'
     checkpoints = sorted(ckpt_dpath.glob('*'))
     print(checkpoints[-1])
 ")
@@ -146,7 +149,7 @@ LOG_BATCH_VIZ_TO_DISK=1 python -m yolo.lazy \
     task=validation \
     dataset=kwcoco-demo \
     use_wandb=False \
-    out_path="$BUNDLE_DPATH"/training \
+    out_path="$TRAIN_DPATH" \
     name=kwcoco-demo \
     cpu_num=0 \
     device=0 \
diff --git a/yolo/utils/callbacks.py b/yolo/utils/callbacks.py
diff --git a/yolo/utils/logging_utils.py b/yolo/utils/logging_utils.py
@@ -339,21 +339,20 @@ def custom_wandb_log(string="", level=int, newline=True, repeat=True, prefix=Tru
         callbacks.append(YOLORichModelSummary())
 
     if 1:
-        from yolo.utils.callbacks import TorchGlobals
-        callbacks.append(TorchGlobals(float32_matmul_precision='auto'))
+        import lightning
         checkpoint_init_args = {
             'monitor': 'train_loss',
             'mode': 'min',
             'save_top_k': 5,
             'filename': '{epoch:04d}-{step:06d}-trainloss{train_loss:.3f}.ckpt',
             'save_last': True,
         }
-        import lightning
         checkpointer = lightning.pytorch.callbacks.ModelCheckpoint(**checkpoint_init_args)
         callbacks.append(checkpointer)
 
     callbacks.append(ImageLogger())
 
+    print(f'cfg.use_tensorboard={cfg.use_tensorboard}')
     if cfg.use_tensorboard:
         loggers.append(TensorBoardLogger(log_graph="all", save_dir=save_path))
     if cfg.use_wandb:
diff --git a/yolo/utils/trainer.py b/yolo/utils/trainer.py
@@ -10,11 +10,18 @@ class YoloTrainer(lightning.Trainer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self._hacked_torch_global_callback = TorchGlobals(float32_matmul_precision='auto')
 
-    def _run_stage(self, *args, **kwargs):
+    def _run(self, *args, **kwargs):
         # All I want is to print this  directly before training starts.
         # Is that so hard to do?
         self._on_before_run()
+        super()._run(*args, **kwargs)
+
+    def _run_stage(self, *args, **kwargs):
+        # All I want is to print this  directly before training starts.
+        # Is that so hard to do?
+        self._on_before_run_stage()
         super()._run_stage(*args, **kwargs)
 
     @property
@@ -32,6 +39,12 @@ def log_dpath(self):
         return ub.Path(self.logger.log_dir)
 
     def _on_before_run(self):
+        """
+        Our custom "callback"
+        """
+        self._hacked_torch_global_callback.before_setup_environment(self)
+
+    def _on_before_run_stage(self):
         """
         Our custom "callback"
         """
@@ -43,3 +56,45 @@ def _on_before_run_rank0(self):
         import rich
         dpath = self.log_dpath
         rich.print(f"Trainer log dpath:\n\n[link={dpath}]{dpath}[/link]\n")
+
+
+class TorchGlobals(lightning.pytorch.callbacks.Callback):
+    """
+    Callback to setup torch globals.
+
+    Note: this needs to be called before the accelerators are setup, and
+    existing callbacks don't have mechanisms for that, so we hack it in here.
+
+    Args:
+        float32_matmul_precision (str):
+            can be 'medium', 'high', 'default', or 'auto'.
+            The 'default' value does not change any setting.
+            The 'auto' value defaults to 'medium' if the training devices have
+                ampere cores.
+    """
+
+    def __init__(self, float32_matmul_precision='default'):
+        self.float32_matmul_precision = float32_matmul_precision
+
+    def before_setup_environment(self, trainer):
+        import torch
+        print('Setup Torch Globals')
+        float32_matmul_precision = self.float32_matmul_precision
+        if float32_matmul_precision == 'default':
+            float32_matmul_precision = None
+        elif float32_matmul_precision == 'auto':
+            # Detect if we have Ampere tensor cores
+            # Ampere (V8) and later leverage tensor cores, where medium
+            # float32_matmul_precision becomes useful
+            if torch.cuda.is_available():
+                device_versions = [torch.cuda.get_device_capability(device_id)[0]
+                                   for device_id in trainer.device_ids]
+                if all(v >= 8 for v in device_versions):
+                    float32_matmul_precision = 'medium'
+                else:
+                    float32_matmul_precision = None
+            else:
+                float32_matmul_precision = None
+        if float32_matmul_precision is not None:
+            print(f'Update: float32_matmul_precision={float32_matmul_precision}')
+            torch.set_float32_matmul_precision(float32_matmul_precision)