Adds quant stats logging support

jomitchellnv · jomitchellnv · commit 57a50b337035 · 2026-04-06T20:11:13.000-07:00
Adds unpadded_tps to wandb charts
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/fp8_debugging_stats.yaml b/bionemo-recipes/recipes/esm2_minifold_te/fp8_debugging_stats.yaml
@@ -0,0 +1,23 @@
+example_fp8_tensor_stat_collection:
+    enabled: True
+    layers:
+        # Match the te.Linear sublayers within MiniFormer blocks
+        layer_types: [pi, gi, po, go, fc1, fc2]
+    transformer_engine:
+        LogFp8TensorStats:
+            enabled: True
+            tensors_struct:
+            - tensor: activation
+              stats: [underflows%, scale_inv_min, scale_inv_max, mse]
+              freq: 10
+            - tensor: gradient
+              stats: [underflows%, scale_inv_min, scale_inv_max, mse]
+              freq: 10
+            - tensor: weight
+              stats: [underflows%, scale_inv_min, scale_inv_max, mse]
+              freq: 10
+        LogTensorStats:
+          enabled: True
+          stats: [max, min, mean, std, l1_norm]
+          tensors: [dgrad, wgrad]
+          freq: 1
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/L0_sanity.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/L0_sanity.yaml
@@ -73,6 +73,12 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 # Log every step for sanity check
 logger:
   frequency: 1
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/defaults.yaml
@@ -81,6 +81,13 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+# Quantization stats logging (requires nvdlfw_inspect)
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 # Logging
 logger:
   frequency: 100
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/eval.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/eval.yaml
@@ -49,6 +49,12 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 wandb_init_args:
   project: esm2_minifold_te
   name: eval_${now:%Y%m%d_%H%M%S}
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100.yaml
@@ -64,6 +64,12 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 logger:
   frequency: 5
 
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real.yaml
@@ -71,6 +71,12 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 logger:
   frequency: 5
 
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real_3B.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real_3B.yaml
@@ -70,6 +70,12 @@ component_precision:
   seq_proj: false      # Sequence/pair feature projections (fc_s_1, fc_s_2, fc_z_1, fc_z_2, seq_to_pair te.Linear layers)
   dist_head: false     # Distogram output head (fc_out_1, fc_out_2 te.Linear layers) — kept in model's base precision per MXFP8 paper
 
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
 logger:
   frequency: 5
 
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/perf_logger.py b/bionemo-recipes/recipes/esm2_minifold_te/perf_logger.py
@@ -18,6 +18,7 @@
 import logging
 import time
 
+import nvdlfw_inspect.api as debug_api
 import torch
 import torchmetrics
 from omegaconf import DictConfig, OmegaConf
@@ -44,6 +45,7 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
 
         self.min_loss = torch.tensor(float("inf"), device=torch.device(f"cuda:{dist_config.local_rank}"))
         self.logging_frequency = args.logger.frequency
+        self.quant_stats_enabled = args.quant_stats_config.enabled
 
         metrics_dict = {
             "train/loss": torchmetrics.MeanMetric(),
@@ -57,6 +59,7 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             "train/contact_recall_8A": torchmetrics.MeanMetric(),
             "train/lddt_from_distogram": torchmetrics.MeanMetric(),
             "train/mean_distance_error": torchmetrics.MeanMetric(),
+            "train/unpadded_tokens_per_sec": torchmetrics.MeanMetric(),
         }
 
         self.metrics = torchmetrics.MetricCollection(metrics_dict)
@@ -75,6 +78,7 @@ def log_step(
         grad_norm: torch.Tensor | DTensor | float = 0.0,
         lr: float = 0.0,
         structure_metrics: dict[str, torch.Tensor] | None = None,
+        unpadded_tokens: float = 0.0,
     ):
         """Log a training step."""
         with torch.no_grad():
@@ -95,6 +99,8 @@ def log_step(
                 self.metrics["train/learning_rate"].update(lr)
                 self.metrics["train/grad_norm"].update(grad_norm)
                 self.metrics["train/step_time"].update(step_time)
+                if unpadded_tokens > 0 and step_time > 0:
+                    self.metrics["train/unpadded_tokens_per_sec"].update(unpadded_tokens / step_time)
 
                 if structure_metrics is not None:
                     for key, value in structure_metrics.items():
@@ -121,8 +127,13 @@ def log_step(
                 if self._dist_config.local_rank == 0:
                     logger.info(", ".join([f"{k.split('/')[1]}: {v:.3g}" for k, v in metrics.items()]))
 
+                if self.quant_stats_enabled:
+                    debug_api.step()
+
     def finish(self):
         """Finish the logger."""
+        if self.quant_stats_enabled:
+            debug_api.end_debug()
         if not self._dist_config.is_main_process():
             return
         wandb.finish()
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/quantization.py b/bionemo-recipes/recipes/esm2_minifold_te/quantization.py
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/tests/test_precisions.py b/bionemo-recipes/recipes/esm2_minifold_te/tests/test_precisions.py
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/tests/test_quantization.py b/bionemo-recipes/recipes/esm2_minifold_te/tests/test_quantization.py
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_minifold_te/train_fsdp2.py