Add wandb support for quant stats logging (#1526)

jomitchellnv · web-flow · commit 807782ad9e79 · 2026-03-17T22:07:58.000Z
### Description  #### Usage  ```python TODO: Add code snippet ``` ### Type of changes  - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR - [ciflow:notebooks](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:notebooks) - Run Jupyter notebooks execution tests for bionemo2 - [ciflow:slow](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:slow) - Run slow single GPU integration tests marked as @pytest.mark.slow for bionemo2 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2. - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes. Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. #### Triggering Code Rabbit AI Review To trigger a code review from code rabbit, comment on a pull request with one of these commands: - @coderabbitai review - Triggers a standard review - @coderabbitai full review - Triggers a comprehensive review See https://docs.coderabbit.ai/reference/review-commands for a full list of commands. ### Pre-submit Checklist  - [ ] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [ ] All existing tests pass successfully --------- Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com>
diff --git a/bionemo-recipes/recipes/esm2_native_te/checkpoint.py b/bionemo-recipes/recipes/esm2_native_te/checkpoint.py
@@ -358,6 +358,7 @@ class AppState(Stateful):
         default_factory=lambda: StateDictOptions(
             full_state_dict=False,
             cpu_offload=True,
+            strict=False,
         )
     )
 
diff --git a/bionemo-recipes/recipes/esm2_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/esm2_native_te/hydra_config/defaults.yaml
@@ -87,6 +87,7 @@ quant_stats_config:
   enabled: false
   quant_stats_file: ./fp8_debugging_stats.yaml
   quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
 
 # Note: The layers are going to come in 1 indexed and we convert them to be 0 indexed at runtime.
 fp8_layers: null
diff --git a/bionemo-recipes/recipes/esm2_native_te/quantization.py b/bionemo-recipes/recipes/esm2_native_te/quantization.py
@@ -20,11 +20,26 @@
 from pathlib import Path
 
 import yaml
+from nvdlfw_inspect.logging import BaseLogger
 
 
 logger = logging.getLogger(__name__)
 
 
+class WandBQuantLogger(BaseLogger):
+    """Forward nvdlfw_inspect quant stats to WandB as scalars.
+
+    Each stat is logged under the ``quant/`` prefix so it appears alongside
+    training metrics (loss, perplexity, etc.) in a single WandB dashboard.
+    """
+
+    def log_scalar(self, name: str, value: float | int, iteration: int, **kwargs):
+        """Log a single quant stat to WandB."""
+        import wandb
+
+        wandb.log({f"quant/{name}": value}, step=iteration)
+
+
 def generate_layer_regex(layer_numbers: list[int] | None) -> str:
     """Generate a regex pattern to match specific layer numbers (1-indexed).
 
@@ -99,6 +114,7 @@ def initialize_quant_stats_logging(
     quant_log_dir: str,
     rank: int,
     layer_precision: list[str | None],
+    statistics_logger: BaseLogger | None = None,
 ) -> None:
     """Set up quantization stats logging via nvdlfw_inspect.
 
@@ -111,6 +127,9 @@ def initialize_quant_stats_logging(
         rank: The global rank of this process.
         layer_precision: Per-layer precision list (0-indexed by position). Each element is
             ``"fp8"``, ``"fp4"``, or ``None``.
+        statistics_logger: Optional custom logger (e.g. :class:`WandBQuantLogger`) that receives
+            every ``log_scalar`` call from the debug API.  When provided together with
+            ``default_logging_enabled=True`` the file logger is kept as well.
     """
     import nvdlfw_inspect.api as debug_api
     import transformer_engine
@@ -133,6 +152,7 @@ def initialize_quant_stats_logging(
         config_file=updated_config,
         feature_dirs=[te_features_dir],
         log_dir=rank_log_dir,
+        statistics_logger=statistics_logger,
         default_logging_enabled=True,
     )
 
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py
@@ -34,7 +34,7 @@
 from distributed_config import DistributedConfig
 from modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
 from perf_logger import PerfLogger
-from quantization import initialize_quant_stats_logging, resolve_layer_precision
+from quantization import WandBQuantLogger, initialize_quant_stats_logging, resolve_layer_precision
 from scheduler import get_linear_schedule_with_warmup
 
 
@@ -82,11 +82,15 @@ def main(args: DictConfig) -> float | None:
         )
         config.layer_precision = layer_precision
         if args.quant_stats_config.enabled:
+            wandb_logger = None
+            if args.quant_stats_config.log_to_wandb and dist_config.is_main_process():
+                wandb_logger = WandBQuantLogger()
             initialize_quant_stats_logging(
                 quant_stats_file=args.quant_stats_config.quant_stats_file,
                 quant_log_dir=args.quant_stats_config.quant_log_dir,
                 rank=dist_config.rank,
                 layer_precision=layer_precision,
+                statistics_logger=wandb_logger,
             )
 
         # Create quantization recipes -- these are only used if FP8/FP4 is enabled in the config.

Original file line number	Diff line number	Diff line change
`@@ -358,6 +358,7 @@ class AppState(Stateful):`
`358`	`358`	`default_factory=lambda: StateDictOptions(`
`359`	`359`	`full_state_dict=False,`
`360`	`360`	`cpu_offload=True,`
	`361`	`+ strict=False,`
`361`	`362`	`)`
`362`	`363`	`)`
`363`	`364`