Add new PTL callback to measure the wall-clock time per optmizer step to match native recipe

balvisio · balvisio · commit 1431df393c8f · 2026-05-24T22:47:57.000Z
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py
@@ -35,6 +35,7 @@
 from src.tokenizer import Tokenizer
 from src.utils.fsdp_config import get_fsdp_strategy
 from src.utils.grad_norm_callback import GradientNormLogger
+from src.utils.interval_step_timing import IntervalStepTimingCallback
 from src.utils.pred_writer import PredWriter
 from src.utils.scheduler import linear_scheduler_with_warmup_lr_lambda
 from src.utils.throughput_logger import ThroughputLogger
@@ -136,6 +137,7 @@ def get_callbacks_config(args: Any) -> Dict[str, fdl.Config]:
         "lr_monitor": fdl.Config(LearningRateMonitor, logging_interval="step", log_weight_decay=True),
         "grad_norm_callback": fdl.Config(GradientNormLogger, log_every_n_steps=args.log_every_n_steps),
         "timer_callback": fdl.Config(StepTimingCallback, log_every_n_steps=args.log_every_n_steps, mode="train"),
+        "interval_timer_callback": fdl.Config(IntervalStepTimingCallback, log_every_n_steps=args.log_every_n_steps),
         "throughput_callback": fdl.Config(ThroughputLogger, log_every_n_steps=args.log_every_n_steps, warmup_steps=40),
     }
     if args.mode == "eval":
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/utils/__init__.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/utils/__init__.py
@@ -15,13 +15,15 @@
 
 
 from src.utils.grad_norm_callback import GradientNormLogger
+from src.utils.interval_step_timing import IntervalStepTimingCallback
 from src.utils.pred_writer import PredWriter
 from src.utils.pylogger import RankedLogger
 from src.utils.throughput_logger import ThroughputLogger
 
 
 __all__ = [
     "GradientNormLogger",
+    "IntervalStepTimingCallback",
     "PredWriter",
     "RankedLogger",
     "ThroughputLogger",
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/utils/interval_step_timing.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/utils/interval_step_timing.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import torch
+from lightning.pytorch.callbacks import Callback
+
+
+class IntervalStepTimingCallback(Callback):
+    """Logs mean wall-clock time per optimizer step over a fixed logging interval.
+
+    Mirrors the semantics of `train/step_time` in the native_te recipe's `PerfLogger`:
+    samples `time.perf_counter()` only at log boundaries and divides by
+    `log_every_n_steps`, yielding the average optimizer-step wall time over the
+    last interval rather than a per-step measurement.
+    """
+
+    def __init__(self, log_every_n_steps: int = 10):  # noqa: D107
+        self.log_every_n_steps = log_every_n_steps
+        self.previous_log_time: float | None = None
+
+    def on_train_start(self, trainer, pl_module):  # noqa: D102
+        self.previous_log_time = time.perf_counter()
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):  # noqa: D102
+        if (batch_idx + 1) % trainer.accumulate_grad_batches != 0:
+            return
+
+        step = trainer.global_step
+        if step == 0 or step % self.log_every_n_steps != 0:
+            return
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        now = time.perf_counter()
+        step_time = (now - self.previous_log_time) / self.log_every_n_steps
+        self.previous_log_time = now
+
+        pl_module.log(
+            "timing_train/step_time",
+            step_time,
+            prog_bar=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )