add throughput related metrics

JamesDeng42 · JamesDeng42 · commit cdda16779f83 · 2026-01-26T16:09:06.000-08:00
diff --git a/recml/core/training/jax_trainer.py b/recml/core/training/jax_trainer.py
@@ -20,6 +20,7 @@
 import os
 import pprint
 from typing import Any, Generic, Protocol, Self, TypeVar
+import time
 
 from absl import logging
 from clu import data as clu_data
@@ -558,28 +559,49 @@ def _write_marker_file(self):
       f.write("COMPLETED")
 
   def _train_n_steps(
-      self,
-      train_iter: Iterator[PyTree],
-      train_step: partitioning.StepFn,
-      state: State,
-      start_step: int,
-      num_steps: int,
-      summary_writer: metrics_tools.AsyncMultiWriter,
-  ) -> tuple[State, Mapping[str, Any]]:
-    """Performs a training loop and returns the updated state and metrics."""
-    metrics_accum = metrics_tools.MetricAccumulator(summary_writer)
-    for step in range(start_step, start_step + num_steps):
-      with jax.profiler.StepTraceAnnotation("train", step_num=step):
-        train_batch = next(train_iter)
-        inputs = self._partitioner.shard_inputs(train_batch)
-        state, metrics_update = train_step(inputs, state)
-        metrics_accum.accumulate(metrics_update, step)
-        self.report_progress(step)
-        if (step != start_step + num_steps - 1) and self._enable_checkpointing:
-          self._maybe_save_checkpoint(step, state)
-
-    metrics = metrics_accum.compute_and_log_scalars(start_step + num_steps - 1)
-    return state, metrics
+        self,
+        train_iter: Iterator[PyTree],
+        train_step: partitioning.StepFn,
+        state: State,
+        start_step: int,
+        num_steps: int,
+        summary_writer: metrics_tools.AsyncMultiWriter,
+    ) -> tuple[State, Mapping[str, Any]]:
+        """Performs a training loop and returns the updated state and metrics."""
+        metrics_accum = metrics_tools.MetricAccumulator(summary_writer)
+        for step in range(start_step, start_step + num_steps):
+            with jax.profiler.StepTraceAnnotation("train", step_num=step):
+                train_batch = next(train_iter)
+                step_start = time.time()
+                inputs = self._partitioner.shard_inputs(train_batch)
+                state, metrics_update = train_step(inputs, state)
+
+                timing_metrics = {}
+                if step - start_step > 10:
+                    jax.block_until_ready(metrics_update)
+                    step_duration = time.time() - step_start
+
+                    timing_metrics = {
+                        "perf/step_time_ms": base_metrics.scalar(step_duration * 1000),
+                        "perf/steps_per_sec": base_metrics.scalar(
+                            1.0 / step_duration if step_duration > 0 else 0
+                        ),
+                    }
+
+                    if "common/batch_size" in metrics_update:
+                        bs = metrics_update["common/batch_size"].compute()
+                        timing_metrics["perf/throughput_ex_per_sec"] = (
+                            base_metrics.scalar(bs / step_duration)
+                        )
+
+                metrics_accum.accumulate({**metrics_update, **timing_metrics}, step)
+
+                self.report_progress(step)
+                if (step != start_step + num_steps - 1) and self._enable_checkpointing:
+                    self._maybe_save_checkpoint(step, state)
+
+        metrics = metrics_accum.compute_and_log_scalars(start_step + num_steps - 1)
+        return state, metrics
 
   def _evaluate_n_steps(
       self,
diff --git a/recml/examples/dlrm_experiment.py b/recml/examples/dlrm_experiment.py
@@ -276,6 +276,7 @@ def _loss_fn(params: jt.PyTree) -> tuple[jt.Scalar, jt.Array]:
       loss = jnp.mean(optax.sigmoid_binary_cross_entropy(logits, label), axis=0)
       return loss, logits
 
+    global_batch_size = self.train_data.global_batch_size
     grad_fn = jax.value_and_grad(_loss_fn, has_aux=True, allow_int=True)
     (loss, logits), grads = grad_fn(state.params)
     state = state.update(grads=grads)
@@ -287,6 +288,7 @@ def _loss_fn(params: jt.PyTree) -> tuple[jt.Scalar, jt.Array]:
         'aucroc': recml.metrics.aucroc(label, logits, from_logits=True),
         'label/mean': recml.metrics.mean(label),
         'prediction/mean': recml.metrics.mean(jax.nn.sigmoid(logits)),
+        "common/batch_size": recml.metrics.scalar(global_batch_size),
     }
     return state, metrics