Added per-batch timings, added eval and test timings (#390)

klemen1999 · web-flow · commit e68c353a8d9b · 2026-06-02T12:51:56.000+02:00
diff --git a/luxonis_train/callbacks/README.md b/luxonis_train/callbacks/README.md
@@ -198,9 +198,17 @@ Callback that publishes training progress and timing metrics.
 
 **Published Metrics:**
 
-| Metric Key                     | Description                                             |
-| ------------------------------ | ------------------------------------------------------- |
-| `train/epoch_progress_percent` | Percentage (0-100) of current epoch completed           |
-| `train/epoch_duration_sec`     | Time elapsed so far in current epoch                    |
-| `train/epoch_completion_sec`   | Total duration of completed training epoch in seconds   |
-| `val/epoch_completion_sec`     | Total duration of completed validation epoch in seconds |
+| Metric Key                     | Description                                              |
+| ------------------------------ | -------------------------------------------------------- |
+| `train/batch_total_sec`        | Time spent processing one training batch                 |
+| `train/epoch_progress_percent` | Percentage (0-100) of current epoch completed            |
+| `train/epoch_duration_sec`     | Time elapsed so far in current epoch                     |
+| `train/epoch_completion_sec`   | Total duration of completed training epoch in seconds    |
+| `val/batch_total_sec`          | Time spent processing one validation batch               |
+| `val/epoch_progress_percent`   | Percentage (0-100) of current validation epoch completed |
+| `val/epoch_duration_sec`       | Time elapsed so far in current validation epoch          |
+| `val/epoch_completion_sec`     | Total duration of completed validation epoch in seconds  |
+| `test/batch_total_sec`         | Time spent processing one test batch                     |
+| `test/epoch_progress_percent`  | Percentage (0-100) of current test epoch completed       |
+| `test/epoch_duration_sec`      | Time elapsed so far in current test epoch                |
+| `test/epoch_completion_sec`    | Total duration of completed test epoch in seconds        |
diff --git a/luxonis_train/callbacks/training_progress_callback.py b/luxonis_train/callbacks/training_progress_callback.py
@@ -1,4 +1,5 @@
 import time
+from math import isfinite
 from typing import Any
 
 import lightning.pytorch as pl
@@ -17,7 +18,15 @@ class TrainingProgressCallback(pl.Callback):
         - C{train/epoch_progress_percent}: Percentage of current epoch completed
         - C{train/epoch_duration_sec}: Time elapsed so far in current epoch (updated per batch)
         - C{train/epoch_completion_sec}: Total duration of completed training epoch in seconds
+        - C{train/batch_total_sec}: Time spent processing one training batch
+        - C{val/epoch_progress_percent}: Percentage of current validation epoch completed
+        - C{val/epoch_duration_sec}: Time elapsed so far in current validation epoch
         - C{val/epoch_completion_sec}: Total duration of completed validation epoch in seconds
+        - C{val/batch_total_sec}: Time spent processing one validation batch
+        - C{test/epoch_progress_percent}: Percentage of current test epoch completed
+        - C{test/epoch_duration_sec}: Time elapsed so far in current test epoch
+        - C{test/epoch_completion_sec}: Total duration of completed test epoch in seconds
+        - C{test/batch_total_sec}: Time spent processing one test batch
     """
 
     def __init__(self, log_every_n_batches: int = 1):
@@ -32,28 +41,78 @@ def __init__(self, log_every_n_batches: int = 1):
         self.log_every_n_batches = max(1, log_every_n_batches)
         self._train_epoch_start_time: float | None = None
         self._val_epoch_start_time: float | None = None
+        self._test_epoch_start_time: float | None = None
+        self._train_batch_start_time: float | None = None
+        self._val_batch_start_time: float | None = None
+        self._test_batch_start_time: float | None = None
+        self._train_batch_step = 0
+        self._val_batch_step = 0
+        self._test_batch_step = 0
+        self._val_epoch_batch_count = 0
+        self._test_epoch_batch_count = 0
+
+    @staticmethod
+    def _now() -> float:
+        return time.perf_counter()
+
+    @staticmethod
+    def _elapsed(start_time: float | None) -> float:
+        if start_time is None:
+            return 0.0
+        return time.perf_counter() - start_time
+
+    @staticmethod
+    def _total_batches(
+        total_batches: float | list[int | float],
+    ) -> int:
+        """Return the total number of batches across eval
+        dataloaders.
+        """
+        if isinstance(total_batches, list):
+            return sum(
+                int(batch_count)
+                for batch_count in total_batches
+                if isfinite(batch_count)
+            )
+        if not isfinite(total_batches):
+            return 0
+        return int(total_batches)
 
     @override
     def on_train_epoch_start(
         self,
         trainer: pl.Trainer,
         pl_module: "lxt.LuxonisLightningModule",
     ) -> None:
-        self._train_epoch_start_time = time.time()
+        self._train_epoch_start_time = self._now()
 
         if trainer.logger is None:
             logger.warning(
                 "TrainingProgressCallback requires a logger to be configured."
             )
             return
 
+        # Keep train progress/timing metrics on a cumulative batch axis.
+        # `global_step` tracks optimizer steps, so with gradient
+        # accumulation multiple train batches can collapse onto the same
+        # step and stop being truly per-batch aligned.
         trainer.logger.log_metrics(
             {
                 "train/epoch_progress_percent": 0.0,
             },
-            step=trainer.global_step,
+            step=self._train_batch_step,
         )
 
+    @override
+    def on_train_batch_start(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        self._train_batch_start_time = self._now()
+
     @rank_zero_only
     @override
     def on_train_batch_end(
@@ -64,11 +123,13 @@ def on_train_batch_end(
         batch: Any,
         batch_idx: int,
     ) -> None:
+        self._train_batch_step += 1
+
         if trainer.logger is None:
             return
 
         # Log every N batches to reduce overhead
-        if (batch_idx + 1) % self.log_every_n_batches != 0:
+        if not self._should_log_batch(batch_idx + 1):
             return
 
         total_batches = trainer.num_training_batches
@@ -79,18 +140,16 @@ def on_train_batch_end(
             else 0.0
         )
 
-        epoch_duration = (
-            time.time() - self._train_epoch_start_time
-            if self._train_epoch_start_time is not None
-            else 0.0
-        )
+        epoch_duration = self._elapsed(self._train_epoch_start_time)
+        batch_total = self._elapsed(self._train_batch_start_time)
 
         trainer.logger.log_metrics(
             {
                 "train/epoch_progress_percent": progress_percent,
                 "train/epoch_duration_sec": epoch_duration,
+                "train/batch_total_sec": batch_total,
             },
-            step=trainer.global_step,
+            step=self._train_batch_step,
         )
 
     @rank_zero_only
@@ -103,18 +162,14 @@ def on_train_epoch_end(
         if trainer.logger is None:
             return
 
-        epoch_duration = (
-            time.time() - self._train_epoch_start_time
-            if self._train_epoch_start_time is not None
-            else 0.0
-        )
+        epoch_duration = self._elapsed(self._train_epoch_start_time)
 
         trainer.logger.log_metrics(
             {
                 "train/epoch_completion_sec": epoch_duration,
                 "train/epoch_progress_percent": 100.0,
             },
-            step=trainer.current_epoch,
+            step=self._train_batch_step,
         )
 
     @override
@@ -123,7 +178,72 @@ def on_validation_epoch_start(
         trainer: pl.Trainer,
         pl_module: "lxt.LuxonisLightningModule",
     ) -> None:
-        self._val_epoch_start_time = time.time()
+        self._val_epoch_start_time = self._now()
+        self._val_epoch_batch_count = 0
+
+        if trainer.sanity_checking or trainer.logger is None:
+            return
+
+        trainer.logger.log_metrics(
+            {"val/epoch_progress_percent": 0.0},
+            step=self._val_batch_step,
+        )
+
+    @override
+    def on_validation_batch_start(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if trainer.sanity_checking:
+            return
+
+        self._val_batch_start_time = self._now()
+
+    @rank_zero_only
+    @override
+    def on_validation_batch_end(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if trainer.sanity_checking:
+            return
+
+        self._val_epoch_batch_count += 1
+        self._val_batch_step += 1
+
+        if trainer.logger is None:
+            return
+
+        if not self._should_log_batch(self._val_epoch_batch_count):
+            return
+
+        total_batches = self._total_batches(trainer.num_val_batches)
+        progress_percent = (
+            (self._val_epoch_batch_count / total_batches) * 100
+            if total_batches > 0
+            else 0.0
+        )
+        epoch_duration = self._elapsed(self._val_epoch_start_time)
+
+        trainer.logger.log_metrics(
+            {
+                "val/batch_total_sec": self._elapsed(
+                    self._val_batch_start_time
+                ),
+                "val/epoch_progress_percent": progress_percent,
+                "val/epoch_duration_sec": epoch_duration,
+            },
+            step=self._val_batch_step,
+        )
 
     @rank_zero_only
     @override
@@ -135,13 +255,116 @@ def on_validation_epoch_end(
         if trainer.sanity_checking or trainer.logger is None:
             return
 
-        epoch_duration = (
-            time.time() - self._val_epoch_start_time
-            if self._val_epoch_start_time is not None
+        epoch_duration = self._elapsed(self._val_epoch_start_time)
+
+        if self._val_epoch_batch_count > 0 and not self._should_log_batch(
+            self._val_epoch_batch_count
+        ):
+            trainer.logger.log_metrics(
+                {
+                    "val/epoch_progress_percent": 100.0,
+                    "val/epoch_duration_sec": epoch_duration,
+                },
+                step=self._val_batch_step,
+            )
+        trainer.logger.log_metrics(
+            {"val/epoch_completion_sec": epoch_duration},
+            step=trainer.current_epoch,
+        )
+
+    @override
+    def on_test_epoch_start(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+    ) -> None:
+        self._test_epoch_start_time = self._now()
+        self._test_epoch_batch_count = 0
+
+        if trainer.logger is None:
+            return
+
+        trainer.logger.log_metrics(
+            {"test/epoch_progress_percent": 0.0},
+            step=self._test_batch_step,
+        )
+
+    @override
+    def on_test_batch_start(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        self._test_batch_start_time = self._now()
+
+    @rank_zero_only
+    @override
+    def on_test_batch_end(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        self._test_epoch_batch_count += 1
+        self._test_batch_step += 1
+
+        if trainer.logger is None:
+            return
+
+        if not self._should_log_batch(self._test_epoch_batch_count):
+            return
+
+        total_batches = self._total_batches(trainer.num_test_batches)
+        progress_percent = (
+            (self._test_epoch_batch_count / total_batches) * 100
+            if total_batches > 0
             else 0.0
         )
+        epoch_duration = self._elapsed(self._test_epoch_start_time)
 
         trainer.logger.log_metrics(
-            {"val/epoch_completion_sec": epoch_duration},
+            {
+                "test/batch_total_sec": self._elapsed(
+                    self._test_batch_start_time
+                ),
+                "test/epoch_progress_percent": progress_percent,
+                "test/epoch_duration_sec": epoch_duration,
+            },
+            step=self._test_batch_step,
+        )
+
+    @rank_zero_only
+    @override
+    def on_test_epoch_end(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "lxt.LuxonisLightningModule",
+    ) -> None:
+        if trainer.logger is None:
+            return
+
+        epoch_duration = self._elapsed(self._test_epoch_start_time)
+
+        if self._test_epoch_batch_count > 0 and not self._should_log_batch(
+            self._test_epoch_batch_count
+        ):
+            trainer.logger.log_metrics(
+                {
+                    "test/epoch_progress_percent": 100.0,
+                    "test/epoch_duration_sec": epoch_duration,
+                },
+                step=self._test_batch_step,
+            )
+        trainer.logger.log_metrics(
+            {"test/epoch_completion_sec": epoch_duration},
             step=trainer.current_epoch,
         )
+
+    def _should_log_batch(self, seen_batches: int) -> bool:
+        return seen_batches % self.log_every_n_batches == 0
diff --git a/luxonis_train/lightning/luxonis_lightning.py b/luxonis_train/lightning/luxonis_lightning.py
@@ -1025,10 +1025,18 @@ def get_mlflow_logging_keys(self) -> dict[str, list[str]]:
             elif callback.name == "TrainingProgressCallback":
                 metric_keys.update(
                     {
+                        "train/batch_total_sec",
                         "train/epoch_progress_percent",
                         "train/epoch_duration_sec",
                         "train/epoch_completion_sec",
+                        "val/batch_total_sec",
+                        "val/epoch_progress_percent",
+                        "val/epoch_duration_sec",
                         "val/epoch_completion_sec",
+                        "test/batch_total_sec",
+                        "test/epoch_progress_percent",
+                        "test/epoch_duration_sec",
+                        "test/epoch_completion_sec",
                     }
                 )
 
diff --git a/tests/integration/test_mlflow_logging.py b/tests/integration/test_mlflow_logging.py

Original file line number	Diff line number	Diff line change
`@@ -1025,10 +1025,18 @@ def get_mlflow_logging_keys(self) -> dict[str, list[str]]:`
`1025`	`1025`	`elif callback.name == "TrainingProgressCallback":`
`1026`	`1026`	`metric_keys.update(`
`1027`	`1027`	`{`
	`1028`	`+ "train/batch_total_sec",`
`1028`	`1029`	`"train/epoch_progress_percent",`
`1029`	`1030`	`"train/epoch_duration_sec",`
`1030`	`1031`	`"train/epoch_completion_sec",`
	`1032`	`+ "val/batch_total_sec",`
	`1033`	`+ "val/epoch_progress_percent",`
	`1034`	`+ "val/epoch_duration_sec",`
`1031`	`1035`	`"val/epoch_completion_sec",`
	`1036`	`+ "test/batch_total_sec",`
	`1037`	`+ "test/epoch_progress_percent",`
	`1038`	`+ "test/epoch_duration_sec",`
	`1039`	`+ "test/epoch_completion_sec",`
`1032`	`1040`	`}`
`1033`	`1041`	`)`
`1034`	`1042`