feat(train): show finish time in ETA logs (#5328)

OutisLi · web-flow · commit b97ad98514b2 · 2026-03-21T17:41:45.000Z
Make long-running training progress easier to read by keeping the
relative ETA and appending a concise absolute finish time across the pt,
pd, tf, and pt_expt backends.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Training logs now include both remaining ETA and an estimated local
finish time (YYYY-MM-DD HH:MM).
* Timezone-aware local timestamps are shown across training frameworks
for clearer cross-region monitoring and more consistent periodic timing
output.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/deepmd/loggers/training.py b/deepmd/loggers/training.py
@@ -6,15 +6,65 @@
 log = logging.getLogger(__name__)
 
 
+def _format_estimated_finish_time(
+    eta_seconds: int,
+    current_time: datetime.datetime | None = None,
+) -> str:
+    """Format the estimated local finish time.
+
+    Parameters
+    ----------
+    eta_seconds : int
+        Remaining time in seconds.
+    current_time : datetime.datetime | None, optional
+        Current local time used to estimate the finish timestamp. If ``None``,
+        the current local time is used.
+
+    Returns
+    -------
+    str
+        Estimated local finish time in ``YYYY-MM-DD HH:MM`` format.
+    """
+    if current_time is None:
+        current_time = datetime.datetime.now(datetime.timezone.utc).astimezone()
+    elif current_time.tzinfo is not None:
+        current_time = current_time.astimezone()
+    finish_time = current_time + datetime.timedelta(seconds=eta_seconds)
+    return finish_time.strftime("%Y-%m-%d %H:%M")
+
+
 def format_training_message(
     batch: int,
     wall_time: float,
     eta: int | None = None,
+    current_time: datetime.datetime | None = None,
 ) -> str:
-    """Format a training message."""
+    """Format the summary message for one training interval.
+
+    Parameters
+    ----------
+    batch : int
+        The batch index.
+    wall_time : float
+        Wall-clock time shown in the progress message in seconds.
+    eta : int | None, optional
+        Remaining time in seconds.
+    current_time : datetime.datetime | None, optional
+        Current local time used to estimate the finish timestamp. This is only
+        used when ``eta`` is provided.
+
+    Returns
+    -------
+    str
+        The formatted training message.
+    """
     msg = f"Batch {batch:7d}: total wall time = {wall_time:.2f} s"
     if isinstance(eta, int):
-        msg += f", eta = {datetime.timedelta(seconds=int(eta))!s}"
+        eta_seconds = int(eta)
+        msg += (
+            f", eta = {datetime.timedelta(seconds=eta_seconds)!s} at "
+            f"{_format_estimated_finish_time(eta_seconds, current_time=current_time)}"
+        )
     return msg
 
 
@@ -39,6 +89,11 @@ def format_training_message_per_task(
         The learning rate
     check_total_rmse_nan : bool
         Whether to throw an error if the total RMSE is NaN
+
+    Returns
+    -------
+    str
+        The formatted training message for the task.
     """
     if task_name:
         task_name += ": "
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import contextlib
+import datetime
 import functools
 import logging
 import time
@@ -982,6 +983,10 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                             batch=display_step_id,
                             wall_time=train_time,
                             eta=eta,
+                            current_time=datetime.datetime.fromtimestamp(
+                                current_time,
+                                tz=datetime.timezone.utc,
+                            ).astimezone(),
                         )
                     )
                 # the first training time is not accurate
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import datetime
 import functools
 import json
 import logging
@@ -1334,6 +1335,10 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                             batch=display_step_id,
                             wall_time=train_time,
                             eta=eta,
+                            current_time=datetime.datetime.fromtimestamp(
+                                current_time,
+                                tz=datetime.timezone.utc,
+                            ).astimezone(),
                         )
                     )
                 if (
diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
@@ -6,6 +6,7 @@
 converted to torch tensors at the boundary.
 """
 
+import datetime
 import functools
 import logging
 import time
@@ -30,6 +31,7 @@
     LearningRateExp,
 )
 from deepmd.loggers.training import (
+    format_training_message,
     format_training_message_per_task,
 )
 from deepmd.pt_expt.loss import (
@@ -732,6 +734,7 @@ def run(self) -> None:
 
         self.wrapper.train()
         wall_start = time.time()
+        last_log_time = wall_start
 
         for step_id in range(self.start_step, self.num_steps):
             cur_lr = float(self.lr_schedule.value(step_id))
@@ -792,17 +795,40 @@ def run(self) -> None:
                         }
 
                 # wall-clock time
-                wall_elapsed = time.time() - wall_start
+                current_time = time.time()
+                wall_elapsed = current_time - wall_start
+                interval_wall_time = current_time - last_log_time
+                last_log_time = current_time
                 if self.timing_in_training:
                     step_time = t_end - t_start
+                    steps_completed_since_restart = max(
+                        1,
+                        display_step_id - self.start_step,
+                    )
+                    eta = int(
+                        (self.num_steps - display_step_id)
+                        / steps_completed_since_restart
+                        * wall_elapsed
+                    )
                     log.info(
-                        "step=%d  wall=%.2fs  step_time=%.4fs",
-                        display_step_id,
-                        wall_elapsed,
-                        step_time,
+                        format_training_message(
+                            batch=display_step_id,
+                            wall_time=interval_wall_time,
+                            eta=eta,
+                            current_time=datetime.datetime.fromtimestamp(
+                                current_time,
+                                tz=datetime.timezone.utc,
+                            ).astimezone(),
+                        )
                     )
+                    log.info("step=%d  step_time=%.4fs", display_step_id, step_time)
                 else:
-                    log.info("step=%d  wall=%.2fs", display_step_id, wall_elapsed)
+                    log.info(
+                        format_training_message(
+                            batch=display_step_id,
+                            wall_time=interval_wall_time,
+                        )
+                    )
 
                 # log
                 log.info(
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import datetime
 import logging
 import os
 import shutil
@@ -603,10 +604,20 @@ def train(
                     toc = time.time()
                     test_time = toc - tic
                     wall_time = toc - wall_time_tic
+                    displayed_batches = max(
+                        1,
+                        min(self.disp_freq, int(cur_batch - start_batch)),
+                    )
+                    eta = int((stop_batch - cur_batch) / displayed_batches * wall_time)
                     log.info(
                         format_training_message(
                             batch=cur_batch,
                             wall_time=wall_time,
+                            eta=eta,
+                            current_time=datetime.datetime.fromtimestamp(
+                                toc,
+                                tz=datetime.timezone.utc,
+                            ).astimezone(),
                         )
                     )
                     # the first training time is not accurate