Sampler add ConsumedStepsTracker for tracking consumed samples across data-parallel groups

jayhenry · jayhenry · commit adb12e8b19eb · 2026-04-03T10:50:50.000Z
diff --git a/xtuner/v1/datasets/consumed_steps.py b/xtuner/v1/datasets/consumed_steps.py
@@ -0,0 +1,65 @@
+"""Track consumed samples for checkpointing; aggregate across DP only (not
+SP/TP)."""
+
+from __future__ import annotations
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+
+
+def reduce_sum_across_dp_group(dp_mesh: DeviceMesh | None, local_value: int) -> int:
+    """Sum ``local_value`` over the DP process group (one contribution per
+    data-parallel replica).
+
+    Ranks that only differ in SP/TP see identical data batches and must not be summed with the global world group; see
+    Training notes for SP+DP.
+    """
+    if dp_mesh is None or dp_mesh.size() <= 1:
+        return int(local_value)
+    if not dist.is_available() or not dist.is_initialized():
+        return int(local_value)
+    if torch.cuda.is_available():
+        device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    else:
+        device = torch.device("cpu")
+    tensor = torch.tensor([local_value], dtype=torch.int64, device=device)
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=dp_mesh.get_group())
+    return int(tensor.item())
+
+
+class ConsumedStepsTracker:
+    """Holds per-resume totals and per-rank local accumulation; checkpoint
+    total uses DP-only reduction."""
+
+    __slots__ = ("_dp_mesh", "_init_steps", "_local_steps")
+
+    def __init__(self, dp_mesh: DeviceMesh | None) -> None:
+        self._dp_mesh = dp_mesh
+        self._init_steps = 0
+        self._local_steps = 0
+
+    def record(self, n: int) -> None:
+        self._local_steps += int(n)
+
+    def set_init_from_checkpoint(self, total: int) -> None:
+        """After loading a checkpoint: global total consumed so far; reset session-local accumulation."""
+        self._init_steps = int(total)
+        self._local_steps = 0
+
+    def total_for_checkpoint(self) -> int:
+        """Global consumed sample count including this session (collective over
+        DP group)."""
+        return self._init_steps + reduce_sum_across_dp_group(self._dp_mesh, self._local_steps)
+
+
+def apply_old_ckpt_init_steps(sampler: object, sampler_state: dict, train_state_total: int | None) -> None:
+    """If the sampler checkpoint predates ``total_consumed_steps``, copy the
+    total from ``train_state``."""
+    if train_state_total is None:
+        return
+    if sampler_state.get("total_consumed_steps") is not None:
+        return
+    consumed: ConsumedStepsTracker | None = getattr(sampler, "_consumed", None)
+    if consumed is not None:
+        consumed.set_init_from_checkpoint(train_state_total)
diff --git a/xtuner/v1/datasets/dataloader.py b/xtuner/v1/datasets/dataloader.py
@@ -5,6 +5,10 @@
 
 from xtuner.v1.datasets.collator import ColateItem
 from xtuner.v1.datasets.resume import get_dataloader_state, load_dataloader_state
+from xtuner.v1.utils import get_logger
+
+
+logger = get_logger()
 
 
 class BaseDataloader(ABC):
@@ -16,10 +20,10 @@ class BaseDataloader(ABC):
     """
 
     @abstractmethod
-    def load_state_dict(self, state_dict: dict) -> None: ...
+    def load_state_dict(self, state_dict: dict, train_state_total_consumed_samples: int | None = None) -> None: ...
 
     @abstractmethod
-    def get_state_dict(self, consumed_samples: int) -> dict: ...
+    def get_state_dict(self, consumed_samples: int = -1) -> dict: ...
 
     @abstractmethod
     def __iter__(self) -> Iterator[list[ColateItem]]: ...
@@ -33,13 +37,36 @@ class Dataloader(torch.utils.data.DataLoader, BaseDataloader):
     implement.
     """
 
-    def load_state_dict(self, state_dict: dict) -> None:
-        load_dataloader_state(self, state_dict)
+    def load_state_dict(
+        self,
+        state_dict: dict,
+        train_state_total_consumed_samples: int | None = None,
+    ) -> None:
+        load_dataloader_state(
+            self,
+            state_dict,
+            train_state_total_consumed_samples=train_state_total_consumed_samples,
+        )
 
-    def get_state_dict(self, consumed_samples: int) -> dict:
+    def get_state_dict(self, consumed_samples: int = -1) -> dict:
+        if consumed_samples != -1:
+            logger.warning(
+                "Dataloader.get_state_dict(consumed_samples=...) is deprecated; use the default (-1). "
+                "Consumed samples are tracked on the sampler."
+            )
         dataloader_state = get_dataloader_state(self, consumed_samples)
         return cast(dict, dataloader_state)
 
+    def record_consumed_samples(self, n: int) -> None:
+        if hasattr(self.sampler, "record_consumed_samples"):
+            self.sampler.record_consumed_samples(n)
+
+    def get_total_consumed_samples(self) -> int:
+        sampler = self.sampler
+        if hasattr(sampler, "get_total_consumed_steps"):
+            return int(sampler.get_total_consumed_steps())
+        return 0
+
     # __iter__ is inherited from torch.utils.data.DataLoader
 
     # Streaming dataloader may not have `set_epoch` and `__len__` method, so we add here.
diff --git a/xtuner/v1/datasets/preset_sampler.py b/xtuner/v1/datasets/preset_sampler.py
@@ -22,6 +22,7 @@
 
 from xtuner.v1.utils import get_logger
 
+from .consumed_steps import ConsumedStepsTracker
 from .preset_pack import PresetPackDataset
 
 
@@ -116,6 +117,7 @@ def __init__(
         else:
             self.rank = 0
             self.world_size = 1
+        self._consumed = ConsumedStepsTracker(dp_mesh)
 
         self.dataset = dataset
         self.global_batch_size = global_batch_size
@@ -170,19 +172,35 @@ def __len__(self) -> int:
     def set_epoch(self, epoch: int) -> None:
         self.epoch = epoch
 
-    def get_state_dict(self, step: int) -> dict:
+    def record_consumed_samples(self, n: int) -> None:
+        self._consumed.record(n)
+
+    def get_total_consumed_steps(self) -> int:
+        return self._consumed.total_for_checkpoint()
+
+    def get_state_dict(self, step: int | None = None) -> dict:
         # Same convention as :class:`LengthGroupedSampler`: ``step`` is the global pack offset
         # (modulo ``total_size``) into ``global_order``, shared across all ranks in the checkpoint.
-        global_step = step % self.total_size
+        if step is None:
+            total_consumed = self._consumed.total_for_checkpoint()
+        else:
+            total_consumed = int(step)
+        global_step = total_consumed % self.total_size
         return {
             "epoch": self.epoch,
             "step": global_step,
+            "total_consumed_steps": total_consumed,
             "world_size": self.world_size,
             "num_samples": self.num_samples,
             "total_size": self.total_size,
         }
 
     def load_state_dict(self, state_dict: dict) -> None:
+        tc = state_dict.get("total_consumed_steps")
+        if tc is not None:
+            self._consumed.set_init_from_checkpoint(int(tc))
+        else:
+            self._consumed.set_init_from_checkpoint(0)
         if self.world_size != state_dict.get("world_size"):
             logger.warning(
                 f"PresetSampler: world_size mismatch: checkpoint has "
@@ -191,5 +209,4 @@ def load_state_dict(self, state_dict: dict) -> None:
             )
 
         self.epoch = state_dict["epoch"]
-        global_step = int(state_dict["step"])
-        self.step = global_step
+        self.step = int(state_dict["step"])
diff --git a/xtuner/v1/datasets/resume.py b/xtuner/v1/datasets/resume.py
@@ -3,6 +3,7 @@
 
 from xtuner.v1.utils import get_logger
 
+from .consumed_steps import apply_old_ckpt_init_steps
 from .packing import ExpandSoftPackDataset, _LegacySoftPackDataset
 from .sampler import LengthGroupedSampler, ParallelSampler
 
@@ -15,15 +16,21 @@ class DataloaderState(TypedDict):
     dataset: dict
 
 
-def get_dataloader_state(dataloader: DataLoader, consumed_samples: int) -> DataloaderState:
+def get_dataloader_state(dataloader: DataLoader, consumed_samples: int = -1) -> DataloaderState:
     sampler: ParallelSampler | LengthGroupedSampler = dataloader.sampler  # type: ignore[assignment]
     dataset: ExpandSoftPackDataset | _LegacySoftPackDataset = dataloader.dataset  # type: ignore[assignment]
     dataloader_state = DataloaderState(sampler={}, dataset={})
 
     if not hasattr(sampler, "load_state_dict") or not hasattr(sampler, "get_state_dict"):
         logger.warning(f"Resuming from {type(sampler)} is risky.")
-    else:
+    elif consumed_samples != -1:
+        logger.warning(
+            "Passing consumed_samples to get_dataloader_state is deprecated; "
+            "consumed sample totals are tracked on the sampler. Use the default consumed_samples=-1."
+        )
         dataloader_state["sampler"].update(sampler.get_state_dict(step=consumed_samples))
+    else:
+        dataloader_state["sampler"].update(sampler.get_state_dict())
 
     if not hasattr(dataset, "load_state_dict") or not hasattr(dataset, "get_state_dict"):
         logger.warning(f"Resuming from {type(dataset)} is risky.")
@@ -33,7 +40,11 @@ def get_dataloader_state(dataloader: DataLoader, consumed_samples: int) -> Datal
     return dataloader_state
 
 
-def load_dataloader_state(dataloader: DataLoader, state: dict):
+def load_dataloader_state(
+    dataloader: DataLoader,
+    state: dict,
+    train_state_total_consumed_samples: int | None = None,
+):
     sampler = dataloader.sampler
     dataset = dataloader.dataset
 
@@ -44,6 +55,7 @@ def load_dataloader_state(dataloader: DataLoader, state: dict):
 
     if hasattr(sampler, "load_state_dict"):
         sampler.load_state_dict(state["sampler"])
+        apply_old_ckpt_init_steps(sampler, state["sampler"], train_state_total_consumed_samples)
 
     # If the dataset records the training progress, we also restore it.
     if hasattr(dataset, "load_state_dict"):
diff --git a/xtuner/v1/datasets/sampler.py b/xtuner/v1/datasets/sampler.py
@@ -12,6 +12,7 @@
 
 from xtuner.v1.utils import get_logger
 
+from .consumed_steps import ConsumedStepsTracker
 from .jsonl import JsonlDataset
 from .packing import MLLMPretrainHybridPackDataset, _LegacySoftPackDataset
 from .preset_pack import PresetPackDataset
@@ -84,6 +85,7 @@ def __init__(
         self.epoch = 0
         self.step = 0
         self.round_up = round_up
+        self._consumed = ConsumedStepsTracker(dp_mesh)
 
         if self.round_up:
             self.num_samples = math.ceil(len(self.dataset) / global_batch_size) * global_batch_size // world_size
@@ -131,12 +133,23 @@ def set_epoch(self, epoch: int) -> None:
         """
         self.epoch = epoch
 
+    def record_consumed_samples(self, n: int) -> None:
+        self._consumed.record(n)
+
+    def get_total_consumed_steps(self) -> int:
+        return self._consumed.total_for_checkpoint()
+
     def load_state_dict(self, state_dict) -> None:
         """Load the sampler state.
 
         Args:
             state_dict (dict): The state of the sampler.
         """
+        tc = state_dict.get("total_consumed_steps")
+        if tc is not None:
+            self._consumed.set_init_from_checkpoint(int(tc))
+        else:
+            self._consumed.set_init_from_checkpoint(0)
         self.epoch = state_dict["epoch"]
         self.step = state_dict["step"]
 
@@ -146,12 +159,17 @@ def load_state_dict(self, state_dict) -> None:
                 f"is different from the current shuffle ({self.shuffle})."
             )
 
-    def get_state_dict(self, step: int):
+    def get_state_dict(self, step: int | None = None):
         # Attention! Do not set self.step here, or it will cause the next __iter__ to get less samples.
-        step = step % self.total_size
+        if step is None:
+            total_consumed = self._consumed.total_for_checkpoint()
+        else:
+            total_consumed = int(step)
+        step_mod = total_consumed % self.total_size
         return {
             "epoch": self.epoch,
-            "step": step,
+            "step": step_mod,
+            "total_consumed_steps": total_consumed,
             "world_size": self.world_size,
             "shuffle": self.shuffle,
             "round_up": self.round_up,
@@ -233,6 +251,7 @@ def __init__(
         assert isinstance(self.max_lengths, (list, tuple, Column, np.ndarray))
 
         self.global_batch_size = global_batch_size
+        self._consumed = ConsumedStepsTracker(dp_mesh)
 
     def __iter__(self) -> Iterator[int]:
         """Iterate the indices."""
@@ -275,12 +294,23 @@ def set_epoch(self, epoch: int) -> None:
         """
         self.epoch = epoch
 
+    def record_consumed_samples(self, n: int) -> None:
+        self._consumed.record(n)
+
+    def get_total_consumed_steps(self) -> int:
+        return self._consumed.total_for_checkpoint()
+
     def load_state_dict(self, state_dict: dict) -> None:
         """Load the sampler state.
 
         Args:
             state_dict (dict): The state of the sampler.
         """
+        tc = state_dict.get("total_consumed_steps")
+        if tc is not None:
+            self._consumed.set_init_from_checkpoint(int(tc))
+        else:
+            self._consumed.set_init_from_checkpoint(0)
         self.epoch = state_dict["epoch"]
         self.step = state_dict["step"]
 
@@ -298,17 +328,22 @@ def load_state_dict(self, state_dict: dict) -> None:
             )
             self.group_size = origin_group_size
 
-    def get_state_dict(self, step: int):
+    def get_state_dict(self, step: int | None = None):
         """Get the sampler state dict.
 
         Returns:
             dict: The state of the sampler.
         """
         # Attention! Do not set self.step here, or it will cause the next __iter__ to get less samples.
-        step = step % self.total_size
+        if step is None:
+            total_consumed = self._consumed.total_for_checkpoint()
+        else:
+            total_consumed = int(step)
+        step_mod = total_consumed % self.total_size
         return {
             "epoch": self.epoch,
-            "step": step,
+            "step": step_mod,
+            "total_consumed_steps": total_consumed,
             "world_size": self.world_size,
             "round_up": self.round_up,
             "num_samples": self.num_samples,
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py