Merge remote-tracking branch 'origin/main' into type-annotations

thomaschhh · thomaschhh · commit ed78497933ab · 2024-11-18T12:55:53.000Z
diff --git a/.github/workflows/tests_full.yml b/.github/workflows/tests_full.yml
@@ -23,7 +23,7 @@ jobs:
         sudo apt-get update
         sudo apt-get install curl -y       # required by coveralls
         sudo apt-get install git -y
-        python -m pip install torch
+        python -m pip install torch~=2.4.1
         python -m pip install --upgrade pip setuptools wheel
         export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
         python -m pip install -e .[tests]
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ Currently, the flash attention dependency cannot be installed without torch bein
 Until the flash attention developers fix this, we have to run
 
 ```sh
-pip install torch
+pip install torch~=2.4.1
 ```
 beforehand.
 
@@ -75,7 +75,7 @@ pip install -e .
 To install Modalities via pip, run
 
 ```sh
-pip install torch
+pip install torch~=2.4.1
 pip install modalities
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod
 readme = "README.md"
 dependencies = [
     "numpy<2.0",
-    "torch>=2.3",
+    "torch~=2.4.1",
     "packaging",
     "tqdm",
     "pyyaml",
diff --git a/src/modalities/checkpointing/checkpoint_saving_strategies.py b/src/modalities/checkpointing/checkpoint_saving_strategies.py
@@ -1,3 +1,4 @@
+import dataclasses
 from abc import ABC, abstractmethod
 from typing import Optional
 
@@ -71,15 +72,15 @@ def get_checkpoint_instruction(
         save_current = True
 
         if self.k > 0:
-            self.saved_step_checkpoints = [training_progress] + self.saved_step_checkpoints
+            self.saved_step_checkpoints = [dataclasses.replace(training_progress)] + self.saved_step_checkpoints
             if len(self.saved_step_checkpoints) > self.k:
                 # Delete oldest checkpoint
                 checkpoints_to_delete = [self.saved_step_checkpoints[-1]]
                 self.saved_step_checkpoints = self.saved_step_checkpoints[:-1]
         elif self.k == 0:
             save_current = False
         elif self.k == -1:
-            self.saved_step_checkpoints = [training_progress] + self.saved_step_checkpoints
+            self.saved_step_checkpoints = [dataclasses.replace(training_progress)] + self.saved_step_checkpoints
 
         return CheckpointingInstruction(save_current=save_current, checkpoints_to_delete=checkpoints_to_delete)
 
diff --git a/src/modalities/dataloader/dataloader.py b/src/modalities/dataloader/dataloader.py
@@ -1,7 +1,12 @@
 from typing import Iterable, Optional
 
 from torch.utils.data import Dataset, DistributedSampler, Sampler
-from torch.utils.data.dataloader import DataLoader, T_co, _collate_fn_t, _worker_init_fn_t
+from torch.utils.data.dataloader import DataLoader, _collate_fn_t, _worker_init_fn_t
+
+try:  # torch <= 2.4
+    from torch.utils.data.dataloader import T_co
+except ImportError:  # torch >= 2.5
+    from torch.utils.data.dataloader import _T_co as T_co
 
 from modalities.dataloader.samplers import ResumableBatchSampler
 
diff --git a/tests/checkpointing/test_checkpoint_strategies.py b/tests/checkpointing/test_checkpoint_strategies.py
@@ -12,7 +12,7 @@
         # k value is 0. No deletion of checkpoints.
         (0, [], [], False),
         # k value is 2, but there are currently only one checkpoint. Hence, no deletion.
-        (2, [1], [], True),
+        (2, [TrainingProgress(1, 1, 20, 20)], [], True),
         # k value is -1, therefore we want to keep all checkpoints without any deletion
         (
             -1,
@@ -25,12 +25,21 @@
 def test_checkpoint_strategy_k(
     k: int, saved_instances: list[TrainingProgress], checkpoints_to_delete: list[int], save_current: bool
 ) -> None:
+    num_seen_steps_current_run = 10
     training_progress = TrainingProgress(
-        num_seen_steps_current_run=10, num_seen_tokens_current_run=10, num_target_steps=20, num_target_tokens=40
+        num_seen_steps_current_run=num_seen_steps_current_run,
+        num_seen_tokens_current_run=10,
+        num_target_steps=20,
+        num_target_tokens=40,
     )
     checkpoint_strategy = SaveKMostRecentCheckpointsStrategy(k=k)
     checkpoint_strategy.saved_step_checkpoints = saved_instances
     checkpoint_instruction = checkpoint_strategy.get_checkpoint_instruction(training_progress=training_progress)
 
     assert checkpoint_instruction.checkpoints_to_delete == checkpoints_to_delete
     assert checkpoint_instruction.save_current == save_current
+
+    # make sure that modifying the training progress externally does not affect saved_step_checkpoints
+    if k != 0 and save_current:
+        training_progress.num_seen_steps_current_run = 100
+        assert checkpoint_strategy.saved_step_checkpoints[0].num_seen_steps_current_run == num_seen_steps_current_run