refactor: Applied formatters.

BlueCrescent · BlueCrescent · commit 9253ad16e894 · 2025-11-07T19:06:27.000+01:00
diff --git a/src/modalities/evaluator.py b/src/modalities/evaluator.py
@@ -124,7 +124,9 @@ def evaluate(
                         cumulated_loss[0] += batch_loss.item()  # sum up batch loss
                         cumulated_loss[1] += 1
                     batch_length_tensor = torch.tensor(len(batch)).to(device)
-                    throughput_aggregator.add_value(key=ThroughputAggregationKeys.NUM_SAMPLES, value=batch_length_tensor)
+                    throughput_aggregator.add_value(
+                        key=ThroughputAggregationKeys.NUM_SAMPLES, value=batch_length_tensor
+                    )
 
                     Evaluator._publish_progress(
                         progress_publisher=self.progress_publisher,
diff --git a/src/modalities/utils/mfu.py b/src/modalities/utils/mfu.py
@@ -156,10 +156,7 @@ def __init__(
         wrapped_model: FSDPX,
         device_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
     ):
-        self._num_params = get_total_number_of_trainable_parameters(
-            model=wrapped_model,
-            device_mesh=device_mesh
-        )
+        self._num_params = get_total_number_of_trainable_parameters(model=wrapped_model, device_mesh=device_mesh)
         self._n_layer = n_layer
         self._sequence_length = sequence_length
         self._n_embd = n_embd
diff --git a/tests/fsdp2_parallelization/test_full_and_hybrid_sharding.py b/tests/fsdp2_parallelization/test_full_and_hybrid_sharding.py
@@ -7,7 +7,6 @@
 import torch.multiprocessing as mp
 import yaml
 from pydantic import BaseModel
-from torch.distributed.fsdp import FSDPModule as FSDP2
 
 from modalities.__main__ import Main
 from modalities.config.config import ProcessGroupBackendType
diff --git a/tests/test_torch_compile.py b/tests/test_torch_compile.py
@@ -1,4 +1,3 @@
-
 import copy
 
 import pytest
@@ -68,8 +67,12 @@ def test_get_compiled_model_compiles_blocks(gpt2_model):
     result_model = ModelFactory.get_compiled_model(gpt2_model, block_names, fullgraph=True)
 
     assert len(result_model.transformer.h) == 4, "Should still have four blocks"
-    for i, (original_block_idx, new_block_idx) in enumerate(zip(original_model.transformer.h, result_model.transformer.h)):
-        assert result_model.transformer.h[new_block_idx] is not original_model.transformer.h[original_block_idx], f"Block {i} should be a compiled version"
+    for i, (original_block_idx, new_block_idx) in enumerate(
+        zip(original_model.transformer.h, result_model.transformer.h)
+    ):
+        assert (
+            result_model.transformer.h[new_block_idx] is not original_model.transformer.h[original_block_idx]
+        ), f"Block {i} should be a compiled version"
         assert isinstance(result_model.transformer.h[new_block_idx], nn.Module), f"Block {i} should be an nn.Module"
     assert result_model.transformer.wte is original_wte, "Embedding layer should remain unchanged"
     assert result_model.transformer.lm_head is original_lm_head, "LM head should remain unchanged"
diff --git a/tests/utils/test_mfu.py b/tests/utils/test_mfu.py
@@ -313,7 +313,10 @@ def test_get_theoretical_flops_per_token(
         assert theoretical_flops_per_token == expected_theoretical_flops_per_token
 
     @staticmethod
-    @pytest.mark.skipif(torch.cuda.device_count() < 2 or not torch.cuda.get_device_name().startswith("NVIDIA A100"), reason="This test requires 2 A100 GPUs.")
+    @pytest.mark.skipif(
+        torch.cuda.device_count() < 2 or not torch.cuda.get_device_name().startswith("NVIDIA A100"),
+        reason="This test requires 2 A100 GPUs.",
+    )
     @pytest.mark.parametrize(
         "rdvz_port, relative_config_path, num_samples_per_second_per_gpu, expected_mfu",
         [
@@ -339,7 +342,7 @@ def test_compute_mfu(
         TestMFU._save_yaml_config(config_file_path=tmp_config_file_path, config=config_updated)
 
         # run the test in a distributed environment
-        world_size = 2 #torch.cuda.device_count()
+        world_size = 2  # torch.cuda.device_count()
         num_samples_per_second = num_samples_per_second_per_gpu * world_size
         mp.spawn(
             TestMFU._test_compute_mfu_thread,