deepmodeling · njzjz · Apr 25, 2026 · Apr 23, 2026 · Apr 25, 2026
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -103,9 +103,13 @@
     get_optimizer_state_dict,
     set_optimizer_state_dict,
 )
-from torch.distributed.fsdp import (
-    fully_shard,
-)
+
+try:
+    from torch.distributed.fsdp import (
+        fully_shard,
+    )
+except ImportError:
+    fully_shard = None  # type: ignore[assignment]
 from torch.distributed.optim import (
     ZeroRedundancyOptimizer,
 )
@@ -853,6 +857,15 @@ def single_model_finetune(
         if self.is_distributed:
             torch.cuda.set_device(LOCAL_RANK)
             if self.zero_stage >= 2:
+                if fully_shard is None:
+                    raise RuntimeError(
+                        "training.zero_stage>=2 requires FSDP2, which is only "
+                        "available in PyTorch >= 2.6 "
+                        "(``torch.distributed.fsdp.fully_shard``). "
+                        f"Current PyTorch is {torch.__version__}. "
+                        "Please upgrade PyTorch, or set training.zero_stage "
+                        "to 0 or 1 to stay on the DDP / ZeRO-1 path."
+                    )
                 # FSDP2 does NOT broadcast params (unlike DDP constructor).
                 # Ensure all ranks share identical weights before sharding.
                 for p in self.wrapper.parameters():

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -3948,6 +3948,7 @@ def training_args(
         "but reduces optimizer memory to 1/N per GPU. "
         "2: FSDP2 stage-2, shards optimizer states and gradients; same communication "
         "volume as stage-1 but further reduces gradient memory to 1/N per GPU. "
+        "Stages 2 and 3 require FSDP2, which is available in PyTorch >= 2.6. "
         "Note: FSDP2 introduces DTensor dispatch overhead that can slow down "
         "models with many small layers; use torch.compile to mitigate. "
         "3: FSDP2 stage-3, shards parameters as well; maximum memory savings but "