Add model weight modification guard to ensure DCP checkpoint correctness.

cspades · cspades · commit af7362a3bd71 · 2026-03-12T08:25:53.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/run_fsdp2_model.py b/tests/pytorch/distributed/run_fsdp2_model.py
@@ -548,6 +548,22 @@ def _train(args):
         loss.backward()
         optimizer.step()
 
+    # Verify model weights have diverged from the original
+    # model state after extra training steps.
+    s_post_train = model.state_dict()
+    for key in s1.keys() & s_post_train.keys():
+        if key.endswith("_extra_state"):
+            continue
+        v1 = s1[key]
+        if isinstance(v1, DTensor):
+            v1 = v1.to_local()
+        v_pt = s_post_train[key]
+        if isinstance(v_pt, DTensor):
+            v_pt = v_pt.to_local()
+        assert not torch.allclose(v1, v_pt), (
+            f"[{key}] Model weights should have changed after extra training steps"
+        )
+
     # Load the checkpoint.
     state_dict = {"app": AppState(model=model, optimizer=optimizer)}
     torch.distributed.checkpoint.load(state_dict=state_dict, checkpoint_id=str(CKPT_DIR))
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -98,9 +98,6 @@ def test_distributed(fp8_init, sharding_dims, fp_recipe, layer_type):
             f"Insufficient devices ({NUM_PROCS}) to test sharding configuration: {sharding_dims}"
         )
 
-    if fp_recipe in ("Float8BlockScaling", "NVFP4BlockScaling") and fp8_init:
-        pytest.xfail(f"{fp_recipe} + fp8_init: test_fp8_fsdp2_allgather is currently failing.")
-
     _run_test(fp8_init, sharding_dims, fp_recipe, layer_type)
 
 
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -403,7 +403,7 @@ def untyped_storage(self) -> torch.UntypedStorage:
         data = self._rowwise_data if self._rowwise_data is not None else self._columnwise_data
         if data is not None:
             return data.untyped_storage()
-        return torch.UntypedStorage(0, device=self.device)
+        return self._default_storage
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
diff --git a/transformer_engine/pytorch/tensor/storage/float8_blockwise_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/float8_blockwise_tensor_storage.py
@@ -35,6 +35,8 @@ class Float8BlockwiseQTensorStorage(QuantizedTensorStorage):
     _rowwise_scale_inv: Optional[torch.Tensor]
     _columnwise_scale_inv: Optional[torch.Tensor]
     _is_2D_scaled: bool
+    # Default storage of 1 byte.
+    _default_storage: torch.UntypedStorage
 
     def __new__(
         cls,
@@ -61,6 +63,7 @@ def __new__(
         instance._rowwise_scale_inv = rowwise_scale_inv
         instance._columnwise_scale_inv = columnwise_scale_inv
         instance._is_2D_scaled = is_2D_scaled
+        instance._default_storage = torch.UntypedStorage(1)
 
         return instance
 

Original file line number	Diff line number	Diff line change
`@@ -98,9 +98,6 @@ def test_distributed(fp8_init, sharding_dims, fp_recipe, layer_type):`
`98`	`98`	`f"Insufficient devices ({NUM_PROCS}) to test sharding configuration: {sharding_dims}"`
`99`	`99`	`)`
`100`	`100`
`101`		`- if fp_recipe in ("Float8BlockScaling", "NVFP4BlockScaling") and fp8_init:`
`102`		`- pytest.xfail(f"{fp_recipe} + fp8_init: test_fp8_fsdp2_allgather is currently failing.")`
`103`		`-`
`104`	`101`	`_run_test(fp8_init, sharding_dims, fp_recipe, layer_type)`
`105`	`102`
`106`	`103`