Update storage-related xFails.

cspades · cspades · commit 15df86fcdaf1 · 2026-03-16T12:15:07.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -133,10 +133,9 @@ def _run_allgather_test(sharding_dims, recipe):
 )
 def test_fp8_fsdp2_allgather(sharding_dims, fp_recipe):
     """Verify FSDP2 FP8 all-gather matches a manual dequantize-then-gather reference."""
-    if fp_recipe in ("Float8BlockScaling", "NVFP4BlockScaling"):
+    if fp_recipe == "NVFP4BlockScaling":
         pytest.xfail(
-            f"{fp_recipe}: block-scaled quantization formats are not supported by the "
-            "FP8 FSDP2 all-gather correctness test."
+            f"{fp_recipe}: NVFP4 FSDP2 all-gather hooks need to be implemented."
         )
 
     parallel_size = math.prod(x for x in sharding_dims if x != 0)
@@ -181,20 +180,7 @@ def test_fsdp2_fused_adam_fp8_master_weights(fp_recipe):
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
 def test_fsdp2_fused_adam_fp8_master_weights_no_meta(fp_recipe):
-    """FusedAdam(master_weights=True) + FSDP2 + quantized_model_init (CUDA init, no meta device).
-
-    Block-scaling QuantizedTensors (MXFP8, Float8Blockwise, NVFP4) are wrapper
-    subclasses with data_ptr() == 0.  Without meta-device init, FSDP2's
-    reset_sharded_param() crashes with 'invalid python storage'.
-    Per-tensor FP8 (DelayedScaling, Float8CurrentScaling) works because
-    Float8Tensor's storage is accessible.
-    """
-    if fp_recipe in ("MXFP8BlockScaling", "Float8BlockScaling", "NVFP4BlockScaling"):
-        pytest.xfail(
-            f"{fp_recipe}: FSDP2 without meta-device init crashes on block-scaling "
-            "QuantizedTensor wrapper subclasses (data_ptr() == 0). "
-            "Use device='meta' + reset_parameters() after sharding."
-        )
+    """FusedAdam(master_weights=True) + FSDP2 + quantized_model_init (CUDA init, no meta device)."""
     _run_fused_adam_test("fused_adam_fp8_master_weights_no_meta", fp_recipe)
 
 
@@ -232,8 +218,8 @@ def test_fsdp2_dcp_output_parity(fp_recipe):
 
     if fp_recipe == "NVFP4BlockScaling":
         pytest.xfail(
-            "NVFP4BlockScaling: DCP load_state_dict triggers reset_sharded_param() "
-            "which calls data_ptr() on NVFP4Tensor wrapper subclass with invalid storage"
+            "NVFP4BlockScaling: Failing parity tests with DCP. Snippet: \n"
+            "Fresh model loaded from DCP checkpoint produces different output."
         )
 
     if fp_recipe == "Float8BlockScaling" and torch.cuda.get_device_capability()[0] == 12:
@@ -261,8 +247,8 @@ def test_fsdp2_dcp_output_parity_async(fp_recipe):
 
     if fp_recipe == "NVFP4BlockScaling":
         pytest.xfail(
-            "NVFP4BlockScaling: DCP load_state_dict triggers reset_sharded_param() "
-            "which calls data_ptr() on NVFP4Tensor wrapper subclass with invalid storage"
+            "NVFP4BlockScaling: Failing parity tests with DCP. Snippet: \n"
+            "Fresh model loaded from DCP checkpoint produces different output."
         )
 
     if fp_recipe == "Float8BlockScaling":
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -10,6 +10,7 @@
 from typing import Any, Optional, Tuple, Union
 
 import torch
+from torch.distributed.tensor import DTensor
 
 import transformer_engine_torch as tex
 from transformer_engine_torch import DType as TE_DType
@@ -726,6 +727,11 @@ def fsdp_post_all_gather(
             # columnwise_data is (K, full_M), logical shape is (full_M, K)
             data_shape = (columnwise_data.shape[1], columnwise_data.shape[0])
 
+        if isinstance(out, DTensor):
+            # out.to_local() is not supported with Torch Dispatch,
+            # for quantized tensors with _transpose usage.
+            out = out._local_tensor
+
         if out is not None:
             # Update existing tensor in-place (subsequent iterations)
             out._rowwise_data = rowwise_data