Lint.

cspades · cspades · commit 925d0227fa50 · 2026-03-03T10:14:13.000-08:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -575,14 +575,12 @@ def set_device_mesh(
         weight_mesh : Optional[DeviceMesh]
             Not used for DotProductAttention as there are no quantized weights.
         """
+        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -201,10 +201,10 @@ class MultiheadAttention(torch.nn.Module):
             parameters and if the DTensor DeviceMesh includes dimensions that do not
             shard weights, such as in the case of HSDP (DP-Replicate x DP-Shard).
             For example:
-                - device_mesh["dp"] for FSDP.
-                - device_mesh["dp_cp"] if using CP ranks in FSDP.
-                - device_mesh["tp"] if using TP.
-                - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
+            - device_mesh["dp"] for FSDP.
+            - device_mesh["dp_cp"] if using CP ranks in FSDP.
+            - device_mesh["tp"] if using TP.
+            - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
 
     Optimization parameters
     -----------------------
@@ -641,12 +641,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
@@ -655,7 +652,7 @@ def set_device_mesh(
             # Iterate through child sub-modules without deep recursion.
             # Automatically detects TransformerEngine TP modules and
             # the capability to call this method at any level.
-            for name, child in self.named_children():
+            for child in self.children():
                 if hasattr(child, "set_device_mesh"):
                     child.set_device_mesh(tp_mesh, weight_mesh)
 
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -858,12 +858,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py
@@ -168,6 +168,7 @@ def set_device_mesh(
             Quantized DTensor parameters are currently not supported for FusibleOperation(s),
             and this mesh is not used.
         """
+        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Construct TP-Replicate DTensors. Used to shim non-TP parameters for compatibility
             # with DTensor parameters in TP layers to support DTensor operations.
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -1112,10 +1112,10 @@ class LayerNormLinear(TransformerEngineBaseModule):
             parameters and if the DTensor DeviceMesh includes dimensions that do not
             shard weights, such as in the case of HSDP (DP-Replicate x DP-Shard).
             For example:
-                - device_mesh["dp"] for FSDP.
-                - device_mesh["dp_cp"] if using CP ranks in FSDP.
-                - device_mesh["tp"] if using TP.
-                - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
+            - device_mesh["dp"] for FSDP.
+            - device_mesh["dp_cp"] if using CP ranks in FSDP.
+            - device_mesh["tp"] if using TP.
+            - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
 
     Optimization parameters
     -----------------------
@@ -1492,12 +1492,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -1734,10 +1734,10 @@ class LayerNormMLP(TransformerEngineBaseModule):
             parameters and if the DTensor DeviceMesh includes dimensions that do not
             shard weights, such as in the case of HSDP (DP-Replicate x DP-Shard).
             For example:
-                - device_mesh["dp"] for FSDP.
-                - device_mesh["dp_cp"] if using CP ranks in FSDP.
-                - device_mesh["tp"] if using TP.
-                - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
+            - device_mesh["dp"] for FSDP.
+            - device_mesh["dp_cp"] if using CP ranks in FSDP.
+            - device_mesh["tp"] if using TP.
+            - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
 
     Optimization parameters
     -----------------------
@@ -2058,12 +2058,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -1047,10 +1047,10 @@ class Linear(TransformerEngineBaseModule):
             parameters and if the DTensor DeviceMesh includes dimensions that do not
             shard weights, such as in the case of HSDP (DP-Replicate x DP-Shard).
             For example:
-                - device_mesh["dp"] for FSDP.
-                - device_mesh["dp_cp"] if using CP ranks in FSDP.
-                - device_mesh["tp"] if using TP.
-                - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
+            - device_mesh["dp"] for FSDP.
+            - device_mesh["dp_cp"] if using CP ranks in FSDP.
+            - device_mesh["tp"] if using TP.
+            - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
 
     Optimization parameters
     -----------------------
@@ -1383,12 +1383,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py
@@ -171,6 +171,7 @@ def set_device_mesh(
             Quantized DTensor parameters are currently not supported for FusibleOperation(s),
             and this mesh is not used.
         """
+        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Construct TP-Replicate DTensors. Used to shim non-TP parameters for compatibility
             # with DTensor parameters in TP layers to support DTensor operations.
diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py
@@ -258,10 +258,10 @@ class TransformerLayer(torch.nn.Module):
             parameters and if the DTensor DeviceMesh includes dimensions that do not
             shard weights, such as in the case of HSDP (DP-Replicate x DP-Shard).
             For example:
-                - device_mesh["dp"] for FSDP.
-                - device_mesh["dp_cp"] if using CP ranks in FSDP.
-                - device_mesh["tp"] if using TP.
-                - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
+            - device_mesh["dp"] for FSDP.
+            - device_mesh["dp_cp"] if using CP ranks in FSDP.
+            - device_mesh["tp"] if using TP.
+            - device_mesh["dp_cp_tp"] if strided-sharding with FSDP-TP.
 
     Optimization parameters
     -----------------------
@@ -629,12 +629,9 @@ def set_device_mesh(
         """
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
-            assert (
-                tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(),
-                (
-                    f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
-                    f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
-                ),
+            assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
+                f"TransformerEngine {self.__class__.__name__} TP init size ({self.tp_size}) "
+                f"does not match the size of the provided TP DeviceMesh ({tp_mesh.size()})."
             )
             # Set the tensor parallel group from the mesh.
             self.set_tensor_parallel_group(tp_mesh.get_group())
@@ -643,7 +640,7 @@ def set_device_mesh(
             # Iterate through child sub-modules without deep recursion.
             # Automatically detects TransformerEngine TP modules and
             # the capability to call this method at any level.
-            for name, child in self.named_children():
+            for child in self.children():
                 if hasattr(child, "set_device_mesh"):
                     child.set_device_mesh(tp_mesh, weight_mesh)