Reapply "step1: all to all"

iProzd · iProzd · commit e9c71d02fcfe · 2026-05-18T17:22:54.000+08:00
This reverts commit 64bfcc6.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -44,6 +44,7 @@ ______________________________________________________________________
 1. **复用 DPA3 代码的方式**：**copy 一份过来改**，不要 `from deepmd-kit-moe.xxx import ...`。需要参考时去 `/mnt/data_nas/zhangd/claude_space/deepmd-kit-moe` 找。详见子 agent `dpa3-ref-searcher`。
 1. **每个 Step 必须配套 UT**：不写 UT 不能进下一个 Step。UT 通过 → 才能集成。详见 `SPEC.md` §6 的测试矩阵。
 1. **多卡 UT 用 torchrun 跑**：模板见 skill `multi-gpu-test-template`。
+1. **代码风格检查**：每个 Step 完成后必须运行 `ruff check` 并修复所有问题，然后重新验证测试通过。Ruff 路径：`/root/miniconda3/bin/ruff`。
 
 ______________________________________________________________________
 
diff --git a/deepmd/pt/model/descriptor/sezm_nn/moe/__init__.py b/deepmd/pt/model/descriptor/sezm_nn/moe/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""SeZM MoE (Mixture-of-Experts) modules for Expert Parallelism + Data Parallelism.
+
+This package implements MoE components for the SeZM descriptor:
+- Communication primitives (A2A with second-order derivatives)
+- Router (top-k gating)
+- Expert collections (routing + shared experts)
+- MoE convolution layer (replaces SO2 linear stack)
+"""
+
+from __future__ import (
+    annotations,
+)
+
+__all__ = []
diff --git a/deepmd/pt/model/descriptor/sezm_nn/moe/a2a_ops.py b/deepmd/pt/model/descriptor/sezm_nn/moe/a2a_ops.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Differentiable All-to-All communication operators for SeZM MoE Expert Parallelism.
+
+Provides `_AllToAllDouble`, a recursive autograd Function whose backward
+calls `.apply()` again, creating a fresh autograd node so that
+`create_graph=True` (required for force -> virial second derivatives)
+works correctly to arbitrary order.
+
+Public API
+----------
+all_to_all_differentiable(x, send_splits, recv_splits, group)
+    When *group* is ``None`` (single-GPU / no EP), returns *x* unchanged.
+    Otherwise dispatches through ``_AllToAllDouble``.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    Any,
+)
+
+import torch
+import torch.distributed as dist
+from torch.autograd import (
+    Function,
+)
+
+
+def _a2a_raw(
+    x: torch.Tensor,
+    send_splits: list[int],
+    recv_splits: list[int],
+    group: dist.ProcessGroup,
+) -> torch.Tensor:
+    """Raw All-to-All without autograd.
+
+    Parameters
+    ----------
+    x : Tensor
+        Input tensor whose first dimension equals ``sum(send_splits)``.
+    send_splits : list[int]
+        Number of rows to send to each rank.
+    recv_splits : list[int]
+        Number of rows to receive from each rank.
+    group : ProcessGroup
+        The communication group.
+
+    Returns
+    -------
+    Tensor
+        Output tensor with first dimension ``sum(recv_splits)``.
+    """
+    total_recv = sum(recv_splits)
+    out = torch.empty((total_recv, *x.shape[1:]), dtype=x.dtype, device=x.device)
+    dist.all_to_all_single(
+        out,
+        x.contiguous(),
+        output_split_sizes=recv_splits,
+        input_split_sizes=send_splits,
+        group=group,
+    )
+    return out
+
+
+class _AllToAllDouble(Function):
+    """Recursively differentiable All-to-All.
+
+    The backward pass calls ``.apply()`` with swapped send/recv splits,
+    which creates a *new* autograd node.  This means the graph built by
+    ``create_graph=True`` (1st backward) can itself be differentiated
+    (2nd backward), giving correct second-order derivatives through
+    the communication boundary.
+
+    The layer-sequential structure of SeZM guarantees that all ranks
+    execute A2A calls in the same order, so deadlocks cannot occur.
+    """
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        send_splits: list[int],
+        recv_splits: list[int],
+        group: dist.ProcessGroup,
+    ) -> torch.Tensor:
+        ctx.group = group
+        ctx.send_splits = send_splits
+        ctx.recv_splits = recv_splits
+        return _a2a_raw(x, send_splits, recv_splits, group)
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> tuple[torch.Tensor, None, None, None]:
+        # Recursive call: backward of this node is itself an A2A with
+        # swapped splits.  Because we call .apply(), a new autograd node
+        # is inserted into the graph, enabling higher-order derivatives.
+        grad_input = _AllToAllDouble.apply(
+            grad_output,
+            ctx.recv_splits,
+            ctx.send_splits,
+            ctx.group,
+        )
+        return grad_input, None, None, None
+
+
+def all_to_all_differentiable(
+    x: torch.Tensor,
+    send_splits: list[int],
+    recv_splits: list[int],
+    group: dist.ProcessGroup | None,
+) -> torch.Tensor:
+    """Public API for differentiable All-to-All.
+
+    Parameters
+    ----------
+    x : Tensor
+        Input tensor.
+    send_splits : list[int]
+        Number of rows to send to each rank.
+    recv_splits : list[int]
+        Number of rows to receive from each rank.
+    group : ProcessGroup or None
+        Communication group.  When ``None`` (single-GPU / no EP),
+        *x* is returned unchanged with gradients flowing through.
+
+    Returns
+    -------
+    Tensor
+        Result of All-to-All, or *x* itself when ``group is None``.
+    """
+    if group is None:
+        return x
+    return _AllToAllDouble.apply(x, send_splits, recv_splits, group)
diff --git a/source/tests/pt/test_sezm_moe_a2a.py b/source/tests/pt/test_sezm_moe_a2a.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Unit tests for SeZM MoE All-to-All communication primitive (single-GPU)."""
+
+import unittest
+
+import torch
+
+from deepmd.pt.model.descriptor.sezm_nn.moe.a2a_ops import (
+    all_to_all_differentiable,
+)
+
+
+class TestAllToAllSingleGPU(unittest.TestCase):
+    """Single-GPU tests for _AllToAllDouble communication primitive."""
+
+    def test_single_gpu_passthrough(self):
+        """group=None should return x unchanged with gradients flowing through."""
+        x = torch.randn(10, 8, requires_grad=True, device="cpu")
+        send_splits = [3, 3, 4]
+        recv_splits = [2, 5, 3]
+
+        out = all_to_all_differentiable(x, send_splits, recv_splits, group=None)
+
+        # Output should be identical to input
+        self.assertIs(out, x, "group=None should return input tensor unchanged")
+
+        # Gradient should flow through
+        loss = out.sum()
+        loss.backward()
+        self.assertIsNotNone(x.grad, "Gradient should flow through when group=None")
+        self.assertTrue(
+            torch.allclose(x.grad, torch.ones_like(x)),
+            "Gradient should be all ones for sum() loss",
+        )
+
+    def test_shape_preservation(self):
+        """Forward pass should preserve trailing dimensions."""
+        # Test various shapes
+        test_cases = [
+            ((10, 8), [3, 3, 4], [2, 5, 3]),
+            ((15, 16, 32), [5, 5, 5], [4, 6, 5]),
+            ((8, 4, 4, 64), [2, 3, 3], [3, 2, 3]),
+        ]
+
+        for shape, send_splits, recv_splits in test_cases:
+            with self.subTest(shape=shape):
+                x = torch.randn(*shape, device="cpu")
+                out = all_to_all_differentiable(x, send_splits, recv_splits, group=None)
+
+                # First dimension should match sum(recv_splits)
+                expected_shape = (sum(recv_splits), *shape[1:])
+                self.assertEqual(
+                    out.shape,
+                    expected_shape,
+                    f"Output shape mismatch for input shape {shape}",
+                )
+
+    def test_first_backward(self):
+        """loss.backward() should produce non-zero gradients."""
+        x = torch.randn(10, 8, requires_grad=True, device="cpu")
+        send_splits = [3, 3, 4]
+        recv_splits = [2, 5, 3]
+
+        out = all_to_all_differentiable(x, send_splits, recv_splits, group=None)
+        loss = (out**2).sum()
+        loss.backward()
+
+        self.assertIsNotNone(x.grad, "Gradient should exist after backward")
+        self.assertTrue(
+            (x.grad.abs() > 1e-6).any(), "Gradient should contain non-zero values"
+        )
+
+    def test_second_backward(self):
+        """create_graph=True + second backward should produce non-zero gradients."""
+        x = torch.randn(10, 8, requires_grad=True, device="cpu")
+        send_splits = [3, 3, 4]
+        recv_splits = [2, 5, 3]
+
+        # First forward
+        out = all_to_all_differentiable(x, send_splits, recv_splits, group=None)
+        loss = (out**2).sum()
+
+        # First backward with create_graph=True
+        (grad_x,) = torch.autograd.grad(loss, x, create_graph=True, retain_graph=True)
+
+        self.assertIsNotNone(grad_x, "First-order gradient should exist")
+        self.assertTrue(
+            grad_x.requires_grad, "First-order gradient should require grad"
+        )
+
+        # Second backward
+        loss2 = (grad_x**2).sum()
+        loss2.backward()
+
+        self.assertIsNotNone(x.grad, "Second-order gradient should exist")
+        self.assertTrue(
+            (x.grad.abs() > 1e-6).any(),
+            "Second-order gradient should contain non-zero values",
+        )
+
+    def test_gradgradcheck_fp64(self):
+        """torch.autograd.gradgradcheck should pass in fp64."""
+        # Use smaller tensors for gradgradcheck (it's expensive)
+        x = torch.randn(6, 4, dtype=torch.float64, requires_grad=True, device="cpu")
+        send_splits = [2, 2, 2]
+        recv_splits = [1, 3, 2]
+
+        def func(inp):
+            return all_to_all_differentiable(inp, send_splits, recv_splits, group=None)
+
+        # gradgradcheck verifies second-order derivatives
+        result = torch.autograd.gradgradcheck(
+            func, x, eps=1e-6, atol=1e-4, rtol=1e-3, raise_exception=False
+        )
+
+        self.assertTrue(
+            result, "gradgradcheck failed: second-order derivatives are incorrect"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pt/test_sezm_moe_a2a_multigpu.py b/source/tests/pt/test_sezm_moe_a2a_multigpu.py