step1b: fix ut

iProzd · iProzd · commit 2c165af6a867 · 2026-05-18T17:56:14.000+08:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -63,6 +63,7 @@ ______________________________________________________________________
 1. **每个 Step 必须配套 UT**：不写 UT 不能进下一个 Step。UT 通过 → 才能集成。详见 `SPEC.md` §6 的测试矩阵。
 1. **多卡 UT 用 torchrun 跑**：模板见 skill `multi-gpu-test-template`。
 1. **代码风格检查**：每个 Step 完成后必须运行 `ruff check` 并修复所有问题，然后重新验证测试通过。Ruff 路径：`/root/miniconda3/bin/ruff`。
+1. **多卡 UT 默认 4/8 GPU**：除非测试目标明确只适合 2 GPU smoke test，否则多卡 UT 必须至少覆盖 4 GPU；当前环境有 8 张 GPU 时，Step 验收必须优先跑 8 GPU。报告测试结果时写明实际 `torchrun --nproc_per_node` 数量和 backend。
 
 ______________________________________________________________________
 
diff --git a/PROGRESS.md b/PROGRESS.md
@@ -29,13 +29,9 @@ This file tracks implementation and validation status. `SPEC.md` remains the des
 - Single-process Step 1 tests: PASS
   - Command used: `pytest source/tests/pt/test_sezm_moe_a2a.py -q`
   - Result: 5 tests passed, 3 subtests passed
-- Multi-process Step 1 smoke test: PASS
-  - Runner: Cursor `multi-gpu-tester` subagent
-  - Command shape: `torchrun --nproc_per_node=2 ... source/tests/pt/test_sezm_moe_a2a_multigpu.py`
-  - Result: 4 tests passed, no hang
-- Multi-process Step 1 4-process test: PASS
-  - Command shape: `torchrun --nproc_per_node=4 ... source/tests/pt/test_sezm_moe_a2a_multigpu.py`
-  - Result: 4 tests passed on all ranks, no hang
+- Multi-process Step 1 8-rank CUDA/NCCL test: PASS
+  - Command shape: `torchrun --nproc_per_node=8 ... source/tests/pt/test_sezm_moe_a2a_multigpu.py`
+  - Result: 6 tests passed on all 8 ranks, no hang
 - Step 1 ruff check: PASS
   - Command used: `/root/miniconda3/bin/ruff check deepmd/pt/model/descriptor/sezm_nn/moe/a2a_ops.py source/tests/pt/test_sezm_moe_a2a.py source/tests/pt/test_sezm_moe_a2a_multigpu.py`
 - DPA3 reference subagent smoke test: PASS
@@ -48,6 +44,7 @@ This file tracks implementation and validation status. `SPEC.md` remains the des
 - `pytest` 9.0.3 is installed in `/mnt/data_nas/zhangd/conda_env/torch-modern`.
 - `/root/miniconda3/bin/ruff` is available and reports `ruff 0.15.6`.
 - Existing Step 1 tests are runnable via `pytest`, `unittest`, and standalone `torchrun`.
+- Multi-rank tests use CUDA/NCCL when CUDA is available and fall back to CPU/Gloo only when CUDA is unavailable.
 
 ## Not Started
 
diff --git a/SPEC.md b/SPEC.md
@@ -673,6 +673,13 @@ ______________________________________________________________________
 
 每个测试都对应一个 pytest 文件。命名：`test_sezm_moe_<topic>.py` 和 `test_sezm_moe_<topic>_multigpu.py`。
 
+### 多卡 UT 的 GPU 数量规则
+
+- 多卡 UT 必须与单卡 UT 分开写成独立文件：`test_sezm_moe_<topic>.py` 与 `test_sezm_moe_<topic>_multigpu.py`。
+- 除非测试目标明确只是 2 GPU smoke test，否则多卡 UT 至少覆盖 4 GPU。
+- 当前开发环境有 8 张 GPU 时，Step 验收必须优先跑 8 GPU，并在报告中写明 `torchrun --nproc_per_node`、backend（NCCL/Gloo）和通过的 rank 数。
+- 对 A2A、梯度同步、checkpoint resharding、二阶导不死锁等跨 rank 行为，2 GPU 结果只能作为 smoke test，不能替代 4/8 GPU 验收。
+
 ______________________________________________________________________
 
 ## 8. 配置 schema
diff --git a/source/tests/pt/test_sezm_moe_a2a.py b/source/tests/pt/test_sezm_moe_a2a.py
@@ -98,9 +98,10 @@ def test_second_backward(self):
             "Second-order gradient should contain non-zero values",
         )
 
-    def test_gradgradcheck_fp64(self):
-        """torch.autograd.gradgradcheck should pass in fp64."""
-        # Use smaller tensors for gradgradcheck (it's expensive)
+    def test_short_circuit_gradgradcheck_fp64(self):
+        """group=None short-circuit should pass gradgradcheck in fp64."""
+        # This verifies the single-process passthrough path only.  The real
+        # _AllToAllDouble gradgradcheck lives in the multi-GPU test file.
         x = torch.randn(6, 4, dtype=torch.float64, requires_grad=True, device="cpu")
         send_splits = [2, 2, 2]
         recv_splits = [1, 3, 2]
diff --git a/source/tests/pt/test_sezm_moe_a2a_multigpu.py b/source/tests/pt/test_sezm_moe_a2a_multigpu.py
@@ -4,6 +4,7 @@
 Run with:
     torchrun --nproc_per_node=2 source/tests/pt/test_sezm_moe_a2a_multigpu.py
     torchrun --nproc_per_node=4 source/tests/pt/test_sezm_moe_a2a_multigpu.py
+    torchrun --nproc_per_node=8 source/tests/pt/test_sezm_moe_a2a_multigpu.py
 """
 
 import unittest
@@ -19,20 +20,52 @@
 def setup_dist():
     """Initialize distributed environment."""
     if not dist.is_initialized():
-        dist.init_process_group(backend="gloo")
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        dist.init_process_group(backend=backend)
     rank = dist.get_rank()
     world_size = dist.get_world_size()
-    # Use CPU for multi-GPU tests (gloo backend)
-    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+    else:
+        device = torch.device("cpu")
     return rank, world_size, device
 
 
 def cleanup_dist():
     """Clean up distributed environment."""
     if dist.is_initialized():
+        dist.barrier()
         dist.destroy_process_group()
 
 
+def make_cyclic_splits(rank, world_size):
+    """Return deterministic asymmetric splits valid for any world size."""
+    send_splits = [((rank + 2 * peer) % 5) + 1 for peer in range(world_size)]
+    recv_splits = [((peer + 2 * rank) % 5) + 1 for peer in range(world_size)]
+    return send_splits, recv_splits
+
+
+def make_encoded_input(rank, send_splits, device):
+    """Build rows whose values encode source rank, target rank, and row id."""
+    rows = []
+    for peer, count in enumerate(send_splits):
+        for row_id in range(count):
+            rows.append([float(rank), float(peer), float(row_id)])
+    return torch.tensor(rows, dtype=torch.float64, device=device)
+
+
+def make_expected_encoded_output(rank, world_size, device):
+    """Expected all-to-all output for make_encoded_input and make_cyclic_splits."""
+    rows = []
+    for source_rank in range(world_size):
+        source_send_splits, _ = make_cyclic_splits(source_rank, world_size)
+        count = source_send_splits[rank]
+        for row_id in range(count):
+            rows.append([float(source_rank), float(rank), float(row_id)])
+    return torch.tensor(rows, dtype=torch.float64, device=device)
+
+
 class TestAllToAllMultiGPU(unittest.TestCase):
     """Multi-GPU tests for _AllToAllDouble communication primitive."""
 
@@ -44,44 +77,19 @@ def setUpClass(cls):
 
     @classmethod
     def tearDownClass(cls):
-        """Clean up distributed environment."""
-        cleanup_dist()
+        """Keep the process group alive until run_tests aggregates results."""
 
-    def test_forward_shape(self):
-        """Forward pass should produce correct output shape across ranks."""
-        # Each rank sends different amounts
-        # Constraint: rank i's send_splits[j] == rank j's recv_splits[i]
-        if self.world_size == 2:
-            send_splits = [3, 5] if self.rank == 0 else [2, 6]
-            recv_splits = [3, 2] if self.rank == 0 else [5, 6]
-        elif self.world_size == 4:
-            # Matrix: send[i][j] = recv[j][i]
-            # rank 0 sends: [2, 3, 1, 4] -> rank 0 recvs: [2, 5, 3, 7]
-            # rank 1 sends: [5, 2, 4, 3] -> rank 1 recvs: [3, 2, 6, 4]
-            # rank 2 sends: [3, 6, 1, 2] -> rank 2 recvs: [1, 4, 1, 5]
-            # rank 3 sends: [7, 4, 5, 1] -> rank 3 recvs: [4, 3, 2, 1]
-            if self.rank == 0:
-                send_splits = [2, 3, 1, 4]
-                recv_splits = [2, 5, 3, 7]
-            elif self.rank == 1:
-                send_splits = [5, 2, 4, 3]
-                recv_splits = [3, 2, 6, 4]
-            elif self.rank == 2:
-                send_splits = [3, 6, 1, 2]
-                recv_splits = [1, 4, 1, 5]
-            else:  # rank 3
-                send_splits = [7, 4, 5, 1]
-                recv_splits = [4, 3, 2, 1]
-        else:
-            self.skipTest(f"Test not configured for world_size={self.world_size}")
+    def test_forward_values_and_shape(self):
+        """Forward pass should move the correct rows across ranks."""
+        send_splits, recv_splits = make_cyclic_splits(self.rank, self.world_size)
 
         total_send = sum(send_splits)
         total_recv = sum(recv_splits)
 
-        x = torch.randn(total_send, 8, device=self.device, requires_grad=True)
+        x = make_encoded_input(self.rank, send_splits, self.device).requires_grad_(True)
         out = all_to_all_differentiable(x, send_splits, recv_splits, self.group)
+        expected = make_expected_encoded_output(self.rank, self.world_size, self.device)
 
-        # Check output shape
         self.assertEqual(
             out.shape[0],
             total_recv,
@@ -92,20 +100,17 @@ def test_forward_shape(self):
             x.shape[1:],
             f"Rank {self.rank}: trailing dimensions should be preserved",
         )
+        torch.testing.assert_close(out, expected)
 
     def test_backward_no_deadlock(self):
         """Backward pass should not deadlock."""
-        if self.world_size == 2:
-            send_splits = [4, 4]
-            recv_splits = [4, 4]
-        elif self.world_size == 4:
-            send_splits = [2, 2, 2, 2]
-            recv_splits = [2, 2, 2, 2]
-        else:
-            self.skipTest(f"Test not configured for world_size={self.world_size}")
+        send_splits = [2] * self.world_size
+        recv_splits = [2] * self.world_size
 
         total_send = sum(send_splits)
-        x = torch.randn(total_send, 8, device=self.device, requires_grad=True)
+        x = torch.randn(
+            total_send, 8, device=self.device, dtype=torch.float64, requires_grad=True
+        )
 
         out = all_to_all_differentiable(x, send_splits, recv_splits, self.group)
         loss = (out**2).sum()
@@ -120,17 +125,13 @@ def test_backward_no_deadlock(self):
 
     def test_second_backward_no_deadlock(self):
         """Second backward (create_graph=True) should not deadlock."""
-        if self.world_size == 2:
-            send_splits = [3, 3]
-            recv_splits = [3, 3]
-        elif self.world_size == 4:
-            send_splits = [2, 2, 2, 2]
-            recv_splits = [2, 2, 2, 2]
-        else:
-            self.skipTest(f"Test not configured for world_size={self.world_size}")
+        send_splits = [2] * self.world_size
+        recv_splits = [2] * self.world_size
 
         total_send = sum(send_splits)
-        x = torch.randn(total_send, 8, device=self.device, requires_grad=True)
+        x = torch.randn(
+            total_send, 8, device=self.device, dtype=torch.float64, requires_grad=True
+        )
 
         # First forward
         out = all_to_all_differentiable(x, send_splits, recv_splits, self.group)
@@ -157,36 +158,19 @@ def test_second_backward_no_deadlock(self):
 
     def test_asymmetric_splits(self):
         """Test with asymmetric send/recv splits across ranks."""
-        # Constraint: rank i's send_splits[j] == rank j's recv_splits[i]
-        if self.world_size == 2:
-            # Rank 0 sends more to rank 1, rank 1 sends more to rank 0
-            send_splits = [2, 6] if self.rank == 0 else [5, 3]
-            recv_splits = [2, 5] if self.rank == 0 else [6, 3]
-        elif self.world_size == 4:
-            # Matrix: send[i][j] = recv[j][i]
-            # rank 0 sends: [1, 2, 3, 4] -> rank 0 recvs: [1, 3, 2, 4]
-            # rank 1 sends: [3, 2, 1, 4] -> rank 1 recvs: [2, 2, 3, 3]
-            # rank 2 sends: [2, 3, 4, 1] -> rank 2 recvs: [3, 1, 4, 2]
-            # rank 3 sends: [4, 3, 2, 1] -> rank 3 recvs: [4, 4, 1, 1]
-            if self.rank == 0:
-                send_splits = [1, 2, 3, 4]
-                recv_splits = [1, 3, 2, 4]
-            elif self.rank == 1:
-                send_splits = [3, 2, 1, 4]
-                recv_splits = [2, 2, 3, 3]
-            elif self.rank == 2:
-                send_splits = [2, 3, 4, 1]
-                recv_splits = [3, 1, 4, 2]
-            else:  # rank 3
-                send_splits = [4, 3, 2, 1]
-                recv_splits = [4, 4, 1, 1]
-        else:
-            self.skipTest(f"Test not configured for world_size={self.world_size}")
+        send_splits, recv_splits = make_cyclic_splits(self.rank, self.world_size)
+        self.assertNotEqual(
+            send_splits,
+            recv_splits,
+            f"Rank {self.rank}: split pattern should be asymmetric",
+        )
 
         total_send = sum(send_splits)
         total_recv = sum(recv_splits)
 
-        x = torch.randn(total_send, 16, device=self.device, requires_grad=True)
+        x = torch.randn(
+            total_send, 16, device=self.device, dtype=torch.float64, requires_grad=True
+        )
         out = all_to_all_differentiable(x, send_splits, recv_splits, self.group)
 
         # Check shape
@@ -198,12 +182,73 @@ def test_asymmetric_splits(self):
         loss.backward()
         self.assertIsNotNone(x.grad)
 
+    def test_three_layer_second_backward_no_deadlock(self):
+        """Three chained A2A ops should support second backward."""
+        send_splits = [1] * self.world_size
+        recv_splits = [1] * self.world_size
+        x = torch.randn(
+            self.world_size,
+            4,
+            dtype=torch.float64,
+            device=self.device,
+            requires_grad=True,
+        )
+
+        y = x
+        for _ in range(3):
+            y = all_to_all_differentiable(y, send_splits, recv_splits, self.group)
+
+        loss = (y**2).sum()
+        (grad_x,) = torch.autograd.grad(loss, x, create_graph=True, retain_graph=True)
+        (grad_x**2).sum().backward()
+        self.assertIsNotNone(x.grad, f"Rank {self.rank}: second-order grad missing")
+        self.assertTrue(
+            (x.grad.abs() > 1e-6).any(),
+            f"Rank {self.rank}: second-order grad should be non-zero",
+        )
+
+    def test_gradgradcheck_fp64_world_group(self):
+        """Gradgradcheck should exercise _AllToAllDouble with WORLD group."""
+        torch.manual_seed(20260518)
+        if self.device.type == "cuda":
+            torch.cuda.manual_seed_all(20260518)
+
+        send_splits = [1] * self.world_size
+        recv_splits = [1] * self.world_size
+        x = torch.randn(
+            self.world_size,
+            2,
+            dtype=torch.float64,
+            device=self.device,
+            requires_grad=True,
+        )
+
+        def func(inp):
+            out = all_to_all_differentiable(
+                inp, send_splits, recv_splits, group=self.group
+            )
+            # Pick the row sourced from this rank so per-rank gradgradcheck
+            # perturbs only the input that can affect the local output.
+            return out.narrow(0, self.rank, 1)
+
+        result = torch.autograd.gradgradcheck(
+            func,
+            (x,),
+            eps=1e-6,
+            atol=1e-4,
+            raise_exception=False,
+        )
+        self.assertTrue(
+            result,
+            f"Rank {self.rank}: distributed gradgradcheck failed",
+        )
+
 
 def run_tests():
     """Run all tests and report results."""
     import sys
 
-    rank, world_size, _ = setup_dist()
+    rank, world_size, device = setup_dist()
 
     # Only rank 0 prints header
     if rank == 0:
@@ -217,18 +262,20 @@ def run_tests():
     result = runner.run(suite)
 
     # Synchronize results across ranks (before cleanup)
-    success = torch.tensor([1 if result.wasSuccessful() else 0], dtype=torch.int32)
+    success = torch.tensor(
+        [1 if result.wasSuccessful() else 0], dtype=torch.int32, device=device
+    )
     if dist.is_initialized():
         dist.all_reduce(success, op=dist.ReduceOp.MIN)
 
         if rank == 0:
             if success.item() == 1:
                 sys.stdout.write(f"\n{'=' * 70}\n")
-                sys.stdout.write(f"✓ All tests passed on all {world_size} ranks\n")
+                sys.stdout.write(f"PASS: all tests passed on all {world_size} ranks\n")
                 sys.stdout.write(f"{'=' * 70}\n\n")
             else:
                 sys.stdout.write(f"\n{'=' * 70}\n")
-                sys.stdout.write("✗ Tests failed on at least one rank\n")
+                sys.stdout.write("FAIL: tests failed on at least one rank\n")
                 sys.stdout.write(f"{'=' * 70}\n\n")
 
         cleanup_dist()