deepmodeling
diff --git a/‎deepmd/pt/model/descriptor/sezm.py‎
Lines changed: 1 addition & 1 deletion b/‎deepmd/pt/model/descriptor/sezm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepmd/pt/model/descriptor/sezm_nn/edge_cache.py‎
Lines changed: 5 additions & 5 deletions b/‎deepmd/pt/model/descriptor/sezm_nn/edge_cache.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎deepmd/pt/model/model/sezm_model.py‎
Lines changed: 47 additions & 62 deletions b/‎deepmd/pt/model/model/sezm_model.py‎
Lines changed: 47 additions & 62 deletions
@@ -952,7 +952,7 @@ def forward(
         extended_coord = extended_coord.to(self.compute_dtype)
         nf, nloc, nnei = nlist.shape
         nall = extended_coord.shape[1]
-        n_nodes = int(nf * nloc)
+        n_nodes = nf * nloc
         charge_spin = self._canonicalize_charge_spin(
             charge_spin,
             nf=nf,
 
@@ -290,7 +290,7 @@ def build_edge_cache(
         Per-edge cache.
     """
     nf, nloc, nnei = nlist.shape
-    n_nodes = int(nf * nloc)
+    n_nodes = nf * nloc
 
     # === Step 1. Force fp32+ for geometry ===
     geom_dtype = get_promoted_dtype(extended_coord.dtype)
@@ -492,10 +492,10 @@ def build_edge_cache_from_edges(
     edge_type_feat = edge_type_feat * edge_keep_f.to(dtype=edge_type_feat.dtype)
 
     # === Step 6. Source Freeze Propagation Gate (optional) ===
-    # The sparse-edge path packs one dummy masked edge per frame so the
-    # compiled graph sees a statically non-empty tensor. ``edge_keep_f``
-    # rewrites any such slot to ``w=1`` inside ``compute_edge_src_gate``,
-    # keeping the product reduction unaffected by padding.
+    # The sparse-edge path packs masked dummy edges so the compiled graph sees
+    # a statically non-empty, non-singular edge tensor. ``edge_keep_f`` rewrites
+    # any such slot to ``w=1`` inside ``compute_edge_src_gate``, keeping the
+    # product reduction unaffected by padding.
     edge_src_gate: torch.Tensor | None = None
     if bridging_switch is not None:
         with nvtx_range("src_gate"):
 
@@ -87,11 +87,12 @@
 function:
 
 * ``core_compute`` rebuilds a compact, GPU-friendly edge list from the
-  padded DeePMD neighbor list (``build_edge_list_from_nlist``), with a
-  single masked dummy edge appended so the edge tensor is never empty
-  (NOTE 10).  Edge vectors come from ``index_select`` on the extended
-  coordinate tensor, which keeps the gradient path back to coordinates
-  explicit and safe under symbolic shapes (NOTE 11).
+  padded DeePMD neighbor list (``build_edge_list_from_nlist``), with
+  masked dummy edges appended so the edge tensor has a non-singular
+  symbolic lower bound (NOTE 10).  Edge vectors come from
+  ``index_select`` on the extended coordinate tensor, which keeps the
+  gradient path back to coordinates explicit and safe under symbolic
+  shapes (NOTE 11).
 * The SeZM descriptor consumes the edge list and produces per-atom
   features.
 * The fitting network predicts per-atom energy; ``apply_out_stat`` adds
@@ -240,24 +241,22 @@
       cudagraphs capture autograd metadata only once.  Higher-order
       gradients need fresh metadata per call, so cudagraphs would feed
       stale autograd state into the second backward.
-* ``max_fusion_size`` -- mode-dependent
+* ``max_fusion_size=8``
       Caps kernel fusion complexity so Inductor's scheduler does not
       time out on the large edge-level reductions inside the
-      descriptor when nsel is big.  Training uses ``64`` (the long-
-      standing default, observed stable on every training run so far);
-      inference uses the tighter ``8`` to dodge the Triton lowering
-      failure described by the next bullet.
-* ``triton.persistent_reductions=False`` -- inference only
+      descriptor when nsel is big.  The tighter value keeps both
+      training and inference fusions small enough for Triton IR
+      generation on GPU backends that are sensitive to large dynamic
+      edge graphs.
+* ``triton.persistent_reductions=False``
       Inductor's persistent-reduction scheduler fuses a ``sum`` with
       *all* neighbouring pointwise ops (``tanh_backward``, ``pow``,
       ``exp``, ``mul``, ``select``, ``slice``, ``view`` ...) into one
-      ``triton_per_fused_...`` kernel.  On the graph emitted by
-      inference (``create_graph=False``, no double-detach stripping,
-      different fused topology than training) this kernel hits Triton
-      bug ``PassManager::run failed`` inside ``make_ttgir``.  Training
-      never produces the same fused shape and does not benefit from
-      disabling the optimisation, so the flag is left on for training
-      to preserve kernel quality.
+      ``triton_per_fused_...`` kernel.  On SeZM's dynamic edge graph
+      this can hit Triton bug ``PassManager::run failed`` inside
+      ``make_ttgir``.  Disabling it forces the reduction into its own
+      kernel before either training or inference can form the
+      pathological fused IR.
 * ``triton.mix_order_reduction=False``
       Workaround for PyTorch <=2.11 bugs pytorch/pytorch#174379,
       #178080, #179494.  All three manifest only under data-dependent
@@ -324,17 +323,20 @@
 In eval mode we merely detach; no ``create_graph`` is requested, so the
 compiled kernel never has to build a backward graph.
 
-NOTE 10 -- Tail dummy edge
---------------------------
+NOTE 10 -- Tail dummy edges
+---------------------------
 
-``build_edge_list_from_nlist`` appends exactly one masked edge at the
-end of every batch.  Real edge compaction happens via
+``build_edge_list_from_nlist`` appends two masked edges at the end of
+every batch.  Real edge compaction happens via
 ``torch.nonzero(valid_mask)``, whose output length is data-dependent
 and can be zero in sparse or single-type systems.  make_fx cannot trace
 an "if n_edges == 0: skip" branch symbolically; without the dummy it
 would fall back to concrete shape specialization and break
-``dynamic=True``.  The dummy's ``edge_mask`` is ``False`` so it
-contributes exactly zero to every downstream sum or gather.
+``dynamic=True``.  A pair of dummy slots also gives Inductor's batched
+matmul lowering a static ``E >= 2`` edge-axis bound, avoiding
+data-dependent layout guards on ``E == 1``.  Each dummy's ``edge_mask``
+is ``False`` so it contributes exactly zero to every downstream sum or
+gather.
 
 NOTE 11 -- ``index_select`` for coordinate gradients
 ----------------------------------------------------
@@ -1690,44 +1692,23 @@ def compute_fn(
         # fresh graph is cheap and a segfault is fatal.
         traced = _rebuild_graph_module(traced)
 
-        # NOTE: Inductor options are mode-dependent.  Training has been
-        # running cleanly with ``max_fusion_size=64`` for a while, so we
-        # keep that path untouched to avoid destabilising it.  Inference
-        # (``self.training is False``) has shown a Triton
-        # ``make_ttgir`` / ``PassManager::run failed`` on the fused
-        # per-reduction kernel
-        # ``triton_per_fused_clone_exp_mul_pow_select_slice_sum_tanh_...``;
-        # the kernel itself is fine, but the *fused* IR is too big /
-        # too complex for Triton's lowering pipeline on this version.
-        # So inference:
-        #   * disables ``triton.persistent_reductions`` -- persistent
-        #     reduction is what lets Inductor pull a ``sum`` together
-        #     with all surrounding pointwise ops (including the
-        #     activation-backward pointwise chain) into one
-        #     ``per_fused_...`` kernel; turning it off forces the sum
-        #     to emit its own kernel and stops the pathological fuse.
-        #   * tightens ``max_fusion_size`` from 64 to 8, so even
-        #     non-persistent fusions stay small enough for Triton IR
-        #     generation to succeed.
-        # Training does not hit this path in practice (different graph
-        # topology under ``create_graph=True``), so we keep the looser
-        # options there to preserve kernel quality.
+        # NOTE: Conservative Inductor options keep SeZM's dynamic edge
+        # graph from forming overly large Triton reduction kernels
+        # (``make_ttgir`` / ``PassManager::run failed``) on some
+        # GPU/Triton combinations.
         compile_options: dict[str, Any] = {
             "max_autotune": False,
             "shape_padding": True,
             "epilogue_fusion": False,
             "triton.cudagraphs": False,
+            "max_fusion_size": 8,
+            "triton.persistent_reductions": False,
             # NOTE: ``mix_order_reduction`` hits multiple bugs under
             # data-dependent symbolic shapes on PyTorch <=2.11
             # (pytorch/pytorch#174379, #178080, #179494) -- our edge
             # count is exactly that kind of shape.
             "triton.mix_order_reduction": False,
         }
-        if self.training:
-            compile_options["max_fusion_size"] = 64
-        else:
-            compile_options["max_fusion_size"] = 8
-            compile_options["triton.persistent_reductions"] = False
         try:
             from torch._inductor import config as inductor_config
 
@@ -1979,9 +1960,10 @@ def build_edge_list_from_nlist(
         Build a compact edge list from DeePMD padded neighbor list.
 
         Edge vectors are computed via ``index_select`` on ``extended_coord``
-        so they remain differentiable w.r.t. the input coordinates.  One
-        masked dummy edge is always appended to avoid data-dependent empty-edge
-        branches that ``make_fx`` cannot trace.
+        so they remain differentiable w.r.t. the input coordinates.  Two
+        masked dummy edges are always appended to avoid data-dependent empty-edge
+        branches that ``make_fx`` cannot trace and singular edge-axis guards
+        in Inductor's batched matmul lowering.
 
         Parameters
         ----------
@@ -1995,11 +1977,11 @@ def build_edge_list_from_nlist(
         Returns
         -------
         edge_index
-            Edge indices with shape (2, E+1) where E is valid edge count.
+            Edge indices with shape (2, E+2) where E is valid edge count.
         edge_vec
-            Edge vectors with shape (E+1, 3).
+            Edge vectors with shape (E+2, 3).
         edge_mask
-            Boolean mask with shape (E+1,).  The trailing element is ``False``.
+            Boolean mask with shape (E+2).  The trailing elements are ``False``.
         """
         nf, nloc, nsel = nlist.shape
         n_actual = nf * nloc
@@ -2051,19 +2033,22 @@ def build_edge_list_from_nlist(
 
         valid_idx = torch.nonzero(edge_mask_actual, as_tuple=False).flatten()
 
-        # === Step 3. Compact edges + append one masked dummy ===
-        # NOTE: Always append exactly one masked dummy edge.
+        # === Step 3. Compact edges + append masked dummies ===
+        # NOTE: Always append two masked dummy edges.
         # ``torch.nonzero(edge_mask_actual)`` produces a data-dependent
         # number of valid edges, which can be zero on sparse or
         # single-type systems.  make_fx cannot trace an
         # ``if n_edges == 0: skip`` branch symbolically; without the
         # dummy it would fall back to concrete shape specialisation and
-        # break ``torch.compile(dynamic=True)`` for later batches.  The
+        # break ``torch.compile(dynamic=True)`` for later batches.  Two
+        # dummy edges keep the symbolic edge axis statically above one,
+        # which avoids Inductor bmm layout guards on ``E == 1``.  Each
         # dummy edge copies entry 0 (any in-range index is fine) and
         # carries ``edge_mask=False`` so every downstream sum, gather
         # or scatter ignores it.
+        dummy_count = 2
         padded_idx = torch.cat(
-            [valid_idx, torch.zeros(1, dtype=torch.long, device=device)]
+            [valid_idx, torch.zeros(dummy_count, dtype=torch.long, device=device)]
         )
         src_sel = src_actual.index_select(0, padded_idx)
         dst_sel = dst_actual.index_select(0, padded_idx)
@@ -2072,7 +2057,7 @@ def build_edge_list_from_nlist(
         edge_mask = torch.cat(
             [
                 torch.ones(valid_idx.shape[0], dtype=torch.bool, device=device),
-                torch.zeros(1, dtype=torch.bool, device=device),
+                torch.zeros(dummy_count, dtype=torch.bool, device=device),
             ]
         )
         return edge_index, edge_vec_sel, edge_mask