diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 597f8ea006..fb77838b4c 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -614,7 +614,13 @@ def _format_nlist(
                     axis=-1,
                 )
 
-            if n_nnei > nnei or extra_nlist_sort:
+            # Order matters for torch.export: Python evaluates `or` left-to-right
+            # with short-circuit.  When `extra_nlist_sort=True` (Python bool) is
+            # on the left, the right-hand `n_nnei > nnei` is not evaluated, so no
+            # symbolic guard is registered on the dynamic `n_nnei` dimension.
+            # Swapping the operands would force the SymInt comparison to run and
+            # emit an `_assert_scalar` node in the exported graph.
+            if extra_nlist_sort or n_nnei > nnei:
                 n_nf, n_nloc, n_nnei = nlist.shape
                 # make a copy before revise
                 m_real_nei = nlist >= 0
diff --git a/deepmd/entrypoints/convert_backend.py b/deepmd/entrypoints/convert_backend.py
index a8cf20c6b3..e5cd51b386 100644
--- a/deepmd/entrypoints/convert_backend.py
+++ b/deepmd/entrypoints/convert_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
 from typing import (
     Any,
 )
@@ -7,11 +8,14 @@
     Backend,
 )
 
+log = logging.getLogger(__name__)
+
 
 def convert_backend(
     *,  # Enforce keyword-only arguments
     INPUT: str,
     OUTPUT: str,
+    atomic_virial: bool = False,
     **kwargs: Any,
 ) -> None:
     """Convert a model file from one backend to another.
@@ -20,12 +24,31 @@ def convert_backend(
     ----------
     INPUT : str
         The input model file.
-    INPUT : str
+    OUTPUT : str
         The output model file.
+    atomic_virial : bool
+        If True, export .pt2/.pte models with per-atom virial correction.
+        This adds ~2.5x inference cost.  Default False.  Silently ignored
+        (with a warning) for backends that don't support the flag.
     """
     inp_backend: Backend = Backend.detect_backend_by_model(INPUT)()
     out_backend: Backend = Backend.detect_backend_by_model(OUTPUT)()
     inp_hook = inp_backend.serialize_hook
     out_hook = out_backend.deserialize_hook
     data = inp_hook(INPUT)
-    out_hook(OUTPUT, data)
+    # Forward atomic_virial to pt_expt deserialize_to_file if applicable;
+    # warn and skip the flag for backends that don't accept it so that
+    # scripts passing --atomic-virial indiscriminately don't break.
+    import inspect
+
+    sig = inspect.signature(out_hook)
+    if "do_atomic_virial" in sig.parameters:
+        out_hook(OUTPUT, data, do_atomic_virial=atomic_virial)
+    else:
+        if atomic_virial:
+            log.warning(
+                "--atomic-virial is only meaningful for pt_expt .pt2/.pte "
+                "outputs; ignoring it for output backend %s",
+                out_backend.name,
+            )
+        out_hook(OUTPUT, data)
diff --git a/deepmd/main.py b/deepmd/main.py
index 3afcda8b4a..bf59dfdad5 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -915,6 +915,15 @@ def main_parser() -> argparse.ArgumentParser:
     )
     parser_convert_backend.add_argument("INPUT", help="The input model file.")
     parser_convert_backend.add_argument("OUTPUT", help="The output model file.")
+    parser_convert_backend.add_argument(
+        "--atomic-virial",
+        action="store_true",
+        default=False,
+        help="Export .pt2/.pte models with per-atom virial correction. "
+        "This adds ~2.5x inference cost but is required for "
+        "LAMMPS compute/atom virial output. "
+        "Ignored (with a warning) for other output backends.",
+    )
 
     # * show model ******************************************************************
     parser_show = subparsers.add_parser(
diff --git a/deepmd/pt_expt/model/dipole_model.py b/deepmd/pt_expt/model/dipole_model.py
index 79ae26024e..4b0e570ecb 100644
--- a/deepmd/pt_expt/model/dipole_model.py
+++ b/deepmd/pt_expt/model/dipole_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -16,6 +17,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -137,6 +139,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -147,6 +150,13 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # See make_model.py for the rationale of the pad + monkeypatch.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
diff --git a/deepmd/pt_expt/model/dos_model.py b/deepmd/pt_expt/model/dos_model.py
index 2e69d90ab3..219c22e753 100644
--- a/deepmd/pt_expt/model/dos_model.py
+++ b/deepmd/pt_expt/model/dos_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -16,6 +17,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -117,6 +119,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -127,6 +130,13 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # See make_model.py for the rationale of the pad + monkeypatch.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
diff --git a/deepmd/pt_expt/model/dp_linear_model.py b/deepmd/pt_expt/model/dp_linear_model.py
index 46790c877e..0ac75659b0 100644
--- a/deepmd/pt_expt/model/dp_linear_model.py
+++ b/deepmd/pt_expt/model/dp_linear_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -19,6 +20,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -142,6 +144,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -152,9 +155,16 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # See make_model.py for the rationale of the pad + monkeypatch.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
 
     @classmethod
     def update_sel(
diff --git a/deepmd/pt_expt/model/dp_zbl_model.py b/deepmd/pt_expt/model/dp_zbl_model.py
index b7f164114b..baa30c4ce0 100644
--- a/deepmd/pt_expt/model/dp_zbl_model.py
+++ b/deepmd/pt_expt/model/dp_zbl_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -16,6 +17,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -139,6 +141,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -149,6 +152,15 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # Force `_format_nlist`'s sort branch into the compiled graph so the
+        # exported model tolerates oversized nlists at runtime — see
+        # make_model.py for the full rationale.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 4bd9792420..b28b81ffb1 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import math
+import types
 from typing import (
     Any,
 )
@@ -28,6 +29,28 @@
 )
 
 
+def _pad_nlist_for_export(nlist: torch.Tensor) -> torch.Tensor:
+    """Append a single ``-1`` column to ``nlist`` for export-time tracing.
+
+    Used inside ``forward_common_lower_exportable`` (and its spin counterpart)
+    so that ``_format_nlist``'s terminal slice ``ret[..., :nnei]`` truncates
+    to a statically sized output.  Without the extra column, torch.export
+    cannot prove the ``ret.shape[-1] == nnei`` assertion at trace time and
+    would specialise the dynamic ``nnei`` dim to the sample value.
+
+    Combined with the short-circuit order in ``_format_nlist``
+    (``extra_nlist_sort`` on the left) and the ``need_sorted_nlist_for_lower``
+    override during tracing, this keeps the compiled graph's ``nnei`` axis
+    fully dynamic and free of symbolic shape guards.
+    """
+    pad = -torch.ones(
+        (*nlist.shape[:2], 1),
+        dtype=nlist.dtype,
+        device=nlist.device,
+    )
+    return torch.cat([nlist, pad], dim=-1)
+
+
 def _cal_hessian_ext(
     model: Any,
     kk: str,
@@ -346,6 +369,7 @@ def fn(
                 aparam: torch.Tensor | None,
             ) -> dict[str, torch.Tensor]:
                 extended_coord = extended_coord.detach().requires_grad_(True)
+                nlist = _pad_nlist_for_export(nlist)
                 return model.forward_common_lower(
                     extended_coord,
                     extended_atype,
@@ -356,13 +380,26 @@ def fn(
                     do_atomic_virial=do_atomic_virial,
                 )
 
-            return make_fx(fn, **make_fx_kwargs)(
-                extended_coord,
-                extended_atype,
-                nlist,
-                mapping,
-                fparam,
-                aparam,
+            # Force `_format_nlist`'s sort branch into the compiled graph so the
+            # exported model tolerates oversized nlists at runtime (LAMMPS builds
+            # nlists with rcut+skin).  Combined with the short-circuit order in
+            # `_format_nlist`, no symbolic guard on the dynamic `nnei` axis is
+            # emitted.
+            _orig_need_sort = model.need_sorted_nlist_for_lower
+            model.need_sorted_nlist_for_lower = types.MethodType(
+                lambda self: True, model
             )
+            try:
+                traced = make_fx(fn, **make_fx_kwargs)(
+                    extended_coord,
+                    extended_atype,
+                    nlist,
+                    mapping,
+                    fparam,
+                    aparam,
+                )
+            finally:
+                model.need_sorted_nlist_for_lower = _orig_need_sort
+            return traced
 
     return CM
diff --git a/deepmd/pt_expt/model/polar_model.py b/deepmd/pt_expt/model/polar_model.py
index d421bb76a4..dd6b1c5d0f 100644
--- a/deepmd/pt_expt/model/polar_model.py
+++ b/deepmd/pt_expt/model/polar_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -16,6 +17,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -117,6 +119,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -127,6 +130,13 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # See make_model.py for the rationale of the pad + monkeypatch.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
diff --git a/deepmd/pt_expt/model/property_model.py b/deepmd/pt_expt/model/property_model.py
index 72a327fb03..223f8e5d78 100644
--- a/deepmd/pt_expt/model/property_model.py
+++ b/deepmd/pt_expt/model/property_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -16,6 +17,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -124,6 +126,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_lower(
                 extended_coord,
                 extended_atype,
@@ -134,6 +137,13 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord, extended_atype, nlist, mapping, fparam, aparam
-        )
+        # See make_model.py for the rationale of the pad + monkeypatch.
+        _orig_need_sort = model.need_sorted_nlist_for_lower
+        model.need_sorted_nlist_for_lower = types.MethodType(lambda self: True, model)
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord, extended_atype, nlist, mapping, fparam, aparam
+            )
+        finally:
+            model.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
diff --git a/deepmd/pt_expt/model/spin_model.py b/deepmd/pt_expt/model/spin_model.py
index 70f41f0701..e69ee29f5a 100644
--- a/deepmd/pt_expt/model/spin_model.py
+++ b/deepmd/pt_expt/model/spin_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import types
 from typing import (
     Any,
 )
@@ -17,6 +18,7 @@
 )
 
 from .make_model import (
+    _pad_nlist_for_export,
     make_model,
 )
 from .model import (
@@ -96,6 +98,7 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            nlist = _pad_nlist_for_export(nlist)
             return model.forward_common_lower(
                 extended_coord,
                 extended_atype,
@@ -107,15 +110,30 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord,
-            extended_atype,
-            extended_spin,
-            nlist,
-            mapping,
-            fparam,
-            aparam,
+        # Force the sort branch of `_format_nlist` into the compiled graph by
+        # overriding `need_sorted_nlist_for_lower` on the backbone (which is
+        # where `call_common_lower` reads it).  Short-circuit `or` in
+        # `_format_nlist` then skips the symbolic `n_nnei > nnei` comparison,
+        # so no spurious shape guard is emitted.  See make_model.py for the
+        # non-spin counterpart.
+        backbone = model.backbone_model
+        _orig_need_sort = backbone.need_sorted_nlist_for_lower
+        backbone.need_sorted_nlist_for_lower = types.MethodType(
+            lambda self: True, backbone
         )
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord,
+                extended_atype,
+                extended_spin,
+                nlist,
+                mapping,
+                fparam,
+                aparam,
+            )
+        finally:
+            backbone.need_sorted_nlist_for_lower = _orig_need_sort
+        return traced
 
     def forward_common_lower(
         self, *args: Any, **kwargs: Any
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index f59c397525..04cdedd6cf 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -17,7 +17,7 @@
 
 
 def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
-    """Remove shape-guard assertion nodes from a spin model's exported graph.
+    """Neutralise shape-guard assertion nodes in a spin model's exported graph.
 
     ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape
     relationships discovered during tracing.  For the spin model, the atom-
@@ -32,8 +32,12 @@ def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
     so filtering by message content is not reliable.  Since
     ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape
     guards into these deferred assertions, and the only shape relationships in
-    the spin model involve nall/nloc, removing all of them is safe in this
+    the spin model involve nall/nloc, neutralising all of them is safe in this
     context.
+
+    We replace each assertion's condition with ``True`` rather than erasing the
+    node; erasing nodes can disturb the FX graph structure and produce NaN
+    gradients on some Python/torch versions.
     """
     graph = graph_module.graph
     for node in list(graph.nodes):
@@ -41,8 +45,7 @@ def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
             node.op == "call_function"
             and node.target is torch.ops.aten._assert_scalar.default
         ):
-            graph.erase_node(node)
-    graph.eliminate_dead_code()
+            node.args = (True, node.args[1])
     graph_module.recompile()
 
 
@@ -178,11 +181,12 @@ def _make_sample_inputs(
 def _build_dynamic_shapes(
     *sample_inputs: torch.Tensor | None,
     has_spin: bool = False,
+    model_nnei: int = 1,
 ) -> tuple:
     """Build dynamic shape specifications for torch.export.
 
-    Marks nframes, nloc and nall as dynamic dimensions so the exported
-    program handles arbitrary frame and atom counts.
+    Marks nframes, nloc, nall and nnei as dynamic dimensions so the exported
+    program handles arbitrary frame, atom and neighbor counts.
 
     Parameters
     ----------
@@ -190,12 +194,15 @@ def _build_dynamic_shapes(
         Sample inputs: either 6 tensors (non-spin) or 7 tensors (spin).
     has_spin : bool
         Whether the inputs include an extended_spin tensor.
+    model_nnei : int
+        The model's sum(sel).  Used as the min for the dynamic nnei dim.
     Returns a tuple (not dict) to match positional args of the make_fx
     traced module, whose arg names may have suffixes like ``_1``.
     """
     nframes_dim = torch.export.Dim("nframes", min=1)
     nall_dim = torch.export.Dim("nall", min=1)
     nloc_dim = torch.export.Dim("nloc", min=1)
+    nnei_dim = torch.export.Dim("nnei", min=max(1, model_nnei))
 
     if has_spin:
         # (ext_coord, ext_atype, ext_spin, nlist, mapping, fparam, aparam)
@@ -205,7 +212,11 @@ def _build_dynamic_shapes(
             {0: nframes_dim, 1: nall_dim},  # extended_coord: (nframes, nall, 3)
             {0: nframes_dim, 1: nall_dim},  # extended_atype: (nframes, nall)
             {0: nframes_dim, 1: nall_dim},  # extended_spin: (nframes, nall, 3)
-            {0: nframes_dim, 1: nloc_dim},  # nlist: (nframes, nloc, nnei)
+            {
+                0: nframes_dim,
+                1: nloc_dim,
+                2: nnei_dim,
+            },  # nlist: (nframes, nloc, nnei) — nnei is dynamic
             {0: nframes_dim, 1: nall_dim},  # mapping: (nframes, nall)
             {0: nframes_dim} if fparam is not None else None,  # fparam
             {0: nframes_dim, 1: nloc_dim} if aparam is not None else None,  # aparam
@@ -217,7 +228,11 @@ def _build_dynamic_shapes(
         return (
             {0: nframes_dim, 1: nall_dim},  # extended_coord: (nframes, nall, 3)
             {0: nframes_dim, 1: nall_dim},  # extended_atype: (nframes, nall)
-            {0: nframes_dim, 1: nloc_dim},  # nlist: (nframes, nloc, nnei)
+            {
+                0: nframes_dim,
+                1: nloc_dim,
+                2: nnei_dim,
+            },  # nlist: (nframes, nloc, nnei) — nnei is dynamic
             {0: nframes_dim, 1: nall_dim},  # mapping: (nframes, nall)
             {0: nframes_dim} if fparam is not None else None,  # fparam
             {0: nframes_dim, 1: nloc_dim} if aparam is not None else None,  # aparam
@@ -257,6 +272,7 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
         "type_map": model.get_type_map(),
         "rcut": model.get_rcut(),
         "sel": model.get_sel(),
+        "nnei": sum(model.get_sel()),
         "dim_fparam": model.get_dim_fparam(),
         "dim_aparam": model.get_dim_aparam(),
         "mixed_types": model.mixed_types(),
@@ -336,6 +352,7 @@ def deserialize_to_file(
     model_file: str,
     data: dict,
     model_json_override: dict | None = None,
+    do_atomic_virial: bool = False,
 ) -> None:
     """Deserialize a dictionary to a .pte or .pt2 model file.
 
@@ -356,16 +373,24 @@ def deserialize_to_file(
         If provided, this dict is stored in model.json instead of ``data``.
         Used by ``dp compress`` to store the compressed model dict while
         tracing the uncompressed model (make_fx cannot trace custom ops).
+    do_atomic_virial : bool
+        If True, export with per-atom virial correction (3 extra backward
+        passes, ~2.5x slower).  Default False for best performance.
     """
     if model_file.endswith(".pt2"):
-        _deserialize_to_file_pt2(model_file, data, model_json_override)
+        _deserialize_to_file_pt2(
+            model_file, data, model_json_override, do_atomic_virial
+        )
     else:
-        _deserialize_to_file_pte(model_file, data, model_json_override)
+        _deserialize_to_file_pte(
+            model_file, data, model_json_override, do_atomic_virial
+        )
 
 
 def _trace_and_export(
     data: dict,
     model_json_override: dict | None = None,
+    do_atomic_virial: bool = False,
 ) -> tuple:
     """Common logic: build model, trace, export.
 
@@ -447,7 +472,7 @@ def _trace_and_export(
             mapping_t,
             fparam=fparam,
             aparam=aparam,
-            do_atomic_virial=True,
+            do_atomic_virial=do_atomic_virial,
             tracing_mode="symbolic",
             _allow_non_fake_inputs=True,
         )
@@ -463,7 +488,7 @@ def _trace_and_export(
             mapping_t,
             fparam=fparam,
             aparam=aparam,
-            do_atomic_virial=True,
+            do_atomic_virial=do_atomic_virial,
             tracing_mode="symbolic",
             _allow_non_fake_inputs=True,
         )
@@ -477,7 +502,9 @@ def _trace_and_export(
     # graph.  Exporting on CPU keeps devices consistent; we move the
     # ExportedProgram to the target device afterwards via the official
     # move_to_device_pass (avoids FakeTensor device-propagation errors).
-    dynamic_shapes = _build_dynamic_shapes(*sample_inputs, has_spin=is_spin)
+    dynamic_shapes = _build_dynamic_shapes(
+        *sample_inputs, has_spin=is_spin, model_nnei=sum(model.get_sel())
+    )
     exported = torch.export.export(
         traced,
         sample_inputs,
@@ -487,13 +514,13 @@ def _trace_and_export(
     )
 
     if is_spin:
-        # torch.export re-introduces shape-guard assertions even when
-        # the make_fx graph has none.  The spin model's atom-doubling
-        # logic creates slice patterns that depend on (nall - nloc),
-        # producing guards like Ne(nall, nloc).  These guards are
-        # spurious: the model is correct when nall == nloc (NoPBC).
-        # Strip them from the exported graph so the model can be
-        # used with any valid nall >= nloc.
+        # The spin model's atom-doubling slice patterns depend on
+        # (nall - nloc), producing guards like Ne(nall, nloc).  These are
+        # spurious — the model is correct when nall == nloc (NoPBC).
+        # Non-spin models don't emit shape guards because the short-circuit
+        # order in `_format_nlist` (dpmodel) keeps the dynamic `nnei` axis
+        # free of symbolic comparisons when `extra_nlist_sort=True`
+        # (see `forward_common_lower_exportable` in pt_expt/model/make_model.py).
         _strip_shape_assertions(exported.graph_module)
 
     # 7. Move the exported program to the target device if needed.
@@ -504,7 +531,10 @@ def _trace_and_export(
 
         exported = move_to_device_pass(exported, target_device)
 
-    # 8. Prepare JSON-serializable model dict
+    # 8. Record export-time config in metadata
+    metadata["do_atomic_virial"] = do_atomic_virial
+
+    # 9. Prepare JSON-serializable model dict
     json_source = model_json_override if model_json_override is not None else data
     data_for_json = deepcopy(json_source)
     data_for_json = _numpy_to_json_serializable(data_for_json)
@@ -516,10 +546,11 @@ def _deserialize_to_file_pte(
     model_file: str,
     data: dict,
     model_json_override: dict | None = None,
+    do_atomic_virial: bool = False,
 ) -> None:
     """Deserialize a dictionary to a .pte model file."""
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override
+        data, model_json_override, do_atomic_virial
     )
 
     model_def_script = data.get("model_def_script") or {}
@@ -537,6 +568,7 @@ def _deserialize_to_file_pt2(
     model_file: str,
     data: dict,
     model_json_override: dict | None = None,
+    do_atomic_virial: bool = False,
 ) -> None:
     """Deserialize a dictionary to a .pt2 model file (AOTInductor).
 
@@ -551,11 +583,31 @@ def _deserialize_to_file_pt2(
     )
 
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override
+        data, model_json_override, do_atomic_virial
     )
 
-    # Compile via AOTInductor into a .pt2 package
-    aoti_compile_and_package(exported, package_path=model_file)
+    # On CUDA, aggressive kernel fusion (default realize_opcount_threshold=30)
+    # causes NaN in the backward pass (force/virial) of attention-based
+    # descriptors (DPA1, DPA2). Setting threshold=0 prevents fusion and
+    # avoids the NaN. Only applied on CUDA; CPU compilation is unaffected.
+    #
+    # NOTE: `torch._inductor.config` is a process-wide singleton.  The
+    # save/restore pattern here is NOT thread-safe — concurrent AOTInductor
+    # compilations from multiple threads would race on this global.  Callers
+    # must serialise `.pt2` exports if running under a thread pool.  Processes
+    # are fine (each has its own inductor config).
+    import torch._inductor.config as _inductor_config
+
+    import deepmd.pt_expt.utils.env as _env
+
+    is_cuda = _env.DEVICE.type == "cuda"
+    saved_threshold = _inductor_config.realize_opcount_threshold
+    if is_cuda:
+        _inductor_config.realize_opcount_threshold = 0
+    try:
+        aoti_compile_and_package(exported, package_path=model_file)
+    finally:
+        _inductor_config.realize_opcount_threshold = saved_threshold
 
     # Embed metadata into the .pt2 ZIP archive
     model_def_script = data.get("model_def_script") or {}
diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index 0d42324d24..1bcf44a885 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -208,9 +208,11 @@ class DeepPotPTExpt : public DeepPotBackend {
   bool gpu_enabled;
   std::vector<std::string> type_map;
   std::vector<std::string> output_keys;  // sorted internal output key names
-  bool mixed_types;
-  std::vector<int> sel;
+  bool do_atomic_virial;  // whether model was exported with atomic virial corr
+  int nnei;               // expected nlist nnei dimension (= sum(sel))
   NeighborListData nlist_data;
+  at::Tensor mapping_tensor;     // cached mapping tensor (LAMMPS path)
+  at::Tensor firstneigh_tensor;  // cached nlist tensor (LAMMPS path)
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
 
   /**
diff --git a/source/api_cc/include/DeepSpinPTExpt.h b/source/api_cc/include/DeepSpinPTExpt.h
index af108c7690..47b38767d4 100644
--- a/source/api_cc/include/DeepSpinPTExpt.h
+++ b/source/api_cc/include/DeepSpinPTExpt.h
@@ -183,9 +183,11 @@ class DeepSpinPTExpt : public DeepSpinBackend {
   bool gpu_enabled;
   std::vector<std::string> type_map;
   std::vector<std::string> output_keys;
-  bool mixed_types;
-  std::vector<int> sel;
+  bool do_atomic_virial;  // whether model was exported with atomic virial corr
+  int nnei;               // expected nlist nnei dimension (= sum(sel))
   NeighborListData nlist_data;
+  at::Tensor mapping_tensor;     // cached mapping tensor (LAMMPS path)
+  at::Tensor firstneigh_tensor;  // cached nlist tensor (LAMMPS path)
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
 
   std::vector<torch::Tensor> run_model(const torch::Tensor& coord,
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index a888d29758..61b823ae62 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -101,6 +101,45 @@ inline void build_comm_dict_with_virtual_atoms(
                   remapped_sendnum.data(), remapped_recvnum.data());
 }
 
+/**
+ * @brief Flatten a jagged neighbor list into a [1, nloc, nnei] tensor.
+ *
+ * Each row in @p data may have a different number of neighbors.  Short rows
+ * are padded with -1.  The output width is max(min_nnei, max_row_length).
+ * No truncation or distance sorting is done — the model's format_nlist
+ * handles that on-device.
+ *
+ * If @p min_nnei is 0 (the default used by the .pth callers) and every row
+ * is empty (no atom has any neighbor — fully-dissociated system), the
+ * output shape is [1, nloc, 0].  PyTorch accepts zero-sized dimensions, and
+ * the eager `_format_nlist` pads it back up to sum(sel).  .pt2 callers
+ * always pass @p min_nnei = sum(sel) > 0, so the output width is at least
+ * sum(sel) for them.
+ *
+ * @param data      Jagged neighbor list: data[i] holds neighbor indices
+ *                  for local atom i.
+ * @param min_nnei  Minimum width of the nnei dimension.  For .pt2 models
+ *                  this should be sum(sel) from the model metadata, because
+ *                  torch.export marks nnei >= sum(sel) as a dynamic constraint.
+ *                  For .pth models 0 (the default) is fine.
+ */
+inline torch::Tensor createNlistTensor(
+    const std::vector<std::vector<int>>& data, int min_nnei = 0) {
+  int nloc = static_cast<int>(data.size());
+  int nnei = min_nnei;
+  for (int ii = 0; ii < nloc; ++ii) {
+    nnei = std::max(nnei, static_cast<int>(data[ii].size()));
+  }
+  std::vector<int> flat_data(static_cast<size_t>(nloc) * nnei, -1);
+  for (int ii = 0; ii < nloc; ++ii) {
+    for (size_t jj = 0; jj < data[ii].size(); ++jj) {
+      flat_data[static_cast<size_t>(ii) * nnei + jj] = data[ii][jj];
+    }
+  }
+  torch::Tensor flat_tensor = torch::tensor(flat_data, torch::kInt32);
+  return flat_tensor.view({1, nloc, nnei});
+}
+
 }  // namespace deepmd
 
 #endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index fc09949691..d69dbb8f82 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -32,22 +32,6 @@ void DeepPotPT::translate_error(std::function<void()> f) {
   }
 }
 
-torch::Tensor createNlistTensor(const std::vector<std::vector<int>>& data) {
-  size_t total_size = 0;
-  for (const auto& row : data) {
-    total_size += row.size();
-  }
-  std::vector<int> flat_data;
-  flat_data.reserve(total_size);
-  for (const auto& row : data) {
-    flat_data.insert(flat_data.end(), row.begin(), row.end());
-  }
-
-  torch::Tensor flat_tensor = torch::tensor(flat_data, torch::kInt32);
-  int nloc = data.size();
-  int nnei = nloc > 0 ? total_size / nloc : 0;
-  return flat_tensor.view({1, nloc, nnei});
-}
 DeepPotPT::DeepPotPT() : inited(false) {}
 DeepPotPT::DeepPotPT(const std::string& model,
                      const int& gpu_rank,
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index c1f3d9d674..db099ed464 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -12,12 +12,12 @@
 
 #include "SimulationRegion.h"
 #include "common.h"
+#include "commonPT.h"
 #include "commonPTExpt.h"
 #include "device.h"
 #include "errors.h"
 #include "neighbor_list.h"
 
-using deepmd::ptexpt::buildTypeSortedNlist;
 using deepmd::ptexpt::parse_json;
 using deepmd::ptexpt::read_zip_entry;
 
@@ -93,7 +93,6 @@ void DeepPotPTExpt::init(const std::string& model,
   ntypes = static_cast<int>(metadata["type_map"].as_array().size());
   dfparam = metadata["dim_fparam"].as_int();
   daparam = metadata["dim_aparam"].as_int();
-  mixed_types = metadata["mixed_types"].as_bool();
   aparam_nall = false;  // pt_expt models use nloc for aparam
   if (metadata.obj_val.count("has_default_fparam")) {
     has_default_fparam_ = metadata["has_default_fparam"].as_bool();
@@ -120,16 +119,29 @@ void DeepPotPTExpt::init(const std::string& model,
     }
   }
 
+  if (metadata.obj_val.count("do_atomic_virial")) {
+    do_atomic_virial = metadata["do_atomic_virial"].as_bool();
+  } else {
+    // Older models without this field were exported with do_atomic_virial=True
+    do_atomic_virial = true;
+  }
+
+  // Read expected nnei (= sum(sel)) — the .pt2 graph has this dimension static.
+  if (metadata.obj_val.count("nnei")) {
+    nnei = metadata["nnei"].as_int();
+  } else {
+    // Fallback: compute from sel array
+    nnei = 0;
+    for (const auto& v : metadata["sel"].as_array()) {
+      nnei += v.as_int();
+    }
+  }
+
   type_map.clear();
   for (const auto& v : metadata["type_map"].as_array()) {
     type_map.push_back(v.as_string());
   }
 
-  sel.clear();
-  for (const auto& v : metadata["sel"].as_array()) {
-    sel.push_back(v.as_int());
-  }
-
   // Parse output keys from metadata
   output_keys.clear();
   for (const auto& v : metadata["output_keys"].as_array()) {
@@ -211,6 +223,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                             const std::vector<VALUETYPE>& fparam,
                             const std::vector<VALUETYPE>& aparam,
                             const bool atomic) {
+  // Fail fast before allocating any tensors: refuse to run if the caller
+  // asked for atomic virial but the .pt2 was exported without it.
+  if (atomic && !do_atomic_virial) {
+    throw deepmd::deepmd_exception(
+        "Atomic virial was requested (e.g. by LAMMPS compute */atom/virial) "
+        "but this .pt2 model was exported without it (metadata field "
+        "do_atomic_virial=False). Atomic virial adds ~2.5x inference cost "
+        "and is off by default for .pt2. To enable it, regenerate with: "
+        "dp convert-backend --atomic-virial INPUT.pth OUTPUT.pt2");
+  }
   torch::Device device(torch::kCUDA, gpu_id);
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
@@ -252,40 +274,39 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           .clone()
           .to(device);
 
+  // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition,
+  // atom exchange between subdomains), so `ago > 0` implies the cached
+  // mapping and nlist tensors are still valid.  Rebuild only on ago==0.
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
     nlist_data.padding();
-  }
-  // Build type-sorted, sel-limited nlist expected by the .pt2 model
-  at::Tensor firstneigh_tensor =
-      buildTypeSortedNlist<double>(nlist_data.jlist, coord_d, datype, sel, nloc,
-                                   mixed_types)
-          .to(device);
 
-  // Build mapping tensor.
-  // NOTE: must .clone() because the local vector goes out of scope before
-  // run_model is called, and torch::from_blob does not copy the data.
-  at::Tensor mapping_tensor;
-  if (lmp_list.mapping) {
-    std::vector<std::int64_t> mapping(nall_real);
-    for (int ii = 0; ii < nall_real; ii++) {
-      mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
-    }
-    mapping_tensor =
-        torch::from_blob(mapping.data(), {1, nall_real}, int_option)
-            .clone()
-            .to(device);
-  } else {
-    // Default identity mapping for local atoms
-    std::vector<std::int64_t> mapping(nall_real);
-    for (int ii = 0; ii < nall_real; ii++) {
-      mapping[ii] = ii;
+    // Rebuild mapping tensor
+    if (lmp_list.mapping) {
+      std::vector<std::int64_t> mapping(nall_real);
+      for (int ii = 0; ii < nall_real; ii++) {
+        mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
+      }
+      mapping_tensor =
+          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+              .clone()
+              .to(device);
+    } else {
+      // Default identity mapping for local atoms
+      std::vector<std::int64_t> mapping(nall_real);
+      for (int ii = 0; ii < nall_real; ii++) {
+        mapping[ii] = ii;
+      }
+      mapping_tensor =
+          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+              .clone()
+              .to(device);
     }
-    mapping_tensor =
-        torch::from_blob(mapping.data(), {1, nall_real}, int_option)
-            .clone()
-            .to(device);
+
+    // Flatten raw nlist — the .pt2 model sorts by distance on-device.
+    firstneigh_tensor =
+        createNlistTensor(nlist_data.jlist, nnei).to(torch::kInt64).to(device);
   }
 
   // Build fparam/aparam tensors (cast to float64 for the model)
@@ -434,6 +455,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                             const std::vector<VALUETYPE>& fparam,
                             const std::vector<VALUETYPE>& aparam,
                             const bool atomic) {
+  // Fail fast before allocating any tensors (same check as the nlist
+  // overload — see its comment).
+  if (atomic && !do_atomic_virial) {
+    throw deepmd::deepmd_exception(
+        "Atomic virial was requested (e.g. by LAMMPS compute */atom/virial) "
+        "but this .pt2 model was exported without it (metadata field "
+        "do_atomic_virial=False). Atomic virial adds ~2.5x inference cost "
+        "and is off by default for .pt2. To enable it, regenerate with: "
+        "dp convert-backend --atomic-virial INPUT.pth OUTPUT.pt2");
+  }
   int natoms = atype.size();
   int nframes = coord.size() / (natoms * 3);
   if (nframes > 1) {
@@ -518,8 +549,6 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                 ncell, ext_stt, ext_end, region, ncell);
   }
 
-  // 3. Build type-sorted, sel-limited nlist (uses double coords for distances)
-
   // 4. Convert to tensors (always float64 for .pt2 model)
   // NOTE: must .clone() because from_blob does not copy data, and the local
   // vectors would go out of scope before run_model completes.
@@ -532,10 +561,9 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       torch::from_blob(atype_64.data(), {1, nall}, int_options)
           .clone()
           .to(device);
+  // Flatten raw nlist — the .pt2 model sorts by distance on-device.
   at::Tensor nlist_tensor =
-      buildTypeSortedNlist<double>(nlist_raw, coord_cpy_d, atype_cpy, sel, nloc,
-                                   mixed_types)
-          .to(device);
+      createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device);
   std::vector<std::int64_t> mapping_64(mapping_vec.begin(), mapping_vec.end());
   at::Tensor mapping_tensor =
       torch::from_blob(mapping_64.data(), {1, nall}, int_options)
diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
index e7fdb0a6a5..5add377045 100644
--- a/source/api_cc/src/DeepSpinPT.cc
+++ b/source/api_cc/src/DeepSpinPT.cc
@@ -31,22 +31,6 @@ void DeepSpinPT::translate_error(std::function<void()> f) {
   }
 }
 
-torch::Tensor createNlistTensor2(const std::vector<std::vector<int>>& data) {
-  size_t total_size = 0;
-  for (const auto& row : data) {
-    total_size += row.size();
-  }
-  std::vector<int> flat_data;
-  flat_data.reserve(total_size);
-  for (const auto& row : data) {
-    flat_data.insert(flat_data.end(), row.begin(), row.end());
-  }
-
-  torch::Tensor flat_tensor = torch::tensor(flat_data, torch::kInt32);
-  int nloc = data.size();
-  int nnei = nloc > 0 ? total_size / nloc : 0;
-  return flat_tensor.view({1, nloc, nnei});
-}
 DeepSpinPT::DeepSpinPT() : inited(false) {}
 DeepSpinPT::DeepSpinPT(const std::string& model,
                        const int& gpu_rank,
@@ -209,7 +193,7 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
               .to(device);
     }
   }
-  at::Tensor firstneigh = createNlistTensor2(nlist_data.jlist);
+  at::Tensor firstneigh = createNlistTensor(nlist_data.jlist);
   firstneigh_tensor = firstneigh.to(torch::kInt64).to(device);
   bool do_atom_virial_tensor = atomic;
   c10::optional<torch::Tensor> fparam_tensor;
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index ae4ef423ed..dcd7df55a4 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -12,12 +12,12 @@
 
 #include "SimulationRegion.h"
 #include "common.h"
+#include "commonPT.h"
 #include "commonPTExpt.h"
 #include "device.h"
 #include "errors.h"
 #include "neighbor_list.h"
 
-using deepmd::ptexpt::buildTypeSortedNlist;
 using deepmd::ptexpt::parse_json;
 using deepmd::ptexpt::read_zip_entry;
 
@@ -93,7 +93,6 @@ void DeepSpinPTExpt::init(const std::string& model,
   ntypes = static_cast<int>(metadata["type_map"].as_array().size());
   dfparam = metadata["dim_fparam"].as_int();
   daparam = metadata["dim_aparam"].as_int();
-  mixed_types = metadata["mixed_types"].as_bool();
   aparam_nall = false;
 
   // Spin-specific metadata
@@ -133,16 +132,29 @@ void DeepSpinPTExpt::init(const std::string& model,
     }
   }
 
+  if (metadata.obj_val.count("do_atomic_virial")) {
+    do_atomic_virial = metadata["do_atomic_virial"].as_bool();
+  } else {
+    // Older models without this field were exported with do_atomic_virial=True
+    do_atomic_virial = true;
+  }
+
+  // Read expected nnei (= sum(sel)) — the .pt2 graph has this dimension static.
+  if (metadata.obj_val.count("nnei")) {
+    nnei = metadata["nnei"].as_int();
+  } else {
+    // Fallback: compute from sel array
+    nnei = 0;
+    for (const auto& v : metadata["sel"].as_array()) {
+      nnei += v.as_int();
+    }
+  }
+
   type_map.clear();
   for (const auto& v : metadata["type_map"].as_array()) {
     type_map.push_back(v.as_string());
   }
 
-  sel.clear();
-  for (const auto& v : metadata["sel"].as_array()) {
-    sel.push_back(v.as_int());
-  }
-
   output_keys.clear();
   for (const auto& v : metadata["output_keys"].as_array()) {
     output_keys.push_back(v.as_string());
@@ -230,6 +242,15 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
                              const std::vector<VALUETYPE>& fparam,
                              const std::vector<VALUETYPE>& aparam,
                              const bool atomic) {
+  // Fail fast before allocating any tensors.
+  if (atomic && !do_atomic_virial) {
+    throw deepmd::deepmd_exception(
+        "Atomic virial was requested (e.g. by LAMMPS compute */atom/virial) "
+        "but this .pt2 model was exported without it (metadata field "
+        "do_atomic_virial=False). Atomic virial adds ~2.5x inference cost "
+        "and is off by default for .pt2. To enable it, regenerate with: "
+        "dp convert-backend --atomic-virial INPUT.pth OUTPUT.pt2");
+  }
   torch::Device device(torch::kCUDA, gpu_id);
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
@@ -281,36 +302,38 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
           .clone()
           .to(device);
 
+  // LAMMPS sets ago=0 on every nlist rebuild, so ago>0 implies the cached
+  // mapping and nlist tensors are still valid — see DeepPotPTExpt.cc for
+  // the same rationale.
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
     nlist_data.padding();
-  }
-  at::Tensor firstneigh_tensor =
-      buildTypeSortedNlist<double>(nlist_data.jlist, coord_d, datype, sel, nloc,
-                                   mixed_types)
-          .to(device);
 
-  // Build mapping tensor
-  at::Tensor mapping_tensor;
-  if (lmp_list.mapping) {
-    std::vector<std::int64_t> mapping(nall_real);
-    for (int ii = 0; ii < nall_real; ii++) {
-      mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
-    }
-    mapping_tensor =
-        torch::from_blob(mapping.data(), {1, nall_real}, int_option)
-            .clone()
-            .to(device);
-  } else {
-    std::vector<std::int64_t> mapping(nall_real);
-    for (int ii = 0; ii < nall_real; ii++) {
-      mapping[ii] = ii;
+    // Rebuild mapping tensor
+    if (lmp_list.mapping) {
+      std::vector<std::int64_t> mapping(nall_real);
+      for (int ii = 0; ii < nall_real; ii++) {
+        mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
+      }
+      mapping_tensor =
+          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+              .clone()
+              .to(device);
+    } else {
+      std::vector<std::int64_t> mapping(nall_real);
+      for (int ii = 0; ii < nall_real; ii++) {
+        mapping[ii] = ii;
+      }
+      mapping_tensor =
+          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+              .clone()
+              .to(device);
     }
-    mapping_tensor =
-        torch::from_blob(mapping.data(), {1, nall_real}, int_option)
-            .clone()
-            .to(device);
+
+    // Flatten raw nlist — the .pt2 model sorts by distance on-device.
+    firstneigh_tensor =
+        createNlistTensor(nlist_data.jlist, nnei).to(torch::kInt64).to(device);
   }
 
   // Build fparam/aparam tensors
@@ -475,6 +498,15 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
                              const std::vector<VALUETYPE>& fparam,
                              const std::vector<VALUETYPE>& aparam,
                              const bool atomic) {
+  // Fail fast before allocating any tensors.
+  if (atomic && !do_atomic_virial) {
+    throw deepmd::deepmd_exception(
+        "Atomic virial was requested (e.g. by LAMMPS compute */atom/virial) "
+        "but this .pt2 model was exported without it (metadata field "
+        "do_atomic_virial=False). Atomic virial adds ~2.5x inference cost "
+        "and is off by default for .pt2. To enable it, regenerate with: "
+        "dp convert-backend --atomic-virial INPUT.pth OUTPUT.pt2");
+  }
   int natoms = atype.size();
 
   torch::Device device(torch::kCUDA, gpu_id);
@@ -580,10 +612,9 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
       torch::from_blob(atype_64.data(), {1, nall}, int_options)
           .clone()
           .to(device);
+  // Flatten raw nlist — the .pt2 model sorts by distance on-device.
   at::Tensor nlist_tensor =
-      buildTypeSortedNlist<double>(nlist_raw, coord_cpy_d, atype_cpy, sel, nloc,
-                                   mixed_types)
-          .to(device);
+      createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device);
   std::vector<std::int64_t> mapping_64(mapping_vec.begin(), mapping_vec.end());
   at::Tensor mapping_tensor =
       torch::from_blob(mapping_64.data(), {1, nall}, int_options)
diff --git a/source/api_cc/src/DeepSpinTF.cc b/source/api_cc/src/DeepSpinTF.cc
index 7f569ea231..091fca64c2 100644
--- a/source/api_cc/src/DeepSpinTF.cc
+++ b/source/api_cc/src/DeepSpinTF.cc
@@ -1121,7 +1121,9 @@ void DeepSpinTF::extend(int& extend_inum,
   extend_firstneigh.resize(extend_nloc);
   extend_numneigh.resize(extend_nloc);
   for (int ii = 0; ii < extend_nloc; ii++) {
-    extend_firstneigh[ii] = &extend_neigh[ii][0];
+    // `&vec[0]` is undefined behaviour for empty vectors; use
+    // vector::data() instead.  See convert_nlist for the same fix.
+    extend_firstneigh[ii] = extend_neigh[ii].data();
     extend_numneigh[ii] = extend_neigh[ii].size();
   }
 
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 1ad1a5c97b..7aae726242 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -276,13 +276,21 @@ void deepmd::NeighborListData::copy_from_nlist(const InputNlist& inlist,
   int inum = natoms >= 0 ? natoms : inlist.inum;
   ilist.resize(inum);
   jlist.resize(inum);
-  memcpy(&ilist[0], inlist.ilist, inum * sizeof(int));
+  if (inum > 0) {
+    memcpy(&ilist[0], inlist.ilist, inum * sizeof(int));
+  }
   for (int ii = 0; ii < inum; ++ii) {
     int jnum = inlist.numneigh[ii];
     jlist[ii].resize(jnum);
-    memcpy(&jlist[ii][0], inlist.firstneigh[ii], jnum * sizeof(int));
-    for (int jj = 0; jj < jnum; ++jj) {
-      jlist[ii][jj] &= inlist.mask;
+    // Guard against empty jlist[ii]: `&vec[0]` is undefined behaviour for
+    // empty vectors and libstdc++ debug mode asserts on it.  This happens
+    // when a subdomain's local atoms legitimately have zero neighbours
+    // within cutoff (e.g. under spatial partitioning).
+    if (jnum > 0) {
+      memcpy(&jlist[ii][0], inlist.firstneigh[ii], jnum * sizeof(int));
+      for (int jj = 0; jj < jnum; ++jj) {
+        jlist[ii][jj] &= inlist.mask;
+      }
     }
   }
 }
@@ -295,13 +303,13 @@ void deepmd::NeighborListData::shuffle(const AtomMap& map) {
 void deepmd::NeighborListData::shuffle(const std::vector<int>& fwd_map) {
   int nloc = fwd_map.size();
   for (unsigned ii = 0; ii < ilist.size(); ++ii) {
-    if (ilist[ii] < nloc) {
+    if (ilist[ii] >= 0 && ilist[ii] < nloc) {
       ilist[ii] = fwd_map[ilist[ii]];
     }
   }
   for (unsigned ii = 0; ii < jlist.size(); ++ii) {
     for (unsigned jj = 0; jj < jlist[ii].size(); ++jj) {
-      if (jlist[ii][jj] < nloc) {
+      if (jlist[ii][jj] >= 0 && jlist[ii][jj] < nloc) {
         jlist[ii][jj] = fwd_map[jlist[ii][jj]];
       }
     }
@@ -353,12 +361,16 @@ void deepmd::NeighborListData::make_inlist(InputNlist& inlist) {
   firstneigh.resize(nloc);
   for (int ii = 0; ii < nloc; ++ii) {
     numneigh[ii] = jlist[ii].size();
-    firstneigh[ii] = &jlist[ii][0];
+    // `&vec[0]` is undefined behaviour for empty vectors (libstdc++
+    // debug mode asserts on it).  When numneigh[ii] is 0 the pointer is
+    // never dereferenced; use vector::data() which is well-defined for
+    // empty vectors.  Mirrors the fix in convert_nlist.
+    firstneigh[ii] = jlist[ii].data();
   }
   inlist.inum = nloc;
-  inlist.ilist = &ilist[0];
-  inlist.numneigh = &numneigh[0];
-  inlist.firstneigh = &firstneigh[0];
+  inlist.ilist = ilist.data();
+  inlist.numneigh = numneigh.data();
+  inlist.firstneigh = firstneigh.data();
 }
 
 #ifdef BUILD_TENSORFLOW
diff --git a/source/api_cc/src/commonPTExpt.h b/source/api_cc/src/commonPTExpt.h
index 7dd02d09a9..ddc8ad5014 100644
--- a/source/api_cc/src/commonPTExpt.h
+++ b/source/api_cc/src/commonPTExpt.h
@@ -438,101 +438,11 @@ inline std::string read_zip_entry(const std::string& zip_path,
 }
 
 // ============================================================================
-// Build type-sorted, sel-limited neighbor list tensor.
+// Create raw neighbor list tensor.
+// The .pt2 compiled graph already contains format_nlist (distance sort +
+// truncation) and an internal +1 pad that guarantees the sort branch fires.
+// The C++ side just flattens the jagged nlist into a rectangular tensor.
 // ============================================================================
 
-/**
- * @brief Convert a raw neighbor list to the sel-limited format expected by the
- *        pt_expt model.
- *
- * For non-mixed-type models (distinguish_types=true): the nlist has shape
- * (nframes, nloc, sum(sel)), where the first sel[0] entries are neighbors of
- * type 0, the next sel[1] are type 1, etc.  Within each type group neighbors
- * are sorted by distance (ascending).
- *
- * For mixed-type models (distinguish_types=false): all neighbors go into a
- * single group sorted by distance, truncated to sum(sel).
- *
- * Missing slots are filled with -1.
- */
-template <typename VALUETYPE>
-inline torch::Tensor buildTypeSortedNlist(
-    const std::vector<std::vector<int>>& raw_nlist,
-    const std::vector<VALUETYPE>& coord_ext,
-    const std::vector<int>& atype_ext,
-    const std::vector<int>& sel,
-    int nloc,
-    bool mixed_types) {
-  int nsel = 0;
-  for (auto s : sel) {
-    nsel += s;
-  }
-  int ntypes = sel.size();
-  std::vector<int64_t> result(static_cast<size_t>(nloc) * nsel, -1);
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    const auto& neighbors = raw_nlist[ii];
-    VALUETYPE xi = coord_ext[ii * 3 + 0];
-    VALUETYPE yi = coord_ext[ii * 3 + 1];
-    VALUETYPE zi = coord_ext[ii * 3 + 2];
-    int offset = ii * nsel;
-
-    if (mixed_types) {
-      std::vector<std::pair<VALUETYPE, int>> all_neighbors;
-      for (int jj : neighbors) {
-        if (jj < 0) {
-          continue;
-        }
-        int jtype = atype_ext[jj];
-        if (jtype < 0) {
-          continue;
-        }
-        VALUETYPE dx = coord_ext[jj * 3 + 0] - xi;
-        VALUETYPE dy = coord_ext[jj * 3 + 1] - yi;
-        VALUETYPE dz = coord_ext[jj * 3 + 2] - zi;
-        VALUETYPE rr = dx * dx + dy * dy + dz * dz;
-        all_neighbors.emplace_back(rr, jj);
-      }
-      std::sort(all_neighbors.begin(), all_neighbors.end());
-      int count = std::min(static_cast<int>(all_neighbors.size()), nsel);
-      for (int kk = 0; kk < count; ++kk) {
-        result[offset + kk] = all_neighbors[kk].second;
-      }
-    } else {
-      std::vector<std::vector<std::pair<VALUETYPE, int>>> by_type(ntypes);
-      for (int jj : neighbors) {
-        if (jj < 0) {
-          continue;
-        }
-        int jtype = atype_ext[jj];
-        if (jtype < 0 || jtype >= ntypes) {
-          continue;
-        }
-        VALUETYPE dx = coord_ext[jj * 3 + 0] - xi;
-        VALUETYPE dy = coord_ext[jj * 3 + 1] - yi;
-        VALUETYPE dz = coord_ext[jj * 3 + 2] - zi;
-        VALUETYPE rr = dx * dx + dy * dy + dz * dz;
-        by_type[jtype].emplace_back(rr, jj);
-      }
-      int col = 0;
-      for (int tt = 0; tt < ntypes; ++tt) {
-        auto& group = by_type[tt];
-        std::sort(group.begin(), group.end());
-        int count = std::min(static_cast<int>(group.size()), sel[tt]);
-        for (int kk = 0; kk < count; ++kk) {
-          result[offset + col + kk] = group[kk].second;
-        }
-        col += sel[tt];
-      }
-    }
-  }
-
-  torch::Tensor tensor =
-      torch::from_blob(result.data(), {1, nloc, nsel},
-                       torch::TensorOptions().dtype(torch::kInt64))
-          .clone();
-  return tensor;
-}
-
 }  // namespace ptexpt
 }  // namespace deepmd
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index a3e7d067f7..a812f776fc 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -11,6 +11,7 @@ if(ENABLE_TENSORFLOW)
 endif()
 if(ENABLE_PYTORCH)
   target_compile_definitions(runUnitTests_cc PRIVATE BUILD_PYTORCH)
+  target_link_libraries(runUnitTests_cc "${TORCH_LIBRARIES}")
 endif()
 if(ENABLE_JAX)
   target_compile_definitions(runUnitTests_cc PRIVATE BUILD_JAX)
diff --git a/source/api_cc/tests/test_deeppot_dpa_ptexpt_spin.cc b/source/api_cc/tests/test_deeppot_dpa_ptexpt_spin.cc
index 27c1836aa1..5f5ae7a6b6 100644
--- a/source/api_cc/tests/test_deeppot_dpa_ptexpt_spin.cc
+++ b/source/api_cc/tests/test_deeppot_dpa_ptexpt_spin.cc
@@ -8,12 +8,13 @@
 #include <vector>
 
 #include "DeepSpin.h"
+#include "DeepSpinPTExpt.h"
 #include "neighbor_list.h"
 #include "test_utils.h"
 
 // Spin models need relaxed epsilon
 #undef EPSILON
-#define EPSILON (std::is_same<VALUETYPE, double>::value ? 1e-6 : 1e-1)
+#define EPSILON (std::is_same<VALUETYPE, double>::value ? 1e-10 : 1e-4)
 
 // ============================================================================
 // PBC test fixture
@@ -52,7 +53,7 @@ class TestInferDeepSpinDpaPtExpt : public ::testing::Test {
         GTEST_SKIP() << "Skipping: " << model_path << " not found.";
       }
     }
-#ifndef BUILD_PYTORCH
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT_SPIN
     GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
 #endif
     dp.init(model_path);
@@ -238,7 +239,7 @@ class TestInferDeepSpinDpaPtExptNopbc : public ::testing::Test {
         GTEST_SKIP() << "Skipping: " << model_path << " not found.";
       }
     }
-#ifndef BUILD_PYTORCH
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT_SPIN
     GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
 #endif
     dp.init(model_path);
@@ -470,3 +471,103 @@ TYPED_TEST(TestInferDeepSpinDpaPtExptNopbc, cpu_lmp_nlist_atomic) {
     EXPECT_LT(fabs(atom_vir[ii] - expected_atom_v[ii]), EPSILON);
   }
 }
+
+TYPED_TEST(TestInferDeepSpinDpaPtExptNopbc, cpu_lmp_nlist_oversized) {
+  using VALUETYPE = TypeParam;
+  const std::vector<VALUETYPE>& coord = this->coord;
+  const std::vector<VALUETYPE>& spin = this->spin;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_fm = this->expected_fm;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  deepmd::DeepSpin& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, force_mag, virial;
+
+  std::vector<std::vector<int> > nlist_data = {
+      {1, 2, 3, 4, 5}, {0, 2, 3, 4, 5}, {0, 1, 3, 4, 5},
+      {0, 1, 2, 4, 5}, {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}};
+  // Pad with extra -1 entries and shuffle to create oversized nlist
+  std::vector<std::vector<int> > nlist_oversized;
+  _pad_shuffle_nlist(nlist_oversized, nlist_data, 50);
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_oversized);
+  dp.compute(ener, force, force_mag, virial, coord, spin, atype, box, 0, inlist,
+             0);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(force_mag.size(), natoms * 3);
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+    EXPECT_LT(fabs(force_mag[ii] - expected_fm[ii]), EPSILON);
+  }
+  EXPECT_FALSE(virial.empty()) << "Virial should not be empty";
+  EXPECT_EQ(virial.size(), 9);
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Sanity check that the shuffle in `_pad_shuffle_nlist` actually redistributes
+// real neighbors away from the front of each row — see the non-spin version
+// in test_deeppot_ptexpt.cc for the rationale.  At least one prediction
+// (energy / force / force_mag / virial) must deviate from the reference by
+// more than EPSILON when the nlist is truncated to a single column.
+TYPED_TEST(TestInferDeepSpinDpaPtExptNopbc,
+           cpu_lmp_nlist_oversized_shuffle_sanity) {
+  using VALUETYPE = TypeParam;
+  const std::vector<VALUETYPE>& coord = this->coord;
+  const std::vector<VALUETYPE>& spin = this->spin;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_fm = this->expected_fm;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  deepmd::DeepSpin& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, force_mag, virial;
+
+  std::vector<std::vector<int> > nlist_data = {
+      {1, 2, 3, 4, 5}, {0, 2, 3, 4, 5}, {0, 1, 3, 4, 5},
+      {0, 1, 2, 4, 5}, {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}};
+  std::vector<std::vector<int> > nlist_oversized;
+  _pad_shuffle_nlist(nlist_oversized, nlist_data, 50);
+  for (auto& row : nlist_oversized) {
+    row.resize(1);
+  }
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_oversized);
+  dp.compute(ener, force, force_mag, virial, coord, spin, atype, box, 0, inlist,
+             0);
+
+  // See test_deeppot_ptexpt.cc for the rationale of the separate negative-
+  // direction threshold.  For the spin nopbc test the strongest signal
+  // is in force_mag (max magnitude ~0.97); 1e-3 is well below all
+  // expected signal magnitudes and well above the float precision floor.
+  static constexpr double SANITY_MIN_DEV = 1e-3;
+  bool any_deviates = fabs(ener - expected_tot_e) > SANITY_MIN_DEV;
+  for (int ii = 0; ii < natoms * 3 && !any_deviates; ++ii) {
+    any_deviates = fabs(force[ii] - expected_f[ii]) > SANITY_MIN_DEV;
+  }
+  for (int ii = 0; ii < natoms * 3 && !any_deviates; ++ii) {
+    any_deviates = fabs(force_mag[ii] - expected_fm[ii]) > SANITY_MIN_DEV;
+  }
+  for (int ii = 0; ii < 9 && !any_deviates; ++ii) {
+    any_deviates = fabs(virial[ii] - expected_tot_v[ii]) > SANITY_MIN_DEV;
+  }
+  EXPECT_TRUE(any_deviates)
+      << "Every prediction stayed within SANITY_MIN_DEV after single-column "
+         "truncation of the shuffled oversized nlist — the shuffle appears "
+         "to have kept real neighbors at the front.  The oversized test "
+         "above may be a tautology; increase n_extra in _pad_shuffle_nlist.";
+}
diff --git a/source/api_cc/tests/test_deeppot_ptexpt.cc b/source/api_cc/tests/test_deeppot_ptexpt.cc
index cfde173a82..201724a725 100644
--- a/source/api_cc/tests/test_deeppot_ptexpt.cc
+++ b/source/api_cc/tests/test_deeppot_ptexpt.cc
@@ -356,6 +356,166 @@ TYPED_TEST(TestInferDeepPotAPtExpt, cpu_lmp_nlist_2rc) {
   }
 }
 
+TYPED_TEST(TestInferDeepPotAPtExpt, cpu_lmp_nlist_oversized) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  // Pad with extra -1 entries and shuffle to create oversized nlist
+  std::vector<std::vector<int> > nlist_oversized;
+  _pad_shuffle_nlist(nlist_oversized, nlist_data, 200);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_oversized);
+
+  double ener;
+  std::vector<VALUETYPE> force_(nall * 3, 0.0), virial(9, 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Sanity check for the oversized-nlist test above: the Python counterpart
+// asserts that naïvely truncating the shuffled nlist (without distance
+// sorting) produces a DIFFERENT prediction, proving that the shuffle did
+// move real neighbors past the sum(sel) boundary.  The C++ API always
+// sorts on-device, so we verify the equivalent property by truncating the
+// shuffled nlist to a single column: with the shuffle ratio (~5 real
+// entries out of 205), almost every row ends up holding a -1, and the
+// model's prediction must deviate from the reference on at least one of
+// {energy, force, virial}.  If every prediction still matches the
+// reference within EPSILON, the shuffle is not meaningful and the
+// oversized test above is a tautology.
+TYPED_TEST(TestInferDeepPotAPtExpt, cpu_lmp_nlist_oversized_shuffle_sanity) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  std::vector<std::vector<int> > nlist_oversized;
+  _pad_shuffle_nlist(nlist_oversized, nlist_data, 200);
+  for (auto& row : nlist_oversized) {
+    row.resize(1);
+  }
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_oversized);
+
+  double ener;
+  std::vector<VALUETYPE> force_(nall * 3, 0.0), virial(9, 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  // Threshold for the negative-direction check is intentionally separate
+  // from EPSILON.  EPSILON is a numerical-precision tolerance for positive
+  // assertions ("model matches reference"); here we want a physical-signal
+  // tolerance ("broken model produces a meaningful deviation").  1e-3 is
+  // 100x below the smallest expected force magnitude (~1e-1) and well
+  // above the float precision noise floor (~1e-5), so it cleanly
+  // distinguishes "actually broken" from "noise".
+  static constexpr double SANITY_MIN_DEV = 1e-3;
+  bool any_deviates = fabs(ener - expected_tot_e) > SANITY_MIN_DEV;
+  for (int ii = 0; ii < natoms * 3 && !any_deviates; ++ii) {
+    any_deviates = fabs(force[ii] - expected_f[ii]) > SANITY_MIN_DEV;
+  }
+  for (int ii = 0; ii < 9 && !any_deviates; ++ii) {
+    any_deviates = fabs(virial[ii] - expected_tot_v[ii]) > SANITY_MIN_DEV;
+  }
+  EXPECT_TRUE(any_deviates)
+      << "Every prediction (energy/force/virial) stayed within "
+         "SANITY_MIN_DEV after single-column truncation of the shuffled "
+         "oversized nlist — the shuffle appears to have kept real neighbors "
+         "at the front.  The oversized test above may be a tautology; "
+         "increase n_extra in _pad_shuffle_nlist.";
+}
+
+// Edge case: a subdomain whose every local atom has zero neighbors within
+// cutoff (e.g. under aggressive spatial partitioning on a small/sparse
+// subdomain, or at the start of a simulation before atoms settle in).
+// The C++ side builds an InputNlist with empty rows, which feeds
+// `createNlistTensor` with min_nnei=sum(sel); the compiled .pt2 graph must
+// then run cleanly on an all-`-1` nlist and produce a sensible (finite,
+// interaction-free) result.  Verifies both that the code path doesn't
+// crash and that the forces/virial collapse to zero (no interactions).
+TYPED_TEST(TestInferDeepPotAPtExpt, cpu_lmp_nlist_empty_subdomain) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  int& natoms = this->natoms;
+  deepmd::DeepPot& dp = this->dp;
+
+  // Pass coord/atype as-is; the model sees them, but `dp.compute` only
+  // uses the provided InputNlist for neighbor information.
+  int nall = natoms;
+  std::vector<std::vector<int> > nlist_data(natoms);  // every row empty
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+
+  double ener;
+  std::vector<VALUETYPE> force(nall * 3, 0.0), virial(9, 0.0);
+  // Must not throw: zero-neighbor input is legal and expected under some
+  // spatial-partitioning configurations.
+  ASSERT_NO_THROW(
+      dp.compute(ener, force, virial, coord, atype, box, 0, inlist, 0));
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_TRUE(std::isfinite(ener));
+  // With no neighbors, interaction forces and virial must be exactly zero
+  // (the descriptor sees only -1 entries, so pair contributions vanish).
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_TRUE(std::isfinite(force[ii]));
+    EXPECT_LT(fabs(force[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_TRUE(std::isfinite(virial[ii]));
+    EXPECT_LT(fabs(virial[ii]), EPSILON);
+  }
+}
+
 TYPED_TEST(TestInferDeepPotAPtExpt, cpu_lmp_nlist_type_sel) {
   using VALUETYPE = TypeParam;
   std::vector<VALUETYPE>& coord = this->coord;
@@ -421,6 +581,52 @@ TYPED_TEST(TestInferDeepPotAPtExpt, print_summary) {
   dp.print_summary("");
 }
 
+// Regression test for the fail-fast guard hoisted in commit c80db58d.
+// `deeppot_sea_no_atomic_virial.pt2` is a copy of deeppot_sea.pt2 with
+// the do_atomic_virial=false flag patched into its metadata.json.
+// Calling compute() with atomic=true on this model must throw before
+// any tensors are allocated.
+TYPED_TEST(TestInferDeepPotAPtExpt, cpu_atomic_throws_when_disabled) {
+  using VALUETYPE = TypeParam;
+  deepmd::DeepPot dp_no_av;
+  ASSERT_NO_THROW(
+      dp_no_av.init("../../tests/infer/deeppot_sea_no_atomic_virial.pt2"));
+
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  int& natoms = this->natoms;
+
+  // Build an LMP-style nlist so we exercise the nlist-overload of
+  // compute(); the no-nlist overload has the same guard but is
+  // covered by symmetry.
+  float rc = dp_no_av.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+
+  double ener;
+  std::vector<VALUETYPE> force(nall * 3, 0.0), virial(9, 0.0), atom_ener,
+      atom_vir;
+  // atomic=true => guard must trip and throw deepmd_exception.
+  EXPECT_THROW(
+      dp_no_av.compute(ener, force, virial, atom_ener, atom_vir, coord_cpy,
+                       atype_cpy, box, nall - nloc, inlist, 0),
+      deepmd::deepmd_exception);
+  // atomic=false on the same model must work normally (sanity check
+  // that the guard fires only when actually requested).
+  EXPECT_NO_THROW(dp_no_av.compute(ener, force, virial, coord_cpy, atype_cpy,
+                                   box, nall - nloc, inlist, 0));
+}
+
 template <class VALUETYPE>
 class TestInferDeepPotAPtExptNoPbc : public ::testing::Test {
  protected:
diff --git a/source/api_cc/tests/test_deepspin_model_devi_ptexpt.cc b/source/api_cc/tests/test_deepspin_model_devi_ptexpt.cc
index e58d9c0f78..ada7c2eba7 100644
--- a/source/api_cc/tests/test_deepspin_model_devi_ptexpt.cc
+++ b/source/api_cc/tests/test_deepspin_model_devi_ptexpt.cc
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "DeepSpin.h"
+#include "DeepSpinPTExpt.h"
 #include "neighbor_list.h"
 #include "test_utils.h"
 
@@ -46,7 +47,7 @@ class TestInferDeepSpinModeDeviPtExpt : public ::testing::Test {
         GTEST_SKIP() << "Skipping: " << model1_path << " not found.";
       }
     }
-#ifndef BUILD_PYTORCH
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT_SPIN
     GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
 #endif
     dp0.init(model0_path);
diff --git a/source/api_cc/tests/test_neighbor_list_data.cc b/source/api_cc/tests/test_neighbor_list_data.cc
new file mode 100644
index 0000000000..3e5198d30d
--- /dev/null
+++ b/source/api_cc/tests/test_neighbor_list_data.cc
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Backend-agnostic unit tests for deepmd::NeighborListData and the
+// deepmd::convert_nlist helper.  Exercises edge cases (empty rows, empty
+// nlist) that surfaced as latent libstdc++-debug-mode UB assertions in
+// production code paths.
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "common.h"
+#include "neighbor_list.h"
+
+namespace deepmd {
+
+// Build a NeighborListData with @p nloc local atoms and zero neighbors per
+// atom.  Realistic under aggressive spatial partitioning where a subdomain's
+// every local atom has no neighbors within cutoff.
+TEST(TestNeighborListData, MakeInlistEmptyRows) {
+  NeighborListData data;
+  const int nloc = 4;
+  data.ilist.resize(nloc);
+  for (int ii = 0; ii < nloc; ++ii) {
+    data.ilist[ii] = ii;
+  }
+  data.jlist.resize(nloc);  // every row default-constructed to empty
+
+  // Must not trigger UB ('&vec[0]' on empty vector) under libstdc++ debug.
+  InputNlist inlist;
+  ASSERT_NO_THROW(data.make_inlist(inlist));
+
+  EXPECT_EQ(inlist.inum, nloc);
+  ASSERT_NE(inlist.numneigh, nullptr);
+  ASSERT_NE(inlist.firstneigh, nullptr);
+  for (int ii = 0; ii < nloc; ++ii) {
+    EXPECT_EQ(inlist.numneigh[ii], 0);
+  }
+}
+
+TEST(TestNeighborListData, MakeInlistMixedEmptyAndNonemptyRows) {
+  NeighborListData data;
+  data.ilist = {0, 1, 2};
+  data.jlist.resize(3);
+  // row 0: empty (legitimate edge case)
+  data.jlist[1] = {7, 8};
+  // row 2: empty
+  InputNlist inlist;
+  ASSERT_NO_THROW(data.make_inlist(inlist));
+  EXPECT_EQ(inlist.numneigh[0], 0);
+  EXPECT_EQ(inlist.numneigh[1], 2);
+  EXPECT_EQ(inlist.numneigh[2], 0);
+  // Only the populated row's firstneigh should be dereferenced.
+  EXPECT_EQ(inlist.firstneigh[1][0], 7);
+  EXPECT_EQ(inlist.firstneigh[1][1], 8);
+}
+
+// convert_nlist(jagged) must not dereference an empty row when populating
+// firstneigh.  Regression test for the same `&vec[0]` UB pattern fixed in
+// commit 72f95f87.
+TEST(TestNeighborListData, ConvertNlistEmptyRows) {
+  std::vector<std::vector<int>> input = {{}, {}, {}};  // all rows empty
+  std::vector<int> ilist(input.size()), numneigh(input.size());
+  std::vector<int*> firstneigh(input.size());
+  InputNlist out(static_cast<int>(input.size()), ilist.data(), numneigh.data(),
+                 firstneigh.data());
+  ASSERT_NO_THROW(convert_nlist(out, input));
+  EXPECT_EQ(out.inum, 3);
+  for (int ii = 0; ii < 3; ++ii) {
+    EXPECT_EQ(out.numneigh[ii], 0);
+    // firstneigh[ii] may be vector::data()'s sentinel or nullptr — must not
+    // be dereferenced when numneigh[ii] == 0.
+  }
+}
+
+// copy_from_nlist must not dereference an empty source row even when
+// memcpy size is 0.  Regression test for the same UB pattern.
+TEST(TestNeighborListData, CopyFromNlistEmptyRows) {
+  // Build an InputNlist with all empty rows.
+  const int nloc = 4;
+  std::vector<int> src_ilist(nloc), src_numneigh(nloc, 0);
+  std::vector<int*> src_firstneigh(nloc, nullptr);
+  for (int ii = 0; ii < nloc; ++ii) {
+    src_ilist[ii] = ii;
+  }
+  InputNlist src(nloc, src_ilist.data(), src_numneigh.data(),
+                 src_firstneigh.data());
+  src.mask = ~0;  // identity mask, must not be applied to absent neighbors
+
+  NeighborListData data;
+  ASSERT_NO_THROW(data.copy_from_nlist(src));
+  EXPECT_EQ(static_cast<int>(data.ilist.size()), nloc);
+  EXPECT_EQ(static_cast<int>(data.jlist.size()), nloc);
+  for (int ii = 0; ii < nloc; ++ii) {
+    EXPECT_TRUE(data.jlist[ii].empty());
+  }
+}
+
+// copy_from_nlist with an empty source list (inum == 0) must not
+// dereference '&ilist[0]' on the empty target ilist.
+TEST(TestNeighborListData, CopyFromNlistInumZero) {
+  InputNlist src;
+  src.inum = 0;
+  src.ilist = nullptr;
+  src.numneigh = nullptr;
+  src.firstneigh = nullptr;
+  src.mask = ~0;
+
+  NeighborListData data;
+  ASSERT_NO_THROW(data.copy_from_nlist(src));
+  EXPECT_TRUE(data.ilist.empty());
+  EXPECT_TRUE(data.jlist.empty());
+}
+
+// Round-trip: convert_nlist(jagged) → copy_from_nlist → make_inlist
+// must preserve both empty and non-empty rows without UB.
+TEST(TestNeighborListData, RoundTripWithEmptyRows) {
+  std::vector<std::vector<int>> input = {{}, {3, 4}, {}, {5}};
+  std::vector<int> ilist(input.size()), numneigh(input.size());
+  std::vector<int*> firstneigh(input.size());
+  InputNlist src(static_cast<int>(input.size()), ilist.data(), numneigh.data(),
+                 firstneigh.data());
+  ASSERT_NO_THROW(convert_nlist(src, input));
+
+  NeighborListData data;
+  ASSERT_NO_THROW(data.copy_from_nlist(src));
+  EXPECT_EQ(static_cast<int>(data.jlist.size()), 4);
+  EXPECT_TRUE(data.jlist[0].empty());
+  EXPECT_EQ(data.jlist[1], (std::vector<int>{3, 4}));
+  EXPECT_TRUE(data.jlist[2].empty());
+  EXPECT_EQ(data.jlist[3], (std::vector<int>{5}));
+
+  InputNlist out;
+  ASSERT_NO_THROW(data.make_inlist(out));
+  EXPECT_EQ(out.numneigh[0], 0);
+  EXPECT_EQ(out.numneigh[1], 2);
+  EXPECT_EQ(out.numneigh[2], 0);
+  EXPECT_EQ(out.numneigh[3], 1);
+}
+
+}  // namespace deepmd
diff --git a/source/api_cc/tests/test_utils.h b/source/api_cc/tests/test_utils.h
index 51f05f9b54..9d5d5a4815 100644
--- a/source/api_cc/tests/test_utils.h
+++ b/source/api_cc/tests/test_utils.h
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #pragma once
 #include <cmath>
+#include <random>
 
 #include "SimulationRegion.h"
 #include "gtest/gtest.h"
@@ -84,6 +85,34 @@ inline void _build_nlist(std::vector<std::vector<int>>& nlist_data,
   coord_cpy.assign(coord_cpy_.begin(), coord_cpy_.end());
 }
 
+/**
+ * @brief Pad each atom's neighbor list with -1 entries and shuffle.
+ *
+ * Mimics the Python test_oversized_nlist approach: append n_extra
+ * padding entries (-1) to each row, then apply a deterministic
+ * permutation so that real neighbors are no longer at the front.
+ * This exercises the model's internal distance-sort + truncation.
+ */
+inline void _pad_shuffle_nlist(std::vector<std::vector<int>>& nlist_out,
+                               const std::vector<std::vector<int>>& nlist_in,
+                               int n_extra,
+                               unsigned int seed = 42) {
+  nlist_out.resize(nlist_in.size());
+  for (size_t ii = 0; ii < nlist_in.size(); ++ii) {
+    // copy original + pad with -1
+    nlist_out[ii] = nlist_in[ii];
+    for (int jj = 0; jj < n_extra; ++jj) {
+      nlist_out[ii].push_back(-1);
+    }
+    // deterministic shuffle (std::minstd_rand for reproducibility)
+    std::minstd_rand rng(seed + static_cast<unsigned int>(ii));
+    for (size_t jj = nlist_out[ii].size() - 1; jj > 0; --jj) {
+      size_t kk = rng() % (jj + 1);
+      std::swap(nlist_out[ii][jj], nlist_out[ii][kk]);
+    }
+  }
+}
+
 template <typename VALUETYPE>
 class EnergyModelTest {
  protected:
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index 6723e3de66..3a0d8eb122 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -852,7 +852,11 @@ void deepmd::convert_nlist(InputNlist& to_nlist,
   for (int ii = 0; ii < to_nlist.inum; ++ii) {
     to_nlist.ilist[ii] = ii;
     to_nlist.numneigh[ii] = from_nlist[ii].size();
-    to_nlist.firstneigh[ii] = &from_nlist[ii][0];
+    // `&vec[0]` is undefined behaviour for empty vectors (libstdc++ debug
+    // mode asserts on it).  When numneigh[ii] is 0 the pointer is never
+    // dereferenced; use vector::data() which is well-defined for empty
+    // vectors (may return nullptr).
+    to_nlist.firstneigh[ii] = from_nlist[ii].data();
   }
 }
 
diff --git a/source/tests/infer/case.py b/source/tests/infer/case.py
index 828974c6e6..84acc5bad5 100644
--- a/source/tests/infer/case.py
+++ b/source/tests/infer/case.py
@@ -173,7 +173,12 @@ def get_model(self, suffix: str, out_file: str | None = None) -> str:
             out_file = tempfile.NamedTemporaryFile(
                 suffix=suffix, dir=tempdir.name, delete=False, prefix=self.key + "_"
             ).name
-        convert_backend(INPUT=self.filename, OUTPUT=out_file)
+        # For .pte/.pt2, export with atomic virial so tests can verify
+        # per-atom virial against reference values.
+        kwargs: dict = {}
+        if suffix in (".pte", ".pt2"):
+            kwargs["atomic_virial"] = True
+        convert_backend(INPUT=self.filename, OUTPUT=out_file, **kwargs)
         return out_file
 
 
diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py
index 26e8fc4afd..d587d7e534 100644
--- a/source/tests/infer/gen_dpa1.py
+++ b/source/tests/infer/gen_dpa1.py
@@ -85,7 +85,7 @@ def main():
 
     pt2_path = os.path.join(base_dir, "deeppot_dpa1.pt2")
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
-    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data))
+    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
 
     pth_path = os.path.join(base_dir, "deeppot_dpa1.pth")
     print(f"Exporting to {pth_path} ...")  # noqa: T201
diff --git a/source/tests/infer/gen_dpa2.py b/source/tests/infer/gen_dpa2.py
index 84b4d314a3..8ce277fcf5 100644
--- a/source/tests/infer/gen_dpa2.py
+++ b/source/tests/infer/gen_dpa2.py
@@ -108,7 +108,7 @@ def main():
 
     pt2_path = os.path.join(base_dir, "deeppot_dpa2.pt2")
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
-    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data))
+    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
 
     pth_path = os.path.join(base_dir, "deeppot_dpa2.pth")
     print(f"Exporting to {pth_path} ...")  # noqa: T201
diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py
index 322163462d..e5e2d66579 100644
--- a/source/tests/infer/gen_dpa3.py
+++ b/source/tests/infer/gen_dpa3.py
@@ -86,7 +86,7 @@ def main():
 
     pt2_path = os.path.join(base_dir, "deeppot_dpa3.pt2")
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
-    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data))
+    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
 
     pth_path = os.path.join(base_dir, "deeppot_dpa3.pth")
     print(f"Exporting to {pth_path} ...")  # noqa: T201
diff --git a/source/tests/infer/gen_fparam_aparam.py b/source/tests/infer/gen_fparam_aparam.py
index f0d80d5764..13dbe4ffcf 100644
--- a/source/tests/infer/gen_fparam_aparam.py
+++ b/source/tests/infer/gen_fparam_aparam.py
@@ -67,7 +67,9 @@ def main():
     }
     pt2_default_path = os.path.join(base_dir, "fparam_aparam_default.pt2")
     print(f"Exporting to {pt2_default_path} ...")  # noqa: T201
-    pt_expt_deserialize_to_file(pt2_default_path, copy.deepcopy(data_default))
+    pt_expt_deserialize_to_file(
+        pt2_default_path, copy.deepcopy(data_default), do_atomic_virial=True
+    )
 
     # ---- 3. Export fparam_aparam.pt2 and .pth (without default_fparam) ----
     config_no_default = copy.deepcopy(config)
@@ -84,7 +86,9 @@ def main():
 
     pt2_path = os.path.join(base_dir, "fparam_aparam.pt2")
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
-    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data_no_default))
+    pt_expt_deserialize_to_file(
+        pt2_path, copy.deepcopy(data_no_default), do_atomic_virial=True
+    )
 
     pth_path = os.path.join(base_dir, "fparam_aparam.pth")
     pth_exported = False
diff --git a/source/tests/infer/gen_model_devi.py b/source/tests/infer/gen_model_devi.py
index bdceaa05aa..7e43add02e 100644
--- a/source/tests/infer/gen_model_devi.py
+++ b/source/tests/infer/gen_model_devi.py
@@ -99,7 +99,9 @@ def main():
         }
         pt2_path = os.path.join(base_dir, f"model_devi_md{idx}.pt2")
         print(f"Exporting to {pt2_path} ...")  # noqa: T201
-        pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data))
+        pt_expt_deserialize_to_file(
+            pt2_path, copy.deepcopy(data), do_atomic_virial=True
+        )
         models.append(pt2_path)
 
     print("Export done.")  # noqa: T201
diff --git a/source/tests/infer/gen_sea.py b/source/tests/infer/gen_sea.py
index 84b78bbb78..02f4e7ee63 100644
--- a/source/tests/infer/gen_sea.py
+++ b/source/tests/infer/gen_sea.py
@@ -57,10 +57,52 @@ def main():
     }
 
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
-    deserialize_to_file(pt2_path, data)
+    deserialize_to_file(pt2_path, data, do_atomic_virial=True)
+
+    # Produce a variant for regression-testing the C++ "atomic &&
+    # !do_atomic_virial" throw path by copying the .pt2 archive and
+    # flipping the do_atomic_virial flag in its metadata.json — much
+    # cheaper than running a second AOTInductor compile.  The compiled
+    # graph itself supports atomic virial; only the C++ guard differs.
+    import shutil
+
+    pt2_no_aviral = os.path.join(base_dir, "deeppot_sea_no_atomic_virial.pt2")
+    print(f"Patching to {pt2_no_aviral} ...")  # noqa: T201
+    shutil.copyfile(pt2_path, pt2_no_aviral)
+    _patch_no_atomic_virial(pt2_no_aviral)
 
     print("Done!")  # noqa: T201
 
 
+def _patch_no_atomic_virial(pt2_path: str) -> None:
+    """Flip do_atomic_virial=False in the metadata.json of a .pt2 archive.
+
+    The .pt2 is a ZIP archive; the metadata blob lives at
+    ``extra/metadata.json``.  We rewrite the archive with that one entry
+    replaced and all other entries preserved verbatim.
+    """
+    import json
+    import zipfile
+
+    metadata_name = "extra/metadata.json"
+    tmp_path = pt2_path + ".tmp"
+    # PyTorch .pt2 archives use ZIP_STORED (uncompressed) so that the C++
+    # reader (read_zip_entry in commonPTExpt.h) and torch's mmap-based
+    # tensor loader can read entries without decompression.  Preserve
+    # that on rewrite — using ZIP_DEFLATED would yield bytes the C++
+    # reader treats as raw, resulting in JSON parse errors.
+    with zipfile.ZipFile(pt2_path, "r") as src:
+        names = src.namelist()
+        meta = json.loads(src.read(metadata_name).decode("utf-8"))
+        meta["do_atomic_virial"] = False
+        with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_STORED) as dst:
+            for name in names:
+                if name == metadata_name:
+                    dst.writestr(name, json.dumps(meta))
+                else:
+                    dst.writestr(name, src.read(name))
+    os.replace(tmp_path, pt2_path)
+
+
 if __name__ == "__main__":
     main()
diff --git a/source/tests/infer/gen_spin.py b/source/tests/infer/gen_spin.py
index 3053e0ad4f..d37e3207ff 100644
--- a/source/tests/infer/gen_spin.py
+++ b/source/tests/infer/gen_spin.py
@@ -112,7 +112,7 @@ def main():
     convert_backend(INPUT=yaml_path, OUTPUT=pth_path)
 
     print(f"Converting to {pt2_path} ...")  # noqa: T201
-    convert_backend(INPUT=yaml_path, OUTPUT=pt2_path)
+    convert_backend(INPUT=yaml_path, OUTPUT=pt2_path, atomic_virial=True)
 
     print("Export done.")  # noqa: T201
 
diff --git a/source/tests/infer/gen_spin_model_devi.py b/source/tests/infer/gen_spin_model_devi.py
index 3dc5240e14..72792205e2 100644
--- a/source/tests/infer/gen_spin_model_devi.py
+++ b/source/tests/infer/gen_spin_model_devi.py
@@ -116,7 +116,7 @@ def main():
         pt2_path = os.path.join(base_dir, f"deeppot_dpa_spin_md{idx}.pt2")
         pt2_paths.append(pt2_path)
         print(f"Converting to {pt2_path} ...")  # noqa: T201
-        convert_backend(INPUT=yaml_path, OUTPUT=pt2_path)
+        convert_backend(INPUT=yaml_path, OUTPUT=pt2_path, atomic_virial=True)
 
     print("Export done.")  # noqa: T201
 
diff --git a/source/tests/pt_expt/export_helpers.py b/source/tests/pt_expt/export_helpers.py
index 453b9d0c01..ae4db82ddb 100644
--- a/source/tests/pt_expt/export_helpers.py
+++ b/source/tests/pt_expt/export_helpers.py
@@ -213,7 +213,7 @@ def model_forward_lower_export_round_trip(
         tracing_mode="symbolic",
         _allow_non_fake_inputs=True,
     )
-    dynamic_shapes = _build_dynamic_shapes(*inputs_2f)
+    dynamic_shapes = _build_dynamic_shapes(*inputs_2f, model_nnei=sum(md_pt.get_sel()))
     exported_dyn = torch.export.export(
         traced_sym,
         inputs_2f,
diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py
index 6797fa2c03..f96f08ae28 100644
--- a/source/tests/pt_expt/infer/test_deep_eval.py
+++ b/source/tests/pt_expt/infer/test_deep_eval.py
@@ -63,11 +63,11 @@ def setUpClass(cls) -> None:
         cls.model = cls.model.to(torch.float64)
         cls.model.eval()
 
-        # Serialize and save to .pte
+        # Serialize and save to .pte (with atomic virial for test_dynamic_shapes)
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         # Create DeepPot for testing
         cls.dp = DeepPot(cls.tmpfile.name)
@@ -273,6 +273,106 @@ def test_dynamic_shapes(self) -> None:
                         err_msg=f"nloc={nloc}, key={key}",
                     )
 
+    def test_oversized_nlist(self) -> None:
+        """Test that the exported model handles nlist with more neighbors than nnei.
+
+        In LAMMPS, the neighbor list is built with rcut + skin, so atoms
+        typically have more neighbors than sum(sel).  The compiled
+        format_nlist must sort by distance and truncate correctly.
+
+        The test verifies two things:
+
+        1. **Correctness**: the exported model with an oversized, shuffled
+           nlist produces the same results as the eager model (both sort by
+           distance and keep the closest sum(sel) neighbors).
+
+        2. **Naive truncation produces wrong results**: simply taking the
+           first sum(sel) columns of the shuffled nlist (simulating a C++
+           implementation that truncates without sorting) gives a different
+           energy.  This proves the distance sort is necessary.
+        """
+        exported = torch.export.load(self.tmpfile.name)
+        exported_mod = exported.module()
+
+        nnei = sum(self.sel)  # model's expected neighbor count
+        nloc = 5
+        ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = _make_sample_inputs(
+            self.model, nloc=nloc
+        )
+
+        # Pad nlist with -1 columns, then shuffle column order so real
+        # neighbors are interspersed with absent ones beyond column sum(sel).
+        n_extra = nnei  # double the nlist width
+        nlist_padded = torch.cat(
+            [
+                nlist_t,
+                -torch.ones(
+                    (*nlist_t.shape[:2], n_extra),
+                    dtype=nlist_t.dtype,
+                    device=nlist_t.device,
+                ),
+            ],
+            dim=-1,
+        )
+        # Shuffle columns: move some real neighbors past sum(sel) boundary.
+        rng = np.random.default_rng(42)
+        perm = rng.permutation(nlist_padded.shape[-1])
+        nlist_shuffled = nlist_padded[:, :, perm]
+        assert nlist_shuffled.shape[-1] > nnei
+
+        # --- Part 1: exported model sorts correctly ---
+        # Reference: eager model with shuffled oversized nlist
+        ec = ext_coord.detach().requires_grad_(True)
+        ref_ret = self.model.forward_common_lower(
+            ec,
+            ext_atype,
+            nlist_shuffled,
+            mapping_t,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=True,
+        )
+
+        # Exported model with same shuffled oversized nlist
+        pte_ret = exported_mod(
+            ext_coord, ext_atype, nlist_shuffled, mapping_t, fparam, aparam
+        )
+
+        for key in ("energy", "energy_redu", "energy_derv_r", "energy_derv_c"):
+            if ref_ret[key] is not None and key in pte_ret:
+                np.testing.assert_allclose(
+                    ref_ret[key].detach().cpu().numpy(),
+                    pte_ret[key].detach().cpu().numpy(),
+                    rtol=1e-10,
+                    atol=1e-10,
+                    err_msg=f"oversized nlist, key={key}",
+                )
+
+        # --- Part 2: naive truncation gives wrong results ---
+        # Simulate the old C++ bug: truncate shuffled nlist to sum(sel) columns
+        # without distance sorting.  Some close neighbors that were shuffled
+        # beyond column sum(sel) are lost, producing wrong energy.
+        nlist_truncated = nlist_shuffled[:, :, :nnei]
+        ec2 = ext_coord.detach().requires_grad_(True)
+        trunc_ret = self.model.forward_common_lower(
+            ec2,
+            ext_atype,
+            nlist_truncated,
+            mapping_t,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=True,
+        )
+        # The truncated result MUST differ from the correctly sorted result,
+        # proving that naive truncation discards real neighbors.
+        e_ref = ref_ret["energy_redu"].detach().cpu().numpy()
+        e_trunc = trunc_ret["energy_redu"].detach().cpu().numpy()
+        assert not np.allclose(e_ref, e_trunc, rtol=1e-10, atol=1e-10), (
+            "Naive truncation of shuffled nlist should give different energy, "
+            "but got the same result.  The test data may not have enough "
+            "neighbors shuffled beyond sum(sel) to trigger the bug."
+        )
+
     def test_serialize_round_trip(self) -> None:
         """Test .pte → serialize_from_file → deserialize → model gives same outputs."""
         loaded_data = serialize_from_file(self.tmpfile.name)
@@ -547,14 +647,14 @@ def setUpClass(cls) -> None:
         # compilation (tests/pt/__init__.py sets it to "cuda:9999999").
         torch.set_default_device(None)
         try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+            deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
         finally:
             torch.set_default_device("cuda:9999999")
 
         # Also save to .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.pte_tmpfile.close()
-        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         # Create DeepPot for .pt2
         cls.dp = DeepPot(cls.tmpfile.name)
@@ -768,6 +868,99 @@ def test_no_pbc(self) -> None:
             v, ref["virial"].detach().cpu().numpy(), rtol=1e-10, atol=1e-10
         )
 
+    def test_oversized_nlist(self) -> None:
+        """Test that the exported model handles nlist with more neighbors than nnei.
+
+        In LAMMPS, the neighbor list is built with rcut + skin, so atoms
+        typically have more neighbors than sum(sel).  The compiled
+        format_nlist must sort by distance and truncate correctly.
+
+        The test verifies two things:
+
+        1. **Correctness**: the exported model with an oversized, shuffled
+           nlist produces the same results as the eager model (both sort by
+           distance and keep the closest sum(sel) neighbors).
+
+        2. **Naive truncation produces wrong results**: simply taking the
+           first sum(sel) columns of the shuffled nlist (simulating a C++
+           implementation that truncates without sorting) gives a different
+           energy.  This proves the distance sort is necessary.
+        """
+        exported = torch.export.load(self.pte_tmpfile.name)
+        exported_mod = exported.module()
+
+        nnei = sum(self.sel)  # model's expected neighbor count
+        nloc = 5
+        ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = _make_sample_inputs(
+            self.model, nloc=nloc
+        )
+
+        # Pad nlist with -1 columns, then shuffle column order so real
+        # neighbors are interspersed with absent ones beyond column sum(sel).
+        n_extra = nnei  # double the nlist width
+        nlist_padded = torch.cat(
+            [
+                nlist_t,
+                -torch.ones(
+                    (*nlist_t.shape[:2], n_extra),
+                    dtype=nlist_t.dtype,
+                    device=nlist_t.device,
+                ),
+            ],
+            dim=-1,
+        )
+        # Shuffle columns: move some real neighbors past sum(sel) boundary.
+        rng = np.random.default_rng(42)
+        perm = rng.permutation(nlist_padded.shape[-1])
+        nlist_shuffled = nlist_padded[:, :, perm]
+        assert nlist_shuffled.shape[-1] > nnei
+
+        # --- Part 1: exported model sorts correctly ---
+        ec = ext_coord.detach().requires_grad_(True)
+        ref_ret = self.model.forward_common_lower(
+            ec,
+            ext_atype,
+            nlist_shuffled,
+            mapping_t,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=True,
+        )
+
+        pte_ret = exported_mod(
+            ext_coord, ext_atype, nlist_shuffled, mapping_t, fparam, aparam
+        )
+
+        for key in ("energy", "energy_redu", "energy_derv_r", "energy_derv_c"):
+            if ref_ret[key] is not None and key in pte_ret:
+                np.testing.assert_allclose(
+                    ref_ret[key].detach().cpu().numpy(),
+                    pte_ret[key].detach().cpu().numpy(),
+                    rtol=1e-10,
+                    atol=1e-10,
+                    err_msg=f"oversized nlist, key={key}",
+                )
+
+        # --- Part 2: naive truncation gives wrong results ---
+        nlist_truncated = nlist_shuffled[:, :, :nnei]
+        ec2 = ext_coord.detach().requires_grad_(True)
+        trunc_ret = self.model.forward_common_lower(
+            ec2,
+            ext_atype,
+            nlist_truncated,
+            mapping_t,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=True,
+        )
+        e_ref = ref_ret["energy_redu"].detach().cpu().numpy()
+        e_trunc = trunc_ret["energy_redu"].detach().cpu().numpy()
+        assert not np.allclose(e_ref, e_trunc, rtol=1e-10, atol=1e-10), (
+            "Naive truncation of shuffled nlist should give different energy, "
+            "but got the same result.  The test data may not have enough "
+            "neighbors shuffled beyond sum(sel) to trigger the bug."
+        )
+
     def test_serialize_round_trip(self) -> None:
         """Test .pt2 → serialize_from_file → deserialize → model gives same outputs."""
         loaded_data = serialize_from_file(self.tmpfile.name)
@@ -1070,7 +1263,7 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         cls.dp = DeepPot(cls.tmpfile.name)
 
@@ -1187,14 +1380,14 @@ def setUpClass(cls) -> None:
         cls.tmpfile.close()
         torch.set_default_device(None)
         try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+            deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
         finally:
             torch.set_default_device("cuda:9999999")
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.pte_tmpfile.close()
-        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         cls.dp = DeepPot(cls.tmpfile.name)
         cls.dp_pte = DeepPot(cls.pte_tmpfile.name)
diff --git a/source/tests/pt_expt/model/test_ener_model.py b/source/tests/pt_expt/model/test_ener_model.py
index cd520b5cc0..b91653a260 100644
--- a/source/tests/pt_expt/model/test_ener_model.py
+++ b/source/tests/pt_expt/model/test_ener_model.py
@@ -366,7 +366,7 @@ def test_forward_lower_exportable(self) -> None:
             _allow_non_fake_inputs=True,
         )
 
-        dynamic_shapes = _build_dynamic_shapes(*inputs_5f)
+        dynamic_shapes = _build_dynamic_shapes(*inputs_5f, model_nnei=sum(md.get_sel()))
         exported_dyn = torch.export.export(
             traced_sym,
             inputs_5f,
diff --git a/source/tests/pt_expt/model/test_export_pipeline.py b/source/tests/pt_expt/model/test_export_pipeline.py
index 8bff3d7130..23e0a62a98 100644
--- a/source/tests/pt_expt/model/test_export_pipeline.py
+++ b/source/tests/pt_expt/model/test_export_pipeline.py
@@ -149,7 +149,13 @@ def test_export_pipeline(self, descriptor_type, with_fparam) -> None:
 
         # 6. Export with dynamic shapes (same as dp freeze)
         dynamic_shapes = _build_dynamic_shapes(
-            ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam
+            ext_coord,
+            ext_atype,
+            nlist_t,
+            mapping_t,
+            fparam,
+            aparam,
+            model_nnei=sum(model2.get_sel()),
         )
         exported = torch.export.export(
             traced,
diff --git a/source/tests/pt_expt/model/test_spin_ener_model.py b/source/tests/pt_expt/model/test_spin_ener_model.py
index 5dd6dbecf2..f7f96392d3 100644
--- a/source/tests/pt_expt/model/test_spin_ener_model.py
+++ b/source/tests/pt_expt/model/test_spin_ener_model.py
@@ -359,6 +359,56 @@ def setUp(self) -> None:
 
 
 class TestSpinEnerModelExportable(unittest.TestCase):
+    @staticmethod
+    def _build_extended_inputs(model, natoms=6):
+        """Build extended inputs for spin model export tests."""
+        from deepmd.dpmodel.utils import (
+            build_neighbor_list,
+            extend_coord_with_ghosts,
+            normalize_coord,
+        )
+
+        generator = torch.Generator(device="cpu").manual_seed(GLOBAL_SEED)
+        cell = torch.rand([3, 3], dtype=dtype, device="cpu", generator=generator)
+        cell = (cell + cell.T) + 5.0 * torch.eye(3, device="cpu")
+        coord = torch.rand([natoms, 3], dtype=dtype, device="cpu", generator=generator)
+        coord = torch.matmul(coord, cell)
+        atype = torch.tensor([0, 0, 1, 0, 1, 1], dtype=torch.int64)
+        spin = (
+            torch.rand([natoms, 3], dtype=dtype, device="cpu", generator=generator)
+            * 0.5
+        )
+
+        rcut = model.get_rcut()
+        sel = SPIN_DATA["descriptor"]["sel"]
+        coord_np = coord.unsqueeze(0).numpy()
+        atype_np = atype.unsqueeze(0).numpy()
+        box_np = cell.reshape(1, 9).numpy()
+        coord_normalized = normalize_coord(
+            coord_np.reshape(1, natoms, 3),
+            box_np.reshape(1, 3, 3),
+        )
+        ext_coord, ext_atype, mapping = extend_coord_with_ghosts(
+            coord_normalized, atype_np, box_np, rcut
+        )
+        nlist = build_neighbor_list(
+            ext_coord, ext_atype, natoms, rcut, sel, distinguish_types=True
+        )
+        ext_coord = ext_coord.reshape(1, -1, 3)
+        spin_np = spin.unsqueeze(0).numpy()
+        ext_spin = np.take_along_axis(
+            spin_np,
+            np.repeat(mapping[:, :, np.newaxis], 3, axis=2),
+            axis=1,
+        )
+
+        ext_coord_t = torch.tensor(ext_coord, dtype=dtype, device=env.DEVICE)
+        ext_atype_t = torch.tensor(ext_atype, dtype=torch.int64, device=env.DEVICE)
+        nlist_t = torch.tensor(nlist, dtype=torch.int64, device=env.DEVICE)
+        mapping_t = torch.tensor(mapping, dtype=torch.int64, device=env.DEVICE)
+        ext_spin_t = torch.tensor(ext_spin, dtype=dtype, device=env.DEVICE)
+        return ext_coord_t, ext_atype_t, ext_spin_t, nlist_t, mapping_t, sel
+
     def test_forward_lower_exportable(self) -> None:
         """Test that SpinEnergyModel.forward_lower_exportable works with make_fx and torch.export."""
         from deepmd.dpmodel.utils import (
@@ -538,6 +588,80 @@ def test_forward_lower_exportable(self) -> None:
                 err_msg=f"loaded vs eager (nf=1): {key}",
             )
 
+    def test_oversized_nlist(self) -> None:
+        """Test that the exported spin model handles nlist with more neighbors than nnei.
+
+        Verifies two things:
+        1. The exported model with an oversized, shuffled nlist produces the
+           same results as the eager model.
+        2. Naive truncation of the shuffled nlist gives different energy,
+           proving distance sort is necessary.
+        """
+        model = _make_model()
+        ext_coord_t, ext_atype_t, ext_spin_t, nlist_t, mapping_t, sel = (
+            self._build_extended_inputs(model)
+        )
+        nnei = sum(sel)
+
+        # Pad and shuffle nlist
+        n_extra = nnei
+        nlist_padded = torch.cat(
+            [
+                nlist_t,
+                -torch.ones(
+                    (*nlist_t.shape[:2], n_extra),
+                    dtype=nlist_t.dtype,
+                    device=nlist_t.device,
+                ),
+            ],
+            dim=-1,
+        )
+        rng = np.random.default_rng(42)
+        perm = rng.permutation(nlist_padded.shape[-1])
+        nlist_shuffled = nlist_padded[:, :, perm]
+        assert nlist_shuffled.shape[-1] > nnei
+
+        output_keys = ("energy", "extended_force", "extended_force_mag", "virial")
+
+        # --- Part 1: exported model sorts correctly ---
+        traced = model.forward_lower_exportable(
+            ext_coord_t,
+            ext_atype_t,
+            ext_spin_t,
+            nlist_shuffled,
+            mapping_t,
+        )
+        ret_traced = traced(
+            ext_coord_t, ext_atype_t, ext_spin_t, nlist_shuffled, mapping_t, None, None
+        )
+
+        ec = ext_coord_t.detach().requires_grad_(True)
+        ret_eager = model.forward_lower(
+            ec, ext_atype_t, ext_spin_t, nlist_shuffled, mapping_t
+        )
+
+        for key in output_keys:
+            if ret_eager.get(key) is not None and key in ret_traced:
+                np.testing.assert_allclose(
+                    ret_eager[key].detach().cpu().numpy(),
+                    ret_traced[key].detach().cpu().numpy(),
+                    rtol=1e-10,
+                    atol=1e-10,
+                    err_msg=f"oversized nlist, key={key}",
+                )
+
+        # --- Part 2: naive truncation gives wrong results ---
+        nlist_truncated = nlist_shuffled[:, :, :nnei]
+        ec2 = ext_coord_t.detach().requires_grad_(True)
+        ret_trunc = model.forward_lower(
+            ec2, ext_atype_t, ext_spin_t, nlist_truncated, mapping_t
+        )
+        e_ref = ret_eager["energy"].detach().cpu().numpy()
+        e_trunc = ret_trunc["energy"].detach().cpu().numpy()
+        assert not np.allclose(e_ref, e_trunc, rtol=1e-10, atol=1e-10), (
+            "Naive truncation of shuffled nlist should give different energy."
+        )
+
 
 if __name__ == "__main__":
     unittest.main()