deepmodeling
diff --git a/‎deepmd/pt/entrypoints/freeze_pt2.py‎
Lines changed: 2 additions & 2 deletions b/‎deepmd/pt/entrypoints/freeze_pt2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎deepmd/pt/model/descriptor/sezm.py‎
Lines changed: 17 additions & 11 deletions b/‎deepmd/pt/model/descriptor/sezm.py‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎deepmd/pt/model/model/sezm_model.py‎
Lines changed: 4 additions & 1 deletion b/‎deepmd/pt/model/model/sezm_model.py‎
Lines changed: 4 additions & 1 deletion
@@ -4,8 +4,8 @@
 SeZM relies on a nested ``autograd.grad(create_graph=True)`` inside
 ``fit_output_to_model_output``; TorchScript cannot represent that
 graph, so DPA4 / SeZM checkpoints are routed through AOTInductor instead.
-The output archive layout matches the ``pt_expt`` convention and is
-consumed directly by ``DeepPotPTExpt.cc`` without any C++ change.
+The output archive layout follows the ``pt_expt`` convention, including the
+metadata consumed by ``DeepPotPTExpt.cc`` and ``DeepSpinPTExpt.cc``.
 
 Tracing runs on CPU (``make_fx`` with ``_allow_non_fake_inputs=True``
 is brittle on CUDA because the proxy-tensor dispatcher does not set
 
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-SeZM: The descriptor of smooth equivariant Zone-bridging Model.
+SeZM descriptor: Smooth Equivariant Zone-bridging Model.
 
 PyTorch backend
 
-This implementation is designed around two non-negotiables:
+This implementation is designed around two goals:
 
 1) Conservative forces: the descriptor is computed from differentiable energy.
-2) Speed-first inference: edge geometry and Wigner-D rotation blocks are computed
+2) Efficient inference: edge geometry and Wigner-D rotation blocks are computed
    exactly once per `forward()` and reused by all interaction blocks.
 
 Shared descriptor building blocks are re-exported by `sezm_nn/__init__.py`.
@@ -117,10 +117,11 @@
 
 @BaseDescriptor.register("SeZM")
 @BaseDescriptor.register("sezm")
+@BaseDescriptor.register("DPA4")
 @BaseDescriptor.register("dpa4")
 class DescrptSeZM(BaseDescriptor, nn.Module):
     """
-    SeZM: The descriptor of smooth equivariant Zone-bridging Model for DeePMD-kit.
+    SeZM descriptor.
 
     Execution outline
     -----------------
@@ -242,8 +243,8 @@ class DescrptSeZM(BaseDescriptor, nn.Module):
         - DepthAttnRes: input-dependent query projection
         - EnvironmentInitialEmbedding:
           rbf_proj_layer1/2 and g_layer1/2
-        Attention projections in SO2Convolution
-        (attn_radial_logit_proj, attn_output_gate_proj) are always bias-free.
+        Attention logit and output-gate parameters in SO(2) convolution are
+        always bias-free.
     layer_scale
         If True, apply learnable LayerScale (init 1e-3) on residual branches:
         - SO(2) branch: per-focus-channel scales `(n_focus, focus_dim)`
@@ -292,9 +293,11 @@ class DescrptSeZM(BaseDescriptor, nn.Module):
         ``True`` only when ``s2_activation[1]=True``. The final ``l=0`` output
         FFN always keeps this user-provided value.
     use_amp
-        If True, use automatic mixed precision (AMP) with bfloat16 on CUDA.
-        This does not provide accelerations under fp32 precision but will decrease
-        the memory usage, while preserving model accuracy.
+        If True, use automatic mixed precision (AMP) with bfloat16 on CUDA
+        during training. This can improve speed and reduce memory usage.
+        Enabling this option is recommended on GPUs with native bfloat16 support.
+        Disable it on GPUs without native bfloat16 support to avoid runtime
+        errors or additional conversion overhead.
     exclude_types
         List of excluded type pairs.
     precision
@@ -1554,8 +1557,11 @@ def _compute_mode_ctx(self, device: torch.device) -> Generator[None, None, None]
         Notes
         -----
         - When `use_amp=True` and the model is in training mode, enables
-          torch.autocast with bfloat16 on CUDA.
-        - Only affects autocast-eligible operations (matmul, conv, etc.).
+          torch.autocast with bfloat16 on CUDA. This can improve speed and
+          reduce memory usage on GPUs with native bfloat16 support.
+          Disable AMP on GPUs without native bfloat16 support to avoid runtime
+          errors or additional conversion overhead.
+        - Only affects autocast-eligible operations.
         - Does nothing during inference (`self.training=False`), on non-CUDA
           devices, or when `use_amp=False`.
 
 
@@ -559,6 +559,7 @@ def _rebuild_graph_module(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
 
 @BaseModel.register("SeZM")
 @BaseModel.register("sezm")
+@BaseModel.register("DPA4")
 @BaseModel.register("dpa4")
 class SeZMModel(DPModelCommon, SeZMModel_):
     """
@@ -570,7 +571,9 @@ class SeZMModel(DPModelCommon, SeZMModel_):
     standard neighbor list and traces the local graph with ``make_fx`` for
     higher-order force training. Evaluation/inference compile usage is
     controlled by the `DP_COMPILE_INFER` environment variable read at model
-    initialization time.
+    initialization time. This path is experimental, requires ``torch==2.11``,
+    may still expose PyTorch compiler bugs, and can improve training speed by
+    roughly 2-3x on supported workloads.
     """
 
     model_type = "SeZM"