feat(pt): add angle gated attention

iProzd · iProzd · commit 6e11a3692d29 · 2025-07-23T18:28:59.000+08:00
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
@@ -77,6 +77,8 @@ def __init__(
         rk_order: int = 4,
         rk_update_diff_layer: bool = False,
         angle_use_node: bool = True,
+        angle_self_attention: bool = False,
+        angle_self_attention_gate: str = "none",
     ) -> None:
         r"""The constructor for the RepFlowArgs class which defines the parameters of the repflow block in DPA3 descriptor.
 
@@ -202,6 +204,8 @@ def __init__(
         self.rk_update_diff_layer = rk_update_diff_layer
         self.angle_use_node = angle_use_node
         self.only_angle_gated_mlp = only_angle_gated_mlp
+        self.angle_self_attention = angle_self_attention
+        self.angle_self_attention_gate = angle_self_attention_gate
         assert (
             fix_stat_std == 0.3
         ), "fix_stat_std is not implemented in this version, please use skip_stat instead."
diff --git a/deepmd/pt/model/descriptor/dpa3.py b/deepmd/pt/model/descriptor/dpa3.py
@@ -214,6 +214,8 @@ def init_subclass_params(sub_data, sub_class):
             rk_update_diff_layer=self.repflow_args.rk_update_diff_layer,
             angle_use_node=self.repflow_args.angle_use_node,
             use_loc_mapping=use_loc_mapping,
+            angle_self_attention=self.repflow_args.angle_self_attention,
+            angle_self_attention_gate=self.repflow_args.angle_self_attention_gate,
             exclude_types=exclude_types,
             env_protection=env_protection,
             precision=precision,
diff --git a/deepmd/pt/model/descriptor/repflow_layer.py b/deepmd/pt/model/descriptor/repflow_layer.py
@@ -90,6 +90,8 @@ def __init__(
         only_angle_gated_mlp: bool = False,
         node_use_rmsnorm: bool = False,
         angle_use_node: bool = True,
+        angle_self_attention: bool = False,
+        angle_self_attention_gate: str = "none",
         activation_function: str = "silu",
         update_style: str = "res_residual",
         update_residual: float = 0.1,
@@ -188,6 +190,8 @@ def __init__(
             self.node_rmsnorm = None
 
         self.angle_use_node = angle_use_node
+        self.angle_self_attention = angle_self_attention
+        self.angle_self_attention_gate = angle_self_attention_gate
 
         if self.edge_rbf_dot_self or self.edge_rbf_dot_message:
             self.rbf_mlp = MLPLayer(
@@ -501,6 +505,23 @@ def __init__(
                     )
                 )
 
+            if self.angle_self_attention:
+                self.angle_attention_mlp_in = MLPLayer(
+                    self.a_dim,
+                    self.a_dim * 3,  # query, key, value
+                    precision=precision,
+                    seed=child_seed(seed, 21),
+                )
+                self.angle_attention_mlp_out = MLPLayer(
+                    self.a_dim,
+                    self.a_dim,
+                    precision=precision,
+                    seed=child_seed(seed, 22),
+                )
+            else:
+                self.angle_attention_mlp_in = None
+                self.angle_attention_mlp_out = None
+
             if self.update_dihedral:
                 self.dihedral_dim = self.d_dim + 2 * self.a_dim
                 # angle dihedral message
@@ -1581,6 +1602,63 @@ def forward(
                 )
             a_update_list.append(angle_self_update)
 
+            if self.angle_self_attention:
+                # add a self-attention mechanism for angle_ebd with shape [nb x nloc x a_nnei x a_nnei x a_dim], on the last two dimensions
+                assert self.angle_attention_mlp_in is not None
+                assert self.angle_attention_mlp_out is not None
+                # nb x nloc x a_nnei x a_nnei x (3 * a_dim)
+                attention_output = self.angle_attention_mlp_in(angle_ebd)
+                # nb x nloc x a_nnei x a_nnei x a_dim
+                query, key, value = torch.chunk(
+                    attention_output, 3, dim=-1
+                )  # Split into query, key, value
+                # nb x nloc x a_nnei x a_nnei x a_nnei
+                attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (
+                    query.size(-1) ** 0.5
+                )  # Scaled dot-product attention
+                # smooth
+                attention_scores = (attention_scores + 20.0) * a_sw[
+                    :, :, None, :, None
+                ] * a_sw[:, :, None, None, :] - 20.0
+                # nb x nloc x a_nnei x a_nnei x a_nnei
+                attention_weights = torch.softmax(
+                    attention_scores, dim=-1
+                )  # Normalize scores
+                # smooth
+                attention_weights = (
+                    attention_weights
+                    * a_sw[:, :, None, :, None]
+                    * a_sw[:, :, None, None, :]
+                )
+                # optional gates
+                if self.angle_self_attention_gate == "edge":
+                    # nb x nloc x a_nnei x 3
+                    h2_angle = h2[..., : self.a_sel, :]
+                    # normalize
+                    h2_angle = h2_angle / torch.linalg.norm(
+                        h2_angle, dim=-1, keepdim=True
+                    )
+                    # nb x nloc x a_nnei x 3
+                    h2_angle = torch.where(
+                        a_nlist_mask.unsqueeze(-1).expand([-1, -1, -1, 3]),
+                        h2_angle,
+                        0.0,
+                    )
+                    # nb x nloc x a_nnei x a_nnei
+                    h2h2t = torch.matmul(h2_angle, torch.transpose(h2_angle, -1, -2))
+                    # nb x nloc x a_nnei x a_nnei x a_nnei
+                    attention_weights = attention_weights * h2h2t[:, :, None, :, :]
+
+                # nb x nloc x a_nnei x a_nnei x a_dim
+                angle_ebd_attended = torch.matmul(
+                    attention_weights, value
+                )  # Apply attention weights to value
+                # nb x nloc x a_nnei x a_nnei x a_dim
+                angle_attention_updated = self.act(
+                    self.angle_attention_mlp_out(angle_ebd_attended)
+                )  # Apply attention output layer
+                a_update_list.append(angle_attention_updated)
+
             # dihedral update with fixed sel
             if self.update_dihedral and not self.use_dynamic_sel:
                 assert d_nlist is not None
diff --git a/deepmd/pt/model/descriptor/repflows.py b/deepmd/pt/model/descriptor/repflows.py
@@ -158,6 +158,8 @@ def __init__(
         rk_update_diff_layer: bool = False,
         angle_use_node: bool = True,
         optim_update: bool = True,
+        angle_self_attention: bool = False,
+        angle_self_attention_gate: str = "none",
         seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         r"""
@@ -391,6 +393,18 @@ def __init__(
         else:
             self.env = None
 
+        self.angle_self_attention = angle_self_attention
+        self.angle_self_attention_gate = angle_self_attention_gate
+        if self.angle_self_attention:
+            assert (
+                not self.use_dynamic_sel
+            ), "angle_self_attention does not support dynamic selection so far"
+            assert self.angle_self_attention_gate in [
+                "none",
+                "edge",
+                "edge_feat",
+            ], "angle_self_attention_gate must be 'none', 'edge' or 'edge_feat'"
+
         self.activation_function = activation_function
         self.update_style = update_style
         self.update_residual = update_residual
@@ -501,6 +515,8 @@ def __init__(
                     only_angle_gated_mlp=self.only_angle_gated_mlp,
                     node_use_rmsnorm=self.node_use_rmsnorm,
                     angle_use_node=self.angle_use_node,
+                    angle_self_attention=self.angle_self_attention,
+                    angle_self_attention_gate=self.angle_self_attention_gate,
                     seed=child_seed(child_seed(seed, 1), ii),
                 )
             )
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -1869,6 +1869,18 @@ def dpa3_repflow_args():
             optional=True,
             default=True,
         ),
+        Argument(
+            "angle_self_attention",
+            bool,
+            optional=True,
+            default=False,
+        ),
+        Argument(
+            "angle_self_attention_gate",
+            str,
+            optional=True,
+            default="none",
+        ),
     ]