add use_gated_mlp and gated_mlp_norm

iProzd · iProzd · commit 28d688cd609f · 2025-09-10T17:09:47.000+08:00
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
@@ -181,6 +181,8 @@ def __init__(
         use_dynamic_sel: bool = False,
         sel_reduce_factor: float = 10.0,
         update_use_layernorm: bool = False,
+        use_gated_mlp: bool = False,
+        gated_mlp_norm: str = "none",
     ) -> None:
         self.n_dim = n_dim
         self.e_dim = e_dim
@@ -212,6 +214,8 @@ def __init__(
         self.use_dynamic_sel = use_dynamic_sel
         self.sel_reduce_factor = sel_reduce_factor
         self.update_use_layernorm = update_use_layernorm
+        self.use_gated_mlp = use_gated_mlp
+        self.gated_mlp_norm = gated_mlp_norm
 
     def __getitem__(self, key: str) -> Any:
         if hasattr(self, key):
diff --git a/deepmd/pt/model/descriptor/dpa3.py b/deepmd/pt/model/descriptor/dpa3.py
@@ -166,7 +166,10 @@ def init_subclass_params(sub_data: Any, sub_class: Any) -> Any:
             use_dynamic_sel=self.repflow_args.use_dynamic_sel,
             sel_reduce_factor=self.repflow_args.sel_reduce_factor,
             use_loc_mapping=use_loc_mapping,
+            # followings are new added param
             update_use_layernorm=self.repflow_args.update_use_layernorm,
+            use_gated_mlp=self.repflow_args.use_gated_mlp,
+            gated_mlp_norm=self.repflow_args.gated_mlp_norm,
             exclude_types=exclude_types,
             env_protection=env_protection,
             precision=precision,
diff --git a/deepmd/pt/model/descriptor/repflow_layer.py b/deepmd/pt/model/descriptor/repflow_layer.py
@@ -17,6 +17,7 @@
     get_residual,
 )
 from deepmd.pt.model.network.mlp import (
+    GatedMLP,
     MLPLayer,
 )
 from deepmd.pt.model.network.utils import (
@@ -59,6 +60,8 @@ def __init__(
         sel_reduce_factor: float = 10.0,
         smooth_edge_update: bool = False,
         update_use_layernorm: bool = False,
+        use_gated_mlp: bool = False,
+        gated_mlp_norm: str = "none",
         activation_function: str = "silu",
         update_style: str = "res_residual",
         update_residual: float = 0.1,
@@ -99,6 +102,10 @@ def __init__(
         self.update_residual = update_residual
         self.update_residual_init = update_residual_init
         self.update_use_layernorm = update_use_layernorm
+        self.use_gated_mlp = use_gated_mlp
+        if self.use_gated_mlp:
+            assert not optim_update, "Gated MLP does not support optim update!"
+        self.gated_mlp_norm = gated_mlp_norm
         self.a_compress_e_rate = a_compress_e_rate
         self.a_compress_use_split = a_compress_use_split
         self.precision = precision
@@ -165,13 +172,23 @@ def __init__(
             )
 
         # node edge message
-        self.node_edge_linear = MLPLayer(
-            self.edge_info_dim,
-            self.n_multi_edge_message * n_dim,
-            precision=precision,
-            seed=child_seed(seed, 4),
-            trainable=trainable,
-        )
+        if not self.use_gated_mlp:
+            self.node_edge_linear = MLPLayer(
+                self.edge_info_dim,
+                self.n_multi_edge_message * n_dim,
+                precision=precision,
+                seed=child_seed(seed, 4),
+                trainable=trainable,
+            )
+        else:
+            self.node_edge_linear = GatedMLP(
+                self.edge_info_dim,
+                self.n_multi_edge_message * n_dim,
+                activation_function=self.activation_function,
+                norm=self.gated_mlp_norm,
+                precision=precision,
+                seed=child_seed(seed, 4),
+            )
         if self.update_style == "res_residual":
             for head_index in range(self.n_multi_edge_message):
                 self.n_residual.append(
@@ -256,13 +273,23 @@ def __init__(
                     self.a_compress_e_linear = None
 
             # edge angle message
-            self.edge_angle_linear1 = MLPLayer(
-                self.angle_dim,
-                self.e_dim,
-                precision=precision,
-                seed=child_seed(seed, 10),
-                trainable=trainable,
-            )
+            if not self.use_gated_mlp:
+                self.edge_angle_linear1 = MLPLayer(
+                    self.angle_dim,
+                    self.e_dim,
+                    precision=precision,
+                    seed=child_seed(seed, 10),
+                    trainable=trainable,
+                )
+            else:
+                self.edge_angle_linear1 = GatedMLP(
+                    self.angle_dim,
+                    self.e_dim,
+                    activation_function=self.activation_function,
+                    norm=self.gated_mlp_norm,
+                    precision=precision,
+                    seed=child_seed(seed, 10),
+                )
             self.edge_angle_linear2 = MLPLayer(
                 self.e_dim,
                 self.e_dim,
diff --git a/deepmd/pt/model/descriptor/repflows.py b/deepmd/pt/model/descriptor/repflows.py
@@ -221,6 +221,8 @@ def __init__(
         sel_reduce_factor: float = 10.0,
         use_loc_mapping: bool = True,
         update_use_layernorm: bool = False,
+        use_gated_mlp: bool = False,
+        gated_mlp_norm: str = "none",
         optim_update: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
         trainable: bool = True,
@@ -287,6 +289,8 @@ def __init__(
         self.epsilon = 1e-4
         self.seed = seed
         self.update_use_layernorm = update_use_layernorm
+        self.use_gated_mlp = use_gated_mlp
+        self.gated_mlp_norm = gated_mlp_norm
 
         self.edge_embd = MLPLayer(
             1,
@@ -333,6 +337,8 @@ def __init__(
                     sel_reduce_factor=self.sel_reduce_factor,
                     smooth_edge_update=self.smooth_edge_update,
                     update_use_layernorm=self.update_use_layernorm,
+                    use_gated_mlp=self.use_gated_mlp,
+                    gated_mlp_norm=self.gated_mlp_norm,
                     seed=child_seed(child_seed(seed, 1), ii),
                     trainable=trainable,
                 )
diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py
@@ -280,6 +280,89 @@ def check_load_param(ss: str) -> Optional[nn.Parameter]:
         return obj
 
 
+class GatedMLP(nn.Module):
+    """Gated MLP
+    similar model structure is used in CGCNN and M3GNet.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        *,
+        activation_function: Optional[str] = None,
+        norm: str = "batch",
+        bias: bool = True,
+        precision: str = DEFAULT_PRECISION,
+        seed: Optional[Union[int, list[int]]] = None,
+    ) -> None:
+        """Initialize a gated MLP.
+
+        Args:
+            input_dim (int): the input dimension
+            output_dim (int): the output dimension
+            activation_function (str, optional): The name of the activation function to use in
+                the gated MLP. Must be one of "relu", "silu", "tanh", or "gelu".
+                Default = "silu"
+            norm (str, optional): The name of the normalization layer to use on the
+                updated atom features. Must be one of "batch", "layer", or None.
+                Default = "batch"
+            bias (bool): whether to use bias in each Linear layers.
+                Default = True
+        """
+        super().__init__()
+        self.mlp_core = MLPLayer(
+            input_dim,
+            output_dim,
+            bias=bias,
+            precision=precision,
+            seed=seed,
+        )
+        self.mlp_gate = MLPLayer(
+            input_dim,
+            output_dim,
+            bias=bias,
+            precision=precision,
+            seed=seed,
+        )
+        # for jit
+        self.matrix = self.mlp_core.matrix
+        self.bias = self.mlp_core.bias
+        self.act = ActivationFn(activation_function)
+        self.sigmoid = nn.Sigmoid()
+        self.norm1 = find_normalization(name=norm, dim=output_dim)
+        self.norm2 = find_normalization(name=norm, dim=output_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Performs a forward pass through the MLP.
+
+        Args:
+            x (Tensor): a tensor of shape (batch_size, input_dim)
+
+        Returns
+        -------
+        Tensor: a tensor of shape (batch_size, output_dim)
+        """
+        if self.norm1 is None:
+            core = self.act(self.mlp_core(x))
+            gate = self.sigmoid(self.mlp_gate(x))
+        else:
+            core = self.act(self.norm1(self.mlp_core(x)))
+            gate = self.sigmoid(self.norm2(self.mlp_gate(x)))
+        return core * gate
+
+
+def find_normalization(name: str, dim: int | None = None) -> nn.Module | None:
+    """Return an normalization function using name."""
+    if name is None:
+        return None
+    return {
+        "batch": nn.BatchNorm1d(dim),
+        "layer": nn.LayerNorm(dim),
+        "none": None,
+    }.get(name.lower(), None)
+
+
 MLP_ = make_multilayer_network(MLPLayer, nn.Module)
 
 
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -1673,6 +1673,18 @@ def dpa3_repflow_args() -> list[Argument]:
             optional=True,
             default=False,
         ),
+        Argument(
+            "use_gated_mlp",
+            bool,
+            optional=True,
+            default=False,
+        ),
+        Argument(
+            "gated_mlp_norm",
+            str,
+            optional=True,
+            default="none",
+        ),
     ]