create custom OP for se_t_tebd

OutisLi · OutisLi · commit d24380489807 · 2025-09-28T17:21:32.000+08:00
diff --git a/deepmd/pt/model/descriptor/se_t_tebd.py b/deepmd/pt/model/descriptor/se_t_tebd.py
@@ -904,6 +904,7 @@ def forward(
             self.rcut_smth,
             protection=self.env_protection,
         )
+        # dmatrix: [1/r, dx/r^2, dy/r^2, dz/r^2], sw: distance weighting
         # nb x nloc x nnei
         exclude_mask = self.emask(nlist, extended_atype)
         nlist = torch.where(exclude_mask != 0, nlist, -1)
@@ -924,11 +925,11 @@ def forward(
         rr = dmatrix
         rr = rr * exclude_mask[:, :, None]
 
-        # nfnl x nt_i x 3
+        # nfnl x nt_i x 3: direction vectors
         rr_i = rr[:, :, 1:]
         # nfnl x nt_j x 3
         rr_j = rr[:, :, 1:]
-        # nfnl x nt_i x nt_j
+        # nfnl x nt_i x nt_j: three-body angular correlations (cos theta_ij)
         env_ij = torch.einsum("ijm,ikm->ijk", rr_i, rr_j)
         # nfnl x nt_i x nt_j x 1
         ss = env_ij.unsqueeze(-1)
@@ -951,18 +952,19 @@ def forward(
             gg = self.filter_layers.networks[0](ss)
         elif self.tebd_input_mode in ["strip"]:
             if self.compress:
-                # Use tabulated computation for the geometric embedding
+                # Tabulated geometric embedding from angular features
+                # using SE_T_TEBD specific function
                 ebd_env_ij = env_ij.view(-1, 1)
-                gg_s_compressed = torch.ops.deepmd.tabulate_fusion_se_t(
+                gg_s = torch.ops.deepmd.tabulate_fusion_se_t_tebd(
                     self.compress_data[0].contiguous(),
                     self.compress_info[0].cpu().contiguous(),
                     ebd_env_ij.contiguous(),
                     env_ij.contiguous(),
                     self.filter_neuron[-1],
                 )[0]
-                # The compressed output is nfnl x ng, need to expand to nfnl x nt_i x nt_j x ng
-                # by replicating across the neighbor dimensions
-                gg_s = gg_s_compressed.view(nfnl, 1, 1, self.filter_neuron[-1]).expand(nfnl, nnei, nnei, self.filter_neuron[-1])
+                # SE_T_TEBD tabulation preserves the full neighbor structure
+                # nfnl x nt_i x nt_j x ng
+                gg_s = gg_s.view(nfnl, nnei, nnei, self.filter_neuron[-1])
             else:
                 # nfnl x nt_i x nt_j x ng
                 gg_s = self.filter_layers.networks[0](ss)
@@ -1010,16 +1012,19 @@ def forward(
             # (nfnl x nt_i x nt_j) x ng
             gg_t = gg_t.reshape(nfnl, nnei, nnei, ng)
             if self.smooth:
+                # Apply distance weighting to type features
                 gg_t = (
                     gg_t
                     * sw.reshape(nfnl, self.nnei, 1, 1)
                     * sw.reshape(nfnl, 1, self.nnei, 1)
                 )
+            # Combine geometric and type embeddings: gg_s * (1 + gg_t)
             # nfnl x nt_i x nt_j x ng
             gg = gg_s * gg_t + gg_s
         else:
             raise NotImplementedError
 
+        # Contract angular correlations with learned features
         # nfnl x ng
         res_ij = torch.einsum("ijk,ijkm->im", env_ij, gg)
         res_ij = res_ij * (1.0 / float(self.nnei) / float(self.nnei))
diff --git a/source/lib/src/gpu/tabulate.cu b/source/lib/src/gpu/tabulate.cu
@@ -630,7 +630,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
   dz_dy[block_idx * last_layer_size + thread_idx] = sum;
 }
 
-template <typename FPTYPE, int MM, int KK>
+template <typename FPTYPE, int MTILE, int KTILE>
 __global__ void tabulate_fusion_se_t_tebd_fifth_order_polynomial(
     FPTYPE* out,
     const FPTYPE* table,
@@ -668,7 +668,7 @@ __global__ void tabulate_fusion_se_t_tebd_fifth_order_polynomial(
   }
 }
 
-template <typename FPTYPE, int MM, int KK>
+template <typename FPTYPE, int MTILE, int KTILE>
 __global__ void tabulate_fusion_se_t_tebd_grad_fifth_order_polynomial(
     FPTYPE* dy_dem_x,
     const FPTYPE* table,
diff --git a/source/op/pt/tabulate_multi_device.cc b/source/op/pt/tabulate_multi_device.cc
@@ -389,8 +389,7 @@ void TabulateFusionSeTTebdGradForward(const torch::Tensor& table_tensor,
                                       const torch::Tensor& em_tensor,
                                       const torch::Tensor& dy_tensor,
                                       const torch::Tensor& descriptor_tensor,
-                                      torch::Tensor& dy_dem_x_tensor,
-                                      torch::Tensor& dy_dem_tensor) {
+                                      torch::Tensor& dy_dem_x_tensor) {
   // check input shape
   if (dy_tensor.dim() != 4) {
     throw std::invalid_argument("Dim of dy_tensor should be 4");
@@ -399,7 +398,6 @@ void TabulateFusionSeTTebdGradForward(const torch::Tensor& table_tensor,
   GetTensorDevice(table_tensor, device);
   // flat the tensors
   FPTYPE* dy_dem_x = dy_dem_x_tensor.view({-1}).data_ptr<FPTYPE>();
-  FPTYPE* dy_dem = dy_dem_tensor.view({-1}).data_ptr<FPTYPE>();
 
   const FPTYPE* table = table_tensor.view({-1}).data_ptr<FPTYPE>();
   const FPTYPE* table_info = table_info_tensor.view({-1}).data_ptr<FPTYPE>();
@@ -430,6 +428,54 @@ void TabulateFusionSeTTebdGradForward(const torch::Tensor& table_tensor,
   }
 }
 
+template <typename FPTYPE>
+void TabulateFusionSeTTebdGradGradForward(const torch::Tensor& table_tensor,
+                                          const torch::Tensor& table_info_tensor,
+                                          const torch::Tensor& em_x_tensor,
+                                          const torch::Tensor& em_tensor,
+                                          const torch::Tensor& dz_dy_dem_x_tensor,
+                                          const torch::Tensor& descriptor_tensor,
+                                          torch::Tensor& dz_dy_tensor) {
+  // Check input shape
+  if (dz_dy_dem_x_tensor.dim() != 3) {
+    throw std::invalid_argument("Dim of dz_dy_dem_x should be 3");
+  }
+  // get the device
+  std::string device;
+  GetTensorDevice(table_tensor, device);
+  // flat the tensors
+  FPTYPE* dz_dy = dz_dy_tensor.view({-1}).data_ptr<FPTYPE>();
+
+  const FPTYPE* table = table_tensor.view({-1}).data_ptr<FPTYPE>();
+  const FPTYPE* table_info = table_info_tensor.view({-1}).data_ptr<FPTYPE>();
+  const FPTYPE* em_x = em_x_tensor.view({-1}).data_ptr<FPTYPE>();
+  const FPTYPE* em = em_tensor.view({-1}).data_ptr<FPTYPE>();
+  const FPTYPE* dz_dy_dem_x = dz_dy_dem_x_tensor.view({-1}).data_ptr<FPTYPE>();
+  const int64_t nloc = em_x_tensor.size(0);
+  const int64_t nnei_i = em_x_tensor.size(1);
+  const int64_t nnei_j = em_x_tensor.size(2);
+  const int64_t last_layer_size = descriptor_tensor.size(3);
+  // compute
+  if (device == "GPU") {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    deepmd::tabulate_fusion_se_t_tebd_grad_grad_gpu(dz_dy, table, table_info, em_x,
+                                                     em, dz_dy_dem_x, nloc,
+                                                     nnei_i, nnei_j, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    TORCH_CHECK(last_layer_size <= 1024,
+                "In the process of model compression, the size of the "
+                "last layer of embedding net must be less than 1024!");
+  } else if (device == "CPU") {
+    deepmd::tabulate_fusion_se_t_tebd_grad_grad_cpu(dz_dy, table, table_info, em_x,
+                                                     em, dz_dy_dem_x, nloc,
+                                                     nnei_i, nnei_j, last_layer_size);
+  }
+}
+
 template <typename FPTYPE>
 void TabulateFusionSeRForward(const torch::Tensor& table_tensor,
                               const torch::Tensor& table_info_tensor,
@@ -1107,13 +1153,12 @@ class TabulateFusionSeTTebdOp
     torch::Tensor dy_tensor = grad_output[0].contiguous();
     // allocate output tensors
     torch::Tensor dy_dem_x_tensor = torch::zeros_like(em_x_tensor);
-    torch::Tensor dy_dem_tensor = torch::zeros_like(em_tensor);
     // compute
     TabulateFusionSeTTebdGradForward<FPTYPE>(
         table_tensor, table_info_tensor, em_x_tensor, em_tensor, dy_tensor,
-        descriptor_tensor, dy_dem_x_tensor, dy_dem_tensor);
+        descriptor_tensor, dy_dem_x_tensor);
 
-    return {at::Tensor(), at::Tensor(), dy_dem_x_tensor, dy_dem_tensor,
+    return {at::Tensor(), at::Tensor(), dy_dem_x_tensor, at::Tensor(),
             at::Tensor()};
   }
 };

Original file line number	Diff line number	Diff line change
`@@ -630,7 +630,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(`
`630`	`630`	`dz_dy[block_idx * last_layer_size + thread_idx] = sum;`
`631`	`631`	`}`
`632`	`632`
`633`		`-template <typename FPTYPE, int MM, int KK>`
	`633`	`+template <typename FPTYPE, int MTILE, int KTILE>`
`634`	`634`	`__global__ void tabulate_fusion_se_t_tebd_fifth_order_polynomial(`
`635`	`635`	`FPTYPE* out,`
`636`	`636`	`const FPTYPE* table,`
`@@ -668,7 +668,7 @@ __global__ void tabulate_fusion_se_t_tebd_fifth_order_polynomial(`
`668`	`668`	`}`
`669`	`669`	`}`
`670`	`670`
`671`		`-template <typename FPTYPE, int MM, int KK>`
	`671`	`+template <typename FPTYPE, int MTILE, int KTILE>`
`672`	`672`	`__global__ void tabulate_fusion_se_t_tebd_grad_fifth_order_polynomial(`
`673`	`673`	`FPTYPE* dy_dem_x,`
`674`	`674`	`const FPTYPE* table,`