diff --git a/doc/development/pytorch-profiler.md b/doc/development/pytorch-profiler.md
new file mode 100644
index 0000000000..308a172cae
--- /dev/null
+++ b/doc/development/pytorch-profiler.md
@@ -0,0 +1,52 @@
+# PyTorch C++ Profiler Integration Test
+
+This test demonstrates the PyTorch profiler integration with the C++ backend.
+
+## Usage
+
+1. Set environment variables:
+```bash
+export DP_ENABLE_PYTORCH_PROFILER=1
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results
+```
+
+2. Run your DeepMD-kit C++ application
+
+3. Check for profiler output in the specified directory:
+```bash
+# For single-rank or non-MPI usage
+ls -la ./profiler_results/pytorch_profiler_trace.json
+
+# For MPI usage, each rank gets its own file
+ls -la ./profiler_results/pytorch_profiler_trace_rank*.json
+```
+
+For MPI applications, you can use different output directories per rank:
+```bash
+# Example for rank 0
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results_rank0
+# Example for rank 1  
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results_rank1
+```
+
+## Environment Variables
+
+- `DP_ENABLE_PYTORCH_PROFILER`: Set to `1` or `true` to enable profiling
+- `DP_PYTORCH_PROFILER_OUTPUT_DIR`: Directory for profiler output (default: `./profiler_output`)
+
+## Implementation Details
+
+The profiler uses PyTorch's modern `torch::profiler` API and automatically:
+- Creates the output directory if it doesn't exist
+- Profiles all forward pass operations in DeepPotPT and DeepSpinPT
+- Saves profiling results to a JSON file when the object is destroyed
+- Automatically includes MPI rank in filename when MPI is available and initialized
+
+## Output Files
+
+- **Single-rank or non-MPI usage**: `pytorch_profiler_trace.json`
+- **MPI usage**: `pytorch_profiler_trace_rank{rank}.json` (e.g., `pytorch_profiler_trace_rank0.json`, `pytorch_profiler_trace_rank1.json`)
+
+This ensures that each MPI rank saves its profiling data to a separate file, preventing conflicts in multi-rank simulations.
+
+This is intended for development and debugging purposes.
\ No newline at end of file
diff --git a/doc/env.md b/doc/env.md
index 4ca7101236..28d777910b 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -90,3 +90,19 @@ These environment variables also apply to third-party programs using the C++ int
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
 
 :::
+
+:::{envvar} DP_ENABLE_PYTORCH_PROFILER
+
+**Choices**: `0`, `1`, `true`; **Default**: `0`
+
+{{ pytorch_icon }} Enable PyTorch profiler for C++ backend. This is for development purposes.
+
+:::
+
+:::{envvar} DP_PYTORCH_PROFILER_OUTPUT_DIR
+
+**Default**: `./profiler_output`
+
+{{ pytorch_icon }} Output directory for PyTorch profiler traces when `DP_ENABLE_PYTORCH_PROFILER` is enabled.
+
+:::
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 90b7c08449..956aafc226 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -49,6 +49,11 @@ set_target_properties(
              INSTALL_RPATH_USE_LINK_PATH TRUE
              BUILD_RPATH "$ORIGIN/../op/tf;$ORIGIN/../op/pt;$ORIGIN/../op/pd")
 target_compile_definitions(${libname} PRIVATE TF_PRIVATE)
+find_package(MPI)
+if(MPI_FOUND)
+  target_link_libraries(${libname} PRIVATE MPI::MPI_CXX)
+  target_compile_definitions(${libname} PRIVATE USE_MPI)
+endif()
 if(CMAKE_TESTING_ENABLED)
   target_link_libraries(${libname} PRIVATE coverage_config)
 endif()
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index 207a13286c..52b92d1c9f 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -340,6 +340,12 @@ class DeepPotPT : public DeepPotBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  // PyTorch profiler
+  bool profiler_enabled;
+  std::string profiler_output_dir;
+#ifdef BUILD_PYTORCH
+  std::shared_ptr<torch::profiler::Result> profiler_result;
+#endif
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.
diff --git a/source/api_cc/include/DeepSpinPT.h b/source/api_cc/include/DeepSpinPT.h
index be4c85d898..b4279b0679 100644
--- a/source/api_cc/include/DeepSpinPT.h
+++ b/source/api_cc/include/DeepSpinPT.h
@@ -262,6 +262,12 @@ class DeepSpinPT : public DeepSpinBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  // PyTorch profiler
+  bool profiler_enabled;
+  std::string profiler_output_dir;
+#ifdef BUILD_PYTORCH
+  std::shared_ptr<torch::profiler::Result> profiler_result;
+#endif
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index 612f699ea4..215bb42030 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -163,6 +163,36 @@ void select_map_inv(typename std::vector<VT>::iterator out,
  **/
 void get_env_nthreads(int& num_intra_nthreads, int& num_inter_nthreads);
 
+/**
+ * @brief Get PyTorch profiler configuration from environment variables.
+ * @param[out] enable_profiler Whether to enable the profiler. Read from
+ *DP_ENABLE_PYTORCH_PROFILER.
+ * @param[out] output_dir Output directory for profiler traces. Read from
+ *DP_PYTORCH_PROFILER_OUTPUT_DIR.
+ **/
+void get_env_pytorch_profiler(bool& enable_profiler, std::string& output_dir);
+
+/**
+ * @brief Get MPI rank. Currently disabled in api_cc to avoid MPI linking dependencies.
+ * @return Always returns -1. Users can distinguish ranks using different output directories.
+ **/
+int get_mpi_rank();
+
+/**
+ * @brief Create directories recursively in a cross-platform way.
+ * @param path The path to create.
+ * @return true if successful or directory already exists, false otherwise.
+ **/
+bool create_directories(const std::string& path);
+
+/**
+ * @brief Join two path components using platform-appropriate separator.
+ * @param path1 The first path component.
+ * @param path2 The second path component.
+ * @return The joined path.
+ **/
+std::string join_path(const std::string& path1, const std::string& path2);
+
 /**
  * @brief Dynamically load OP library. This should be called before loading
  * graphs.
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 0f3a72b87f..b8c933eb6a 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -46,11 +46,11 @@ torch::Tensor createNlistTensor(const std::vector<std::vector<int>>& data) {
   int nnei = nloc > 0 ? total_size / nloc : 0;
   return flat_tensor.view({1, nloc, nnei});
 }
-DeepPotPT::DeepPotPT() : inited(false) {}
+DeepPotPT::DeepPotPT() : inited(false), profiler_enabled(false) {}
 DeepPotPT::DeepPotPT(const std::string& model,
                      const int& gpu_rank,
                      const std::string& file_content)
-    : inited(false) {
+    : inited(false), profiler_enabled(false) {
   try {
     translate_error([&] { init(model, gpu_rank, file_content); });
   } catch (...) {
@@ -110,6 +110,26 @@ void DeepPotPT::init(const std::string& model,
     }
   }
 
+  // Initialize PyTorch profiler
+  get_env_pytorch_profiler(profiler_enabled, profiler_output_dir);
+  if (profiler_enabled) {
+#ifdef BUILD_PYTORCH
+    // Create output directory if it doesn't exist
+    if (!create_directories(profiler_output_dir)) {
+      std::cerr << "Warning: Failed to create profiler output directory: " << profiler_output_dir << std::endl;
+    }
+    
+    std::cout << "PyTorch profiler enabled. Output directory: " << profiler_output_dir << std::endl;
+    // Start profiling using new API
+    torch::profiler::profile({
+        torch::profiler::ProfilerActivity::CPU,
+        torch::profiler::ProfilerActivity::CUDA,
+    }, true, true, false);  // record_shapes, profile_memory, with_stack
+#else
+    std::cerr << "Warning: PyTorch profiler requested but BUILD_PYTORCH not defined" << std::endl;
+#endif
+  }
+
   auto rcut_ = module.run_method("get_rcut").toDouble();
   rcut = static_cast<double>(rcut_);
   ntypes = module.run_method("get_ntypes").toInt();
@@ -119,7 +139,31 @@ void DeepPotPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepPotPT::~DeepPotPT() {}
+DeepPotPT::~DeepPotPT() {
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled) {
+    try {
+      // Save profiler results to file with MPI rank if available
+      int rank = get_mpi_rank();
+      std::string output_file;
+      if (rank >= 0) {
+        // MPI is available and initialized, include rank in filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace_rank" + std::to_string(rank) + ".json");
+      } else {
+        // MPI not available or not initialized, use original filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace.json");
+      }
+      profiler_result = torch::profiler::disableProfiler();
+      if (profiler_result) {
+        profiler_result->save(output_file);
+        std::cout << "PyTorch profiler results saved to: " << output_file << std::endl;
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Warning: Failed to save profiler results: " << e.what() << std::endl;
+    }
+  }
+#endif
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepPotPT::compute(ENERGYVTYPE& ener,
@@ -234,6 +278,14 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
             options)
             .to(device);
   }
+  
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+  
   c10::Dict<c10::IValue, c10::IValue> outputs =
       (do_message_passing)
           ? module
@@ -383,6 +435,14 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   inputs.push_back(aparam_tensor);
   bool do_atom_virial_tensor = atomic;
   inputs.push_back(do_atom_virial_tensor);
+  
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+  
   c10::Dict<c10::IValue, c10::IValue> outputs =
       module.forward(inputs).toGenericDict();
   c10::IValue energy_ = outputs.at("energy");
diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
index 8ccf2fd383..14198f70eb 100644
--- a/source/api_cc/src/DeepSpinPT.cc
+++ b/source/api_cc/src/DeepSpinPT.cc
@@ -46,11 +46,11 @@ torch::Tensor createNlistTensor2(const std::vector<std::vector<int>>& data) {
   int nnei = nloc > 0 ? total_size / nloc : 0;
   return flat_tensor.view({1, nloc, nnei});
 }
-DeepSpinPT::DeepSpinPT() : inited(false) {}
+DeepSpinPT::DeepSpinPT() : inited(false), profiler_enabled(false) {}
 DeepSpinPT::DeepSpinPT(const std::string& model,
                        const int& gpu_rank,
                        const std::string& file_content)
-    : inited(false) {
+    : inited(false), profiler_enabled(false) {
   try {
     translate_error([&] { init(model, gpu_rank, file_content); });
   } catch (...) {
@@ -110,6 +110,26 @@ void DeepSpinPT::init(const std::string& model,
     }
   }
 
+  // Initialize PyTorch profiler
+  get_env_pytorch_profiler(profiler_enabled, profiler_output_dir);
+  if (profiler_enabled) {
+#ifdef BUILD_PYTORCH
+    // Create output directory if it doesn't exist
+    if (!create_directories(profiler_output_dir)) {
+      std::cerr << "Warning: Failed to create profiler output directory: " << profiler_output_dir << std::endl;
+    }
+    
+    std::cout << "PyTorch profiler enabled. Output directory: " << profiler_output_dir << std::endl;
+    // Start profiling using new API
+    torch::profiler::profile({
+        torch::profiler::ProfilerActivity::CPU,
+        torch::profiler::ProfilerActivity::CUDA,
+    }, true, true, false);  // record_shapes, profile_memory, with_stack
+#else
+    std::cerr << "Warning: PyTorch profiler requested but BUILD_PYTORCH not defined" << std::endl;
+#endif
+  }
+
   auto rcut_ = module.run_method("get_rcut").toDouble();
   rcut = static_cast<double>(rcut_);
   ntypes = module.run_method("get_ntypes").toInt();
@@ -119,7 +139,31 @@ void DeepSpinPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepSpinPT::~DeepSpinPT() {}
+DeepSpinPT::~DeepSpinPT() {
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled) {
+    try {
+      // Save profiler results to file with MPI rank if available
+      int rank = get_mpi_rank();
+      std::string output_file;
+      if (rank >= 0) {
+        // MPI is available and initialized, include rank in filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace_rank" + std::to_string(rank) + ".json");
+      } else {
+        // MPI not available or not initialized, use original filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace.json");
+      }
+      profiler_result = torch::profiler::disableProfiler();
+      if (profiler_result) {
+        profiler_result->save(output_file);
+        std::cout << "PyTorch profiler results saved to: " << output_file << std::endl;
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Warning: Failed to save profiler results: " << e.what() << std::endl;
+    }
+  }
+#endif
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepSpinPT::compute(ENERGYVTYPE& ener,
@@ -410,6 +454,14 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
   inputs.push_back(aparam_tensor);
   bool do_atom_virial_tensor = atomic;
   inputs.push_back(do_atom_virial_tensor);
+  
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+  
   c10::Dict<c10::IValue, c10::IValue> outputs =
       module.forward(inputs).toGenericDict();
   c10::IValue energy_ = outputs.at("energy");
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index eace577f89..1c2ca57956 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -7,6 +7,13 @@
 #include <fstream>
 #include <sstream>
 #include <string>
+#include <iostream>
+#include <sys/stat.h>
+#include <errno.h>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
 
 #include "AtomMap.h"
 #include "device.h"
@@ -20,6 +27,7 @@
 #define PSAPI_VERSION 2
 #include <io.h>
 #include <windows.h>
+#include <direct.h>  // for _mkdir
 #define O_RDONLY _O_RDONLY
 #else
 // not windows
@@ -381,6 +389,83 @@ void deepmd::get_env_nthreads(int& num_intra_nthreads,
   }
 }
 
+void deepmd::get_env_pytorch_profiler(bool& enable_profiler, std::string& output_dir) {
+  enable_profiler = false;
+  output_dir = "./profiler_output";  // default directory
+  
+  const char* env_enable = std::getenv("DP_ENABLE_PYTORCH_PROFILER");
+  if (env_enable &&
+      std::string(env_enable) != std::string("") &&
+      (std::string(env_enable) == "1" || std::string(env_enable) == "true")) {
+    enable_profiler = true;
+  }
+  
+  const char* env_output_dir = std::getenv("DP_PYTORCH_PROFILER_OUTPUT_DIR");
+  if (env_output_dir && std::string(env_output_dir) != std::string("")) {
+    output_dir = std::string(env_output_dir);
+  }
+}
+
+int deepmd::get_mpi_rank() {
+#ifdef USE_MPI
+  int rank = -1;  // Use -1 to indicate MPI not available/initialized
+  int initialized = 0;
+  if (MPI_Initialized(&initialized) == MPI_SUCCESS && initialized) {
+    if (MPI_Comm_rank(MPI_COMM_WORLD, &rank) != MPI_SUCCESS) {
+      rank = -1;  // fallback to -1 if MPI_Comm_rank fails
+    }
+  }
+  return rank;
+#else
+  // MPI not available at compile time
+  return -1;
+#endif
+}
+
+bool deepmd::create_directories(const std::string& path) {
+  if (path.empty()) {
+    return false;
+  }
+  
+  // Check if directory already exists
+  struct stat st;
+  if (stat(path.c_str(), &st) == 0) {
+    return S_ISDIR(st.st_mode);
+  }
+  
+  // Find the parent directory
+  size_t pos = path.find_last_of("/\\");
+  if (pos != std::string::npos && pos > 0) {
+    std::string parent = path.substr(0, pos);
+    if (!create_directories(parent)) {
+      return false;
+    }
+  }
+  
+  // Create this directory
+#if defined(_WIN32)
+  return _mkdir(path.c_str()) == 0 || errno == EEXIST;
+#else
+  return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST;
+#endif
+}
+
+std::string deepmd::join_path(const std::string& path1, const std::string& path2) {
+  if (path1.empty()) return path2;
+  if (path2.empty()) return path1;
+  
+  char sep = '/';
+#if defined(_WIN32)
+  sep = '\\';
+#endif
+  
+  if (path1.back() == '/' || path1.back() == '\\') {
+    return path1 + path2;
+  } else {
+    return path1 + sep + path2;
+  }
+}
+
 static inline void _load_library_path(std::string dso_path) {
 #if defined(_WIN32)
   void* dso_handle = LoadLibrary(dso_path.c_str());
diff --git a/source/api_cc/tests/test_pytorch_profiler.cc b/source/api_cc/tests/test_pytorch_profiler.cc
new file mode 100644
index 0000000000..70e0690de3
--- /dev/null
+++ b/source/api_cc/tests/test_pytorch_profiler.cc
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <iostream>
+
+#include "common.h"
+
+class TestPyTorchProfiler : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Clean any existing environment variables
+    unsetenv("DP_ENABLE_PYTORCH_PROFILER");
+    unsetenv("DP_PYTORCH_PROFILER_OUTPUT_DIR");
+  }
+  
+  void TearDown() override {
+    // Clean up environment variables
+    unsetenv("DP_ENABLE_PYTORCH_PROFILER");
+    unsetenv("DP_PYTORCH_PROFILER_OUTPUT_DIR");
+  }
+};
+
+TEST_F(TestPyTorchProfiler, test_profiler_disabled_by_default) {
+  bool enable_profiler;
+  std::string output_dir;
+  deepmd::get_env_pytorch_profiler(enable_profiler, output_dir);
+  
+  EXPECT_FALSE(enable_profiler);
+  EXPECT_EQ(output_dir, "./profiler_output");
+}
+
+TEST_F(TestPyTorchProfiler, test_profiler_enabled_with_env) {
+  setenv("DP_ENABLE_PYTORCH_PROFILER", "1", 1);
+  
+  bool enable_profiler;
+  std::string output_dir;
+  deepmd::get_env_pytorch_profiler(enable_profiler, output_dir);
+  
+  EXPECT_TRUE(enable_profiler);
+  EXPECT_EQ(output_dir, "./profiler_output");
+}
+
+TEST_F(TestPyTorchProfiler, test_profiler_enabled_with_true) {
+  setenv("DP_ENABLE_PYTORCH_PROFILER", "true", 1);
+  
+  bool enable_profiler;
+  std::string output_dir;
+  deepmd::get_env_pytorch_profiler(enable_profiler, output_dir);
+  
+  EXPECT_TRUE(enable_profiler);
+  EXPECT_EQ(output_dir, "./profiler_output");
+}
+
+TEST_F(TestPyTorchProfiler, test_custom_output_dir) {
+  setenv("DP_ENABLE_PYTORCH_PROFILER", "1", 1);
+  setenv("DP_PYTORCH_PROFILER_OUTPUT_DIR", "/custom/path", 1);
+  
+  bool enable_profiler;
+  std::string output_dir;
+  deepmd::get_env_pytorch_profiler(enable_profiler, output_dir);
+  
+  EXPECT_TRUE(enable_profiler);
+  EXPECT_EQ(output_dir, "/custom/path");
+}
+
+TEST_F(TestPyTorchProfiler, test_profiler_disabled_with_zero) {
+  setenv("DP_ENABLE_PYTORCH_PROFILER", "0", 1);
+  
+  bool enable_profiler;
+  std::string output_dir;
+  deepmd::get_env_pytorch_profiler(enable_profiler, output_dir);
+  
+  EXPECT_FALSE(enable_profiler);
+  EXPECT_EQ(output_dir, "./profiler_output");
+}
+
+TEST_F(TestPyTorchProfiler, test_mpi_rank_detection) {
+  // Test that MPI rank detection returns valid rank (-1 when MPI not initialized, >= 0 when initialized)
+  int rank = deepmd::get_mpi_rank();
+  EXPECT_GE(rank, -1);  // Rank should be -1 (not available) or >= 0 (valid rank)
+}
\ No newline at end of file