diff --git a/doc/env.md b/doc/env.md
index 4ca7101236..1688e0af9c 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -88,5 +88,37 @@ These environment variables also apply to third-party programs using the C++ int
 **Type**: List of paths, split by `:` on Unix and `;` on Windows
 
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
+:::
+
+:::{envvar} DP_PROFILER
+
+{{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.
+
+**Type**: string (output file stem)
+
+**Default**: unset (disabled)
+
+When set to a non-empty value, profiling is enabled for the lifetime of the loaded PyTorch model (e.g. during LAMMPS runs). A JSON trace file is created on finish. The final file name is constructed as:
+
+- `<ENV_VALUE>_gpu<ID>.json` if running on GPU
+- `<ENV_VALUE>.json` if running on CPU
+
+The trace can be examined with [Chrome trace viewer](https://ui.perfetto.dev/) (alternatively chrome://tracing). It includes:
+
+- CPU operator activities
+- CUDA activities (if available)
+
+Example:
+
+```bash
+export DP_PROFILER=result
+mpirun -np 4 lmp -in in.lammps
+# Produces result_gpuX.json, where X is the GPU id used by each MPI rank.
+```
+
+Tips:
+
+- Large runs can generate sizable JSON files; consider limiting numbers of MD steps, like 20.
+- Currently this feature only supports single process, or multi-process runs where each process uses a distinct GPU on the same node.
 
 :::
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index 207a13286c..4a06bf012c 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -340,6 +340,8 @@ class DeepPotPT : public DeepPotBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  bool profiler_enabled{false};
+  std::string profiler_file;
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 0f3a72b87f..3fdfeeae27 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -2,6 +2,7 @@
 #ifdef BUILD_PYTORCH
 #include "DeepPotPT.h"
 
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
 
 #include <cstdint>
@@ -69,13 +70,9 @@ void DeepPotPT::init(const std::string& model,
   }
   deepmd::load_op_library();
   int gpu_num = torch::cuda::device_count();
-  if (gpu_num > 0) {
-    gpu_id = gpu_rank % gpu_num;
-  } else {
-    gpu_id = 0;
-  }
-  torch::Device device(torch::kCUDA, gpu_id);
+  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
   gpu_enabled = torch::cuda::is_available();
+  torch::Device device(torch::kCUDA, gpu_id);
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
     std::cout << "load model from: " << model << " to cpu " << std::endl;
@@ -86,6 +83,37 @@ void DeepPotPT::init(const std::string& model,
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }
+
+  // Configure PyTorch profiler
+  const char* env_profiler = std::getenv("DP_PROFILER");
+  if (env_profiler && *env_profiler) {
+    using torch::profiler::impl::ActivityType;
+    using torch::profiler::impl::ExperimentalConfig;
+    using torch::profiler::impl::ProfilerConfig;
+    using torch::profiler::impl::ProfilerState;
+    std::set<ActivityType> activities{ActivityType::CPU};
+    if (gpu_enabled) {
+      activities.insert(ActivityType::CUDA);
+    }
+    profiler_file = std::string(env_profiler);
+    if (gpu_enabled) {
+      profiler_file += "_gpu" + std::to_string(gpu_id);
+    }
+    profiler_file += ".json";
+    ExperimentalConfig exp_cfg;
+    ProfilerConfig cfg(ProfilerState::KINETO,
+                       false,  // report_input_shapes
+                       false,  // profile_memory
+                       true,   // with_stack
+                       false,  // with_flops
+                       true,   // with_modules
+                       exp_cfg);
+    torch::autograd::profiler::prepareProfiler(cfg, activities);
+    torch::autograd::profiler::enableProfiler(cfg, activities);
+    std::cout << "PyTorch profiler enabled, output file: " << profiler_file
+              << std::endl;
+    profiler_enabled = true;
+  }
   std::unordered_map<std::string, std::string> metadata = {{"type", ""}};
   module = torch::jit::load(model, device, metadata);
   module.eval();
@@ -119,7 +147,17 @@ void DeepPotPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepPotPT::~DeepPotPT() {}
+
+DeepPotPT::~DeepPotPT() {
+  if (profiler_enabled) {
+    auto result = torch::autograd::profiler::disableProfiler();
+    if (result) {
+      result->save(profiler_file);
+    }
+    std::cout << "PyTorch profiler result saved to " << profiler_file
+              << std::endl;
+  }
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepPotPT::compute(ENERGYVTYPE& ener,