Skip to content
32 changes: 32 additions & 0 deletions doc/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,37 @@ These environment variables also apply to third-party programs using the C++ int
**Type**: List of paths, split by `:` on Unix and `;` on Windows

List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
:::

:::{envvar} DP_PROFILER

{{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.

**Type**: string (output file stem)

**Default**: unset (disabled)

When set to a non-empty value, profiling is enabled for the lifetime of the loaded PyTorch model (e.g. during LAMMPS runs). A JSON trace file is created on finish. The final file name is constructed as:

- `<ENV_VALUE>_gpu<ID>.json` if running on GPU
- `<ENV_VALUE>.json` if running on CPU

The trace can be examined with [Chrome trace viewer](https://ui.perfetto.dev/) (alternatively chrome://tracing). It includes:

- CPU operator activities
- CUDA activities (if available)

Example:

```bash
export DP_PROFILER=result
mpirun -np 4 lmp -in in.lammps
# Produces result_gpuX.json, where X is the GPU id used by each MPI rank.
```

Tips:

- Large runs can generate sizable JSON files; consider limiting numbers of MD steps, like 20.
- Currently this feature only supports single process, or multi-process runs where each process uses a distinct GPU on the same node.

:::
2 changes: 2 additions & 0 deletions source/api_cc/include/DeepPotPT.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ class DeepPotPT : public DeepPotBackend {
at::Tensor firstneigh_tensor;
c10::optional<torch::Tensor> mapping_tensor;
torch::Dict<std::string, torch::Tensor> comm_dict;
bool profiler_enabled{false};
std::string profiler_file;
/**
Comment thread
caic99 marked this conversation as resolved.
* @brief Translate PyTorch exceptions to the DeePMD-kit exception.
* @param[in] f The function to run.
Expand Down
52 changes: 45 additions & 7 deletions source/api_cc/src/DeepPotPT.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#ifdef BUILD_PYTORCH
#include "DeepPotPT.h"

#include <torch/csrc/autograd/profiler.h>
#include <torch/csrc/jit/runtime/jit_exception.h>

#include <cstdint>
Expand Down Expand Up @@ -69,13 +70,9 @@ void DeepPotPT::init(const std::string& model,
}
deepmd::load_op_library();
int gpu_num = torch::cuda::device_count();
if (gpu_num > 0) {
gpu_id = gpu_rank % gpu_num;
} else {
gpu_id = 0;
}
torch::Device device(torch::kCUDA, gpu_id);
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
gpu_enabled = torch::cuda::is_available();
torch::Device device(torch::kCUDA, gpu_id);
if (!gpu_enabled) {
device = torch::Device(torch::kCPU);
std::cout << "load model from: " << model << " to cpu " << std::endl;
Expand All @@ -86,6 +83,37 @@ void DeepPotPT::init(const std::string& model,
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
}

// Configure PyTorch profiler
const char* env_profiler = std::getenv("DP_PROFILER");
if (env_profiler && *env_profiler) {
Comment thread
caic99 marked this conversation as resolved.
using torch::profiler::impl::ActivityType;
using torch::profiler::impl::ExperimentalConfig;
using torch::profiler::impl::ProfilerConfig;
using torch::profiler::impl::ProfilerState;
std::set<ActivityType> activities{ActivityType::CPU};
if (gpu_enabled) {
activities.insert(ActivityType::CUDA);
}
profiler_file = std::string(env_profiler);
if (gpu_enabled) {
profiler_file += "_gpu" + std::to_string(gpu_id);
}
profiler_file += ".json";
ExperimentalConfig exp_cfg;
ProfilerConfig cfg(ProfilerState::KINETO,
false, // report_input_shapes
false, // profile_memory
true, // with_stack
false, // with_flops
true, // with_modules
exp_cfg);
torch::autograd::profiler::prepareProfiler(cfg, activities);
torch::autograd::profiler::enableProfiler(cfg, activities);
std::cout << "PyTorch profiler enabled, output file: " << profiler_file
Comment thread
caic99 marked this conversation as resolved.
<< std::endl;
profiler_enabled = true;
}
std::unordered_map<std::string, std::string> metadata = {{"type", ""}};
module = torch::jit::load(model, device, metadata);
module.eval();
Expand Down Expand Up @@ -119,7 +147,17 @@ void DeepPotPT::init(const std::string& model,
aparam_nall = module.run_method("is_aparam_nall").toBool();
inited = true;
}
DeepPotPT::~DeepPotPT() {}

DeepPotPT::~DeepPotPT() {
if (profiler_enabled) {
auto result = torch::autograd::profiler::disableProfiler();
if (result) {
result->save(profiler_file);
}
std::cout << "PyTorch profiler result saved to " << profiler_file
<< std::endl;
}
}

template <typename VALUETYPE, typename ENERGYVTYPE>
void DeepPotPT::compute(ENERGYVTYPE& ener,
Expand Down