diff --git a/doc/env.md b/doc/env.md index 4ca7101236..1688e0af9c 100644 --- a/doc/env.md +++ b/doc/env.md @@ -88,5 +88,37 @@ These environment variables also apply to third-party programs using the C++ int **Type**: List of paths, split by `:` on Unix and `;` on Windows List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows. +::: + +:::{envvar} DP_PROFILER + +{{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend. + +**Type**: string (output file stem) + +**Default**: unset (disabled) + +When set to a non-empty value, profiling is enabled for the lifetime of the loaded PyTorch model (e.g. during LAMMPS runs). A JSON trace file is created on finish. The final file name is constructed as: + +- `_gpu.json` if running on GPU +- `.json` if running on CPU + +The trace can be examined with [Chrome trace viewer](https://ui.perfetto.dev/) (alternatively chrome://tracing). It includes: + +- CPU operator activities +- CUDA activities (if available) + +Example: + +```bash +export DP_PROFILER=result +mpirun -np 4 lmp -in in.lammps +# Produces result_gpuX.json, where X is the GPU id used by each MPI rank. +``` + +Tips: + +- Large runs can generate sizable JSON files; consider limiting numbers of MD steps, like 20. +- Currently this feature only supports single process, or multi-process runs where each process uses a distinct GPU on the same node. ::: diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h index 207a13286c..4a06bf012c 100644 --- a/source/api_cc/include/DeepPotPT.h +++ b/source/api_cc/include/DeepPotPT.h @@ -340,6 +340,8 @@ class DeepPotPT : public DeepPotBackend { at::Tensor firstneigh_tensor; c10::optional mapping_tensor; torch::Dict comm_dict; + bool profiler_enabled{false}; + std::string profiler_file; /** * @brief Translate PyTorch exceptions to the DeePMD-kit exception. * @param[in] f The function to run. diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc index 0f3a72b87f..3fdfeeae27 100644 --- a/source/api_cc/src/DeepPotPT.cc +++ b/source/api_cc/src/DeepPotPT.cc @@ -2,6 +2,7 @@ #ifdef BUILD_PYTORCH #include "DeepPotPT.h" +#include #include #include @@ -69,13 +70,9 @@ void DeepPotPT::init(const std::string& model, } deepmd::load_op_library(); int gpu_num = torch::cuda::device_count(); - if (gpu_num > 0) { - gpu_id = gpu_rank % gpu_num; - } else { - gpu_id = 0; - } - torch::Device device(torch::kCUDA, gpu_id); + gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; gpu_enabled = torch::cuda::is_available(); + torch::Device device(torch::kCUDA, gpu_id); if (!gpu_enabled) { device = torch::Device(torch::kCPU); std::cout << "load model from: " << model << " to cpu " << std::endl; @@ -86,6 +83,37 @@ void DeepPotPT::init(const std::string& model, std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; } + + // Configure PyTorch profiler + const char* env_profiler = std::getenv("DP_PROFILER"); + if (env_profiler && *env_profiler) { + using torch::profiler::impl::ActivityType; + using torch::profiler::impl::ExperimentalConfig; + using torch::profiler::impl::ProfilerConfig; + using torch::profiler::impl::ProfilerState; + std::set activities{ActivityType::CPU}; + if (gpu_enabled) { + activities.insert(ActivityType::CUDA); + } + profiler_file = std::string(env_profiler); + if (gpu_enabled) { + profiler_file += "_gpu" + std::to_string(gpu_id); + } + profiler_file += ".json"; + ExperimentalConfig exp_cfg; + ProfilerConfig cfg(ProfilerState::KINETO, + false, // report_input_shapes + false, // profile_memory + true, // with_stack + false, // with_flops + true, // with_modules + exp_cfg); + torch::autograd::profiler::prepareProfiler(cfg, activities); + torch::autograd::profiler::enableProfiler(cfg, activities); + std::cout << "PyTorch profiler enabled, output file: " << profiler_file + << std::endl; + profiler_enabled = true; + } std::unordered_map metadata = {{"type", ""}}; module = torch::jit::load(model, device, metadata); module.eval(); @@ -119,7 +147,17 @@ void DeepPotPT::init(const std::string& model, aparam_nall = module.run_method("is_aparam_nall").toBool(); inited = true; } -DeepPotPT::~DeepPotPT() {} + +DeepPotPT::~DeepPotPT() { + if (profiler_enabled) { + auto result = torch::autograd::profiler::disableProfiler(); + if (result) { + result->save(profiler_file); + } + std::cout << "PyTorch profiler result saved to " << profiler_file + << std::endl; + } +} template void DeepPotPT::compute(ENERGYVTYPE& ener,