fix(pt_expt): fail fast on with-comm artifact errors instead of silently zeroing

Han Wang · Han Wang · commit 7632db894516 · 2026-05-09T17:31:35.000+08:00
Address @iProzd review on PR #5430: - border_op_export: throw on empty output list rather than returning empty_like(g1), which masked internal kernel bugs as zero outputs. - DeepPotPTExpt / DeepSpinPTExpt: if the with-comm artifact is declared in metadata but fails to load, keep has_comm_artifact_=true so multi-rank dispatch (nswap>0) throws explicitly. Previously has_comm_artifact_ was reset to false on load failure, making multi-rank silently fall through to the single-rank artifact and skip the MPI ghost-embedding exchange.
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
@@ -166,9 +166,10 @@ void DeepPotPTExpt::init(const std::string& model,
   // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``;
   // default to false so old artifacts keep working. If the metadata
   // flag is set but the nested artifact fails to extract or compile,
-  // fall back to single-rank mode rather than aborting init -- the
-  // hard error then surfaces in ``run_model_with_comm()`` only when
-  // multi-rank actually needs it.
+  // keep ``has_comm_artifact_=true`` and let single-rank dispatch
+  // continue working; multi-rank dispatch then fails fast at
+  // ``run_model_with_comm()`` rather than silently dropping the MPI
+  // exchange and producing wrong results.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
@@ -186,11 +187,12 @@ void DeepPotPTExpt::init(const std::string& model,
                           : static_cast<c10::DeviceIndex>(-1));
     } catch (const std::exception& e) {
       std::cerr << "DeepPotPTExpt: failed to load with-comm artifact ("
-                << e.what() << "); falling back to single-rank-only dispatch."
+                << e.what()
+                << "); single-rank inference will still work, but multi-rank "
+                   "LAMMPS dispatch will throw."
                 << std::endl;
       with_comm_tempfile_.reset();
       with_comm_loader.reset();
-      has_comm_artifact_ = false;
     }
   }
 
@@ -244,9 +246,12 @@ std::vector<torch::Tensor> DeepPotPTExpt::run_model_with_comm(
     const std::vector<at::Tensor>& comm_tensors) {
   if (!with_comm_loader) {
     throw deepmd::deepmd_exception(
-        "run_model_with_comm called but the .pt2 file has no with-comm "
-        "artifact. This is a programming error: the caller should check "
-        "has_comm_artifact_ before invoking this path.");
+        "run_model_with_comm called but the with-comm artifact is not "
+        "available. Either the .pt2 file has no with-comm artifact compiled "
+        "(programming error: the caller should check has_comm_artifact_ "
+        "before invoking this path), or the artifact was present in the "
+        ".pt2 metadata but failed to load at init time (see earlier stderr "
+        "log). Multi-rank LAMMPS requires a working with-comm artifact.");
   }
   if (comm_tensors.size() != 8) {
     throw deepmd::deepmd_exception(
@@ -431,6 +436,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   // tensor to gather ghost embeddings from local atoms.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm && !with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank LAMMPS requires the with-comm artifact, but it failed "
+        "to load at init time. See the earlier stderr log for the underlying "
+        "error.");
+  }
   // When NULL-type atoms exist, remapped storage must outlive comm
   // tensors (the int** pointer-array tensor references it).
   std::vector<std::vector<int>> remapped_sendlist;
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -174,7 +174,9 @@ void DeepSpinPTExpt::init(const std::string& model,
 
   // Phase 4: load the optional with-comm artifact for multi-rank GNN
   // spin inference.  Mirrors DeepPotPTExpt; see its init() comment for
-  // the rationale on the try/catch fallback.
+  // the rationale on keeping ``has_comm_artifact_=true`` on load
+  // failure so multi-rank dispatch fails fast rather than silently
+  // dropping the MPI exchange.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
@@ -189,11 +191,12 @@ void DeepSpinPTExpt::init(const std::string& model,
                           : static_cast<c10::DeviceIndex>(-1));
     } catch (const std::exception& e) {
       std::cerr << "DeepSpinPTExpt: failed to load with-comm artifact ("
-                << e.what() << "); falling back to single-rank-only dispatch."
+                << e.what()
+                << "); single-rank inference will still work, but multi-rank "
+                   "LAMMPS dispatch will throw."
                 << std::endl;
       with_comm_tempfile_.reset();
       with_comm_loader.reset();
-      has_comm_artifact_ = false;
     }
   }
 
@@ -249,8 +252,11 @@ std::vector<torch::Tensor> DeepSpinPTExpt::run_model_with_comm(
     const std::vector<at::Tensor>& comm_tensors) {
   if (!with_comm_loader) {
     throw deepmd::deepmd_exception(
-        "DeepSpinPTExpt::run_model_with_comm called but the .pt2 has no "
-        "with-comm artifact.");
+        "DeepSpinPTExpt::run_model_with_comm called but the with-comm "
+        "artifact is not available. Either the .pt2 file has no with-comm "
+        "artifact compiled, or the artifact was present in the .pt2 metadata "
+        "but failed to load at init time (see earlier stderr log). Multi-rank "
+        "LAMMPS requires a working with-comm artifact.");
   }
   if (comm_tensors.size() != 8) {
     throw deepmd::deepmd_exception(
@@ -448,6 +454,12 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
   // (pre atom-doubling); the spin override halves them internally.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm && !with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank LAMMPS requires the with-comm artifact, but it failed "
+        "to load at init time. See the earlier stderr log for the underlying "
+        "error.");
+  }
   std::vector<std::vector<int>> remapped_sendlist;
   std::vector<int*> remapped_sendlist_ptrs;
   std::vector<int> remapped_sendnum, remapped_recvnum;
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
@@ -523,7 +523,12 @@ torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor,
                        communicator_tensor, nlocal_tensor, nghost_tensor);
   // border_op returns {g1_tensor} — a list whose first element aliases
   // g1_tensor. Clone for AOTI graph-output correctness.
-  return out.empty() ? torch::empty_like(g1_tensor) : out[0].clone();
+  if (out.empty()) {
+    throw std::runtime_error(
+        "border_op_export: border_op returned an empty output list, which "
+        "indicates an internal error in the underlying border_op kernel.");
+  }
+  return out[0].clone();
 }
 
 torch::Tensor border_op_backward_export(