PaddlePaddle
diff --git a/‎cmake/cupti.cmake‎
Lines changed: 12 additions & 2 deletions b/‎cmake/cupti.cmake‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎paddle/common/flags.cc‎
Lines changed: 30 additions & 0 deletions b/‎paddle/common/flags.cc‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc‎
Lines changed: 18 additions & 0 deletions b/‎paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/pybind/eager.h‎
Lines changed: 13 additions & 0 deletions b/‎paddle/fluid/pybind/eager.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎paddle/fluid/pybind/eager_py_layer.cc‎
Lines changed: 185 additions & 14 deletions b/‎paddle/fluid/pybind/eager_py_layer.cc‎
Lines changed: 185 additions & 14 deletions
diff --git a/‎paddle/phi/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions b/‎paddle/phi/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎paddle/phi/backends/dynload/rocm_driver.cc‎
Lines changed: 2 additions & 0 deletions b/‎paddle/phi/backends/dynload/rocm_driver.cc‎
Lines changed: 2 additions & 0 deletions
@@ -5,8 +5,18 @@ endif()
 include(${PROJECT_SOURCE_DIR}/cmake/architecture.cmake)
 
 if(WITH_ROCM)
+  if(EXISTS "${ROCM_PATH}/cuda/extras/CUPTI")
+    set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda")
+  elseif(EXISTS "${ROCM_PATH}/cuda/cuda/extras/CUPTI")
+    set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda/cuda")
+  else()
+    message(
+      FATAL_ERROR
+        "CUPTI not found under ${ROCM_PATH}/cuda/extras/CUPTI or ${ROCM_PATH}/cuda/cuda/extras/CUPTI"
+    )
+  endif()
   set(CUPTI_ROOT
-      "${ROCM_PATH}/cuda/extras/CUPTI"
+      "${ROCM_CUDA_DIR}/extras/CUPTI"
       CACHE PATH "CUPTI ROOT")
 else()
   set(CUPTI_ROOT
@@ -59,7 +69,7 @@ get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
 if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
   set(CUPTI_FOUND ON)
   if(WITH_ROCM)
-    include_directories(${ROCM_PATH}/cuda/include)
+    include_directories(${ROCM_CUDA_DIR}/include)
     add_definitions(-D__CUDA_HIP_PLATFORM_AMD__)
   endif()
 else()
 
@@ -230,6 +230,36 @@ PHI_DEFINE_EXPORTED_bool(
     "operator. The autotuning algorithm may be non-deterministic. If "
     "true, the algorithm is deterministic.");
 
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng
+ * Since Version: 3.4
+ * Value Range: bool, default=false
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
+ * Note: Fix RNG kernel launch config so same seed gives same results
+ *       across GPU types.
+ */
+PHI_DEFINE_EXPORTED_bool(
+    deterministic_rng,
+    false,
+    "Enable cross-device RNG consistency by fixing GPU kernel launch "
+    "configuration. When true, RNG kernels use a fixed grid/block size "
+    "so that the same seed produces identical results across GPU types.");
+
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng_grid
+ * Since Version: 3.4
+ * Value Range: int32, default=1024
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
+ * Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
+ *       Cross-device consistency requires the same value on all devices.
+ */
+PHI_DEFINE_EXPORTED_int32(
+    deterministic_rng_grid,
+    1024,
+    "Grid size cap when FLAGS_deterministic_rng is enabled.");
+
 /**
  * CUDA related FLAG
  * Name: FLAGS_embedding_deterministic
 
@@ -307,6 +307,24 @@ bool AminOpInferSymbolicShape(pir::Operation *op,
                                  axis.size() == 0 /*reduce_all*/);
 }
 
+bool AminmaxOpInferSymbolicShape(
+    pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
+  const auto &axis = details::GetVectorAttr(op, "axis");
+  bool keepdim = GetBoolAttr(op, "keepdim");
+  bool reduce_all = axis.size() == 0;
+
+  // ReduceInferDim only sets result(0). We need the same shape for both
+  // outputs, so call it for result(0) then copy to result(1).
+  bool ret =
+      details::ReduceInferDim(op, infer_context, axis, keepdim, reduce_all);
+  if (ret) {
+    const auto &out_shape =
+        infer_context->GetShapeOrDataForValue(op->result(0));
+    infer_context->SetShapeOrDataForValue(op->result(1), out_shape);
+  }
+  return ret;
+}
+
 bool AnyOpInferSymbolicShape(pir::Operation *op,
                              pir::InferSymbolicShapeContext *infer_context) {
   const auto &axis = details::GetVectorAttr(op, "axis");
 
@@ -21,6 +21,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(AffineGrid)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(All)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Amax)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Amin)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Aminmax)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Any)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Argmax)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Argmin)
 
@@ -36,6 +36,19 @@ typedef struct {
   std::vector<bool> forward_input_tensor_is_duplicable;
   std::vector<bool> forward_output_tensor_is_duplicable;
   std::weak_ptr<egr::GradNodePyLayer> grad_node;
+  // Holds strong references to DenseTensor impls saved via save_for_backward,
+  // preventing _clear_dataptr() from freeing the underlying memory before
+  // backward runs.  Lifecycle: born with container (set_container), dies with
+  // the PyLayerObject (PyLayerDealloc).
+  std::vector<std::shared_ptr<phi::TensorBase>> tensor_hold_helper;
+  // Holds strong references to DenseTensor impls captured in Python closures
+  // of the forward function (not recorded via save_for_backward).  The
+  // top-level ``closure_obj`` keeps the owning Python Tensor objects alive
+  // and defines the DFS order used by RestoreDenseTensors.  Populated by
+  // ctx._hold_tensors(obj); applied by ctx._restore_held_tensors() to
+  // re-install impl_ after _clear_dataptr().  Released in PyLayerDealloc.
+  PyObject* closure_obj;
+  std::vector<std::shared_ptr<phi::TensorBase>> closure_tensor_hold_helper;
 #ifdef PADDLE_WITH_CUDA
   std::vector<egr::ReloadFunctor> reload_functors;
 #endif
 
@@ -89,6 +89,11 @@ PyObject* PyLayerNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
     new (&v->grad_node) std::weak_ptr<egr::GradNodePyLayer>();
     new (&v->forward_input_tensor_is_duplicable) std::vector<bool>();
     new (&v->forward_output_tensor_is_duplicable) std::vector<bool>();
+    new (&v->tensor_hold_helper)
+        std::vector<std::shared_ptr<phi::DenseTensor>>();
+    v->closure_obj = nullptr;
+    new (&v->closure_tensor_hold_helper)
+        std::vector<std::shared_ptr<phi::TensorBase>>();
 #ifdef PADDLE_WITH_CUDA
     new (&v->reload_functors) std::vector<egr::ReloadFunctor>();
 #endif
@@ -110,6 +115,10 @@ static void PyLayerDealloc(PyLayerObject* self) {
   self->unpack_hook = nullptr;
   self->forward_input_tensor_is_duplicable.~vector();
   self->forward_output_tensor_is_duplicable.~vector();
+  self->tensor_hold_helper.~vector();
+  Py_XDECREF(self->closure_obj);
+  self->closure_obj = nullptr;
+  self->closure_tensor_hold_helper.~vector();
 #ifdef PADDLE_WITH_CUDA
   self->reload_functors.~vector();
 #endif
@@ -271,8 +280,9 @@ PyObject* pylayer_method_apply(PyObject* cls,
 
   for (int64_t i = inputs_size - 1; i >= 0; --i) {
     PyObject* obj = nullptr;
-    if (i >= args_size) {
-      obj = PyList_GetItem(kwargs_value_list, i - args_size);  // NOLINT
+    if (i >= static_cast<int64_t>(args_size)) {
+      obj = PyList_GetItem(kwargs_value_list,
+                           i - static_cast<int64_t>(args_size));  // NOLINT
     } else {
       obj = PyTuple_GET_ITEM(args, i);
     }
@@ -685,6 +695,62 @@ PyObject* pylayer_method_apply(PyObject* cls,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+// Deep-traverse a PyObject to collect shared_ptr<phi::DenseTensor> for all
+// DenseTensors found (Tensor / Tuple / List / Dict, recursively).  Used by
+// tensor_properties_set_container and ctx._hold_tensors to hold strong
+// references so _clear_dataptr() cannot free the underlying allocation
+// before backward.  DFS-walks obj and calls fn(tensor) for every Tensor
+// leaf.  CollectDenseTensors and RestoreDenseTensors are built on top.
+template <typename Fn>
+static void WalkDenseTensors(PyObject* obj, Fn&& fn) {
+  if (!obj || obj == Py_None) return;
+  if (PyCheckTensor(obj)) {
+    fn(reinterpret_cast<TensorObject*>(obj)->tensor);
+    return;
+  }
+  if (PyTuple_Check(obj)) {
+    Py_ssize_t n = PyTuple_GET_SIZE(obj);
+    for (Py_ssize_t i = 0; i < n; ++i)
+      WalkDenseTensors(PyTuple_GET_ITEM(obj, i), fn);
+    return;
+  }
+  if (PyList_Check(obj)) {
+    Py_ssize_t n = PyList_GET_SIZE(obj);
+    for (Py_ssize_t i = 0; i < n; ++i)
+      WalkDenseTensors(PyList_GET_ITEM(obj, i), fn);
+    return;
+  }
+  if (PyDict_Check(obj)) {
+    PyObject *k = nullptr, *v = nullptr;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(obj, &pos, &k, &v)) {
+      WalkDenseTensors(v, fn);
+    }
+    return;
+  }
+}
+
+static void CollectDenseTensors(
+    PyObject* obj, std::vector<std::shared_ptr<phi::TensorBase>>* holder) {
+  WalkDenseTensors(obj, [holder](const paddle::Tensor& tensor) {
+    if (tensor.impl()) holder->push_back(tensor.impl());
+  });
+}
+
+// Re-installs impl() for tensors cleared by _clear_dataptr(), using the
+// shared_ptrs stored in holder (same DFS order as CollectDenseTensors).
+static void RestoreDenseTensors(
+    PyObject* obj,
+    const std::vector<std::shared_ptr<phi::TensorBase>>& holder) {
+  size_t idx = 0;
+  WalkDenseTensors(obj, [&holder, &idx](paddle::Tensor& tensor) {
+    if (idx < holder.size()) {
+      if (!tensor.impl()) tensor.set_impl(holder[idx]);
+      ++idx;
+    }
+  });
+}
+
 PyObject* call_unpack_hook(PyLayerObject* self) {
   auto unpack_hook = self->unpack_hook;
   auto packed_value = self->container;
@@ -734,10 +800,16 @@ PyObject* tensor_properties_get_container(PyLayerObject* self, void* closure) {
   }
   if (self->container_be_packed) {
     return call_unpack_hook(self);
-  } else {
-    Py_INCREF(self->container);
-    return self->container;
   }
+  // Re-attach any DenseTensor impls that were freed by _clear_dataptr().
+  // tensor_hold_helper keeps the underlying allocations alive; walk the
+  // container in the same DFS order as CollectDenseTensors and reinstall
+  // impls for tensors whose impl() is currently null.
+  if (!self->tensor_hold_helper.empty()) {
+    RestoreDenseTensors(self->container, self->tensor_hold_helper);
+  }
+  Py_INCREF(self->container);
+  return self->container;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -836,11 +908,18 @@ int tensor_properties_set_container(PyLayerObject* self,
                                     void* closure) {
   EAGER_TRY
   if (egr::SavedTensorsHooks::GetInstance().IsEnable()) {
+    // Note 1: when hooks are enabled the tensors are packed; do NOT populate
+    // tensor_hold_helper (the hook system manages tensor lifetimes itself).
     call_pack_hook(self, value);
   } else {
     Py_XINCREF(value);
     Py_XDECREF(self->container);
     self->container = value;
+    // Note 2: deep-traverse value (Tensor / Tuple / List / nested) to hold
+    // strong references to every DenseTensor impl, preventing _clear_dataptr()
+    // from freeing the underlying allocation before backward runs.
+    self->tensor_hold_helper.clear();
+    CollectDenseTensors(value, &self->tensor_hold_helper);
   }
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_NEG
@@ -907,15 +986,107 @@ int tensor_properties_set_grad_in_dtype_consistent(PyLayerObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyMethodDef pylayer_methods[] = {{"name",  // NOLINT
-                                  (PyCFunction)(void (*)())pylayer_method_name,
-                                  METH_NOARGS,
-                                  nullptr},
-                                 {"apply",
-                                  (PyCFunction)(void (*)())pylayer_method_apply,
-                                  METH_CLASS | METH_VARARGS | METH_KEYWORDS,
-                                  nullptr},
-                                 {nullptr, nullptr, 0, nullptr}};
+// ctx._pop_saved_impl(tensor)
+// Removes the strong reference held in tensor_hold_helper for the given
+// tensor's underlying DenseTensor, allowing its memory to be freed early
+// (e.g. inside backward when the tensor is no longer needed).
+// The tensor must have a valid impl() — i.e. pass the recovered tensor
+// returned by ctx.saved_tensor(), not the already-cleared one.
+PyObject* pylayer_pop_saved_impl(PyObject* self_, PyObject* args) {
+  EAGER_TRY
+  auto* self = reinterpret_cast<PyLayerObject*>(self_);
+  PyObject* tensor_obj = nullptr;
+  if (!PyArg_ParseTuple(args, "O", &tensor_obj)) {
+    RETURN_PY_NONE;
+  }
+  if (!tensor_obj || !PyCheckTensor(tensor_obj)) {
+    RETURN_PY_NONE;
+  }
+  const auto& tensor = reinterpret_cast<TensorObject*>(tensor_obj)->tensor;
+  if (!tensor.impl() || !tensor.is_dense_tensor()) {
+    RETURN_PY_NONE;
+  }
+  auto* raw = static_cast<phi::DenseTensor*>(tensor.impl().get());
+  for (auto it = self->tensor_hold_helper.begin();
+       it != self->tensor_hold_helper.end();
+       ++it) {
+    if (it->get() == raw) {
+      self->tensor_hold_helper.erase(it);
+      break;
+    }
+  }
+  RETURN_PY_NONE;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+// ctx._hold_tensors(obj)
+// Keep strong refs to the owning container (Py_INCREF'd) and the impl() of
+// every DenseTensor leaf found in obj (Tensor / Tuple / List / Dict).
+// Covers tensors captured via Python closure of the forward function that
+// bypass save_for_backward / container.  Skipped when saved_tensors_hooks is
+// enabled (the hook system owns tensor lifetime in that case).
+PyObject* pylayer_hold_tensors(PyObject* self_, PyObject* args) {
+  EAGER_TRY
+  auto* self = reinterpret_cast<PyLayerObject*>(self_);
+  PyObject* obj = nullptr;
+  if (!PyArg_ParseTuple(args, "O", &obj)) {
+    RETURN_PY_NONE;
+  }
+  if (obj && obj != Py_None &&
+      !egr::SavedTensorsHooks::GetInstance().IsEnable()) {
+    Py_INCREF(obj);
+    Py_XDECREF(self->closure_obj);
+    self->closure_obj = obj;
+    self->closure_tensor_hold_helper.clear();
+    CollectDenseTensors(obj, &self->closure_tensor_hold_helper);
+  }
+  RETURN_PY_NONE;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+// ctx._restore_held_tensors()
+// Re-install impl() on any Python Tensor previously registered via
+// _hold_tensors whose impl_ has been nulled by _clear_dataptr().  Typically
+// called at the start of backward before recompute re-runs forward.
+PyObject* pylayer_restore_held_tensors(PyObject* self_, PyObject* /*unused*/) {
+  EAGER_TRY
+  auto* self = reinterpret_cast<PyLayerObject*>(self_);
+  if (self->closure_obj && !self->closure_tensor_hold_helper.empty()) {
+    RestoreDenseTensors(self->closure_obj, self->closure_tensor_hold_helper);
+  }
+  RETURN_PY_NONE;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef pylayer_methods[] = {
+    {"name",  // NOLINT
+     (PyCFunction)(void (*)())pylayer_method_name,
+     METH_NOARGS,
+     nullptr},
+    {"apply",
+     (PyCFunction)(void (*)())pylayer_method_apply,
+     METH_CLASS | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"_pop_saved_impl",
+     (PyCFunction)(void (*)())pylayer_pop_saved_impl,
+     METH_VARARGS,
+     "Release the strong reference held for a "
+     "specific DenseTensor saved via "
+     "save_for_backward, allowing its memory to "
+     "be freed early if no other holder exists."},
+    {"_hold_tensors",
+     (PyCFunction)(void (*)())pylayer_hold_tensors,
+     METH_VARARGS,
+     "Deep-traverse the given object (Tensor / tuple / list / dict) and "
+     "keep strong references to every DenseTensor impl found, plus the "
+     "owning Python Tensor object.  Used to protect tensors captured in "
+     "Python closures against _clear_dataptr() in pipeline parallel."},
+    {"_restore_held_tensors",
+     (PyCFunction)(void (*)())pylayer_restore_held_tensors,
+     METH_NOARGS,
+     "Reinstall impl() on Python Tensor objects previously registered via "
+     "_hold_tensors, if their impl() has been nulled by _clear_dataptr()."},
+    {nullptr, nullptr, 0, nullptr}};
 
 struct PyGetSetDef pylayer_properties[] {  // NOLINT
   {"container",
 
@@ -372,9 +372,14 @@ if(WITH_CUTLASS)
   )# for memory_efficient_attention.h
 endif()
 # PADDLE_WARP_SIZE: warp size for the target GPU platform.
-# Default 32 (NVIDIA). Override via -DPADDLE_WARP_SIZE=64 for iluvatar (COREX).
+# Default 32 (NVIDIA). ROCm (AMD/Hygon) wavefront size is 64.
+# Override via -DPADDLE_WARP_SIZE for other platforms.
 if(NOT DEFINED PADDLE_WARP_SIZE)
-  set(PADDLE_WARP_SIZE 32)
+  if(WITH_ROCM)
+    set(PADDLE_WARP_SIZE 64)
+  else()
+    set(PADDLE_WARP_SIZE 32)
+  endif()
 endif()
 math(EXPR PADDLE_WARP_MASK "${PADDLE_WARP_SIZE} - 1")
 if(PADDLE_WARP_SIZE EQUAL 64)
 
@@ -22,6 +22,8 @@ void* rocm_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
+ROCM_ROUTINE_EACH_VVM(DEFINE_WRAP);
+ROCM_ROUTINE_EACH_GPU_GRAPH(DEFINE_WRAP);
 ROCM_ROUTINE_EACH(DEFINE_WRAP);
 
 bool HasCUDADriver() {