PaddlePaddle · metax666 · Apr 27, 2026 · Apr 27, 2026
diff --git a/Paddle b/Paddle
diff --git a/backends/metax_gpu/common/flags_declare.cc b/backends/metax_gpu/common/flags_declare.cc
@@ -116,6 +116,35 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math,
                          false,
                          "Whether to use fast math GPU functions.");
 
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng
+ * Since Version: 3.4
+ * Value Range: bool, default=false
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
+ * Note: Fix RNG kernel launch config so same seed gives same results
+ *       across GPU types.
+ */
+PHI_DEFINE_EXPORTED_bool(
+    deterministic_rng,
+    false,
+    "Enable cross-device RNG consistency by fixing GPU kernel launch "
+    "configuration. When true, RNG kernels use a fixed grid/block size "
+    "so that the same seed produces identical results across GPU types.");
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng_grid
+ * Since Version: 3.4
+ * Value Range: int32, default=1024
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
+ * Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
+ *       Cross-device consistency requires the same value on all devices.
+ */
+PHI_DEFINE_EXPORTED_int32(
+    deterministic_rng_grid,
+    1024,
+    "Grid size cap when FLAGS_deterministic_rng is enabled.");
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * FlashAttention related FLAG
+12 −2		cmake/cupti.cmake
+30 −0		paddle/common/flags.cc
+13 −0		paddle/fluid/pybind/eager.h
+185 −14		paddle/fluid/pybind/eager_py_layer.cc
+7 −2		paddle/phi/CMakeLists.txt
+2 −0		paddle/phi/backends/dynload/rocm_driver.cc
+1 −19		paddle/phi/kernels/cpu/elementwise.h
+29 −14		paddle/phi/kernels/funcs/distribution_helper.h
+26 −13		paddle/phi/kernels/funcs/dropout_impl.cu.h
+58 −0		paddle/phi/kernels/funcs/rng_launch_config.h
+24 −11		paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
+10 −4		paddle/phi/kernels/fusion/xpu/fused_rope_utils.h
+2 −1		paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+2 −1		paddle/phi/kernels/gpu/interpolate_kernel.cu
+3 −2		paddle/phi/kernels/gpu/layer_norm_kernel.cu
+144 −10		paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h
+1 −1		paddle/phi/kernels/stride/matmul_stride_kernel.cu
+15 −5		python/paddle/distributed/fleet/meta_optimizers/muon_sharding_optimizer.py
+129 −31		python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+21 −4		python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+33 −84		python/paddle/distributed/fleet/recompute/recompute.py
+2 −6		python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+72 −55		python/paddle/optimizer/muon.py
+66 −0		python/paddle/tensor/ops.py
+87 −3		test/collective/fleet/hybrid_parallel_sharding_muon_model.py
+46 −0		test/collective/fleet/test_parallel_dygraph_muon.py
+700 −0		test/legacy_test/test_pylayer_clear_dataptr.py
+48 −453		test/legacy_test/test_recompute_with_tuple_input.py