Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Paddle
Submodule Paddle updated 28 files
+12 −2 cmake/cupti.cmake
+30 −0 paddle/common/flags.cc
+13 −0 paddle/fluid/pybind/eager.h
+185 −14 paddle/fluid/pybind/eager_py_layer.cc
+7 −2 paddle/phi/CMakeLists.txt
+2 −0 paddle/phi/backends/dynload/rocm_driver.cc
+1 −19 paddle/phi/kernels/cpu/elementwise.h
+29 −14 paddle/phi/kernels/funcs/distribution_helper.h
+26 −13 paddle/phi/kernels/funcs/dropout_impl.cu.h
+58 −0 paddle/phi/kernels/funcs/rng_launch_config.h
+24 −11 paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
+10 −4 paddle/phi/kernels/fusion/xpu/fused_rope_utils.h
+2 −1 paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+2 −1 paddle/phi/kernels/gpu/interpolate_kernel.cu
+3 −2 paddle/phi/kernels/gpu/layer_norm_kernel.cu
+144 −10 paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h
+1 −1 paddle/phi/kernels/stride/matmul_stride_kernel.cu
+15 −5 python/paddle/distributed/fleet/meta_optimizers/muon_sharding_optimizer.py
+129 −31 python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+21 −4 python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+33 −84 python/paddle/distributed/fleet/recompute/recompute.py
+2 −6 python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+72 −55 python/paddle/optimizer/muon.py
+66 −0 python/paddle/tensor/ops.py
+87 −3 test/collective/fleet/hybrid_parallel_sharding_muon_model.py
+46 −0 test/collective/fleet/test_parallel_dygraph_muon.py
+700 −0 test/legacy_test/test_pylayer_clear_dataptr.py
+48 −453 test/legacy_test/test_recompute_with_tuple_input.py
29 changes: 29 additions & 0 deletions backends/metax_gpu/common/flags_declare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,35 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math,
false,
"Whether to use fast math GPU functions.");

/**
* GPU RNG related FLAG
* Name: FLAGS_deterministic_rng
* Since Version: 3.4
* Value Range: bool, default=false
* Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
* Note: Fix RNG kernel launch config so same seed gives same results
* across GPU types.
*/
PHI_DEFINE_EXPORTED_bool(
deterministic_rng,
false,
"Enable cross-device RNG consistency by fixing GPU kernel launch "
"configuration. When true, RNG kernels use a fixed grid/block size "
"so that the same seed produces identical results across GPU types.");
/**
* GPU RNG related FLAG
* Name: FLAGS_deterministic_rng_grid
* Since Version: 3.4
* Value Range: int32, default=1024
* Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
* Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
* Cross-device consistency requires the same value on all devices.
*/
PHI_DEFINE_EXPORTED_int32(
deterministic_rng_grid,
1024,
"Grid size cap when FLAGS_deterministic_rng is enabled.");

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* FlashAttention related FLAG
Expand Down
Loading