-
Notifications
You must be signed in to change notification settings - Fork 6k
[Cpp API Compatibility] Align cuda compat #78808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
acce468
0738877
e863dc8
eaa7c9b
1ffad12
99029cd
7d054b1
68f1047
12753e7
bc8bc55
7c31b5d
d0bcc46
391d5f2
c11df37
6d428d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ | |
|
|
||
| #include <optional> | ||
|
|
||
| #include "paddle/phi/core/platform/cuda_device_guard.h" | ||
| #include "paddle/phi/backends/gpu/gpu_info.h" | ||
|
|
||
| namespace c10::cuda { | ||
|
|
||
|
|
@@ -46,15 +46,13 @@ struct CUDAGuard { | |
|
|
||
| explicit CUDAGuard(DeviceIndex device_index) | ||
| : original_device_(detail::current_cuda_device()), | ||
| current_device_(original_device_), | ||
| guard_() { | ||
| current_device_(original_device_) { | ||
| set_index(device_index); | ||
| } | ||
|
|
||
| explicit CUDAGuard(Device device) | ||
| : original_device_(detail::current_cuda_device()), | ||
| current_device_(original_device_), | ||
| guard_() { | ||
| current_device_(original_device_) { | ||
| set_device(device); | ||
| } | ||
|
|
||
|
|
@@ -63,18 +61,27 @@ struct CUDAGuard { | |
|
|
||
| CUDAGuard(CUDAGuard&& other) = delete; | ||
| CUDAGuard& operator=(CUDAGuard&& other) = delete; | ||
| ~CUDAGuard() = default; | ||
| ~CUDAGuard() { | ||
| // Always restore to original_device_ to handle cases where the device | ||
| // was changed outside of this guard, matching PyTorch semantics. | ||
| phi::backends::gpu::SetDeviceId(static_cast<int>(original_device_.index())); | ||
| } | ||
|
|
||
| void set_device(Device device) { | ||
| current_device_ = detail::normalize_cuda_device(device); | ||
| guard_.SetDevice(current_device_._PD_GetInner()); | ||
| const Device normalized = detail::normalize_cuda_device(device); | ||
| if (normalized.index() != current_device_.index()) { | ||
| phi::backends::gpu::SetDeviceId(static_cast<int>(normalized.index())); | ||
| current_device_ = normalized; | ||
| } | ||
| } | ||
|
|
||
| void reset_device(Device device) { set_device(device); } | ||
|
|
||
| void set_index(DeviceIndex device_index) { | ||
| current_device_ = Device(kCUDA, device_index); | ||
| guard_.SetDeviceIndex(device_index); | ||
| if (current_device_.index() != device_index) { | ||
| phi::backends::gpu::SetDeviceId(static_cast<int>(device_index)); | ||
| current_device_ = Device(kCUDA, device_index); | ||
| } | ||
| } | ||
|
|
||
| Device original_device() const { return original_device_; } | ||
|
|
@@ -84,7 +91,6 @@ struct CUDAGuard { | |
| private: | ||
| Device original_device_; | ||
| Device current_device_; | ||
| paddle::platform::CUDADeviceGuard guard_; | ||
| }; | ||
|
|
||
| struct OptionalCUDAGuard { | ||
|
|
@@ -107,20 +113,24 @@ struct OptionalCUDAGuard { | |
|
|
||
| OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete; | ||
| OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete; | ||
| ~OptionalCUDAGuard() = default; | ||
| ~OptionalCUDAGuard() { reset(); } | ||
|
|
||
| void set_device(Device device) { | ||
| const Device normalized = detail::normalize_cuda_device(device); | ||
| init_if_needed(); | ||
| guard_->SetDevice(normalized._PD_GetInner()); | ||
| if (normalized.index() != current_device_->index()) { | ||
| phi::backends::gpu::SetDeviceId(static_cast<int>(normalized.index())); | ||
| } | ||
| current_device_ = normalized; | ||
| } | ||
|
|
||
| void reset_device(Device device) { set_device(device); } | ||
|
|
||
| void set_index(DeviceIndex device_index) { | ||
| init_if_needed(); | ||
| guard_->SetDeviceIndex(device_index); | ||
| if (device_index != current_device_->index()) { | ||
| phi::backends::gpu::SetDeviceId(static_cast<int>(device_index)); | ||
| } | ||
| current_device_ = Device(kCUDA, device_index); | ||
| } | ||
|
|
||
|
|
@@ -129,23 +139,26 @@ struct OptionalCUDAGuard { | |
| std::optional<Device> current_device() const { return current_device_; } | ||
|
|
||
| void reset() { | ||
| guard_.reset(); | ||
| if (original_device_.has_value()) { | ||
| // Always restore to original_device_ to handle external device changes. | ||
| // This matches PyTorch OptionalDeviceGuard semantics. | ||
| phi::backends::gpu::SetDeviceId( | ||
| static_cast<int>(original_device_->index())); | ||
| } | ||
|
Comment on lines
116
to
+147
|
||
| original_device_.reset(); | ||
| current_device_.reset(); | ||
| } | ||
|
|
||
| private: | ||
| void init_if_needed() { | ||
| if (!guard_.has_value()) { | ||
| if (!original_device_.has_value()) { | ||
| original_device_ = detail::current_cuda_device(); | ||
| current_device_ = original_device_; | ||
| guard_.emplace(); | ||
| } | ||
| } | ||
|
|
||
| std::optional<Device> original_device_; | ||
| std::optional<Device> current_device_; | ||
| std::optional<paddle::platform::CUDADeviceGuard> guard_; | ||
| }; | ||
|
|
||
| } // namespace c10::cuda | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 确认一下这个 PR 不影响 ABI 兼容性是么?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 我再验证一下,上次改了cpp文件,再跑一遍 ABI Check
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
可以考虑先把那个 PR 处理下,就不用每次都单独搞一下了,加一下 review 检查,@SigureMo 和 @BingooYang 吧
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. static-check 过了,这个 PR 里相关改动可以 revert 掉了 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,10 @@ | |
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include <c10/cuda/CUDAFunctions.h> | ||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| #include <c10/cuda/CUDAGuard.h> | ||
| #endif | ||
| #include <c10/util/Exception.h> | ||
| #include <torch/cuda.h> | ||
|
|
||
|
|
@@ -25,8 +29,10 @@ c10::DeviceIndex device_count() { | |
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| return phi::backends::gpu::GetGPUDeviceCount(); | ||
| #else | ||
| PADDLE_THROW(common::errors::Unavailable( | ||
| "Paddle is not compiled with CUDA. Cannot visit device count.")); | ||
| // Match PyTorch c10::cuda::device_count(): return 0 in CPU-only builds so | ||
| // that is_available() and the pre-checks of synchronize() degrade gracefully | ||
| // through a single, consistent "No CUDA GPUs are available" error path. | ||
| return 0; | ||
| #endif | ||
| } | ||
|
|
||
|
|
@@ -35,21 +41,20 @@ bool is_available() { return cuda::device_count() > 0; } | |
| void synchronize(int64_t device_index) { | ||
| TORCH_CHECK(is_available(), "No CUDA GPUs are available"); | ||
| auto num_gpus = cuda::device_count(); | ||
| TORCH_CHECK(device_index < 0 || device_index < num_gpus, | ||
| "Device index out of range: ", | ||
| device_index); | ||
| // TODO(yongqiang) need using DeviceGuard | ||
| TORCH_CHECK( | ||
| device_index == -1 || (device_index >= 0 && device_index < num_gpus), | ||
| "Device index out of range: ", | ||
| device_index); | ||
|
Comment on lines
41
to
+47
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| paddle::platform::SetDeviceId(device_index); | ||
| #ifdef PADDLE_WITH_HIP | ||
| PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); | ||
| #else | ||
| PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); | ||
| #endif | ||
| #else | ||
| PADDLE_THROW(common::errors::Unavailable( | ||
| "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); | ||
| // Match PyTorch semantics: | ||
| // 1. `device_index == -1` means "current CUDA device". | ||
| // 2. Explicit device synchronization must not leak a changed current device | ||
| // to the caller after returning. | ||
| const c10::cuda::CUDAGuard device_guard(c10::Device( | ||
| c10::DeviceType::CUDA, static_cast<c10::DeviceIndex>(device_index))); | ||
| c10::cuda::device_synchronize(); | ||
| #endif | ||
| // CPU-only builds are already rejected above by the is_available() check. | ||
| } | ||
|
|
||
| } // namespace torch::cuda | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
话说 paddle 的 guard 是有什么问题吗?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#78707 (review) codex review的时候发现的,单测不够充分,或者说当时单测的设计没有发现问题
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
喔喔 是那个问题
不过这里是否可以通过修复 paddle 内的 guard 实现?看起来单纯是 paddle guard 的 bug?这里应该也不会出现修改后导致破坏之前行为的问题?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
应该可以修,因为每次需要SetDevice的时候都调用SetDeviceIndex,导致prev_id_被反复覆盖,实际上应该只在构造CUDADeviceGuard的时候做一次设置prev_id_的操作,这个逻辑应该被拆分出来,之前没有错可能是因为只有一次性调用CUDADeviceGuard的场景,不需要反反复复SetDevice或者SetDeviceIndex
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#55498 这好像是有意设计的,析构的时候不回到原设备,等我再想想
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
啊……有些历史原因这里不能直接复用的话就按照当前方式来就好,看起来不是好解的问题