11/* *
2- * Copyright 2023-2025 , XGBoost Contributors
2+ * Copyright 2023-2026 , XGBoost Contributors
33 */
44#if defined(XGBOOST_USE_NCCL)
55#include < algorithm> // for sort
1010#include < sstream> // for stringstream
1111#include < vector> // for vector
1212
13- #include " ../common/cuda_context.cuh" // for CUDAContext
14- #include " ../common/cuda_rt_utils.h" // for SetDevice
15- #include " ../common/device_helpers.cuh" // for DefaultStream
16- #include " ../common/type.h" // for EraseType
17- #include " comm.cuh" // for NCCLComm
18- #include " comm.h" // for Comm
19- #include " nccl_stub.h" // for NcclStub
20- #include " xgboost/collective/result.h" // for Result
21- #include " xgboost/span.h" // for Span
13+ #include " ../common/cuda_context.cuh" // for CUDAContext
14+ #include " ../common/cuda_rt_utils.h" // for SetDevice, GetUuid, PrintUuid
15+ #include " ../common/type.h" // for EraseType
16+ #include " comm.cuh" // for NCCLComm
17+ #include " comm.h" // for Comm
18+ #include " nccl_stub.h" // for NcclStub
19+ #include " xgboost/collective/result.h" // for Result
20+ #include " xgboost/span.h" // for Span
2221
2322namespace xgboost ::collective {
2423namespace {
@@ -38,23 +37,6 @@ Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared
3837 *pid = id;
3938 return Success ();
4039}
41-
42- inline constexpr std::size_t kUuidLength =
43- sizeof (std::declval<cudaDeviceProp>().uuid) / sizeof (std::uint64_t );
44-
45- void GetCudaUUID (xgboost::common::Span<std::uint64_t , kUuidLength > const & uuid, DeviceOrd device) {
46- cudaDeviceProp prob{};
47- dh::safe_cuda (cudaGetDeviceProperties (&prob, device.ordinal ));
48- std::memcpy (uuid.data (), static_cast <void *>(&(prob.uuid )), sizeof (prob.uuid ));
49- }
50-
51- std::string PrintUUID (xgboost::common::Span<std::uint64_t , kUuidLength > const & uuid) {
52- std::stringstream ss;
53- for (auto v : uuid) {
54- ss << std::hex << v;
55- }
56- return ss.str ();
57- }
5840} // namespace
5941
6042Comm* RabitComm::MakeCUDAVar (Context const * ctx, std::shared_ptr<Coll> pimpl) const {
@@ -76,18 +58,18 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
7658 curt::SetDevice (ctx->Ordinal ());
7759 stub_ = std::make_shared<NcclStub>(nccl_path);
7860
79- std::vector<std:: uint64_t > uuids (root.World () * kUuidLength , 0 );
80- auto s_uuid = xgboost:: common::Span<std:: uint64_t > {uuids.data (), uuids.size ()};
81- auto s_this_uuid = s_uuid.subspan (root.Rank () * kUuidLength , kUuidLength );
82- GetCudaUUID (s_this_uuid, ctx->Device ());
61+ std::vector<unsigned char > uuids (root.World () * curt:: kUuidLength , 0 );
62+ auto s_uuid = common::Span{uuids.data (), uuids.size ()};
63+ auto s_this_uuid = s_uuid.subspan (root.Rank () * curt:: kUuidLength , curt:: kUuidLength );
64+ curt::GetUuid (s_this_uuid, ctx->Ordinal ());
8365
8466 auto rc = pimpl->Allgather (root, common::EraseType (s_uuid));
8567 SafeColl (rc);
8668
87- std::vector<xgboost:: common::Span<std:: uint64_t , kUuidLength >> converted (root.World ());
69+ std::vector<common::Span<unsigned char >> converted (root.World ());
8870 std::size_t j = 0 ;
89- for (size_t i = 0 ; i < uuids.size (); i += kUuidLength ) {
90- converted[j] = s_uuid.subspan (i, kUuidLength );
71+ for (size_t i = 0 ; i < uuids.size (); i += curt:: kUuidLength ) {
72+ converted[j] = s_uuid.subspan (i, curt:: kUuidLength );
9173 j++;
9274 }
9375
@@ -97,7 +79,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
9779
9880 CHECK_EQ (n_uniques, root.World ())
9981 << " Multiple processes within communication group running on same CUDA "
100- << " device is not supported. " << PrintUUID (s_this_uuid) << " \n " ;
82+ << " device is not supported. " << curt::PrintUuid (s_this_uuid) << " \n " ;
10183
10284 rc = std::move (rc) << [&] {
10385 return GetUniqueId (root, this ->stub_ , pimpl, &nccl_unique_id_);
0 commit comments