Skip to content

Commit c3e5782

Browse files
authored
[EM] Support mixed GPU models on SNMG. (dmlc#11998)
- Query C2C info via UUID. - Drop the specialization for full C2C to support large GPUs.
1 parent 1da9aed commit c3e5782

File tree

7 files changed

+85
-59
lines changed

7 files changed

+85
-59
lines changed

doc/tutorials/external_memory.rst

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -332,11 +332,9 @@ Starting with 3.1, XGBoost introduces an adaptive cache for GPU-based external m
332332
training. The feature helps split the data cache into a host cache and a device cache. By
333333
keeping a portion of the cache on the GPU, we can reduce the amount of data transfer
334334
during training when there's sufficient amount of GPU memory. The feature can be
335-
controlled by the ``cache_host_ratio`` parameter in the
336-
:py:class:`xgboost.ExtMemQuantileDMatrix`. It is disabled when the device has full C2C
337-
bandwidth since it's not needed there. On devices that with reduced bandwidth or devices
338-
with PCIe connections, unless explicitly specified, the ratio is automatically estimated
339-
based on device memory size and the size of the dataset.
335+
controlled by the ``cache_host_ratio`` parameter of the
336+
:py:class:`xgboost.ExtMemQuantileDMatrix`. Unless explicitly specified, the ratio is
337+
automatically estimated based on device memory size and the size of the dataset.
340338

341339
However, this parameter increases memory fragmentation as XGBoost needs large memory pages
342340
with irregular sizes. As a result, you might see out of memory error after the

src/collective/comm.cu

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2023-2025, XGBoost Contributors
2+
* Copyright 2023-2026, XGBoost Contributors
33
*/
44
#if defined(XGBOOST_USE_NCCL)
55
#include <algorithm> // for sort
@@ -10,15 +10,14 @@
1010
#include <sstream> // for stringstream
1111
#include <vector> // for vector
1212

13-
#include "../common/cuda_context.cuh" // for CUDAContext
14-
#include "../common/cuda_rt_utils.h" // for SetDevice
15-
#include "../common/device_helpers.cuh" // for DefaultStream
16-
#include "../common/type.h" // for EraseType
17-
#include "comm.cuh" // for NCCLComm
18-
#include "comm.h" // for Comm
19-
#include "nccl_stub.h" // for NcclStub
20-
#include "xgboost/collective/result.h" // for Result
21-
#include "xgboost/span.h" // for Span
13+
#include "../common/cuda_context.cuh" // for CUDAContext
14+
#include "../common/cuda_rt_utils.h" // for SetDevice, GetUuid, PrintUuid
15+
#include "../common/type.h" // for EraseType
16+
#include "comm.cuh" // for NCCLComm
17+
#include "comm.h" // for Comm
18+
#include "nccl_stub.h" // for NcclStub
19+
#include "xgboost/collective/result.h" // for Result
20+
#include "xgboost/span.h" // for Span
2221

2322
namespace xgboost::collective {
2423
namespace {
@@ -38,23 +37,6 @@ Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared
3837
*pid = id;
3938
return Success();
4039
}
41-
42-
inline constexpr std::size_t kUuidLength =
43-
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
44-
45-
void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) {
46-
cudaDeviceProp prob{};
47-
dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal));
48-
std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
49-
}
50-
51-
std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) {
52-
std::stringstream ss;
53-
for (auto v : uuid) {
54-
ss << std::hex << v;
55-
}
56-
return ss.str();
57-
}
5840
} // namespace
5941

6042
Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
@@ -76,18 +58,18 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
7658
curt::SetDevice(ctx->Ordinal());
7759
stub_ = std::make_shared<NcclStub>(nccl_path);
7860

79-
std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
80-
auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
81-
auto s_this_uuid = s_uuid.subspan(root.Rank() * kUuidLength, kUuidLength);
82-
GetCudaUUID(s_this_uuid, ctx->Device());
61+
std::vector<unsigned char> uuids(root.World() * curt::kUuidLength, 0);
62+
auto s_uuid = common::Span{uuids.data(), uuids.size()};
63+
auto s_this_uuid = s_uuid.subspan(root.Rank() * curt::kUuidLength, curt::kUuidLength);
64+
curt::GetUuid(s_this_uuid, ctx->Ordinal());
8365

8466
auto rc = pimpl->Allgather(root, common::EraseType(s_uuid));
8567
SafeColl(rc);
8668

87-
std::vector<xgboost::common::Span<std::uint64_t, kUuidLength>> converted(root.World());
69+
std::vector<common::Span<unsigned char>> converted(root.World());
8870
std::size_t j = 0;
89-
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
90-
converted[j] = s_uuid.subspan(i, kUuidLength);
71+
for (size_t i = 0; i < uuids.size(); i += curt::kUuidLength) {
72+
converted[j] = s_uuid.subspan(i, curt::kUuidLength);
9173
j++;
9274
}
9375

@@ -97,7 +79,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
9779

9880
CHECK_EQ(n_uniques, root.World())
9981
<< "Multiple processes within communication group running on same CUDA "
100-
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
82+
<< "device is not supported. " << curt::PrintUuid(s_this_uuid) << "\n";
10183

10284
rc = std::move(rc) << [&] {
10385
return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);

src/common/cuda_dr_utils.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/**
2-
* Copyright 2024-2025, XGBoost contributors
2+
* Copyright 2024-2026, XGBoost contributors
33
*/
44
#if defined(XGBOOST_USE_CUDA)
55
#include "cuda_dr_utils.h"
66

77
#include <algorithm> // for max
8+
#include <array> // for array
89
#include <charconv> // for from_chars
910
#include <cstdint> // for int32_t
1011
#include <cstring> // for memset
@@ -210,9 +211,14 @@ namespace detail {
210211
return -1;
211212
}
212213

214+
std::array<unsigned char, curt::kUuidLength> uuid;
215+
// Select the current GPU to query.
216+
curt::GetUuid(common::Span{uuid.data(), uuid.size()}, curt::CurrentDevice());
217+
auto str_uuid = curt::PrintUuid(common::Span{uuid.data(), uuid.size()});
213218
// See test for example output from smi.
214-
auto cmd = "nvidia-smi c2c -s -i 0"; // Select the first GPU to query.
219+
auto cmd = "nvidia-smi c2c -s -i " + str_uuid;
215220
auto out = common::CmdOutput(StringView{cmd});
221+
LOG(DEBUG) << "c2c:\n" << out << "\n";
216222
auto cnt = detail::GetC2cLinkCountFromSmiImpl(out);
217223
return cnt;
218224
}

src/common/cuda_rt_utils.cc

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
/**
2-
* Copyright 2015-2025, XGBoost Contributors
2+
* Copyright 2015-2026, XGBoost Contributors
33
*/
44
#include "cuda_rt_utils.h"
55

6-
#include "cuda_stream.h" // for StreamRef
6+
#include <cstring> // for memcpy
7+
#include <set> // for set
8+
#include <sstream> // for stringstream
9+
10+
#include "cuda_stream.h" // for StreamRef
11+
#include "xgboost/span.h" // for Span
712

813
#if defined(XGBOOST_USE_CUDA)
914
#include <cuda_runtime_api.h>
1015

1116
#include <algorithm> // for max
12-
#endif // defined(XGBOOST_USE_CUDA)
17+
18+
#endif // defined(XGBOOST_USE_CUDA)
1319

1420
#include <cstddef> // for size_t
1521
#include <cstdint> // for int32_t
@@ -114,6 +120,27 @@ void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor) {
114120
return !!res;
115121
}
116122

123+
static_assert(kUuidLength == sizeof(std::declval<cudaDeviceProp>().uuid));
124+
125+
void GetUuid(xgboost::common::Span<unsigned char> uuid, std::int32_t device) {
126+
cudaDeviceProp prop{};
127+
dh::safe_cuda(cudaGetDeviceProperties(&prop, device));
128+
std::memcpy(uuid.data(), static_cast<void*>(&(prop.uuid)), kUuidLength);
129+
}
130+
131+
[[nodiscard]] std::string PrintUuid(common::Span<unsigned char const, kUuidLength> uuid) {
132+
std::set<std::size_t> dash_pos{0, 4, 6, 8, 10};
133+
std::stringstream ss;
134+
ss << "GPU";
135+
for (std::size_t i = 0; i < kUuidLength; ++i) {
136+
if (dash_pos.find(i) != dash_pos.cend()) {
137+
ss << "-";
138+
}
139+
ss << std::setw(2) << std::setfill('0') << std::hex << (0xFF & std::uint32_t{uuid[i]});
140+
}
141+
return ss.str();
142+
}
143+
117144
void MemcpyAsync(void* dst, const void* src, std::size_t count, StreamRef stream) {
118145
dh::safe_cuda(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
119146
}
@@ -154,6 +181,10 @@ void SetDevice(std::int32_t device) {
154181

155182
[[nodiscard]] bool MemoryPoolsSupported(std::int32_t) { return false; }
156183

184+
void GetUuid(xgboost::common::Span<unsigned char>, std::int32_t) { common::AssertGPUSupport(); }
185+
186+
[[nodiscard]] std::string PrintUuid(common::Span<unsigned char const, kUuidLength>) { return {}; }
187+
157188
void MemcpyAsync(void*, const void*, std::size_t, StreamRef) { common::AssertGPUSupport(); }
158189

159190
#endif // !defined(XGBOOST_USE_CUDA)

src/common/cuda_rt_utils.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
/**
2-
* Copyright 2024-2025, XGBoost contributors
2+
* Copyright 2024-2026, XGBoost contributors
33
*/
44
#pragma once
55
#include <cstddef> // for size_t
66
#include <cstdint> // for int32_t
7+
#include <string> // for string
78

8-
#include "cuda_stream.h" // for StreamRef
9+
#include "cuda_stream.h" // for StreamRef
10+
#include "xgboost/span.h" // for Span
911

1012
namespace xgboost::curt {
1113
std::int32_t AllVisibleGPUs();
@@ -42,6 +44,12 @@ void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor);
4244

4345
[[nodiscard]] bool MemoryPoolsSupported(std::int32_t device);
4446

47+
inline constexpr std::size_t kUuidLength = 16;
48+
49+
void GetUuid(common::Span<unsigned char> uuid, std::int32_t device);
50+
51+
[[nodiscard]] std::string PrintUuid(common::Span<unsigned char const, kUuidLength> uuid);
52+
4553
// cudaMemcpyAsync
4654
void MemcpyAsync(void* dst, const void* src, std::size_t count, StreamRef stream);
4755
} // namespace xgboost::curt

src/data/batch_utils.cc

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,6 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
5151
using xgboost::cuda_impl::CachePageRatio;
5252

5353
auto lc = cudr::GetC2cLinkCountFromSmiGlobal();
54-
if (lc >= 10) {
55-
// >= 10, life is easy.
56-
if (CachePageBytesIsAuto(min_cache_page_bytes)) {
57-
min_cache_page_bytes = n_d_bytes * CachePageRatio();
58-
}
59-
if (HostRatioIsAuto(cache_host_ratio)) {
60-
cache_host_ratio = 1.0;
61-
}
62-
return {cache_host_ratio, min_cache_page_bytes};
63-
}
6454

6555
/**
6656
* Configure the min_cache_page_bytes
@@ -96,7 +86,6 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
9686
auto h_cache_nbytes = n_cache_bytes - d_cache_nbytes * 0.85;
9787
cache_host_ratio = static_cast<double>(h_cache_nbytes) / static_cast<double>(n_cache_bytes);
9888
if (lc > 0) {
99-
// 0 < lc < 10, C2C is available, but with reduced link count.
10089
// No need to exceed half in practice.
10190
cache_host_ratio = std::max(cache_host_ratio, 0.5);
10291
}

tests/cpp/common/test_cuda_rt_utils.cu

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
11
/**
2-
* Copyright 2025, XGBoost contributors
2+
* Copyright 2025-2026, XGBoost contributors
33
*/
44

55
#include <gtest/gtest.h>
66

7+
#include <array> // for array
78
#include <cstdint> // for int32_t
89
#include <set> // for set
910

11+
#include "../../../src/common/cuda_rt_utils.h"
1012
#include "../../../src/common/cuda_stream_pool.h"
13+
#include "xgboost/span.h" // for Span
1114

1215
namespace xgboost::curt {
16+
TEST(RtUtils, Uuid) {
17+
std::array<unsigned char, kUuidLength> uuid;
18+
GetUuid(uuid, 0);
19+
auto str = PrintUuid(uuid);
20+
ASSERT_EQ(str.substr(0, 4), "GPU-");
21+
ASSERT_EQ(str.length(), 40);
22+
ASSERT_EQ(str.size(), str.length());
23+
}
24+
1325
TEST(RtUtils, StreamPool) {
1426
auto n_streams = 16;
1527
auto pool = std::make_unique<StreamPool>(n_streams);

0 commit comments

Comments
 (0)