Skip to content

Commit c262543

Browse files
committed
UCT/GDR: Add option for PCIe BAR1 export
1 parent 7561177 commit c262543

7 files changed

Lines changed: 137 additions & 16 deletions

File tree

config/m4/cuda.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ AS_IF([test "x$cuda_checked" != "xyes"],
218218
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])],
219219
[[#include <nvml.h>]])
220220
221+
AC_CHECK_DECLS([NVML_FI_DEV_C2C_LINK_COUNT], [], [],
222+
[[#include <nvml.h>]])
221223
222224
# Check for cuda static library
223225
have_cuda_static="no"

config/m4/gdrcopy.m4

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ AS_IF([test "x$with_gdrcopy" != "xno"],
3838
gdrcopy_happy="no"])
3939
], [gdrcopy_happy="no"])
4040
41-
AS_IF([test "x$gdrcopy_happy" = "xyes"],
42-
[AC_CHECK_DECLS([gdr_copy_to_mapping], [], [], [#include "gdrapi.h"])])
41+
AS_IF([test "x$gdrcopy_happy" = "xyes"], [
42+
AC_CHECK_DECLS([gdr_pin_buffer_v2], [], [], [#include "gdrapi.h"])
43+
AC_CHECK_DECLS([gdr_copy_to_mapping], [], [], [#include "gdrapi.h"])
44+
])
4345
4446
CFLAGS="$save_CFLAGS"
4547
CPPFLAGS="$save_CPPFLAGS"

src/uct/cuda/gdr_copy/gdr_copy_md.c

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828
#define UCT_GDR_COPY_RCACHE_OVERHEAD_AUTO 50.0e-9
2929

3030

31+
static const char *uct_gdr_copy_pin_mode_names[] = {
32+
[UCT_GDR_COPY_PIN_MODE_DEFAULT] = "default",
33+
[UCT_GDR_COPY_PIN_MODE_PCIE] = "pcie",
34+
[UCT_GDR_COPY_PIN_MODE_LAST] = NULL
35+
};
36+
3137
typedef struct {
3238
pthread_mutex_t lock;
3339
unsigned refcount;
@@ -57,6 +63,13 @@ static ucs_config_field_t uct_gdr_copy_md_config_table[] = {
5763
{"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */
5864
ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME},
5965

66+
{"PIN_MODE", "default",
67+
"Mapping type for CPU access:\n"
68+
" default - Default mapping C2C or PCIe\n"
69+
" pcie - Force a PCIe based mapping (BAR1), may fail at registration.\n",
70+
ucs_offsetof(uct_gdr_copy_md_config_t, pin_mode),
71+
UCS_CONFIG_TYPE_ENUM(uct_gdr_copy_pin_mode_names)},
72+
6073
{"", "RCACHE_PURGE_ON_FORK=n", NULL,
6174
ucs_offsetof(uct_gdr_copy_md_config_t, rcache_config),
6275
UCS_CONFIG_TYPE_TABLE(ucs_config_rcache_table)},
@@ -139,17 +152,26 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal,
139152
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
140153
unsigned long d_ptr = ((unsigned long)(char*)address);
141154
ucs_log_level_t log_level;
155+
uint32_t pin_gdr_flags = 0;
142156
int ret;
143157

144158
ucs_assert((address != NULL) && (length != 0));
145159

146160
log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG :
147161
UCS_LOG_LEVEL_ERROR;
148162

163+
#if HAVE_DECL_GDR_PIN_BUFFER_V2
164+
pin_gdr_flags = (md->pin_mode == UCT_GDR_COPY_PIN_MODE_PCIE) ?
165+
GDR_PIN_FLAG_FORCE_PCIE : GDR_PIN_FLAG_DEFAULT;
166+
ret = gdr_pin_buffer_v2(md->gdrcpy_ctx, d_ptr, length, pin_gdr_flags,
167+
&mem_hndl->mh);
168+
#else
149169
ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mem_hndl->mh);
170+
#endif
150171
if (ret) {
151-
ucs_log(log_level, "gdr_pin_buffer failed. length :%lu ret:%d",
152-
length, ret);
172+
ucs_log(log_level,
173+
"GPU memory pin failed. length :%lu ret:%d pin_flags:%u",
174+
length, ret, pin_gdr_flags);
153175
goto err;
154176
}
155177

@@ -167,9 +189,11 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal,
167189
goto unmap_buffer;
168190
}
169191

170-
ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p",
192+
ucs_trace("registered memory:%p..%p length:%lu info.va:0x%" PRIx64
193+
" bar_ptr:%p mode:%s",
171194
address, UCS_PTR_BYTE_OFFSET(address, length), length,
172-
mem_hndl->info.va, mem_hndl->bar_ptr);
195+
mem_hndl->info.va, mem_hndl->bar_ptr,
196+
uct_gdr_copy_pin_mode_names[md->pin_mode]);
173197

174198
return UCS_OK;
175199

@@ -468,6 +492,13 @@ uct_gdr_copy_md_create(uct_component_t *component,
468492
uct_gdr_copy_md_t *md;
469493
ucs_status_t status;
470494

495+
#if !HAVE_DECL_GDR_PIN_BUFFER_V2
496+
if (md_config->pin_mode == UCT_GDR_COPY_PIN_MODE_PCIE) {
497+
ucs_error("PCIe pin mode requires GDRCopy with gdr_pin_buffer_v2");
498+
return UCS_ERR_INVALID_PARAM;
499+
}
500+
#endif
501+
471502
md = ucs_malloc(sizeof(*md), "uct_gdr_copy_md_t");
472503
if (md == NULL) {
473504
ucs_error("failed to allocate memory for uct_gdr_copy_md_t");
@@ -478,6 +509,7 @@ uct_gdr_copy_md_create(uct_component_t *component,
478509
md->reg_cost = md_config->uc_reg_cost;
479510
md->super.ops = &uct_gdr_copy_md_ops;
480511
md->rcache = NULL;
512+
md->pin_mode = md_config->pin_mode;
481513

482514
md->gdrcpy_ctx = gdr_open();
483515
if (md->gdrcpy_ctx == NULL) {
@@ -545,6 +577,12 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
545577
md_config->enable_rcache) {
546578
ucs_error("inconsistent gdr_copy rcache enable param");
547579
status = UCS_ERR_INVALID_PARAM;
580+
} else if (uct_gdr_copy_context.md->pin_mode != md_config->pin_mode) {
581+
ucs_error("inconsistent gdr_copy PIN_MODE: shared=%s, opening=%s",
582+
uct_gdr_copy_pin_mode_names
583+
[uct_gdr_copy_context.md->pin_mode],
584+
uct_gdr_copy_pin_mode_names[md_config->pin_mode]);
585+
status = UCS_ERR_INVALID_PARAM;
548586
} else {
549587
status = UCS_OK;
550588
}

src/uct/cuda/gdr_copy/gdr_copy_md.h

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,37 @@
1313
extern uct_component_t uct_gdr_copy_component;
1414

1515

16+
/** Pin mode for GDRCopy GPU memory registration (default vs PCIe-based mapping). */
17+
typedef enum uct_gdr_copy_pin_mode {
18+
UCT_GDR_COPY_PIN_MODE_DEFAULT = 0,
19+
UCT_GDR_COPY_PIN_MODE_PCIE = 1,
20+
UCT_GDR_COPY_PIN_MODE_LAST
21+
} uct_gdr_copy_pin_mode_t;
22+
23+
1624
/**
1725
* @brief gdr_copy MD descriptor
1826
*/
1927
typedef struct {
20-
uct_md_t super; /**< Domain info */
21-
gdr_t gdrcpy_ctx; /**< gdr copy context */
22-
ucs_linear_func_t reg_cost; /**< Memory registration cost */
23-
ucs_rcache_t *rcache; /**< Registration cache */
28+
uct_md_t super; /**< Domain info */
29+
gdr_t gdrcpy_ctx; /**< gdr copy context */
30+
ucs_linear_func_t reg_cost; /**< Memory registration cost */
31+
ucs_rcache_t *rcache; /**< Registration cache */
32+
uct_gdr_copy_pin_mode_t pin_mode; /**< see PIN_MODE; converted at gdr_pin_buffer_v2 */
2433
} uct_gdr_copy_md_t;
2534

2635

2736
/**
2837
* gdr copy domain configuration.
2938
*/
3039
typedef struct uct_gdr_copy_md_config {
31-
uct_md_config_t super;
32-
int shared; /**< Shared MD instance */
33-
int enable_rcache; /**< Enable registration cache */
34-
ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation
35-
without using the cache */
36-
ucs_rcache_config_t rcache_config; /**< Registration cache configuration */
40+
uct_md_config_t super;
41+
int shared; /**< Shared MD instance */
42+
int enable_rcache; /**< Enable registration cache */
43+
ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation
44+
without using the cache */
45+
ucs_rcache_config_t rcache_config; /**< Registration cache configuration */
46+
uct_gdr_copy_pin_mode_t pin_mode; /**< default or pcie mapping preference; see PIN_MODE */
3747
} uct_gdr_copy_md_config_t;
3848

3949

test/gtest/common/mem_buffer.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,37 @@ void mem_buffer::get_bar1_free_size_nvml()
221221
#endif
222222
}
223223

224+
bool mem_buffer::cuda_gpu_has_c2c(unsigned gpu_index)
225+
{
226+
#if HAVE_CUDA && HAVE_DECL_NVML_FI_DEV_C2C_LINK_COUNT
227+
bool has_c2c;
228+
nvmlDevice_t device;
229+
nvmlFieldValue_t value = {0};
230+
231+
if (NVML_CALL(nvmlInit_v2()) != UCS_OK) {
232+
return false;
233+
}
234+
235+
if (NVML_CALL(nvmlDeviceGetHandleByIndex(gpu_index, &device)) != UCS_OK) {
236+
NVML_CALL(nvmlShutdown());
237+
return false;
238+
}
239+
240+
value.fieldId = NVML_FI_DEV_C2C_LINK_COUNT;
241+
if (NVML_CALL(nvmlDeviceGetFieldValues(device, 1, &value)) != UCS_OK) {
242+
NVML_CALL(nvmlShutdown());
243+
return false;
244+
}
245+
246+
has_c2c = (value.nvmlReturn == NVML_SUCCESS) && (value.value.uiVal > 0);
247+
NVML_CALL(nvmlShutdown());
248+
return has_c2c;
249+
#else
250+
(void)gpu_index;
251+
return false;
252+
#endif
253+
}
254+
224255
void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type, bool async)
225256
{
226257
void *ptr;

test/gtest/common/mem_buffer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ class mem_buffer {
9797
/* Get from NVML BAR1 free size */
9898
static void get_bar1_free_size_nvml();
9999

100+
/* NVML NVLink-C2C link count > 0 for CUDA device */
101+
static bool cuda_gpu_has_c2c(unsigned gpu_index = 0);
102+
100103
/* Return free memory on the BAR1 / GPU. If GPU is not used
101104
* SIZE_MAX is returned */
102105
static size_t get_bar1_free_size()

test/gtest/uct/test_md.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
* See file LICENSE for terms.
55
*/
66

7+
#ifdef HAVE_CONFIG_H
8+
# include "config.h"
9+
#endif
10+
711
#include "test_md.h"
812

913
#include <common/mem_buffer.h>
@@ -1249,3 +1253,34 @@ UCS_TEST_P(test_cuda, sparse_regions)
12491253
}
12501254

12511255
UCT_MD_INSTANTIATE_TEST_CASE(test_cuda)
1256+
1257+
#if HAVE_DECL_GDR_PIN_BUFFER_V2
1258+
1259+
class test_gdr_copy : public test_md {
1260+
};
1261+
1262+
UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_pcie_pin,
1263+
!check_caps(UCT_MD_FLAG_REG), "GDR_COPY_PIN_MODE?=pcie")
1264+
{
1265+
constexpr size_t size = 65536;
1266+
void *address = NULL;
1267+
uct_mem_h memh;
1268+
ucs_status_t status;
1269+
1270+
if (!mem_buffer::cuda_gpu_has_c2c()) {
1271+
UCS_TEST_SKIP_R("Cannot find C2C on the system");
1272+
}
1273+
1274+
alloc_memory(&address, size, NULL, UCS_MEMORY_TYPE_CUDA);
1275+
1276+
status = reg_mem(UCT_MD_MEM_ACCESS_ALL, address, size, &memh);
1277+
ASSERT_UCS_OK(status);
1278+
1279+
status = uct_md_mem_dereg(md(), memh);
1280+
ASSERT_UCS_OK(status);
1281+
1282+
free_memory(address, UCS_MEMORY_TYPE_CUDA);
1283+
}
1284+
1285+
_UCT_MD_INSTANTIATE_TEST_CASE(test_gdr_copy, gdr_copy)
1286+
#endif /* HAVE_DECL_GDR_PIN_BUFFER_V2 */

0 commit comments

Comments
 (0)