Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/m4/cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ AS_IF([test "x$cuda_checked" != "xyes"],
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])],
[[#include <nvml.h>]])

AC_CHECK_DECLS([NVML_FI_DEV_C2C_LINK_COUNT], [], [],
[[#include <nvml.h>]])

# Check for cuda static library
have_cuda_static="no"
Expand Down
6 changes: 4 additions & 2 deletions config/m4/gdrcopy.m4
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ AS_IF([test "x$with_gdrcopy" != "xno"],
gdrcopy_happy="no"])
], [gdrcopy_happy="no"])

AS_IF([test "x$gdrcopy_happy" = "xyes"],
[AC_CHECK_DECLS([gdr_copy_to_mapping], [], [], [#include "gdrapi.h"])])
AS_IF([test "x$gdrcopy_happy" = "xyes"], [
AC_CHECK_DECLS([gdr_pin_buffer_v2, gdr_copy_to_mapping], [], [],
[#include "gdrapi.h"])
])

CFLAGS="$save_CFLAGS"
CPPFLAGS="$save_CPPFLAGS"
Expand Down
106 changes: 95 additions & 11 deletions src/uct/cuda/gdr_copy/gdr_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,43 @@
#define UCT_GDR_COPY_RCACHE_OVERHEAD_AUTO 50.0e-9


static ucs_status_t
uct_gdr_copy_use_pcie_params_get(ucs_ternary_auto_value_t use_pcie,
uint32_t *pin_gdr_flags_p, int *pin_pcie_fallback_p)
{
#if HAVE_DECL_GDR_PIN_BUFFER_V2
switch (use_pcie) {
case UCS_YES:
*pin_gdr_flags_p = GDR_PIN_FLAG_FORCE_PCIE;
*pin_pcie_fallback_p = 0;
break;
case UCS_AUTO:
/* Fallthrough */
case UCS_NO:
*pin_gdr_flags_p = GDR_PIN_FLAG_DEFAULT;
*pin_pcie_fallback_p = 0;
break;
case UCS_TRY:
*pin_gdr_flags_p = GDR_PIN_FLAG_FORCE_PCIE;
*pin_pcie_fallback_p = 1;
break;
default:
return UCS_ERR_INVALID_PARAM;
}

return UCS_OK;
#else
if (use_pcie == UCS_YES) {
ucs_error("USE_PCIE=yes requires GDRCopy with gdr_pin_buffer_v2", buf);
return UCS_ERR_INVALID_PARAM;
}

*pin_gdr_flags_p = 0;
*pin_pcie_fallback_p = 0;
return UCS_OK;
#endif
}

typedef struct {
pthread_mutex_t lock;
unsigned refcount;
Expand Down Expand Up @@ -57,6 +94,15 @@ static ucs_config_field_t uct_gdr_copy_md_config_table[] = {
{"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */
ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME},

{"USE_PCIE", "auto",
"Mapping type for CPU access:\n"
" auto - default, driver chooses C2C or PCIe\n"
" yes - Force PCIe (BAR1); may fail at registration\n"
" no - default, driver chooses C2C or PCIe\n"
" try - Try PCIe first, fall back to default in case of failure\n",
ucs_offsetof(uct_gdr_copy_md_config_t, use_pcie),
UCS_CONFIG_TYPE_TERNARY_AUTO},

{"", "RCACHE_PURGE_ON_FORK=n", NULL,
ucs_offsetof(uct_gdr_copy_md_config_t, rcache_config),
UCS_CONFIG_TYPE_TABLE(ucs_config_rcache_table)},
Expand Down Expand Up @@ -136,20 +182,35 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal,
uct_md_h uct_md, void *address, size_t length,
unsigned flags, uct_gdr_copy_mem_t *mem_hndl)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
unsigned long d_ptr = ((unsigned long)(char*)address);
ucs_log_level_t log_level;
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
unsigned long d_ptr = ((unsigned long)(char*)address);
ucs_log_level_t log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ?
UCS_LOG_LEVEL_DEBUG :
UCS_LOG_LEVEL_ERROR;
uint32_t pin_flags = 0;
int ret;

ucs_assert((address != NULL) && (length != 0));

log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG :
UCS_LOG_LEVEL_ERROR;

#if HAVE_DECL_GDR_PIN_BUFFER_V2
pin_flags = md->pin_gdr_flags;
ret = gdr_pin_buffer_v2(md->gdrcpy_ctx, d_ptr, length, pin_flags,
&mem_hndl->mh);
if (ret && (pin_flags != GDR_PIN_FLAG_DEFAULT) && md->pin_pcie_fallback) {
ucs_debug("GPU memory non-default pin failed with length %lu ret %d "
"pin_flags %u, retrying with default pin flag",
length, ret, pin_flags);
pin_flags = GDR_PIN_FLAG_DEFAULT;
ret = gdr_pin_buffer_v2(md->gdrcpy_ctx, d_ptr, length, pin_flags,
&mem_hndl->mh);
}
#else
ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mem_hndl->mh);
#endif
if (ret) {
ucs_log(log_level, "gdr_pin_buffer failed. length :%lu ret:%d",
length, ret);
ucs_log(log_level,
"GPU memory pin failed. length %lu ret %d pin_flags %u",
length, ret, pin_flags);
goto err;
}

Expand All @@ -167,9 +228,10 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal,
goto unmap_buffer;
}

ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p",
ucs_trace("registered memory %p..%p length %lu info.va 0x%" PRIx64
" bar_ptr %p pin_flags %u",
address, UCS_PTR_BYTE_OFFSET(address, length), length,
mem_hndl->info.va, mem_hndl->bar_ptr);
mem_hndl->info.va, mem_hndl->bar_ptr, pin_flags);

return UCS_OK;

Expand Down Expand Up @@ -478,6 +540,12 @@ uct_gdr_copy_md_create(uct_component_t *component,
md->reg_cost = md_config->uc_reg_cost;
md->super.ops = &uct_gdr_copy_md_ops;
md->rcache = NULL;
status = uct_gdr_copy_use_pcie_params_get(md_config->use_pcie,
&md->pin_gdr_flags,
&md->pin_pcie_fallback);
if (status != UCS_OK) {
goto err_free;
}

md->gdrcpy_ctx = gdr_open();
if (md->gdrcpy_ctx == NULL) {
Expand Down Expand Up @@ -529,6 +597,8 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
ucs_derived_of(config, uct_gdr_copy_md_config_t);
uct_gdr_copy_md_t *md;
ucs_status_t status;
uint32_t new_pin_flags;
int new_pin_fallback;

if (!md_config->shared) {
status = uct_gdr_copy_md_create(component, md_config, &md);
Expand All @@ -546,7 +616,21 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
ucs_error("inconsistent gdr_copy rcache enable param");
status = UCS_ERR_INVALID_PARAM;
} else {
status = UCS_OK;
status = uct_gdr_copy_use_pcie_params_get(md_config->use_pcie,
&new_pin_flags,
&new_pin_fallback);
if (status == UCS_OK) {
if ((uct_gdr_copy_context.md->pin_gdr_flags != new_pin_flags) ||
(uct_gdr_copy_context.md->pin_pcie_fallback !=
new_pin_fallback)) {
ucs_error("inconsistent pin mode: shared pin_flags=%u "
"fallback=%d, opening pin_flags=%u fallback=%d",
uct_gdr_copy_context.md->pin_gdr_flags,
uct_gdr_copy_context.md->pin_pcie_fallback,
new_pin_flags, new_pin_fallback);
status = UCS_ERR_INVALID_PARAM;
}
}
}
} else {
status = uct_gdr_copy_md_create(component, md_config,
Expand Down
23 changes: 13 additions & 10 deletions src/uct/cuda/gdr_copy/gdr_copy_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,26 @@ extern uct_component_t uct_gdr_copy_component;
* @brief gdr_copy MD descriptor
*/
typedef struct {
uct_md_t super; /**< Domain info */
gdr_t gdrcpy_ctx; /**< gdr copy context */
ucs_linear_func_t reg_cost; /**< Memory registration cost */
ucs_rcache_t *rcache; /**< Registration cache */
uct_md_t super; /**< Domain info */
gdr_t gdrcpy_ctx; /**< gdr copy context */
ucs_linear_func_t reg_cost; /**< Memory registration cost */
ucs_rcache_t *rcache; /**< Registration cache */
uint32_t pin_gdr_flags; /**< First gdr_pin_buffer_v2 flags (0 if v2 absent) */
int pin_pcie_fallback; /**< If nonzero, retry pin with default flags on failure */
} uct_gdr_copy_md_t;


/**
* gdr copy domain configuration.
*/
typedef struct uct_gdr_copy_md_config {
uct_md_config_t super;
int shared; /**< Shared MD instance */
int enable_rcache; /**< Enable registration cache */
ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation
without using the cache */
ucs_rcache_config_t rcache_config; /**< Registration cache configuration */
uct_md_config_t super;
int shared; /**< Shared MD instance */
int enable_rcache; /**< Enable registration cache */
ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation
without using the cache */
ucs_rcache_config_t rcache_config; /**< Registration cache configuration */
ucs_ternary_auto_value_t use_pcie; /**< UCS_CONFIG_TYPE_TERNARY_AUTO; see USE_PCIE */
} uct_gdr_copy_md_config_t;


Expand Down
31 changes: 31 additions & 0 deletions test/gtest/common/mem_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,37 @@ void mem_buffer::get_bar1_free_size_nvml()
#endif
}

bool mem_buffer::cuda_gpu_has_c2c(unsigned gpu_index)
{
#if HAVE_CUDA && HAVE_DECL_NVML_FI_DEV_C2C_LINK_COUNT
bool has_c2c;
nvmlDevice_t device;
nvmlFieldValue_t value = {0};

if (NVML_CALL(nvmlInit_v2()) != UCS_OK) {
return false;
}

if (NVML_CALL(nvmlDeviceGetHandleByIndex(gpu_index, &device)) != UCS_OK) {
NVML_CALL(nvmlShutdown());
return false;
}

value.fieldId = NVML_FI_DEV_C2C_LINK_COUNT;
if (NVML_CALL(nvmlDeviceGetFieldValues(device, 1, &value)) != UCS_OK) {
NVML_CALL(nvmlShutdown());
return false;
}

has_c2c = (value.nvmlReturn == NVML_SUCCESS) && (value.value.uiVal > 0);
NVML_CALL(nvmlShutdown());
return has_c2c;
#else
(void)gpu_index;
return false;
#endif
}

void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type, bool async)
{
void *ptr;
Expand Down
3 changes: 3 additions & 0 deletions test/gtest/common/mem_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ class mem_buffer {
/* Get from NVML BAR1 free size */
static void get_bar1_free_size_nvml();

/* NVML NVLink-C2C link count > 0 for CUDA device */
static bool cuda_gpu_has_c2c(unsigned gpu_index = 0);

/* Return free memory on the BAR1 / GPU. If GPU is not used
* SIZE_MAX is returned */
static size_t get_bar1_free_size()
Expand Down
59 changes: 59 additions & 0 deletions test/gtest/uct/test_md.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "test_md.h"

#include <common/mem_buffer.h>
Expand Down Expand Up @@ -1249,3 +1253,58 @@ UCS_TEST_P(test_cuda, sparse_regions)
}

UCT_MD_INSTANTIATE_TEST_CASE(test_cuda)

#if HAVE_DECL_GDR_PIN_BUFFER_V2

class test_gdr_copy : public test_md {
protected:
ucs_status_t register_mem()
{
constexpr size_t size = 65536;
void *address = NULL;
uct_mem_h memh;
ucs_status_t status;

alloc_memory(&address, size, NULL, UCS_MEMORY_TYPE_CUDA);
status = reg_mem(UCT_MD_MEM_ACCESS_ALL, address, size, &memh);
if (status == UCS_OK) {
(void)uct_md_mem_dereg(md(), memh);
}

free_memory(address, UCS_MEMORY_TYPE_CUDA);
return status;
}
};

UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_default_pin,
!check_caps(UCT_MD_FLAG_REG), "GDR_COPY_USE_PCIE?=auto")
{
ASSERT_UCS_OK(register_mem());
}

UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_pcie_pin,
!mem_buffer::cuda_gpu_has_c2c() ||
!check_caps(UCT_MD_FLAG_REG),
"GDR_COPY_USE_PCIE?=yes")
{
ASSERT_UCS_OK(register_mem());
}

UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_pcie_pin_fail,
mem_buffer::cuda_gpu_has_c2c() ||
!check_caps(UCT_MD_FLAG_REG),
"GDR_COPY_USE_PCIE?=yes")
{
scoped_log_handler slh(wrap_errors_logger);
ASSERT_UCS_STATUS_EQ(UCS_ERR_IO_ERROR, register_mem());
}

UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_try_pcie_pin,
!check_caps(UCT_MD_FLAG_REG), "GDR_COPY_USE_PCIE=try")
{
ASSERT_UCS_OK(register_mem());
}

_UCT_MD_INSTANTIATE_TEST_CASE(test_gdr_copy, gdr_copy)

#endif /* HAVE_DECL_GDR_PIN_BUFFER_V2 */