diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index 2432a61d022..36ad06b3669 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -218,6 +218,8 @@ AS_IF([test "x$cuda_checked" != "xyes"], [AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])], [[#include ]]) + AC_CHECK_DECLS([NVML_FI_DEV_C2C_LINK_COUNT], [], [], + [[#include ]]) # Check for cuda static library have_cuda_static="no" diff --git a/config/m4/gdrcopy.m4 b/config/m4/gdrcopy.m4 index 50241ee97e3..aaec0e0a236 100644 --- a/config/m4/gdrcopy.m4 +++ b/config/m4/gdrcopy.m4 @@ -38,8 +38,10 @@ AS_IF([test "x$with_gdrcopy" != "xno"], gdrcopy_happy="no"]) ], [gdrcopy_happy="no"]) - AS_IF([test "x$gdrcopy_happy" = "xyes"], - [AC_CHECK_DECLS([gdr_copy_to_mapping], [], [], [#include "gdrapi.h"])]) + AS_IF([test "x$gdrcopy_happy" = "xyes"], [ + AC_CHECK_DECLS([gdr_pin_buffer_v2, gdr_copy_to_mapping], [], [], + [#include "gdrapi.h"]) + ]) CFLAGS="$save_CFLAGS" CPPFLAGS="$save_CPPFLAGS" diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c index 2bc7df58f48..cc2c5006fe5 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -28,6 +28,43 @@ #define UCT_GDR_COPY_RCACHE_OVERHEAD_AUTO 50.0e-9 +static ucs_status_t +uct_gdr_copy_use_pcie_params_get(ucs_ternary_auto_value_t use_pcie, + uint32_t *pin_gdr_flags_p, int *pin_pcie_fallback_p) +{ +#if HAVE_DECL_GDR_PIN_BUFFER_V2 + switch (use_pcie) { + case UCS_YES: + *pin_gdr_flags_p = GDR_PIN_FLAG_FORCE_PCIE; + *pin_pcie_fallback_p = 0; + break; + case UCS_AUTO: + /* Fallthrough */ + case UCS_NO: + *pin_gdr_flags_p = GDR_PIN_FLAG_DEFAULT; + *pin_pcie_fallback_p = 0; + break; + case UCS_TRY: + *pin_gdr_flags_p = GDR_PIN_FLAG_FORCE_PCIE; + *pin_pcie_fallback_p = 1; + break; + default: + return UCS_ERR_INVALID_PARAM; + } + + return UCS_OK; +#else + if (use_pcie == UCS_YES) { + ucs_error("USE_PCIE=yes requires GDRCopy with gdr_pin_buffer_v2", buf); + return UCS_ERR_INVALID_PARAM; + } + + *pin_gdr_flags_p = 0; + *pin_pcie_fallback_p = 0; + return UCS_OK; +#endif +} + typedef struct { pthread_mutex_t lock; unsigned refcount; @@ -57,6 +94,15 @@ static ucs_config_field_t uct_gdr_copy_md_config_table[] = { {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME}, + {"USE_PCIE", "auto", + "Mapping type for CPU access:\n" + " auto - default, driver chooses C2C or PCIe\n" + " yes - Force PCIe (BAR1); may fail at registration\n" + " no - default, driver chooses C2C or PCIe\n" + " try - Try PCIe first, fall back to default in case of failure\n", + ucs_offsetof(uct_gdr_copy_md_config_t, use_pcie), + UCS_CONFIG_TYPE_TERNARY_AUTO}, + {"", "RCACHE_PURGE_ON_FORK=n", NULL, ucs_offsetof(uct_gdr_copy_md_config_t, rcache_config), UCS_CONFIG_TYPE_TABLE(ucs_config_rcache_table)}, @@ -136,20 +182,35 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal, uct_md_h uct_md, void *address, size_t length, unsigned flags, uct_gdr_copy_mem_t *mem_hndl) { - uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); - unsigned long d_ptr = ((unsigned long)(char*)address); - ucs_log_level_t log_level; + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + unsigned long d_ptr = ((unsigned long)(char*)address); + ucs_log_level_t log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? + UCS_LOG_LEVEL_DEBUG : + UCS_LOG_LEVEL_ERROR; + uint32_t pin_flags = 0; int ret; ucs_assert((address != NULL) && (length != 0)); - log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG : - UCS_LOG_LEVEL_ERROR; - +#if HAVE_DECL_GDR_PIN_BUFFER_V2 + pin_flags = md->pin_gdr_flags; + ret = gdr_pin_buffer_v2(md->gdrcpy_ctx, d_ptr, length, pin_flags, + &mem_hndl->mh); + if (ret && (pin_flags != GDR_PIN_FLAG_DEFAULT) && md->pin_pcie_fallback) { + ucs_debug("GPU memory non-default pin failed with length %lu ret %d " + "pin_flags %u, retrying with default pin flag", + length, ret, pin_flags); + pin_flags = GDR_PIN_FLAG_DEFAULT; + ret = gdr_pin_buffer_v2(md->gdrcpy_ctx, d_ptr, length, pin_flags, + &mem_hndl->mh); + } +#else ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mem_hndl->mh); +#endif if (ret) { - ucs_log(log_level, "gdr_pin_buffer failed. length :%lu ret:%d", - length, ret); + ucs_log(log_level, + "GPU memory pin failed. length %lu ret %d pin_flags %u", + length, ret, pin_flags); goto err; } @@ -167,9 +228,10 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal, goto unmap_buffer; } - ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p", + ucs_trace("registered memory %p..%p length %lu info.va 0x%" PRIx64 + " bar_ptr %p pin_flags %u", address, UCS_PTR_BYTE_OFFSET(address, length), length, - mem_hndl->info.va, mem_hndl->bar_ptr); + mem_hndl->info.va, mem_hndl->bar_ptr, pin_flags); return UCS_OK; @@ -478,6 +540,12 @@ uct_gdr_copy_md_create(uct_component_t *component, md->reg_cost = md_config->uc_reg_cost; md->super.ops = &uct_gdr_copy_md_ops; md->rcache = NULL; + status = uct_gdr_copy_use_pcie_params_get(md_config->use_pcie, + &md->pin_gdr_flags, + &md->pin_pcie_fallback); + if (status != UCS_OK) { + goto err_free; + } md->gdrcpy_ctx = gdr_open(); if (md->gdrcpy_ctx == NULL) { @@ -529,6 +597,8 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name, ucs_derived_of(config, uct_gdr_copy_md_config_t); uct_gdr_copy_md_t *md; ucs_status_t status; + uint32_t new_pin_flags; + int new_pin_fallback; if (!md_config->shared) { status = uct_gdr_copy_md_create(component, md_config, &md); @@ -546,7 +616,21 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name, ucs_error("inconsistent gdr_copy rcache enable param"); status = UCS_ERR_INVALID_PARAM; } else { - status = UCS_OK; + status = uct_gdr_copy_use_pcie_params_get(md_config->use_pcie, + &new_pin_flags, + &new_pin_fallback); + if (status == UCS_OK) { + if ((uct_gdr_copy_context.md->pin_gdr_flags != new_pin_flags) || + (uct_gdr_copy_context.md->pin_pcie_fallback != + new_pin_fallback)) { + ucs_error("inconsistent pin mode: shared pin_flags=%u " + "fallback=%d, opening pin_flags=%u fallback=%d", + uct_gdr_copy_context.md->pin_gdr_flags, + uct_gdr_copy_context.md->pin_pcie_fallback, + new_pin_flags, new_pin_fallback); + status = UCS_ERR_INVALID_PARAM; + } + } } } else { status = uct_gdr_copy_md_create(component, md_config, diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.h b/src/uct/cuda/gdr_copy/gdr_copy_md.h index 81d57de49d3..e77990c07a6 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.h @@ -17,10 +17,12 @@ extern uct_component_t uct_gdr_copy_component; * @brief gdr_copy MD descriptor */ typedef struct { - uct_md_t super; /**< Domain info */ - gdr_t gdrcpy_ctx; /**< gdr copy context */ - ucs_linear_func_t reg_cost; /**< Memory registration cost */ - ucs_rcache_t *rcache; /**< Registration cache */ + uct_md_t super; /**< Domain info */ + gdr_t gdrcpy_ctx; /**< gdr copy context */ + ucs_linear_func_t reg_cost; /**< Memory registration cost */ + ucs_rcache_t *rcache; /**< Registration cache */ + uint32_t pin_gdr_flags; /**< First gdr_pin_buffer_v2 flags (0 if v2 absent) */ + int pin_pcie_fallback; /**< If nonzero, retry pin with default flags on failure */ } uct_gdr_copy_md_t; @@ -28,12 +30,13 @@ typedef struct { * gdr copy domain configuration. */ typedef struct uct_gdr_copy_md_config { - uct_md_config_t super; - int shared; /**< Shared MD instance */ - int enable_rcache; /**< Enable registration cache */ - ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation - without using the cache */ - ucs_rcache_config_t rcache_config; /**< Registration cache configuration */ + uct_md_config_t super; + int shared; /**< Shared MD instance */ + int enable_rcache; /**< Enable registration cache */ + ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation + without using the cache */ + ucs_rcache_config_t rcache_config; /**< Registration cache configuration */ + ucs_ternary_auto_value_t use_pcie; /**< UCS_CONFIG_TYPE_TERNARY_AUTO; see USE_PCIE */ } uct_gdr_copy_md_config_t; diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc index 812002c436d..a8812db8278 100644 --- a/test/gtest/common/mem_buffer.cc +++ b/test/gtest/common/mem_buffer.cc @@ -221,6 +221,37 @@ void mem_buffer::get_bar1_free_size_nvml() #endif } +bool mem_buffer::cuda_gpu_has_c2c(unsigned gpu_index) +{ +#if HAVE_CUDA && HAVE_DECL_NVML_FI_DEV_C2C_LINK_COUNT + bool has_c2c; + nvmlDevice_t device; + nvmlFieldValue_t value = {0}; + + if (NVML_CALL(nvmlInit_v2()) != UCS_OK) { + return false; + } + + if (NVML_CALL(nvmlDeviceGetHandleByIndex(gpu_index, &device)) != UCS_OK) { + NVML_CALL(nvmlShutdown()); + return false; + } + + value.fieldId = NVML_FI_DEV_C2C_LINK_COUNT; + if (NVML_CALL(nvmlDeviceGetFieldValues(device, 1, &value)) != UCS_OK) { + NVML_CALL(nvmlShutdown()); + return false; + } + + has_c2c = (value.nvmlReturn == NVML_SUCCESS) && (value.value.uiVal > 0); + NVML_CALL(nvmlShutdown()); + return has_c2c; +#else + (void)gpu_index; + return false; +#endif +} + void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type, bool async) { void *ptr; diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h index 4b1c285b2b8..85cc3ce5fc3 100644 --- a/test/gtest/common/mem_buffer.h +++ b/test/gtest/common/mem_buffer.h @@ -97,6 +97,9 @@ class mem_buffer { /* Get from NVML BAR1 free size */ static void get_bar1_free_size_nvml(); + /* NVML NVLink-C2C link count > 0 for CUDA device */ + static bool cuda_gpu_has_c2c(unsigned gpu_index = 0); + /* Return free memory on the BAR1 / GPU. If GPU is not used * SIZE_MAX is returned */ static size_t get_bar1_free_size() diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index fc6d98f255a..f354218dea8 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "test_md.h" #include @@ -1249,3 +1253,58 @@ UCS_TEST_P(test_cuda, sparse_regions) } UCT_MD_INSTANTIATE_TEST_CASE(test_cuda) + +#if HAVE_DECL_GDR_PIN_BUFFER_V2 + +class test_gdr_copy : public test_md { +protected: + ucs_status_t register_mem() + { + constexpr size_t size = 65536; + void *address = NULL; + uct_mem_h memh; + ucs_status_t status; + + alloc_memory(&address, size, NULL, UCS_MEMORY_TYPE_CUDA); + status = reg_mem(UCT_MD_MEM_ACCESS_ALL, address, size, &memh); + if (status == UCS_OK) { + (void)uct_md_mem_dereg(md(), memh); + } + + free_memory(address, UCS_MEMORY_TYPE_CUDA); + return status; + } +}; + +UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_default_pin, + !check_caps(UCT_MD_FLAG_REG), "GDR_COPY_USE_PCIE?=auto") +{ + ASSERT_UCS_OK(register_mem()); +} + +UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_pcie_pin, + !mem_buffer::cuda_gpu_has_c2c() || + !check_caps(UCT_MD_FLAG_REG), + "GDR_COPY_USE_PCIE?=yes") +{ + ASSERT_UCS_OK(register_mem()); +} + +UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_pcie_pin_fail, + mem_buffer::cuda_gpu_has_c2c() || + !check_caps(UCT_MD_FLAG_REG), + "GDR_COPY_USE_PCIE?=yes") +{ + scoped_log_handler slh(wrap_errors_logger); + ASSERT_UCS_STATUS_EQ(UCS_ERR_IO_ERROR, register_mem()); +} + +UCS_TEST_SKIP_COND_P(test_gdr_copy, gdr_copy_reg_cuda_try_pcie_pin, + !check_caps(UCT_MD_FLAG_REG), "GDR_COPY_USE_PCIE=try") +{ + ASSERT_UCS_OK(register_mem()); +} + +_UCT_MD_INSTANTIATE_TEST_CASE(test_gdr_copy, gdr_copy) + +#endif /* HAVE_DECL_GDR_PIN_BUFFER_V2 */