Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions src/components/topo/cuda/ucc_sysinfo_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,9 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info)
nvmlPciInfo_t nvml_pci;
nvmlReturn_t nvml_st;
ucc_status_t status;
char uuid_str[NVML_DEVICE_UUID_BUFFER_SIZE];
unsigned int num_nvlinks;
const uint8_t *u;
int n_gpus;
int i;

Expand Down Expand Up @@ -459,9 +462,6 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info)
gpu_info->n_gpus = n_gpus;

for (i = 0; i < n_gpus; i++) {
char uuid_str[NVML_DEVICE_UUID_BUFFER_SIZE];
unsigned int num_nvlinks;

nvml_st = nvmlDeviceGetHandleByIndex(i, &nvml_dev);
if (nvml_st != NVML_SUCCESS) {
ucc_debug("nvmlDeviceGetHandleByIndex failed: %s",
Expand Down Expand Up @@ -490,6 +490,10 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info)
gpu_info->gpus[i].caps = 0;
gpu_info->gpus[i].fabric_clique_id = UCC_GPU_FABRIC_CLIQUE_ID_INVALID;
gpu_info->gpus[i].fabric_partition_id = UCC_GPU_FABRIC_PARTITION_ID_INVALID;
memset(
gpu_info->gpus[i].fabric_cluster_uuid,
0,
UCC_GPU_FABRIC_CLUSTER_UUID_LEN);

num_nvlinks = ucc_sysinfo_cuda_get_nvlink_count(nvml_dev);
if (num_nvlinks > 0) {
Expand All @@ -510,26 +514,57 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info)
{
#ifdef HAVE_NVML_GPU_FABRIC_INFO_V
nvmlGpuFabricInfoV_t fabric_info;

fabric_info.version = nvmlGpuFabricInfo_v2;
nvml_st = nvmlDeviceGetGpuFabricInfoV(nvml_dev, &fabric_info);
#else
nvmlGpuFabricInfo_t fabric_info;

nvml_st = nvmlDeviceGetGpuFabricInfo(nvml_dev, &fabric_info);
#endif
UCC_STATIC_ASSERT(
sizeof(fabric_info.clusterUuid) ==
UCC_GPU_FABRIC_CLUSTER_UUID_LEN);
if (nvml_st == NVML_SUCCESS &&
fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) {
gpu_info->gpus[i].caps |= UCC_GPU_CAP_FABRIC;
gpu_info->gpus[i].fabric_clique_id = fabric_info.cliqueId;
#if defined(HAVE_NVML_GPU_FABRIC_INFO_V) && defined(HAVE_NVML_FABRIC_PARTITION_ID)
gpu_info->gpus[i].fabric_partition_id = fabric_info.partitionId;
#endif
/* Globally-unique fabric id; needed for cross-node match. */
memcpy(
gpu_info->gpus[i].fabric_cluster_uuid,
fabric_info.clusterUuid,
UCC_GPU_FABRIC_CLUSTER_UUID_LEN);
}
}
#endif
ucc_debug("GPU %d: caps=0x%x clique=%llu partition=%u",
i, gpu_info->gpus[i].caps,
(unsigned long long)gpu_info->gpus[i].fabric_clique_id,
(unsigned)gpu_info->gpus[i].fabric_partition_id);
u = gpu_info->gpus[i].fabric_cluster_uuid;
ucc_debug(
"GPU %d: caps=0x%x clique=%llu partition=%u "
"cluster_uuid=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
"%02x%02x-%02x%02x%02x%02x%02x%02x",
i,
gpu_info->gpus[i].caps,
(unsigned long long)gpu_info->gpus[i].fabric_clique_id,
(unsigned)gpu_info->gpus[i].fabric_partition_id,
u[0],
u[1],
u[2],
u[3],
u[4],
u[5],
u[6],
u[7],
u[8],
u[9],
u[10],
u[11],
u[12],
u[13],
u[14],
u[15]);
}

if (num_gpus > UCC_MAX_HOST_GPUS) {
Expand Down
34 changes: 34 additions & 0 deletions src/components/topo/ucc_topo.c
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo)
ucc_device_id_t dev;
uint64_t ref_clique_id;
uint32_t ref_partition_id;
const uint8_t *ref_cluster_uuid;
ucc_rank_t i;

if (size == 0) {
Expand All @@ -584,8 +585,21 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo)
return 0;
}

/* clusterUuid: globally-unique fabric identity. cliqueId/partitionId
* are unique only within a fabric cluster, so two unrelated DGX nodes
* can collide there; clusterUuid is the reliable cross-node check. */
if (!ucc_gpu_fabric_cluster_uuid_is_valid(
host->gpus[dev].fabric_cluster_uuid)) {
ucc_debug(
"nvlink domain check: rank 0 GPU %u has no valid fabric "
"cluster UUID",
(unsigned)dev);
return 0;
}

ref_clique_id = host->gpus[dev].fabric_clique_id;
ref_partition_id = host->gpus[dev].fabric_partition_id;
ref_cluster_uuid = host->gpus[dev].fabric_cluster_uuid;

for (i = 1; i < size; i++) {
if (!ucc_topo_rank_device_info(topo, i, &host, &dev)) {
Expand All @@ -598,6 +612,26 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo)
(unsigned)i, (unsigned)dev);
return 0;
}
if (!ucc_gpu_fabric_cluster_uuid_is_valid(
host->gpus[dev].fabric_cluster_uuid)) {
ucc_debug(
"nvlink domain check: rank %u GPU %u has no valid "
"fabric cluster UUID",
(unsigned)i,
(unsigned)dev);
return 0;
}
if (memcmp(
host->gpus[dev].fabric_cluster_uuid,
ref_cluster_uuid,
UCC_GPU_FABRIC_CLUSTER_UUID_LEN) != 0) {
ucc_debug(
"nvlink domain check: rank %u GPU %u cluster_uuid "
"differs from rank 0",
(unsigned)i,
(unsigned)dev);
return 0;
}
if (host->gpus[dev].fabric_clique_id != ref_clique_id) {
ucc_debug("nvlink domain check: rank %u clique_id=%llu differs "
"from rank 0 clique_id=%llu",
Expand Down
18 changes: 13 additions & 5 deletions src/components/topo/ucc_topo.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,11 +332,19 @@ int ucc_topo_is_nvlink_fully_connected(
* @brief Checks if all ranks in the topology subset share the same
* NVLink fabric domain (required for multinode NVLS/NVLink-SHARP).
*
* Iterates all ranks in topo->set and verifies that every rank has
* UCC_GPU_CAP_FABRIC set and an identical valid fabric_clique_id.
* A fabric_clique_id of UCC_GPU_FABRIC_CLIQUE_ID_INVALID is treated as
* "unknown / not populated" and
* causes the function to return 0 immediately.
* Iterates all ranks in topo->set and verifies that every rank has:
* - UCC_GPU_CAP_FABRIC set
* - a non-all-zero fabric_cluster_uuid that matches rank 0's
* - identical valid fabric_clique_id
* - matching fabric_partition_id when populated (GB200+ NVL)
*
* Any of the following causes the function to return 0:
* - fabric_clique_id == UCC_GPU_FABRIC_CLIQUE_ID_INVALID ("unknown /
* not populated", e.g. NVML reports state != COMPLETED)
* - all-zero fabric_cluster_uuid (NVML reports clusterUuid as zero
* even with state == COMPLETED on standalone fabric-capable nodes;
* the cliqueId/partitionId pair is not globally unique in that case
* and could falsely match across unrelated clusters)
*
* @param [in] topo Pointer to the topology structure.
*
Expand Down
26 changes: 22 additions & 4 deletions src/utils/ucc_proc_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ typedef uint8_t ucc_device_id_t;
#define UCC_MAX_HOST_GPUS 16
#define UCC_MAX_HOST_NICS 16

/* Length of NVML clusterUuid (NVML_GPU_FABRIC_UUID_LEN). */
#define UCC_GPU_FABRIC_CLUSTER_UUID_LEN 16

typedef struct ucc_proc_info {
ucc_host_id_t host_hash;
ucc_socket_id_t socket_id;
Expand Down Expand Up @@ -60,17 +63,32 @@ typedef enum ucc_gpu_cap {
typedef struct ucc_gpu_info {
ucc_pci_info_t pci;
/**< Bitmask of ucc_gpu_cap_t flags */
uint32_t caps;
uint32_t caps;
/**< NVLink partition ID for GB200+ NVL sub-fabric partitions.
* UCC_GPU_FABRIC_PARTITION_ID_INVALID means single partition or
* not populated (NVML < r525). */
uint32_t fabric_partition_id;
uint32_t fabric_partition_id;
/**< NVSwitch fabric clique ID (UCC_GPU_FABRIC_CLIQUE_ID_INVALID if unknown) */
uint64_t fabric_clique_id;
uint64_t fabric_clique_id;
/**< Globally-unique NVLink fabric cluster UUID (NVML clusterUuid).
* All-zero means fabric info unavailable. */
uint8_t fabric_cluster_uuid[UCC_GPU_FABRIC_CLUSTER_UUID_LEN];
/**< Hash of GPU UUID for unique identification */
uint64_t uuid;
uint64_t uuid;
} ucc_gpu_info_t;

/* Returns 1 if uuid is non-all-zero (i.e., real fabric info). */
static inline int ucc_gpu_fabric_cluster_uuid_is_valid(const uint8_t *uuid)
{
int i;
for (i = 0; i < UCC_GPU_FABRIC_CLUSTER_UUID_LEN; i++) {
if (uuid[i] != 0) {
return 1;
}
}
return 0;
}

typedef struct ucc_nic_info {
ucc_pci_info_t pci;
/**< IB port number */
Expand Down
Loading