diff --git a/src/components/topo/cuda/ucc_sysinfo_cuda.c b/src/components/topo/cuda/ucc_sysinfo_cuda.c index 1e0856e5a1..0f16f7ab51 100644 --- a/src/components/topo/cuda/ucc_sysinfo_cuda.c +++ b/src/components/topo/cuda/ucc_sysinfo_cuda.c @@ -419,6 +419,9 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info) nvmlPciInfo_t nvml_pci; nvmlReturn_t nvml_st; ucc_status_t status; + char uuid_str[NVML_DEVICE_UUID_BUFFER_SIZE]; + unsigned int num_nvlinks; + const uint8_t *u; int n_gpus; int i; @@ -459,9 +462,6 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info) gpu_info->n_gpus = n_gpus; for (i = 0; i < n_gpus; i++) { - char uuid_str[NVML_DEVICE_UUID_BUFFER_SIZE]; - unsigned int num_nvlinks; - nvml_st = nvmlDeviceGetHandleByIndex(i, &nvml_dev); if (nvml_st != NVML_SUCCESS) { ucc_debug("nvmlDeviceGetHandleByIndex failed: %s", @@ -490,6 +490,10 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info) gpu_info->gpus[i].caps = 0; gpu_info->gpus[i].fabric_clique_id = UCC_GPU_FABRIC_CLIQUE_ID_INVALID; gpu_info->gpus[i].fabric_partition_id = UCC_GPU_FABRIC_PARTITION_ID_INVALID; + memset( + gpu_info->gpus[i].fabric_cluster_uuid, + 0, + UCC_GPU_FABRIC_CLUSTER_UUID_LEN); num_nvlinks = ucc_sysinfo_cuda_get_nvlink_count(nvml_dev); if (num_nvlinks > 0) { @@ -510,12 +514,17 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info) { #ifdef HAVE_NVML_GPU_FABRIC_INFO_V nvmlGpuFabricInfoV_t fabric_info; + fabric_info.version = nvmlGpuFabricInfo_v2; nvml_st = nvmlDeviceGetGpuFabricInfoV(nvml_dev, &fabric_info); #else nvmlGpuFabricInfo_t fabric_info; + nvml_st = nvmlDeviceGetGpuFabricInfo(nvml_dev, &fabric_info); #endif + UCC_STATIC_ASSERT( + sizeof(fabric_info.clusterUuid) == + UCC_GPU_FABRIC_CLUSTER_UUID_LEN); if (nvml_st == NVML_SUCCESS && fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) { gpu_info->gpus[i].caps |= UCC_GPU_CAP_FABRIC; @@ -523,13 +532,39 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info) #if defined(HAVE_NVML_GPU_FABRIC_INFO_V) && defined(HAVE_NVML_FABRIC_PARTITION_ID) gpu_info->gpus[i].fabric_partition_id = fabric_info.partitionId; #endif + /* Globally-unique fabric id; needed for cross-node match. */ + memcpy( + gpu_info->gpus[i].fabric_cluster_uuid, + fabric_info.clusterUuid, + UCC_GPU_FABRIC_CLUSTER_UUID_LEN); } } #endif - ucc_debug("GPU %d: caps=0x%x clique=%llu partition=%u", - i, gpu_info->gpus[i].caps, - (unsigned long long)gpu_info->gpus[i].fabric_clique_id, - (unsigned)gpu_info->gpus[i].fabric_partition_id); + u = gpu_info->gpus[i].fabric_cluster_uuid; + ucc_debug( + "GPU %d: caps=0x%x clique=%llu partition=%u " + "cluster_uuid=%02x%02x%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x-%02x%02x%02x%02x%02x%02x", + i, + gpu_info->gpus[i].caps, + (unsigned long long)gpu_info->gpus[i].fabric_clique_id, + (unsigned)gpu_info->gpus[i].fabric_partition_id, + u[0], + u[1], + u[2], + u[3], + u[4], + u[5], + u[6], + u[7], + u[8], + u[9], + u[10], + u[11], + u[12], + u[13], + u[14], + u[15]); } if (num_gpus > UCC_MAX_HOST_GPUS) { diff --git a/src/components/topo/ucc_topo.c b/src/components/topo/ucc_topo.c index a7f1e080d0..761cfaa16c 100644 --- a/src/components/topo/ucc_topo.c +++ b/src/components/topo/ucc_topo.c @@ -563,6 +563,7 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo) ucc_device_id_t dev; uint64_t ref_clique_id; uint32_t ref_partition_id; + const uint8_t *ref_cluster_uuid; ucc_rank_t i; if (size == 0) { @@ -584,8 +585,21 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo) return 0; } + /* clusterUuid: globally-unique fabric identity. cliqueId/partitionId + * are unique only within a fabric cluster, so two unrelated DGX nodes + * can collide there; clusterUuid is the reliable cross-node check. */ + if (!ucc_gpu_fabric_cluster_uuid_is_valid( + host->gpus[dev].fabric_cluster_uuid)) { + ucc_debug( + "nvlink domain check: rank 0 GPU %u has no valid fabric " + "cluster UUID", + (unsigned)dev); + return 0; + } + ref_clique_id = host->gpus[dev].fabric_clique_id; ref_partition_id = host->gpus[dev].fabric_partition_id; + ref_cluster_uuid = host->gpus[dev].fabric_cluster_uuid; for (i = 1; i < size; i++) { if (!ucc_topo_rank_device_info(topo, i, &host, &dev)) { @@ -598,6 +612,26 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo) (unsigned)i, (unsigned)dev); return 0; } + if (!ucc_gpu_fabric_cluster_uuid_is_valid( + host->gpus[dev].fabric_cluster_uuid)) { + ucc_debug( + "nvlink domain check: rank %u GPU %u has no valid " + "fabric cluster UUID", + (unsigned)i, + (unsigned)dev); + return 0; + } + if (memcmp( + host->gpus[dev].fabric_cluster_uuid, + ref_cluster_uuid, + UCC_GPU_FABRIC_CLUSTER_UUID_LEN) != 0) { + ucc_debug( + "nvlink domain check: rank %u GPU %u cluster_uuid " + "differs from rank 0", + (unsigned)i, + (unsigned)dev); + return 0; + } if (host->gpus[dev].fabric_clique_id != ref_clique_id) { ucc_debug("nvlink domain check: rank %u clique_id=%llu differs " "from rank 0 clique_id=%llu", diff --git a/src/components/topo/ucc_topo.h b/src/components/topo/ucc_topo.h index ff0ea3ce8f..6ac8a76eb7 100644 --- a/src/components/topo/ucc_topo.h +++ b/src/components/topo/ucc_topo.h @@ -332,11 +332,19 @@ int ucc_topo_is_nvlink_fully_connected( * @brief Checks if all ranks in the topology subset share the same * NVLink fabric domain (required for multinode NVLS/NVLink-SHARP). * - * Iterates all ranks in topo->set and verifies that every rank has - * UCC_GPU_CAP_FABRIC set and an identical valid fabric_clique_id. - * A fabric_clique_id of UCC_GPU_FABRIC_CLIQUE_ID_INVALID is treated as - * "unknown / not populated" and - * causes the function to return 0 immediately. + * Iterates all ranks in topo->set and verifies that every rank has: + * - UCC_GPU_CAP_FABRIC set + * - a non-all-zero fabric_cluster_uuid that matches rank 0's + * - identical valid fabric_clique_id + * - matching fabric_partition_id when populated (GB200+ NVL) + * + * Any of the following causes the function to return 0: + * - fabric_clique_id == UCC_GPU_FABRIC_CLIQUE_ID_INVALID ("unknown / + * not populated", e.g. NVML reports state != COMPLETED) + * - all-zero fabric_cluster_uuid (NVML reports clusterUuid as zero + * even with state == COMPLETED on standalone fabric-capable nodes; + * the cliqueId/partitionId pair is not globally unique in that case + * and could falsely match across unrelated clusters) * * @param [in] topo Pointer to the topology structure. * diff --git a/src/utils/ucc_proc_info.h b/src/utils/ucc_proc_info.h index ff964ec6e3..d55948a4a1 100644 --- a/src/utils/ucc_proc_info.h +++ b/src/utils/ucc_proc_info.h @@ -28,6 +28,9 @@ typedef uint8_t ucc_device_id_t; #define UCC_MAX_HOST_GPUS 16 #define UCC_MAX_HOST_NICS 16 +/* Length of NVML clusterUuid (NVML_GPU_FABRIC_UUID_LEN). */ +#define UCC_GPU_FABRIC_CLUSTER_UUID_LEN 16 + typedef struct ucc_proc_info { ucc_host_id_t host_hash; ucc_socket_id_t socket_id; @@ -60,17 +63,32 @@ typedef enum ucc_gpu_cap { typedef struct ucc_gpu_info { ucc_pci_info_t pci; /**< Bitmask of ucc_gpu_cap_t flags */ - uint32_t caps; + uint32_t caps; /**< NVLink partition ID for GB200+ NVL sub-fabric partitions. * UCC_GPU_FABRIC_PARTITION_ID_INVALID means single partition or * not populated (NVML < r525). */ - uint32_t fabric_partition_id; + uint32_t fabric_partition_id; /**< NVSwitch fabric clique ID (UCC_GPU_FABRIC_CLIQUE_ID_INVALID if unknown) */ - uint64_t fabric_clique_id; + uint64_t fabric_clique_id; + /**< Globally-unique NVLink fabric cluster UUID (NVML clusterUuid). + * All-zero means fabric info unavailable. */ + uint8_t fabric_cluster_uuid[UCC_GPU_FABRIC_CLUSTER_UUID_LEN]; /**< Hash of GPU UUID for unique identification */ - uint64_t uuid; + uint64_t uuid; } ucc_gpu_info_t; +/* Returns 1 if uuid is non-all-zero (i.e., real fabric info). */ +static inline int ucc_gpu_fabric_cluster_uuid_is_valid(const uint8_t *uuid) +{ + int i; + for (i = 0; i < UCC_GPU_FABRIC_CLUSTER_UUID_LEN; i++) { + if (uuid[i] != 0) { + return 1; + } + } + return 0; +} + typedef struct ucc_nic_info { ucc_pci_info_t pci; /**< IB port number */