Skip to content

Commit d05fe1d

Browse files
fix: CUDA device PCI bus ID de-dupe OOMing (ignoring other 3 gpus entirely) (ggml-org#22533)
* fix: CUDA device PCI bus ID detection for multi-GPU de-dupe * HIP, MUSA macros --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
1 parent 0754b7b commit d05fe1d

3 files changed

Lines changed: 4 additions & 2 deletions

File tree

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5431,8 +5431,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
54315431
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
54325432
dev_ctx->description = prop.name;
54335433

5434-
char pci_bus_id[16] = {};
5435-
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
5434+
char pci_bus_id[32] = {};
5435+
CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
54365436
dev_ctx->pci_bus_id = pci_bus_id;
54375437
dev_ctx->op_offload_min_batch_size = min_batch_size;
54385438

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
5656
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
5757
#define cudaDeviceGetAttribute hipDeviceGetAttribute
58+
#define cudaDeviceGetPCIBusId hipDeviceGetPCIBusId
5859
#define cudaDeviceProp hipDeviceProp_t
5960
#define cudaDeviceSynchronize hipDeviceSynchronize
6061
#define cudaError_t hipError_t

ggml/src/ggml-cuda/vendors/musa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
4040
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
4141
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
42+
#define cudaDeviceGetPCIBusId musaDeviceGetPCIBusId
4243
#define cudaDeviceProp musaDeviceProp
4344
#define cudaDeviceSynchronize musaDeviceSynchronize
4445
#define cudaError_t musaError_t

0 commit comments

Comments
 (0)