22#include " ggml-impl.h"
33#include " ggml-backend-impl.h"
44
5+ #include " ggml-cuda/allreduce.cuh"
56#include " ggml-cuda/common.cuh"
67#include " ggml-cuda/acc.cuh"
78#include " ggml-cuda/add-id.cuh"
8687
8788static_assert (sizeof (half) == sizeof (ggml_fp16_t ), " wrong fp16 size" );
8889
90+ #define GGML_LOG_WARN_ONCE (str ) \
91+ { static std::once_flag warn_flag; std::call_once (warn_flag, []() { GGML_LOG_WARN (str); }); }
92+
8993[[noreturn]]
9094void ggml_cuda_error (const char * stmt, const char * func, const char * file, int line, const char * msg) {
9195 int id = -1 ; // in case cudaGetDevice fails
@@ -1139,79 +1143,53 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
11391143 /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
11401144};
11411145
1142- #ifdef GGML_USE_NCCL
1146+ // Communication context for multi-GPU AllReduce during tensor parallelism.
1147+ //
1148+ // Created once per meta backend instance. Resources for the selected mode
1149+ // (NCCL communicators or the internal AllReduce pipeline) are initialised
1150+ // eagerly during comm_init so any init failure surfaces at startup rather
1151+ // than mid-run.
11431152struct ggml_backend_cuda_comm_context {
1153+ using try_allreduce_fn = bool (*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
1154+
11441155 std::vector<ggml_backend_t > backends;
1145- std::vector<ncclComm_t> comms ;
1156+ std::vector<int > dev_ids ;
11461157
1147- ~ggml_backend_cuda_comm_context () {
1148- for (ncclComm_t comm : comms) {
1149- NCCL_CHECK (ncclCommDestroy (comm));
1150- }
1151- }
1152- };
1153- #endif // GGML_USE_NCCL
1158+ // Set by the init chain (comm_init_{nccl, internal, none}) to one of
1159+ // try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
1160+ // internal needs `ar_pipeline`, butterfly needs nothing. Per-call
1161+ // failures return false; the meta backend's generic implementation then
1162+ // handles that call.
1163+ try_allreduce_fn try_allreduce = nullptr ;
1164+
1165+ ggml_cuda_ar_pipeline * ar_pipeline = nullptr ;
11541166
1155- static void ggml_backend_cuda_comm_free (void * comm_ctx_v) {
11561167#ifdef GGML_USE_NCCL
1157- if (comm_ctx_v == nullptr ) {
1158- return ;
1159- }
1160- ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
1161- delete comm_ctx;
1162- #else
1163- GGML_UNUSED (comm_ctx_v);
1168+ std::vector<ncclComm_t> comms;
11641169#endif // GGML_USE_NCCL
1165- }
11661170
1167- static void * ggml_backend_cuda_comm_init ( ggml_backend_t * backends, size_t n_backends ) {
1171+ ~ggml_backend_cuda_comm_context ( ) {
11681172#ifdef GGML_USE_NCCL
1169- for (size_t i = 0 ; i < n_backends; i++) {
1170- if (!ggml_backend_is_cuda (backends[i])) {
1171- return nullptr ;
1173+ for (ncclComm_t comm : comms) {
1174+ NCCL_CHECK (ncclCommDestroy (comm));
11721175 }
1173- }
1174- ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
1175- std::vector<int > dev_ids;
1176- ret->backends .reserve (n_backends);
1177- dev_ids.reserve (n_backends);
1178- for (size_t i = 0 ; i < n_backends; i++) {
1179- ret->backends .push_back (backends[i]);
1180- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context ;
1181- dev_ids.push_back (cuda_ctx->device );
1182- }
1183-
1184- ret->comms .resize (n_backends);
1185- NCCL_CHECK (ncclCommInitAll (ret->comms .data (), n_backends, dev_ids.data ()));
1186- return ret;
1187- #else
1188- // If NCCL is installed it is used by default for optimal performance.
1189- // However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
1190- // RCCL is disabled by default, users are explicitly opting in.
1191- // Therefore print no warning for RCCL.
1192- #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1193- static bool warning_printed = false ;
1194- if (!warning_printed) {
1195- GGML_LOG_WARN (" %s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n " , __func__);
1196- warning_printed = true ;
1197- }
1198- #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1199- GGML_UNUSED_VARS (backends, n_backends);
1200- return nullptr ;
12011176#endif // GGML_USE_NCCL
1202- }
1177+ ggml_cuda_ar_pipeline_free (ar_pipeline);
1178+ }
1179+ };
12031180
1204- static bool ggml_backend_cuda_comm_allreduce_tensor (void * comm_ctx_v, struct ggml_tensor ** tensors) {
12051181#ifdef GGML_USE_NCCL
1182+ // AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
1183+ // tensors (bandwidth-bound), then converts back to FP32.
1184+ static bool ggml_backend_cuda_comm_allreduce_nccl (
1185+ ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
12061186 const int64_t ne = ggml_nelements (tensors[0 ]);
12071187 // FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
12081188 // This then causes a crash in this function
12091189 if (ne == 0 ) {
12101190 return true ;
12111191 }
12121192
1213- GGML_ASSERT (comm_ctx_v != nullptr );
1214- ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
12151193 const size_t n_backends = comm_ctx->backends .size ();
12161194
12171195 for (size_t i = 0 ; i < n_backends; ++i) {
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
12361214 NCCL_CHECK (ncclAllReduce (tensors[i]->data , tensors[i]->data , ne, ncclFloat, ncclSum, comm_ctx->comms [i], cuda_ctx->stream ()));
12371215 }
12381216 NCCL_CHECK (ncclGroupEnd ());
1239-
12401217 return true ;
12411218 }
12421219
@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
12751252 }
12761253
12771254 return true ;
1278- #else
1279- GGML_UNUSED_VARS (comm_ctx_v, tensors);
1255+ }
1256+ #endif // GGML_USE_NCCL
1257+
1258+ // Run the internal AR pipeline. Returns false on unsupported / failed input
1259+ // -- the caller decides whether to abort (env-forced) or fall back silently.
1260+ static bool ggml_backend_cuda_comm_allreduce_internal (
1261+ ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1262+ GGML_ASSERT (comm_ctx->ar_pipeline != nullptr );
1263+
1264+ const size_t n_backends = comm_ctx->backends .size ();
1265+ GGML_ASSERT (n_backends == 2 );
1266+ GGML_ASSERT (tensors[0 ] != nullptr );
1267+
1268+ const int64_t ne = ggml_nelements (tensors[0 ]);
1269+ const ggml_type type = tensors[0 ]->type ;
1270+
1271+ if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
1272+ GGML_LOG_DEBUG (" %s: internal unsupported: type=%d\n " , __func__, (int ) type);
1273+ return false ;
1274+ }
1275+
1276+ if (ne == 0 ) {
1277+ return true ;
1278+ }
1279+
1280+ for (size_t i = 0 ; i < n_backends; ++i) {
1281+ if (tensors[i] == nullptr ) {
1282+ GGML_LOG_ERROR (" %s: internal failed: tensor[%zu] is null\n " , __func__, i);
1283+ return false ;
1284+ }
1285+ if (ggml_nelements (tensors[i]) != ne || tensors[i]->type != type) {
1286+ GGML_LOG_ERROR (" %s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n " ,
1287+ __func__, i, ggml_nelements (tensors[i]), (int ) tensors[i]->type , ne, (int ) type);
1288+ return false ;
1289+ }
1290+ if (!ggml_is_contiguously_allocated (tensors[i])) {
1291+ GGML_LOG_DEBUG (" %s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n " ,
1292+ __func__, i, ne, ggml_nbytes (tensors[i]),
1293+ (size_t ) ne * ggml_type_size (type) / ggml_blck_size (type), (int ) type);
1294+ return false ;
1295+ }
1296+ if (((uintptr_t ) tensors[i]->data & 0xF ) != 0 ) {
1297+ GGML_LOG_DEBUG (" %s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 " \n " ,
1298+ __func__, i, tensors[i]->data , (int ) type, ne);
1299+ return false ;
1300+ }
1301+ GGML_ASSERT ((ggml_nbytes (tensors[i]) & 0xF ) == 0 );
1302+ }
1303+
1304+ return ggml_cuda_ar_allreduce (comm_ctx->ar_pipeline , comm_ctx->backends .data (), tensors);
1305+ }
1306+
1307+ // ---------------------------------------------------------------------------
1308+ // Per-call dispatch -- three variants, one per backend. Each is set as
1309+ // comm_ctx->try_allreduce by the matching init step. Per-call failure
1310+ // returns false; the meta backend's generic implementation handles that call.
1311+ // ---------------------------------------------------------------------------
1312+
1313+ #ifdef GGML_USE_NCCL
1314+ static bool ggml_backend_cuda_comm_try_allreduce_nccl (
1315+ ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1316+ return ggml_backend_cuda_comm_allreduce_nccl (comm_ctx, tensors);
1317+ }
1318+ #endif // GGML_USE_NCCL
1319+
1320+ static bool ggml_backend_cuda_comm_try_allreduce_internal (
1321+ ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1322+ return ggml_backend_cuda_comm_allreduce_internal (comm_ctx, tensors);
1323+ }
1324+
1325+ static bool ggml_backend_cuda_comm_try_allreduce_butterfly (
1326+ ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
12801327 return false ;
1328+ }
1329+
1330+ static void ggml_backend_cuda_comm_free (void * comm_ctx_v) {
1331+ if (comm_ctx_v == nullptr ) {
1332+ return ;
1333+ }
1334+ delete static_cast <ggml_backend_cuda_comm_context *>(comm_ctx_v);
1335+ }
1336+
1337+ // ---------------------------------------------------------------------------
1338+ // Init -- chained nccl -> internal -> none. Each step tries to bring up its
1339+ // resource; on failure it warns and recurses into the next step.
1340+ // ---------------------------------------------------------------------------
1341+ static void ggml_backend_cuda_comm_init_none (ggml_backend_cuda_comm_context * ret) {
1342+ ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
1343+ }
1344+
1345+ static void ggml_backend_cuda_comm_init_internal (ggml_backend_cuda_comm_context * ret) {
1346+ ret->ar_pipeline = ggml_cuda_ar_pipeline_init (ret->dev_ids .data (), ret->dev_ids .size ());
1347+ if (ret->ar_pipeline ) {
1348+ ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
1349+ return ;
1350+ }
1351+
1352+ // Clear sticky CUDA error from the failed init.
1353+ (void ) cudaGetLastError ();
1354+ GGML_LOG_WARN (" internal AllReduce init failed (n_devices != 2?); "
1355+ " falling back to meta-backend butterfly\n " );
1356+ ggml_backend_cuda_comm_init_none (ret);
1357+ }
1358+
1359+ static void ggml_backend_cuda_comm_init_nccl (ggml_backend_cuda_comm_context * ret) {
1360+ #ifdef GGML_USE_NCCL
1361+ const size_t n = ret->dev_ids .size ();
1362+ ret->comms .resize (n);
1363+ ncclResult_t rc = ncclCommInitAll (ret->comms .data (), (int ) n, ret->dev_ids .data ());
1364+ if (rc == ncclSuccess) {
1365+ ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
1366+ return ;
1367+ }
1368+
1369+ ret->comms .clear ();
1370+ GGML_LOG_WARN (" NCCL init failed (%s); falling back to internal AllReduce\n " ,
1371+ ncclGetErrorString (rc));
1372+ #else // GGML_USE_NCCL
1373+ #ifndef GGML_USE_HIP
1374+ GGML_LOG_WARN (" NCCL not compiled in; falling back to internal AllReduce. "
1375+ " Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n " );
1376+ #endif // !GGML_USE_HIP
12811377#endif // GGML_USE_NCCL
1378+
1379+ ggml_backend_cuda_comm_init_internal (ret);
1380+ }
1381+
1382+ // Top-level init. Picks one of the three init paths based on
1383+ // GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
1384+ // any fallback. Unrecognised env values warn and fall through to the
1385+ // platform default.
1386+ static void * ggml_backend_cuda_comm_init (ggml_backend_t * backends, size_t n_backends) {
1387+ for (size_t i = 0 ; i < n_backends; i++) {
1388+ if (!ggml_backend_is_cuda (backends[i])) {
1389+ return nullptr ;
1390+ }
1391+ }
1392+
1393+ auto * ret = new ggml_backend_cuda_comm_context;
1394+ ret->backends .assign (backends, backends + n_backends);
1395+ ret->dev_ids .reserve (n_backends);
1396+ for (size_t i = 0 ; i < n_backends; i++) {
1397+ ret->dev_ids .push_back (static_cast <ggml_backend_cuda_context *>(backends[i]->context )->device );
1398+ }
1399+
1400+ const char * env = getenv (" GGML_CUDA_ALLREDUCE" );
1401+ if (!env) {
1402+ // Platform default: Linux uses NCCL, otherwise (generally Windows) internal
1403+ #if defined(__linux__)
1404+ ggml_backend_cuda_comm_init_nccl (ret);
1405+ #else
1406+ ggml_backend_cuda_comm_init_internal (ret);
1407+ #endif // defined(__linux__)
1408+ } else {
1409+ std::string env_str (env);
1410+ if (env_str == " nccl" ) {
1411+ ggml_backend_cuda_comm_init_nccl (ret);
1412+ } else if (env_str == " internal" ) {
1413+ ggml_backend_cuda_comm_init_internal (ret);
1414+ } else if (env_str == " none" ) {
1415+ ggml_backend_cuda_comm_init_none (ret);
1416+ } else {
1417+ GGML_LOG_WARN (" unknown GGML_CUDA_ALLREDUCE value: %s\n " , env);
1418+ ggml_backend_cuda_comm_init_none (ret);
1419+ }
1420+ }
1421+
1422+ return ret;
1423+ }
1424+
1425+ // Top-level dispatch -- calls the function pointer chosen by comm_init.
1426+ // Returns false to let the meta-backend's butterfly run.
1427+ static bool ggml_backend_cuda_comm_allreduce_tensor (void * comm_ctx_v, struct ggml_tensor ** tensors) {
1428+ if (comm_ctx_v == nullptr ) {
1429+ return false ;
1430+ }
1431+ auto * comm_ctx = static_cast <ggml_backend_cuda_comm_context *>(comm_ctx_v);
1432+ return comm_ctx->try_allreduce (comm_ctx, tensors);
12821433}
12831434
12841435ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type (int main_device, const float * tensor_split) {
0 commit comments