Skip to content

Commit d72653f

Browse files
Merge pull request #512 from janhq/update-dev-from-master-2026-05-11-01-09
Sync master with upstream release b9101
2 parents 1094164 + 389ff61 commit d72653f

17 files changed

Lines changed: 1315 additions & 90 deletions

File tree

common/sampling.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
547547
auto & chain = gsmpl->chain;
548548
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
549549

550+
gsmpl->set_logits(ctx, idx);
551+
550552
// Check if a backend sampler has already sampled a token in which case we
551553
// return that token id directly.
552554
{
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
558560
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
559561
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
560562

561-
// TODO: simplify
562-
gsmpl->cur.resize(1);
563-
gsmpl->cur[0] = { id, 0.0f, 1.0f };
564-
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
563+
for (size_t i = 0; i < cur_p.size; ++i) {
564+
if (cur_p.data[i].id == id) {
565+
cur_p.selected = i;
566+
break;
567+
}
568+
}
565569

566570
return id;
567571
}
568572
}
569573

570-
gsmpl->set_logits(ctx, idx);
571-
572574
// apply reasoning budget first
573575
llama_sampler_apply(rbudget, &cur_p);
574576

ggml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
55
### GGML Version
66
set(GGML_VERSION_MAJOR 0)
77
set(GGML_VERSION_MINOR 11)
8-
set(GGML_VERSION_PATCH 0)
8+
set(GGML_VERSION_PATCH 1)
99
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
1010

1111
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

ggml/src/ggml-cuda/allreduce.cu

Lines changed: 968 additions & 0 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cuda/allreduce.cuh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma once
2+
3+
#include "common.cuh"
4+
#include "ggml-backend-impl.h"
5+
6+
#include <cstddef>
7+
8+
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
9+
struct ggml_cuda_ar_pipeline;
10+
11+
// Allocate a pipeline for n_devices GPUs.
12+
// devices[] holds the CUDA device IDs in rank order.
13+
// Returns nullptr on allocation failure.
14+
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
15+
const int * devices, size_t n_devices);
16+
17+
// Release all resources owned by the pipeline.
18+
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
19+
20+
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
21+
// tensors[i] must live on the device managed by backends[i] and be
22+
// contiguous F32, F16, or BF16.
23+
// Preconditions are checked by the CUDA comm dispatcher before calling this.
24+
// Returns true once the reduction work has been enqueued successfully.
25+
bool ggml_cuda_ar_allreduce(
26+
ggml_cuda_ar_pipeline * pipeline,
27+
ggml_backend_t * backends,
28+
ggml_tensor ** tensors);
29+

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 208 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "ggml-impl.h"
33
#include "ggml-backend-impl.h"
44

5+
#include "ggml-cuda/allreduce.cuh"
56
#include "ggml-cuda/common.cuh"
67
#include "ggml-cuda/acc.cuh"
78
#include "ggml-cuda/add-id.cuh"
@@ -86,6 +87,9 @@
8687

8788
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
8889

90+
#define GGML_LOG_WARN_ONCE(str) \
91+
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
92+
8993
[[noreturn]]
9094
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
9195
int id = -1; // in case cudaGetDevice fails
@@ -1139,79 +1143,53 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
11391143
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
11401144
};
11411145

1142-
#ifdef GGML_USE_NCCL
1146+
// Communication context for multi-GPU AllReduce during tensor parallelism.
1147+
//
1148+
// Created once per meta backend instance. Resources for the selected mode
1149+
// (NCCL communicators or the internal AllReduce pipeline) are initialised
1150+
// eagerly during comm_init so any init failure surfaces at startup rather
1151+
// than mid-run.
11431152
struct ggml_backend_cuda_comm_context {
1153+
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
1154+
11441155
std::vector<ggml_backend_t> backends;
1145-
std::vector<ncclComm_t> comms;
1156+
std::vector<int> dev_ids;
11461157

1147-
~ggml_backend_cuda_comm_context() {
1148-
for (ncclComm_t comm : comms) {
1149-
NCCL_CHECK(ncclCommDestroy(comm));
1150-
}
1151-
}
1152-
};
1153-
#endif // GGML_USE_NCCL
1158+
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
1159+
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
1160+
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
1161+
// failures return false; the meta backend's generic implementation then
1162+
// handles that call.
1163+
try_allreduce_fn try_allreduce = nullptr;
1164+
1165+
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
11541166

1155-
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
11561167
#ifdef GGML_USE_NCCL
1157-
if (comm_ctx_v == nullptr) {
1158-
return;
1159-
}
1160-
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
1161-
delete comm_ctx;
1162-
#else
1163-
GGML_UNUSED(comm_ctx_v);
1168+
std::vector<ncclComm_t> comms;
11641169
#endif // GGML_USE_NCCL
1165-
}
11661170

1167-
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
1171+
~ggml_backend_cuda_comm_context() {
11681172
#ifdef GGML_USE_NCCL
1169-
for (size_t i = 0; i < n_backends; i++) {
1170-
if (!ggml_backend_is_cuda(backends[i])) {
1171-
return nullptr;
1173+
for (ncclComm_t comm : comms) {
1174+
NCCL_CHECK(ncclCommDestroy(comm));
11721175
}
1173-
}
1174-
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
1175-
std::vector<int> dev_ids;
1176-
ret->backends.reserve(n_backends);
1177-
dev_ids.reserve(n_backends);
1178-
for (size_t i = 0; i < n_backends; i++) {
1179-
ret->backends.push_back(backends[i]);
1180-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
1181-
dev_ids.push_back(cuda_ctx->device);
1182-
}
1183-
1184-
ret->comms.resize(n_backends);
1185-
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
1186-
return ret;
1187-
#else
1188-
// If NCCL is installed it is used by default for optimal performance.
1189-
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
1190-
// RCCL is disabled by default, users are explicitly opting in.
1191-
// Therefore print no warning for RCCL.
1192-
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1193-
static bool warning_printed = false;
1194-
if (!warning_printed) {
1195-
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
1196-
warning_printed = true;
1197-
}
1198-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1199-
GGML_UNUSED_VARS(backends, n_backends);
1200-
return nullptr;
12011176
#endif // GGML_USE_NCCL
1202-
}
1177+
ggml_cuda_ar_pipeline_free(ar_pipeline);
1178+
}
1179+
};
12031180

1204-
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
12051181
#ifdef GGML_USE_NCCL
1182+
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
1183+
// tensors (bandwidth-bound), then converts back to FP32.
1184+
static bool ggml_backend_cuda_comm_allreduce_nccl(
1185+
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
12061186
const int64_t ne = ggml_nelements(tensors[0]);
12071187
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
12081188
// This then causes a crash in this function
12091189
if (ne == 0) {
12101190
return true;
12111191
}
12121192

1213-
GGML_ASSERT(comm_ctx_v != nullptr);
1214-
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
12151193
const size_t n_backends = comm_ctx->backends.size();
12161194

12171195
for (size_t i = 0; i < n_backends; ++i) {
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
12361214
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
12371215
}
12381216
NCCL_CHECK(ncclGroupEnd());
1239-
12401217
return true;
12411218
}
12421219

@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
12751252
}
12761253

12771254
return true;
1278-
#else
1279-
GGML_UNUSED_VARS(comm_ctx_v, tensors);
1255+
}
1256+
#endif // GGML_USE_NCCL
1257+
1258+
// Run the internal AR pipeline. Returns false on unsupported / failed input
1259+
// -- the caller decides whether to abort (env-forced) or fall back silently.
1260+
static bool ggml_backend_cuda_comm_allreduce_internal(
1261+
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1262+
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
1263+
1264+
const size_t n_backends = comm_ctx->backends.size();
1265+
GGML_ASSERT(n_backends == 2);
1266+
GGML_ASSERT(tensors[0] != nullptr);
1267+
1268+
const int64_t ne = ggml_nelements(tensors[0]);
1269+
const ggml_type type = tensors[0]->type;
1270+
1271+
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
1272+
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
1273+
return false;
1274+
}
1275+
1276+
if (ne == 0) {
1277+
return true;
1278+
}
1279+
1280+
for (size_t i = 0; i < n_backends; ++i) {
1281+
if (tensors[i] == nullptr) {
1282+
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
1283+
return false;
1284+
}
1285+
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
1286+
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
1287+
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
1288+
return false;
1289+
}
1290+
if (!ggml_is_contiguously_allocated(tensors[i])) {
1291+
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
1292+
__func__, i, ne, ggml_nbytes(tensors[i]),
1293+
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
1294+
return false;
1295+
}
1296+
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
1297+
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
1298+
__func__, i, tensors[i]->data, (int) type, ne);
1299+
return false;
1300+
}
1301+
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
1302+
}
1303+
1304+
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
1305+
}
1306+
1307+
// ---------------------------------------------------------------------------
1308+
// Per-call dispatch -- three variants, one per backend. Each is set as
1309+
// comm_ctx->try_allreduce by the matching init step. Per-call failure
1310+
// returns false; the meta backend's generic implementation handles that call.
1311+
// ---------------------------------------------------------------------------
1312+
1313+
#ifdef GGML_USE_NCCL
1314+
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
1315+
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1316+
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
1317+
}
1318+
#endif // GGML_USE_NCCL
1319+
1320+
static bool ggml_backend_cuda_comm_try_allreduce_internal(
1321+
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
1322+
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
1323+
}
1324+
1325+
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
1326+
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
12801327
return false;
1328+
}
1329+
1330+
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
1331+
if (comm_ctx_v == nullptr) {
1332+
return;
1333+
}
1334+
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
1335+
}
1336+
1337+
// ---------------------------------------------------------------------------
1338+
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
1339+
// resource; on failure it warns and recurses into the next step.
1340+
// ---------------------------------------------------------------------------
1341+
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
1342+
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
1343+
}
1344+
1345+
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
1346+
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
1347+
if (ret->ar_pipeline) {
1348+
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
1349+
return;
1350+
}
1351+
1352+
// Clear sticky CUDA error from the failed init.
1353+
(void) cudaGetLastError();
1354+
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
1355+
"falling back to meta-backend butterfly\n");
1356+
ggml_backend_cuda_comm_init_none(ret);
1357+
}
1358+
1359+
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
1360+
#ifdef GGML_USE_NCCL
1361+
const size_t n = ret->dev_ids.size();
1362+
ret->comms.resize(n);
1363+
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
1364+
if (rc == ncclSuccess) {
1365+
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
1366+
return;
1367+
}
1368+
1369+
ret->comms.clear();
1370+
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
1371+
ncclGetErrorString(rc));
1372+
#else // GGML_USE_NCCL
1373+
#ifndef GGML_USE_HIP
1374+
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
1375+
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
1376+
#endif // !GGML_USE_HIP
12811377
#endif // GGML_USE_NCCL
1378+
1379+
ggml_backend_cuda_comm_init_internal(ret);
1380+
}
1381+
1382+
// Top-level init. Picks one of the three init paths based on
1383+
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
1384+
// any fallback. Unrecognised env values warn and fall through to the
1385+
// platform default.
1386+
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
1387+
for (size_t i = 0; i < n_backends; i++) {
1388+
if (!ggml_backend_is_cuda(backends[i])) {
1389+
return nullptr;
1390+
}
1391+
}
1392+
1393+
auto * ret = new ggml_backend_cuda_comm_context;
1394+
ret->backends.assign(backends, backends + n_backends);
1395+
ret->dev_ids.reserve(n_backends);
1396+
for (size_t i = 0; i < n_backends; i++) {
1397+
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
1398+
}
1399+
1400+
const char * env = getenv("GGML_CUDA_ALLREDUCE");
1401+
if (!env) {
1402+
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
1403+
#if defined(__linux__)
1404+
ggml_backend_cuda_comm_init_nccl(ret);
1405+
#else
1406+
ggml_backend_cuda_comm_init_internal(ret);
1407+
#endif // defined(__linux__)
1408+
} else {
1409+
std::string env_str(env);
1410+
if (env_str == "nccl") {
1411+
ggml_backend_cuda_comm_init_nccl(ret);
1412+
} else if (env_str == "internal") {
1413+
ggml_backend_cuda_comm_init_internal(ret);
1414+
} else if (env_str == "none") {
1415+
ggml_backend_cuda_comm_init_none(ret);
1416+
} else {
1417+
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
1418+
ggml_backend_cuda_comm_init_none(ret);
1419+
}
1420+
}
1421+
1422+
return ret;
1423+
}
1424+
1425+
// Top-level dispatch -- calls the function pointer chosen by comm_init.
1426+
// Returns false to let the meta-backend's butterfly run.
1427+
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
1428+
if (comm_ctx_v == nullptr) {
1429+
return false;
1430+
}
1431+
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
1432+
return comm_ctx->try_allreduce(comm_ctx, tensors);
12821433
}
12831434

12841435
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {

0 commit comments

Comments
 (0)