Skip to content

Commit ada982b

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/vulkan.Dockerfile # benches/dgx-spark/dgx-spark.md # scripts/bench-models.sh
2 parents 157fac7 + 3795cc1 commit ada982b

9 files changed

Lines changed: 120 additions & 12 deletions

File tree

ggml/src/ggml-metal/ggml-metal-device.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,26 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows(ggml_me
176176
return res;
177177
}
178178

179+
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag(ggml_metal_library_t lib, const ggml_tensor * op) {
180+
char base[256];
181+
char name[256];
182+
183+
const int n = op->src[0]->ne[0];
184+
185+
snprintf(base, 256, "kernel_diag_%s", ggml_type_name(op->src[0]->type));
186+
snprintf(name, 256, "%s_n=%d", base, n);
187+
188+
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
189+
if (!res.pipeline) {
190+
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
191+
}
192+
193+
res.nsg = 1;
194+
res.smem = 0;
195+
196+
return res;
197+
}
198+
179199
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_metal_library_t lib, ggml_type tsrc) {
180200
char base[256];
181201
char name[256];

ggml/src/ggml-metal/ggml-metal-device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_1d
108108
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
109109
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
110110
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
111+
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag (ggml_metal_library_t lib, const struct ggml_tensor * op);
111112
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
112113
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
113114
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1158,8 +1158,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
11581158
return has_simdgroup_reduction;
11591159
case GGML_OP_RWKV_WKV6:
11601160
case GGML_OP_RWKV_WKV7:
1161-
case GGML_OP_SOLVE_TRI:
11621161
return true;
1162+
case GGML_OP_SOLVE_TRI:
11631163
case GGML_OP_MUL_MAT:
11641164
case GGML_OP_MUL_MAT_ID:
11651165
return has_simdgroup_reduction;
@@ -1241,6 +1241,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
12411241
return false;
12421242
};
12431243
}
1244+
case GGML_OP_DIAG:
1245+
return true;
12441246
case GGML_OP_OPT_STEP_ADAMW:
12451247
case GGML_OP_OPT_STEP_SGD:
12461248
return has_simdgroup_reduction;

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,25 @@ typedef struct {
792792
uint64_t nb3;
793793
} ggml_metal_kargs_set_rows;
794794

795+
typedef struct {
796+
int32_t ne00;
797+
int32_t ne01;
798+
int32_t ne02;
799+
int32_t ne03;
800+
uint64_t nb00;
801+
uint64_t nb01;
802+
uint64_t nb02;
803+
uint64_t nb03;
804+
int32_t ne0;
805+
int32_t ne1;
806+
int32_t ne2;
807+
int32_t ne3;
808+
uint64_t nb0;
809+
uint64_t nb1;
810+
uint64_t nb2;
811+
uint64_t nb3;
812+
} ggml_metal_kargs_diag;
813+
795814
typedef struct {
796815
int64_t ne00;
797816
int64_t ne01;

ggml/src/ggml-metal/ggml-metal-ops.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
361361
{
362362
n_fuse = ggml_metal_op_set_rows(ctx, idx);
363363
} break;
364+
case GGML_OP_DIAG:
365+
{
366+
n_fuse = ggml_metal_op_diag(ctx, idx);
367+
} break;
364368
case GGML_OP_L2_NORM:
365369
{
366370
n_fuse = ggml_metal_op_l2_norm(ctx, idx);
@@ -1259,6 +1263,48 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
12591263
return 1;
12601264
}
12611265

1266+
int ggml_metal_op_diag(ggml_metal_op_t ctx, int idx) {
1267+
ggml_tensor * op = ctx->node(idx);
1268+
1269+
ggml_metal_library_t lib = ctx->lib;
1270+
ggml_metal_encoder_t enc = ctx->enc;
1271+
1272+
GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne);
1273+
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1274+
GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
1275+
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1276+
1277+
ggml_metal_kargs_diag args = {
1278+
/*.ne00 =*/ne00,
1279+
/*.ne01 =*/ne01,
1280+
/*.ne02 =*/ne02,
1281+
/*.ne03 =*/ne03,
1282+
/*.nb00 =*/nb00,
1283+
/*.nb01 =*/nb01,
1284+
/*.nb02 =*/nb02,
1285+
/*.nb03 =*/nb03,
1286+
/*.ne0 =*/ne0,
1287+
/*.ne1 =*/ne1,
1288+
/*.ne2 =*/ne2,
1289+
/*.ne3 =*/ne3,
1290+
/*.nb0 =*/nb0,
1291+
/*.nb1 =*/nb1,
1292+
/*.nb2 =*/nb2,
1293+
/*.nb3 =*/nb3,
1294+
};
1295+
1296+
auto pipeline = ggml_metal_library_get_pipeline_diag(lib, op);
1297+
1298+
ggml_metal_encoder_set_pipeline(enc, pipeline);
1299+
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1300+
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1301+
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 2);
1302+
1303+
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, 32, 1, 1);
1304+
1305+
return 1;
1306+
}
1307+
12621308
int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
12631309
ggml_tensor * op = ctx->node(idx);
12641310

ggml/src/ggml-metal/ggml-metal-ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ int ggml_metal_op_sum_rows (ggml_metal_op_t ctx, int idx);
5656
int ggml_metal_op_cumsum (ggml_metal_op_t ctx, int idx);
5757
int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx);
5858
int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx);
59+
int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx);
5960
int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
6061
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
6162
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8815,6 +8815,26 @@ kernel void kernel_set_rows_f(
88158815
}
88168816
}
88178817

8818+
kernel void kernel_diag_f32(
8819+
constant ggml_metal_kargs_diag & args,
8820+
device const char * src0,
8821+
device char * dst,
8822+
uint3 tgpig[[threadgroup_position_in_grid]],
8823+
ushort tiitg[[thread_index_in_threadgroup]]) {
8824+
constexpr short NW = N_SIMDWIDTH;
8825+
8826+
const int32_t i3 = tgpig.z;
8827+
const int32_t i2 = tgpig.y;
8828+
const int32_t i1 = tgpig.x;
8829+
8830+
device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03);
8831+
device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3);
8832+
8833+
for (int i0 = tiitg; i0 < args.ne0; i0 += NW) {
8834+
dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f;
8835+
}
8836+
}
8837+
88188838
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
88198839
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
88208840

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3220,9 +3220,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
32203220
const uint32_t D_lsb = D ^ (D & (D-1));
32213221
uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);
32223222

3223-
// Nvidia prefers shared memory use to load large tiles of K
3223+
// Nvidia prefers shared memory use to load large tiles of K.
3224+
// Switch to loading from global memory when it would use too much shared memory.
32243225
// AMD prefers loading K directly from global memory
3225-
const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA ? 1 : 0;
3226+
const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0;
32263227

32273228
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem};
32283229
};
@@ -5590,9 +5591,9 @@ static void ggml_vk_instance_init() {
55905591
// Check if there are two physical devices corresponding to the same GPU
55915592
// This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux),
55925593
// see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication.
5593-
// However, for MoltenVK on macOS, multiple GPUs on the same card may report the same UUID,
5594-
// see https://github.com/KhronosGroup/MoltenVK/issues/2683. Until this is fixed, we'll only deduplicate
5595-
// when drivers differ (same driver + same UUID = likely different GPUs)
5594+
// MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards,
5595+
// see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new
5596+
// driver is MoltenVK
55965597
auto old_device = std::find_if(
55975598
vk_instance.device_indices.begin(),
55985599
vk_instance.device_indices.end(),
@@ -5609,11 +5610,9 @@ static void ggml_vk_instance_init() {
56095610
old_id.deviceLUIDValid && new_id.deviceLUIDValid &&
56105611
std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID))
56115612
);
5613+
bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk);
56125614

5613-
// Only deduplicate if same UUID AND different drivers
5614-
// (same driver + same UUID on MoltenVK = likely different GPUs on multi-GPU card)
5615-
bool different_driver = (old_driver.driverID != new_driver.driverID);
5616-
return same_uuid && different_driver;
5615+
return same_uuid && !both_molten_vk;
56175616
}
56185617
);
56195618
if (old_device == vk_instance.device_indices.end()) {
@@ -8450,7 +8449,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
84508449
const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br;
84518450
const uint32_t sfsh = Bc * sfshstride * acctype;
84528451

8453-
const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA;
8452+
const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256;
84548453
const uint32_t kshstride = (k_load_shmem ? hsk_pad : MatBr) / 4 + 2;
84558454
const uint32_t vsh_stride = MatBc / 4 * row_split;
84568455
const uint32_t ksh = ((kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)) * f16vec4;

vendor/cpp-httplib/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
3939
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
4040

4141
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
42-
set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version")
42+
set(BORINGSSL_VERSION "0.20260204.0" CACHE STRING "BoringSSL version")
4343

4444
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
4545

0 commit comments

Comments
 (0)