Skip to content

Commit 4e44a5f

Browse files
committed
Merge branch 'eso_b6387' into crokeso
2 parents 34393de + bb15126 commit 4e44a5f

23 files changed

Lines changed: 1038 additions & 140 deletions

File tree

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2459,7 +2459,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24592459
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
24602460
add_opt(common_arg(
24612461
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2462-
"number of layers to store in VRAM",
2462+
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
24632463
[](common_params & params, int value) {
24642464
params.n_gpu_layers = value;
24652465
if (!llama_supports_gpu_offload()) {

common/chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1624,7 +1624,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
16241624
// If thinking_forced_open, then we capture the </think> tag in the grammar,
16251625
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
16261626
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1627-
"(\\s*"
1627+
"\\s*("
16281628
"(?:<tool_call>"
16291629
"|<function"
16301630
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"

examples/model-conversion/scripts/utils/curl-embedding-server.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
curl --request POST \
33
--url http://localhost:8080/embedding \
44
--header "Content-Type: application/json" \

ggml/include/ggml.h

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,7 @@ extern "C" {
622622
GGML_OP_CONV_TRANSPOSE_1D,
623623
GGML_OP_IM2COL,
624624
GGML_OP_IM2COL_BACK,
625+
GGML_OP_IM2COL_3D,
625626
GGML_OP_CONV_2D,
626627
GGML_OP_CONV_3D,
627628
GGML_OP_CONV_2D_DW,
@@ -1996,6 +1997,41 @@ extern "C" {
19961997
int d0, // dilation dimension 0
19971998
int d1); // dilation dimension 1
19981999

2000+
GGML_API struct ggml_tensor * ggml_im2col_3d(
2001+
struct ggml_context * ctx,
2002+
struct ggml_tensor * a,
2003+
struct ggml_tensor * b,
2004+
int64_t IC,
2005+
int s0, // stride width
2006+
int s1, // stride height
2007+
int s2, // stride depth
2008+
int p0, // padding width
2009+
int p1, // padding height
2010+
int p2, // padding depth
2011+
int d0, // dilation width
2012+
int d1, // dilation height
2013+
int d2, // dilation depth
2014+
enum ggml_type dst_type);
2015+
2016+
// a: [OC*IC, KD, KH, KW]
2017+
// b: [N*IC, ID, IH, IW]
2018+
// result: [N*OC, OD, OH, OW]
2019+
GGML_API struct ggml_tensor * ggml_conv_3d(
2020+
struct ggml_context * ctx,
2021+
struct ggml_tensor * a,
2022+
struct ggml_tensor * b,
2023+
int64_t IC,
2024+
int s0, // stride width
2025+
int s1, // stride height
2026+
int s2, // stride depth
2027+
int p0, // padding width
2028+
int p1, // padding height
2029+
int p2, // padding depth
2030+
int d0, // dilation width
2031+
int d1, // dilation height
2032+
int d2 // dilation depth
2033+
);
2034+
19992035
// kernel size is a->ne[0] x a->ne[1]
20002036
// stride is equal to kernel size
20012037
// padding is zero
@@ -2067,7 +2103,7 @@ extern "C" {
20672103
int d0, // dilation dimension 0
20682104
int d1); // dilation dimension 1
20692105

2070-
GGML_API struct ggml_tensor * ggml_conv_3d(
2106+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
20712107
struct ggml_context * ctx,
20722108
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
20732109
struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2174,6 +2210,19 @@ extern "C" {
21742210
int p2,
21752211
int p3);
21762212

2213+
GGML_API struct ggml_tensor * ggml_pad_ext(
2214+
struct ggml_context * ctx,
2215+
struct ggml_tensor * a,
2216+
int lp0,
2217+
int rp0,
2218+
int lp1,
2219+
int rp1,
2220+
int lp2,
2221+
int rp2,
2222+
int lp3,
2223+
int rp3
2224+
);
2225+
21772226
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
21782227
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
21792228
struct ggml_context * ctx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3396,6 +3396,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
33963396
{
33973397
ggml_compute_forward_im2col_back_f32(params, tensor);
33983398
} break;
3399+
case GGML_OP_IM2COL_3D:
3400+
{
3401+
ggml_compute_forward_im2col_3d(params, tensor);
3402+
} break;
33993403
case GGML_OP_CONV_2D:
34003404
{
34013405
ggml_compute_forward_conv_2d(params, tensor);
@@ -3820,6 +3824,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
38203824
} break;
38213825
case GGML_OP_IM2COL:
38223826
case GGML_OP_IM2COL_BACK:
3827+
case GGML_OP_IM2COL_3D:
38233828
case GGML_OP_CONV_2D:
38243829
case GGML_OP_CONV_3D:
38253830
case GGML_OP_CONV_2D_DW:

ggml/src/ggml-cpu/ops.cpp

Lines changed: 218 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7798,6 +7798,209 @@ void ggml_compute_forward_im2col_back_f32(
77987798
}
77997799
}
78007800

7801+
7802+
// ggml_compute_forward_im2col_3d_f16
7803+
// src0: kernel [OC*IC, KD, KH, KW]
7804+
// src1: image [N*IC, ID, IH, IW]
7805+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7806+
static void ggml_compute_forward_im2col_3d_f16(
7807+
const ggml_compute_params * params,
7808+
ggml_tensor * dst) {
7809+
7810+
const ggml_tensor * src0 = dst->src[0];
7811+
const ggml_tensor * src1 = dst->src[1];
7812+
7813+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
7814+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
7815+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
7816+
7817+
GGML_TENSOR_BINARY_OP_LOCALS;
7818+
7819+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7820+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7821+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7822+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7823+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7824+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7825+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7826+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7827+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7828+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7829+
7830+
7831+
const int ith = params->ith;
7832+
const int nth = params->nth;
7833+
7834+
const int64_t N = ne13 / IC;
7835+
const int64_t ID = ne12;
7836+
const int64_t IH = ne11;
7837+
const int64_t IW = ne10;
7838+
7839+
const int64_t OC = ne03 / IC;
7840+
GGML_UNUSED(OC);
7841+
const int64_t KD = ne02;
7842+
const int64_t KH = ne01;
7843+
const int64_t KW = ne00;
7844+
7845+
const int64_t OD = ne3 / N;
7846+
const int64_t OH = ne2;
7847+
const int64_t OW = ne1;
7848+
const int64_t OH_OW = OH*OW;
7849+
const int64_t KD_KH_KW = KD*KH*KW;
7850+
const int64_t KH_KW = KH*KW;
7851+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7852+
7853+
GGML_ASSERT(nb10 == sizeof(float));
7854+
7855+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7856+
{
7857+
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
7858+
7859+
for (int64_t in = 0; in < N; in++) {
7860+
for (int64_t iod = 0; iod < OD; iod++) {
7861+
for (int64_t ioh = 0; ioh < OH; ioh++) {
7862+
for (int64_t iow = 0; iow < OW; iow++) {
7863+
for (int64_t iic = ith; iic < IC; iic += nth) {
7864+
7865+
// micro kernel
7866+
ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7867+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7868+
7869+
for (int64_t ikd = 0; ikd < KD; ikd++) {
7870+
for (int64_t ikh = 0; ikh < KH; ikh++) {
7871+
for (int64_t ikw = 0; ikw < KW; ikw++) {
7872+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
7873+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
7874+
const int64_t iid = iod*s2 + ikd*d2 - p2;
7875+
7876+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7877+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7878+
} else {
7879+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7880+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
7881+
}
7882+
}
7883+
}
7884+
}
7885+
}
7886+
}
7887+
}
7888+
}
7889+
}
7890+
}
7891+
}
7892+
7893+
// ggml_compute_forward_im2col_3d_f32
7894+
// src0: kernel [OC*IC, KD, KH, KW]
7895+
// src1: image [N*IC, ID, IH, IW]
7896+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7897+
static void ggml_compute_forward_im2col_3d_f32(
7898+
const ggml_compute_params * params,
7899+
ggml_tensor * dst) {
7900+
7901+
const ggml_tensor * src0 = dst->src[0];
7902+
const ggml_tensor * src1 = dst->src[1];
7903+
7904+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
7905+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
7906+
7907+
GGML_TENSOR_BINARY_OP_LOCALS;
7908+
7909+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7910+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7911+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7912+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7913+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7914+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7915+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7916+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7917+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7918+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7919+
7920+
7921+
const int ith = params->ith;
7922+
const int nth = params->nth;
7923+
7924+
const int64_t N = ne13 / IC;
7925+
const int64_t ID = ne12;
7926+
const int64_t IH = ne11;
7927+
const int64_t IW = ne10;
7928+
7929+
const int64_t OC = ne03 / IC;
7930+
GGML_UNUSED(OC);
7931+
const int64_t KD = ne02;
7932+
const int64_t KH = ne01;
7933+
const int64_t KW = ne00;
7934+
7935+
const int64_t OD = ne3 / N;
7936+
const int64_t OH = ne2;
7937+
const int64_t OW = ne1;
7938+
7939+
const int64_t OH_OW = OH*OW;
7940+
const int64_t KD_KH_KW = KD*KH*KW;
7941+
const int64_t KH_KW = KH*KW;
7942+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7943+
7944+
GGML_ASSERT(nb10 == sizeof(float));
7945+
7946+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7947+
{
7948+
float * const wdata = (float *) dst->data;
7949+
7950+
for (int64_t in = 0; in < N; in++) {
7951+
for (int64_t iod = 0; iod < OD; iod++) {
7952+
for (int64_t ioh = 0; ioh < OH; ioh++) {
7953+
for (int64_t iow = 0; iow < OW; iow++) {
7954+
for (int64_t iic = ith; iic < IC; iic += nth) {
7955+
7956+
// micro kernel
7957+
float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7958+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7959+
7960+
for (int64_t ikd = 0; ikd < KD; ikd++) {
7961+
for (int64_t ikh = 0; ikh < KH; ikh++) {
7962+
for (int64_t ikw = 0; ikw < KW; ikw++) {
7963+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
7964+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
7965+
const int64_t iid = iod*s2 + ikd*d2 - p2;
7966+
7967+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7968+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7969+
} else {
7970+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7971+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
7972+
}
7973+
}
7974+
}
7975+
}
7976+
}
7977+
}
7978+
}
7979+
}
7980+
}
7981+
}
7982+
}
7983+
7984+
7985+
void ggml_compute_forward_im2col_3d(
7986+
const ggml_compute_params * params,
7987+
ggml_tensor * dst) {
7988+
switch (dst->type) {
7989+
case GGML_TYPE_F16:
7990+
{
7991+
ggml_compute_forward_im2col_3d_f16(params, dst);
7992+
} break;
7993+
case GGML_TYPE_F32:
7994+
{
7995+
ggml_compute_forward_im2col_3d_f32(params, dst);
7996+
} break;
7997+
default:
7998+
{
7999+
GGML_ABORT("fatal error");
8000+
}
8001+
}
8002+
}
8003+
78018004
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
78028005
void * a, void * b, float * c) {
78038006
const ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -8785,6 +8988,15 @@ static void ggml_compute_forward_pad_f32(
87858988
GGML_TENSOR_UNARY_OP_LOCALS
87868989

87878990
float * dst_ptr = (float *) dst->data;
8991+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
8992+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
8993+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
8994+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
8995+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
8996+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
8997+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
8998+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
8999+
87889000

87899001
// TODO: optimize
87909002

@@ -8793,10 +9005,12 @@ static void ggml_compute_forward_pad_f32(
87939005
for (int64_t i0 = 0; i0 < ne0; ++i0) {
87949006
for (int64_t i3 = 0; i3 < ne3; ++i3) {
87959007
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
8796-
8797-
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
8798-
8799-
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
9008+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
9009+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
9010+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
9011+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
9012+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
9013+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
88009014
dst_ptr[dst_idx] = *src_ptr;
88019015
} else {
88029016
dst_ptr[dst_idx] = 0;

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
7272
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7373
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7474
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
75+
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7576
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7677
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7778
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);

0 commit comments

Comments
 (0)