Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 220 additions & 4 deletions ggml/src/ggml-blas/ggml-blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <future>
#include <vector>
#include <cstring>
#include <memory>

#if defined(GGML_BLAS_USE_ACCELERATE)
# include <Accelerate/Accelerate.h>
Expand All @@ -27,10 +28,223 @@ struct ggml_backend_blas_context {
#endif
};

struct ggml_backend_blas_tensor_extra {
float * dequantized = nullptr;
size_t size_bytes = 0;
bool owns_data = false;
};

struct ggml_backend_blas_buffer_context {
ggml_backend_buffer_t host_buffer = nullptr;
std::vector<std::unique_ptr<ggml_backend_blas_tensor_extra>> extras;
};

static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index);
static ggml_backend_reg_t ggml_backend_blas_reg(void);

static void ggml_backend_blas_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_blas_buffer_context * ctx = (ggml_backend_blas_buffer_context *) buffer->context;

if (ctx == nullptr) {
return;
}

for (auto & extra : ctx->extras) {
if (extra && extra->owns_data) {
ggml_aligned_free(extra->dequantized);
}
}

if (ctx->host_buffer) {
ggml_backend_buffer_free(ctx->host_buffer);
}

delete ctx;
}

static void * ggml_backend_blas_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_blas_buffer_context * ctx = (ggml_backend_blas_buffer_context *) buffer->context;
return ggml_backend_buffer_get_base(ctx->host_buffer);
}

static enum ggml_status ggml_backend_blas_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_blas_buffer_context * ctx = (ggml_backend_blas_buffer_context *) buffer->context;

if (tensor->view_src != NULL) {
auto * src_extra = (ggml_backend_blas_tensor_extra *) tensor->view_src->extra;
if (src_extra != nullptr && src_extra->dequantized != nullptr) {
auto extra = std::make_unique<ggml_backend_blas_tensor_extra>();
GGML_ASSERT(tensor->view_offs % ggml_type_size(tensor->type) == 0);
const size_t elem_offset = tensor->view_offs / ggml_type_size(tensor->type);
extra->dequantized = src_extra->dequantized + elem_offset;
const size_t byte_offset = elem_offset * sizeof(float);
extra->size_bytes = src_extra->size_bytes > byte_offset ? src_extra->size_bytes - byte_offset : 0;
extra->owns_data = false;
tensor->extra = extra.get();
ctx->extras.push_back(std::move(extra));
} else {
tensor->extra = tensor->view_src->extra;
}
return GGML_STATUS_SUCCESS;
}

const bool can_pre_dequantize = buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
tensor->type != GGML_TYPE_F32 && ggml_is_contiguous(tensor) && ggml_get_type_traits(tensor->type)->to_float != nullptr;

std::unique_ptr<ggml_backend_blas_tensor_extra> extra;
if (can_pre_dequantize) {
extra = std::make_unique<ggml_backend_blas_tensor_extra>();
extra->size_bytes = ggml_nelements(tensor) * sizeof(float);
extra->dequantized = (float *) ggml_aligned_malloc(extra->size_bytes);

if (extra->dequantized == nullptr) {
return GGML_STATUS_ALLOC_FAILED;
}

extra->owns_data = true;
tensor->extra = extra.get();
}

if (ctx->host_buffer->iface.init_tensor) {
GGML_CHECK(ctx->host_buffer->iface.init_tensor(ctx->host_buffer, tensor) == GGML_STATUS_SUCCESS);
}

if (extra) {
ctx->extras.push_back(std::move(extra));
}

return GGML_STATUS_SUCCESS;
}

static void ggml_backend_blas_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *) tensor->data + offset, value, size);

ggml_backend_blas_tensor_extra * extra = (ggml_backend_blas_tensor_extra *) tensor->extra;
if (extra != nullptr && extra->dequantized != nullptr && offset == 0 && size == ggml_nbytes(tensor)) {
memset(extra->dequantized, value, extra->size_bytes);
}

GGML_UNUSED(buffer);
}

static void ggml_backend_blas_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *) tensor->data + offset, data, size);

ggml_backend_blas_tensor_extra * extra = (ggml_backend_blas_tensor_extra *) tensor->extra;
if (extra != nullptr && extra->dequantized != nullptr) {
const auto * type_traits = ggml_get_type_traits(tensor->type);
GGML_ASSERT(type_traits->to_float != nullptr);

const size_t type_size = ggml_type_size(tensor->type);
GGML_ASSERT(offset % type_size == 0);
GGML_ASSERT(size % type_size == 0);

const size_t elem_offset = offset / type_size;
const size_t elem_count = size / type_size;

type_traits->to_float((const char *) tensor->data + offset, extra->dequantized + elem_offset, elem_count);
}

GGML_UNUSED(buffer);
}

static void ggml_backend_blas_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *) tensor->data + offset, size);

GGML_UNUSED(buffer);
}

static void ggml_backend_blas_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_blas_buffer_context * ctx = (ggml_backend_blas_buffer_context *) buffer->context;
memset(ggml_backend_buffer_get_base(ctx->host_buffer), value, ggml_backend_buffer_get_size(ctx->host_buffer));
}

static void ggml_backend_blas_buffer_reset(ggml_backend_buffer_t buffer) {
ggml_backend_blas_buffer_context * ctx = (ggml_backend_blas_buffer_context *) buffer->context;

for (auto & extra : ctx->extras) {
if (extra && extra->owns_data) {
ggml_aligned_free(extra->dequantized);
}
}
ctx->extras.clear();

if (ctx->host_buffer->iface.reset) {
ctx->host_buffer->iface.reset(ctx->host_buffer);
}
}

static ggml_backend_buffer_i ggml_backend_blas_buffer_i = {
/* .free_buffer = */ ggml_backend_blas_buffer_free_buffer,
/* .get_base = */ ggml_backend_blas_buffer_get_base,
/* .init_tensor = */ ggml_backend_blas_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_blas_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_blas_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_blas_buffer_get_tensor,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_blas_buffer_clear,
/* .reset = */ ggml_backend_blas_buffer_reset,
};

static const char * ggml_backend_blas_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "BLAS";

GGML_UNUSED(buft);
}

static ggml_backend_buffer_t ggml_backend_blas_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_blas_buffer_context * ctx = new ggml_backend_blas_buffer_context();
ctx->host_buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);

if (ctx->host_buffer == nullptr) {
delete ctx;
return nullptr;
}

return ggml_backend_buffer_init(buft, ggml_backend_blas_buffer_i, ctx, size);
}

static size_t ggml_backend_blas_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_get_alignment(ggml_backend_cpu_buffer_type());

GGML_UNUSED(buft);
}

static bool ggml_backend_blas_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(ggml_backend_cpu_buffer_type());

GGML_UNUSED(buft);
}

static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) {
static ggml_backend_buffer_type ggml_backend_buffer_type_blas = {
/* .iface = */ {
/* .get_name = */ ggml_backend_blas_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_blas_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_blas_buffer_type_get_alignment,
/* .get_max_size = */ NULL,
/* .get_alloc_size = */ NULL,
/* .is_host = */ ggml_backend_blas_buffer_type_is_host,
},
/* .device = */ nullptr,
/* .context = */ NULL,
};

if (ggml_backend_buffer_type_blas.device == nullptr) {
ggml_backend_buffer_type_blas.device = ggml_backend_blas_reg_get_device(ggml_backend_blas_reg(), 0);
}

return &ggml_backend_buffer_type_blas;
}

static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];

const ggml_backend_blas_tensor_extra * extra = (ggml_backend_blas_tensor_extra *) src0->extra;

const bool has_pre_dequant = extra != nullptr && extra->dequantized != nullptr;

GGML_TENSOR_BINARY_OP_LOCALS

const enum ggml_type type = src0->type;
Expand All @@ -55,7 +269,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const int64_t r3 = ne13/ne03;

const int64_t ne_plane = ne01*ne00;
const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
const size_t desired_wsize = (type == GGML_TYPE_F32 || has_pre_dequant) ? 0 : ne03*ne02*ne_plane*sizeof(float);

if (ctx->work_size < desired_wsize) {
ctx->work_data.reset(new char[desired_wsize]);
Expand All @@ -64,7 +278,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
void * wdata = ctx->work_data.get();

// convert src0 to float
if (type != GGML_TYPE_F32) {
if (!has_pre_dequant && type != GGML_TYPE_F32) {
const auto * type_traits = ggml_get_type_traits(type);
ggml_to_float_t const to_float = type_traits->to_float;

Expand Down Expand Up @@ -136,7 +350,9 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);

if (type != GGML_TYPE_F32) {
if (has_pre_dequant) {
x = extra->dequantized + (i03*ne02 + i02)*ne_plane;
} else if (type != GGML_TYPE_F32) {
x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
}

Expand Down Expand Up @@ -373,7 +589,7 @@ static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t d
}

static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type();
return ggml_backend_blas_buffer_type();

GGML_UNUSED(dev);
}
Expand Down
Loading