Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. |

## Design Rule

Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ struct sycl_device_info {
size_t total_vram;
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
optimize_feature opt_feature;
bool usm_system_support; // support for USM system allocations
};


Expand Down
62 changes: 52 additions & 10 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@
#include "ggml-sycl/sycl_hw.hpp"


#define MEM_SIZE_2M 0x00200000
#define MEM_SIZE_1G 0x40000000

static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
Expand All @@ -64,7 +67,7 @@ int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_enable_flash_attention = 1;

int g_ggml_sycl_usm_system = 0;

static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
Expand Down Expand Up @@ -101,6 +104,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
info.devices[i].smpbo = prop.get_local_mem_size();
info.devices[i].warp_size = WARP_SIZE;
info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations);

info.max_work_group_sizes[i] = prop.get_max_work_group_size();
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
Expand Down Expand Up @@ -224,6 +228,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention = 0;
#endif

g_ggml_sycl_usm_system = get_sycl_env("GGML_SYCL_USM_SYSTEM", 0);

GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");

GGML_LOG_INFO("Build with Macros:\n");
Expand Down Expand Up @@ -270,6 +276,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention);
#endif

GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system);

/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
Expand Down Expand Up @@ -354,9 +362,10 @@ struct ggml_backend_sycl_buffer_context {
std::string name;
optimize_feature opt_feature;
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
bool is_usm_system;

ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
device(device), dev_ptr(dev_ptr), stream(stream) {
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) :
device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) {
check_allow_gpu_index(device);
name = (GGML_SYCL_NAME + std::to_string(device));
opt_feature = ggml_sycl_info().devices[device].opt_feature;
Expand All @@ -365,7 +374,10 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
if (is_usm_system)
free(dev_ptr);
else
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
}

//release extra used by tensors
Expand Down Expand Up @@ -641,22 +653,52 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
return ctx->name.c_str();
}

static bool check_usm_system(int device, size_t size) {
bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G;

if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) {
GGML_LOG_INFO("Device does not support USM system allocations\n");
use_usm_system = false;
}

return use_usm_system;
}

static ggml_backend_buffer_t
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) try {
ggml_check_sycl();

ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_sycl_set_device(buft_ctx->device);
const queue_ptr stream = buft_ctx->stream;
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
/*
Alignment below ensures best performance. While in theory it could lead to
wasting memory, this is acceptable because in practice only few buffers are
allocated and even less exceed the minimum size accepted here for USM system
allocations.
*/
size_t alignment = MEM_SIZE_2M;
size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size);

void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
if (use_usm_system) {
dev_ptr = (void *)aligned_alloc(alignment, aligned_size);
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
return nullptr;
}
} else {
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
}
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
catch (sycl::exception const &exc) {
Expand Down