diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 7fb78eae370..6a3ebebfdcf 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -705,6 +705,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. |
## Design Rule
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index fcb0db99c6b..01b02524b1c 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -218,6 +218,7 @@ struct sycl_device_info {
size_t total_vram;
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
optimize_feature opt_feature;
+ bool usm_system_support; // support for USM system allocations
};
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 2ec1421841b..6907a59bac4 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -56,6 +56,9 @@
#include "ggml-sycl/sycl_hw.hpp"
+#define MEM_SIZE_2M 0x00200000
+#define MEM_SIZE_1G 0x40000000
+
static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
@@ -64,7 +67,7 @@ int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_enable_flash_attention = 1;
-
+int g_ggml_sycl_usm_system = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
@@ -101,6 +104,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
info.devices[i].smpbo = prop.get_local_mem_size();
info.devices[i].warp_size = WARP_SIZE;
+ info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations);
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
@@ -224,6 +228,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention = 0;
#endif
+ g_ggml_sycl_usm_system = get_sycl_env("GGML_SYCL_USM_SYSTEM", 0);
+
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Build with Macros:\n");
@@ -270,6 +276,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention);
#endif
+ GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system);
+
/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
@@ -354,9 +362,10 @@ struct ggml_backend_sycl_buffer_context {
std::string name;
optimize_feature opt_feature;
std::vector tensor_extras;
+ bool is_usm_system;
- ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
- device(device), dev_ptr(dev_ptr), stream(stream) {
+ ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) :
+ device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) {
check_allow_gpu_index(device);
name = (GGML_SYCL_NAME + std::to_string(device));
opt_feature = ggml_sycl_info().devices[device].opt_feature;
@@ -365,7 +374,10 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
+ if (is_usm_system)
+ free(dev_ptr);
+ else
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
}
//release extra used by tensors
@@ -641,22 +653,52 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
return ctx->name.c_str();
}
+static bool check_usm_system(int device, size_t size) {
+ bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G;
+
+ if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) {
+ GGML_LOG_INFO("Device does not support USM system allocations\n");
+ use_usm_system = false;
+ }
+
+ return use_usm_system;
+}
+
static ggml_backend_buffer_t
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) try {
+ ggml_check_sycl();
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_sycl_set_device(buft_ctx->device);
const queue_ptr stream = buft_ctx->stream;
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
+ /*
+ Alignment below ensures best performance. While in theory it could lead to
+ wasting memory, this is acceptable because in practice only few buffers are
+ allocated and even less exceed the minimum size accepted here for USM system
+ allocations.
+ */
+ size_t alignment = MEM_SIZE_2M;
+ size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
+ bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size);
void * dev_ptr;
- SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
- size, *stream)));
- if (!dev_ptr) {
- GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
- return nullptr;
+ if (use_usm_system) {
+ dev_ptr = (void *)aligned_alloc(alignment, aligned_size);
+ if (!dev_ptr) {
+ GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
+ return nullptr;
+ }
+ } else {
+ SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
+ size, *stream)));
+ if (!dev_ptr) {
+ GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+ return nullptr;
+ }
}
- ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
+ ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
catch (sycl::exception const &exc) {