@@ -333,6 +333,19 @@ void gemv_4bit_inference_fp32(
333333
334334#endif
335335
336+ #if BUILD_XPU
337+ // Helper: get default SYCL queue for XPU paged memory operations.
338+ // SYCL USM (Unified Shared Memory) provides equivalent functionality to:
339+ // - CUDA's cudaMallocManaged / Level Zero's zeMemAllocShared
340+ // - CUDA's cudaMemPrefetchAsync / Level Zero's zeCommandListAppendMemoryPrefetch
341+ // Level Zero has no equivalent to cudaPeekAtLastError; each L0 call returns ze_result_t.
342+ // SYCL wraps L0 and uses exceptions for error reporting.
343+ static sycl::queue& xpu_default_queue () {
344+ static sycl::queue q{sycl::gpu_selector_v, sycl::property::queue::in_order{}};
345+ return q;
346+ }
347+ #endif
348+
336349extern " C" {
337350#if BUILD_CUDA || BUILD_HIP
338351
@@ -687,6 +700,55 @@ void cgemv_4bit_inference_fp32(
687700 gemv_4bit_inference_fp32 (m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize, stream);
688701}
689702
703+ // XPU Paged Memory Support using SYCL USM (Unified Shared Memory)
704+ // Equivalent CUDA APIs -> SYCL/Level Zero APIs:
705+ // cudaMallocManaged -> sycl::malloc_shared / zeMemAllocShared
706+ // cudaMemPrefetchAsync -> sycl::queue::prefetch / zeCommandListAppendMemoryPrefetch
707+ // cudaPeekAtLastError -> N/A (SYCL uses exceptions; L0 returns ze_result_t per call)
708+
709+ void * cget_managed_ptr (size_t bytes) {
710+ try {
711+ auto & q = xpu_default_queue ();
712+ void * ptr = sycl::malloc_shared (bytes, q);
713+ if (ptr == nullptr ) {
714+ fprintf (stderr, " XPU Error: sycl::malloc_shared returned nullptr for %zu bytes\n " , bytes);
715+ }
716+ return ptr;
717+ } catch (const sycl::exception& e) {
718+ fprintf (stderr, " XPU SYCL Error in cget_managed_ptr: %s\n " , e.what ());
719+ return nullptr ;
720+ }
721+ }
722+
723+ void cprefetch (void * ptr, size_t bytes, int device) {
724+ // device == -1 means prefetch to host; for SYCL we skip in that case
725+ // since SYCL prefetch targets the device associated with the queue.
726+ if (device < 0 )
727+ return ;
728+ try {
729+ auto & q = xpu_default_queue ();
730+ q.prefetch (ptr, bytes);
731+ } catch (const sycl::exception& e) {
732+ fprintf (stderr, " XPU Warning: sycl::queue::prefetch failed: %s\n " , e.what ());
733+ }
734+ }
735+
736+ void cfill_fp32 (float * A, float * B, float value, long n) {
737+ try {
738+ auto & q = xpu_default_queue ();
739+ q.fill (A, value, static_cast <size_t >(n)).wait ();
740+ } catch (const sycl::exception& e) {
741+ fprintf (stderr, " XPU Error in cfill_fp32: %s\n " , e.what ());
742+ }
743+ }
744+
745+ void cfill_uint8 (unsigned char * A, unsigned char * B, unsigned char value, long n) {
746+ // Use host-side memset instead of sycl::queue::fill<unsigned char>
747+ // which segfaults on certain Intel GPU drivers (e.g. Max 1550).
748+ // USM shared memory is host-accessible, so memset works directly.
749+ memset (A, value, static_cast <size_t >(n));
750+ }
751+
690752#endif
691753
692754void cquantize_blockwise_cpu_fp32 (
0 commit comments