From 66deab6b47da1772f60b76b4b33c17734e5b026b Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Mon, 20 Apr 2026 19:55:53 +0200 Subject: [PATCH 1/2] [SYCL] Clarify cache_config as a kernel launch property Update the sycl_ext_intel_cache_config specification to state explicitly that cache_config is a runtime kernel launch property: it must be passed to the kernel invocation via launch_config and must not be embedded in a kernel functor via get(properties_tag). It looks like property-list overloads for single_task/parallel_for are deprecated, so I have removed those from examples in the extension doc. Instead added examples with launch_config. Also fixed line wrapping to 80 columns. Updated e2e test accordingly and added test cases for the enqueue free functions (parallel_for, nd_launch) from sycl_ext_oneapi_enqueue_functions with a launch_config (including free function kernel case). Kept coverage for deprecated APIs (just removed embeddings into kernel type). --- .../sycl_ext_intel_cache_config.asciidoc | 158 +++++++++++----- sycl/test-e2e/Properties/cache_config.cpp | 179 +++++++++++++----- 2 files changed, 237 insertions(+), 100 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc index 349cce03e73aa..c3180c2c3a7a7 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc @@ -20,7 +20,7 @@ == Notice [%hardbreaks] -Copyright (c) 2023-2023 Intel Corporation. All rights reserved. +Copyright (c) 2023-2026 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -43,7 +43,8 @@ This extension is written against the SYCL 2020 specification, Revision 6 and the following extensions: - link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] -- link:sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] +- link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] +- link:../experimental/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] == Status @@ -57,9 +58,10 @@ specification.* == Overview There are devices where the same hardware resources are used for shared local -memory (SLM) and L1 data cache. Developers may want more L1 data cache or SLM based -on their application. This extension adds runtime kernel property `cache_config` -which provides a way to set the preferred cache configuration for a kernel. +memory (SLM) and L1 data cache. Developers may want more L1 data cache or SLM +based on their application. This extension adds a kernel launch property +`cache_config` which provides a way to set the preferred cache configuration +for a kernel invocation. === Feature Test Macro @@ -80,15 +82,27 @@ supports. === Introduction -This extension introduces new kernel property that can be applied to kernels -using the mechanism defined in link:sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties]. +This extension introduces a new kernel launch property that specifies the +preferred cache configuration for a single kernel invocation. The recommended +way to apply the property is to pass it through a `launch_config` object to +one of the enqueue free functions defined in +link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[ +sycl_ext_oneapi_enqueue_functions] (such as `parallel_for` or `nd_launch`). +The property is applicable both to kernels expressed as SYCL lambdas/function +objects and to free function kernels defined by +link:../experimental/sycl_ext_oneapi_free_function_kernels.asciidoc[ +sycl_ext_oneapi_free_function_kernels]. + +Because `cache_config` is a launch-time property rather than a compile-time +kernel property, it is not embedded into the kernel's type and the cache +configuration can be chosen independently for each invocation of the same +kernel. === Cache Config Property ```c++ namespace sycl::ext::intel::experimental { - enum class cache_config_enum : /*unspecified*/ { large_slm, large_data }; inline constexpr cache_config_enum large_slm = @@ -105,71 +119,114 @@ struct cache_config { ``` The `cache_config` property provides a way to set the preferred cache -configuration for a kernel. The following values are supported: +configuration for a kernel invocation. The following values are supported: - * `large_slm`: Prefer having larger shared local memory and smaller L1 data cache. - In this case driver will ensure that all workgroups will have enough - SLM to run. + * `large_slm`: Prefer having larger shared local memory and smaller L1 data + cache. In this case the driver will ensure that all workgroups + will have enough SLM to run. - * `large_data`: Prefer having larger L1 data cache and smaller shared local memory. - In this case SLM size may be shrinked (which may result in workgroups - spawning as there will be not enough SLM to handle multiple workgroups) - but L1 data cache will be bigger. There may be rare use cases when this - is beneficial. + * `large_data`: Prefer having larger L1 data cache and smaller shared local + memory. In this case SLM size may be shrunk (which may result + in workgroups spawning as there will be not enough SLM to + handle multiple workgroups) but L1 data cache will be bigger. + There may be rare use cases when this is beneficial. -These property may be passed to any kernel invocation function (e.g. -`parallel_for`) via the properties parameter. At most, only one of these -values may be passed to any single kernel invocation function. +The `cache_config` property must be passed to a kernel invocation through a +`launch_config` object. At most one `cache_config` value may be passed to +any single kernel invocation. Backends that do not support this extension may accept and ignore this property. -=== Adding a Property List to a Kernel Launch +=== Usage Examples -A simple example of using this extension is shown below. +Simple examples of using this extension are shown below. -The example assumes that the kernel will benefit from large SLM and hence uses the property -`cache_config_large_slm`: +==== parallel_for and nd_launch ```c++ -using namespace sycl::ext::intel::experimental; -{ - ... - properties kernel_properties{cache_config{large_slm}}; - - q.single_task(kernel_properties, [=] { - *a = *b + *c; - }).wait(); +namespace syclex = sycl::ext::oneapi::experimental; +namespace intelex = sycl::ext::intel::experimental; + +sycl::queue q; +syclex::properties props{intelex::cache_config{intelex::large_slm}}; + +// Range-based launch. +syclex::parallel_for(q, + syclex::launch_config{sycl::range<1>{1024}, props}, + [=](sycl::item<1> i) { /* kernel code */ }); + +// nd_range-based launch. +syclex::nd_launch(q, + syclex::launch_config{sycl::nd_range<1>{{1024}, {64}}, props}, + [=](sycl::nd_item<1> i) { /* kernel code */ }); + +q.wait(); +``` + +==== Free function kernels + +For a kernel declared with the free function kernel syntax, the property is +passed to `nd_launch` via the same `launch_config` mechanism: + +```c++ +namespace syclex = sycl::ext::oneapi::experimental; +namespace intelex = sycl::ext::intel::experimental; + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::nd_range_kernel<1>)) +void my_kernel(float *ptr) { /* kernel code */ } + +int main() { + sycl::queue q; + float *ptr = sycl::malloc_shared(1024, q); + + syclex::launch_config cfg{ + sycl::nd_range<1>{{1024}, {64}}, + syclex::properties{intelex::cache_config{intelex::large_slm}} + }; + syclex::nd_launch(q, cfg, syclex::kernel_function, ptr); + + q.wait(); } ``` -=== Embedding Property into a Kernel +==== Different cache configurations for different launches of the same kernel -The example below shows how the kernel from the previous section could be -rewritten to leverage an embedded property list (see link:sycl_ext_oneapi_kernel_properties.asciidoc#embedding-properties-into-a-kernel[embedding-properties-into-a-kernel]): +Because `cache_config` is a runtime property, different invocations of the +same kernel can use different cache configurations. ```c++ -using namespace sycl::ext::intel::experimental; -struct KernelFunctor { +namespace syclex = sycl::ext::oneapi::experimental; +namespace intelex = sycl::ext::intel::experimental; - KernelFunctor(int* a, int* b, int* c) : a(a), b(b), c(c) {} +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::nd_range_kernel<1>)) +void my_kernel(float *ptr) { /* kernel code */ } - void operator()() const { - *a = *b + *c; - } +int main() { + sycl::queue q; + sycl::context ctxt = q.get_context(); + float *ptr = sycl::malloc_shared(1024, q); - auto get(properties_tag) const { - return properties{cache_config{large_slm}}; - } + auto bundle = + syclex::get_kernel_bundle(ctxt); + sycl::kernel k = bundle.ext_oneapi_get_kernel(); - int* a; - int* b; - int* c; -}; + sycl::nd_range<1> ndr{{1024}, {64}}; -... + // First launch with large SLM. + syclex::nd_launch(q, + syclex::launch_config{ndr, + syclex::properties{intelex::cache_config{intelex::large_slm}}}, + k, ptr); -q.single_task(KernelFunctor{a, b, c}).wait(); + // Second launch of the same kernel with large data cache. + syclex::nd_launch(q, + syclex::launch_config{ndr, + syclex::properties{intelex::cache_config{intelex::large_data}}}, + k, ptr); + + q.wait(); +} ``` == Revision History @@ -180,4 +237,5 @@ q.single_task(KernelFunctor{a, b, c}).wait(); |======================================== |Rev|Date|Author|Changes |1|2022-03-01|Artur Gainullin|*Initial public working draft* +|2|2026-04-24|Artur Gainullin|*Clarify `cache_config` as a kernel launch property; remove embedding via `get(properties_tag)`* |======================================== diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp index 68d8906c38b0e..26224a0da189c 100644 --- a/sycl/test-e2e/Properties/cache_config.cpp +++ b/sycl/test-e2e/Properties/cache_config.cpp @@ -1,47 +1,60 @@ -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-usm_shared_allocations // UNSUPPORTED: windows && gpu-intel-gen12 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/21556 -// RUN: %{build} -o %t.out -// RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s +// RUN: %{build} -Wno-deprecated-declarations -o %t.out +// RUN: env SYCL_UR_TRACE=-1 UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s + +// This test verifies that the `cache_config` runtime launch property from +// sycl_ext_intel_cache_config is correctly propagated to the UR (observed +// via the `urKernelSetExecInfo` trace) across the main +// supported kernel invocation paths: +// * `sycl_ext_oneapi_enqueue_functions` `parallel_for` and `nd_launch`. +// * `sycl_ext_oneapi_free_function_kernels` via `nd_launch`. +// * A negative case: the property is not set when no property is passed. +// * Deprecated APIs: SYCL `queue::single_task`, `queue::parallel_for` and +// `queue::parallel_for_work_group` with a property list. #include #include +#include +#include +#include #include #include +#include using namespace sycl; -using namespace sycl::ext::intel::experimental; -using namespace sycl::ext::oneapi::experimental; +namespace syclext = sycl::ext::oneapi; +namespace syclexp = sycl::ext::oneapi::experimental; +namespace intelexp = sycl::ext::intel::experimental; struct KernelFunctor { void operator()() const {} - auto get(properties_tag) const { return properties{cache_config(large_slm)}; } }; struct KernelFunctorND { void operator()(nd_item<2> i) const {} - auto get(properties_tag) const { return properties{cache_config(large_slm)}; } }; struct NegativeKernelFunctor { void operator()(nd_item<2> i) const {} - auto get(properties_tag) const { return properties{}; } }; struct RangeKernelFunctor { void operator()(id<2> i) const {} - auto get(properties_tag) const { return properties{cache_config(large_slm)}; } }; struct WorkGroupFunctor { void operator()(group<1> g) const { g.parallel_for_work_item([&](h_item<1>) {}); } - auto get(properties_tag) const { return properties{cache_config(large_slm)}; } }; +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void free_function_kernel(int *) {} + template struct ReductionKernelFunctor { T1 mInput_values; ReductionKernelFunctor(T1 &Input_values) : mInput_values(Input_values) {} @@ -49,36 +62,119 @@ template struct ReductionKernelFunctor { template void operator()(id<1> idx, sumT &sum) const { sum += mInput_values[idx]; } - auto get(properties_tag) const { return properties{cache_config(large_slm)}; } }; int main() { - sycl::property_list q_prop{sycl::property::queue::in_order()}; - queue q{q_prop}; + sycl::queue q{sycl::property::queue::in_order{}}; + + syclexp::properties large_slm_props{ + intelexp::cache_config{intelexp::large_slm}}; + syclexp::properties large_data_props{ + intelexp::cache_config{intelexp::large_data}}; + + // Recommended APIs with launch_config. + + // CHECK: enqueue_functions::parallel_for with launch_config + large_slm + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM + std::cout << "enqueue_functions::parallel_for with launch_config + large_slm" + << std::endl; + syclexp::parallel_for( + q, syclexp::launch_config{sycl::range<1>{16}, large_slm_props}, + [=](sycl::item<1>) {}); + q.wait(); + + // CHECK: enqueue_functions::nd_launch with launch_config + large_data + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_DATA + std::cout << "enqueue_functions::nd_launch with launch_config + large_data" + << std::endl; + syclexp::nd_launch( + q, + syclexp::launch_config{ + sycl::nd_range<1>{sycl::range<1>{16}, sycl::range<1>{4}}, + large_data_props}, + [=](sycl::nd_item<1>) {}); + q.wait(); + + // CHECK: free function kernel via nd_launch + large_slm + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM + std::cout << "free function kernel via nd_launch + large_slm" << std::endl; + int *ptr = sycl::malloc_shared(16, q); + syclexp::nd_launch( + q, + syclexp::launch_config{ + sycl::nd_range<1>{sycl::range<1>{16}, sycl::range<1>{4}}, + large_slm_props}, + syclexp::kernel_function, ptr); + q.wait(); + + // Same kernel launched twice with different cache_config values. + // CHECK: kernel_bundle: same kernel with large_slm then large_data + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_DATA + std::cout << "kernel_bundle: same kernel with large_slm then large_data" + << std::endl; + { + sycl::context ctxt = q.get_context(); + auto bundle = + syclexp::get_kernel_bundle(ctxt); + sycl::kernel k = bundle.ext_oneapi_get_kernel(); + sycl::nd_range<1> ndr{sycl::range<1>{16}, sycl::range<1>{4}}; + + syclexp::nd_launch(q, syclexp::launch_config{ndr, large_slm_props}, k, ptr); + syclexp::nd_launch(q, syclexp::launch_config{ndr, large_data_props}, k, + ptr); + q.wait(); + } + sycl::free(ptr, q); + + // Negative case: launch with no cache_config property. + // CHECK: negative case with no cache_config + // CHECK-NOT: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG + std::cout << "negative case with no cache_config" << std::endl; + syclexp::nd_launch(q, + syclexp::launch_config{sycl::nd_range<1>{ + sycl::range<1>{16}, sycl::range<1>{4}}}, + [=](sycl::nd_item<1>) {}); + q.wait(); + + // Depracated APIs. - sycl::ext::oneapi::experimental::properties properties{ - cache_config(large_slm)}; + // CHECK: KernelFunctor single_task + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM + std::cout << "KernelFunctor single_task" << std::endl; + q.single_task(large_slm_props, KernelFunctor{}).wait(); + + // CHECK: KernelFunctor parallel_for + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_DATA + std::cout << "KernelFunctor parallel_for" << std::endl; + q.parallel_for(nd_range<2>{range<2>(4, 4), range<2>(2, 2)}, large_data_props, + KernelFunctorND{}) + .wait(); // CHECK: parallel_for with sycl::range - // CHECK: zeKernelSetCacheConfig + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_DATA std::cout << "parallel_for with sycl::range" << std::endl; - q.parallel_for(range<2>{16, 16}, RangeKernelFunctor{}).wait(); + q.parallel_for(range<2>{16, 16}, large_data_props, RangeKernelFunctor{}) + .wait(); // CHECK: parallel_for_work_group(range, func) - // CHECK: zeKernelSetCacheConfig + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM std::cout << "parallel_for_work_group(range, func)" << std::endl; q.submit([&](handler &cgh) { - cgh.parallel_for_work_group(range<1>(8), - WorkGroupFunctor{}); - }); + cgh.parallel_for_work_group(range<1>(8), large_slm_props, + WorkGroupFunctor{}); + }).wait(); + ; // CHECK: parallel_for_work_group(range, range, func) - // CHECK: zeKernelSetCacheConfig + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM std::cout << "parallel_for_work_group(range, range, func)" << std::endl; q.submit([&](handler &cgh) { - cgh.parallel_for_work_group( - range<1>(8), range<1>(4), WorkGroupFunctor{}); - }); + cgh.parallel_for_work_group( + range<1>(8), range<1>(4), large_slm_props, WorkGroupFunctor{}); + }).wait(); + ; buffer values_buf{1024}; { @@ -90,35 +186,18 @@ int main() { buffer sum_buf{&sum_result, 1}; // CHECK: parallel_for with reduction - // CHECK: zeKernelSetCacheConfig + // CHECK: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG_LARGE_SLM std::cout << "parallel_for with reduction" << std::endl; q.submit([&](handler &cgh) { - auto input_values = values_buf.get_access(cgh); - auto sum_reduction = reduction(sum_buf, cgh, plus<>()); - cgh.parallel_for(range<1>{1024}, sum_reduction, - ReductionKernelFunctor(input_values)); - }); - - // CHECK: KernelFunctor single_task - // CHECK: zeKernelSetCacheConfig - std::cout << "KernelFunctor single_task" << std::endl; - q.single_task(KernelFunctor{}).wait(); - - // CHECK: KernelFunctor parallel_for - // CHECK: zeKernelSetCacheConfig - std::cout << "KernelFunctor parallel_for" << std::endl; - q.parallel_for(nd_range<2>{range<2>(4, 4), range<2>(2, 2)}, KernelFunctorND{}) - .wait(); - - // CHECK: negative parallel_for with sycl::nd_range - // CHECK-NOT: zeKernelSetCacheConfig - std::cout << "negative parallel_for with sycl::nd_range" << std::endl; - q.parallel_for(nd_range<2>{range<2>(4, 4), range<2>(2, 2)}, - NegativeKernelFunctor{}) - .wait(); + auto input_values = values_buf.get_access(cgh); + auto sum_reduction = reduction(sum_buf, cgh, std::plus<>()); + cgh.parallel_for(range<1>{1024}, large_slm_props, sum_reduction, + ReductionKernelFunctor(input_values)); + }).wait(); + ; // CHECK: negative parallel_for with KernelFunctor - // CHECK-NOT: zeKernelSetCacheConfig + // CHECK-NOT: urKernelSetExecInfo{{.*}}UR_KERNEL_CACHE_CONFIG std::cout << "negative parallel_for with KernelFunctor" << std::endl; q.parallel_for(nd_range<2>{range<2>(4, 4), range<2>(2, 2)}, NegativeKernelFunctor{}) From c98be83cdeef03f78299252158a42dd715b836aa Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Mon, 27 Apr 2026 12:12:09 -0700 Subject: [PATCH 2/2] Address review --- .../sycl_ext_intel_cache_config.asciidoc | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc index c3180c2c3a7a7..3b4807512c7d2 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_cache_config.asciidoc @@ -20,7 +20,7 @@ == Notice [%hardbreaks] -Copyright (c) 2023-2026 Intel Corporation. All rights reserved. +Copyright (c) 2023 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -44,7 +44,6 @@ the following extensions: - link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] - link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] -- link:../experimental/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] == Status @@ -145,23 +144,27 @@ Simple examples of using this extension are shown below. ==== parallel_for and nd_launch ```c++ +#include + namespace syclex = sycl::ext::oneapi::experimental; namespace intelex = sycl::ext::intel::experimental; -sycl::queue q; -syclex::properties props{intelex::cache_config{intelex::large_slm}}; +int main() { + sycl::queue q; + syclex::properties props{intelex::cache_config{intelex::large_slm}}; -// Range-based launch. -syclex::parallel_for(q, - syclex::launch_config{sycl::range<1>{1024}, props}, - [=](sycl::item<1> i) { /* kernel code */ }); + // Range-based launch. + syclex::parallel_for(q, + syclex::launch_config{sycl::range<1>{1024}, props}, + [=](sycl::item<1> i) { /* kernel code */ }); -// nd_range-based launch. -syclex::nd_launch(q, - syclex::launch_config{sycl::nd_range<1>{{1024}, {64}}, props}, - [=](sycl::nd_item<1> i) { /* kernel code */ }); + // nd_range-based launch. + syclex::nd_launch(q, + syclex::launch_config{sycl::nd_range<1>{{1024}, {64}}, props}, + [=](sycl::nd_item<1> i) { /* kernel code */ }); -q.wait(); + q.wait(); +} ``` ==== Free function kernels @@ -170,6 +173,8 @@ For a kernel declared with the free function kernel syntax, the property is passed to `nd_launch` via the same `launch_config` mechanism: ```c++ +#include + namespace syclex = sycl::ext::oneapi::experimental; namespace intelex = sycl::ext::intel::experimental; @@ -196,6 +201,8 @@ Because `cache_config` is a runtime property, different invocations of the same kernel can use different cache configurations. ```c++ +#include + namespace syclex = sycl::ext::oneapi::experimental; namespace intelex = sycl::ext::intel::experimental;