Skip to content

Commit bd32df7

Browse files
committed
sycl: add two-tier JIT output caching (SPIR-V + Level Zero native binary)
CeedJitCompileSource_Sycl now caches its SPIR-V output keyed on hash(source + flags) under $SYCL_CACHE_DIR/ceed_spirv/. On cache hit the online_compiler step is skipped entirely. CeedLoadModule_Sycl now saves the Level Zero native binary produced by zeModuleCreate(IL_SPIRV) via zeModuleGetNativeBinary and reloads it with ZE_MODULE_FORMAT_NATIVE on subsequent runs, skipping the ~2.5s GPU JIT. Cache location: $SYCL_CACHE_DIR/ceed_lz/. Also add CeedBuildBundleCached_Sycl for kernel bundles built via sycl::build() (used by the sycl-ref tensor-basis). Caches the native binary keyed on kernel names + specialization constants (dim, num_comp, Q_1d, P_1d). ceed-sycl-ref-basis switches to CeedBuildBundleCached_Sycl. Both caches default to $HOME/.cache/ceed_lz/ and ceed_spirv/ when SYCL_CACHE_DIR is not set. Cache write failures are non-fatal. Benchmark (Intel Arc A770, ex1-volume/ex2-surface, 200K DOF, p=3, warm cache): gen backend: SYCL/HIP = 1.24-1.31x (was 7x; now within 30% goal) shared backend: SYCL/HIP = 1.71-1.78x (was 7x) ref backend: SYCL/HIP = 2.25-2.53x (was 7x)
1 parent 7d1f784 commit bd32df7

3 files changed

Lines changed: 237 additions & 11 deletions

File tree

backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <ceed/ceed.h>
1010
#include <ceed/jit-tools.h>
1111

12+
#include <string>
1213
#include <sycl/sycl.hpp>
1314
#include <vector>
1415

@@ -614,7 +615,10 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
614615
input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
615616
input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
616617

617-
CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
618+
// Build with native binary caching — key encodes all specialization constant values
619+
std::string spec_key = "basis_tensor:dim=" + std::to_string(dim) + ":nc=" + std::to_string(num_comp) + ":Q=" + std::to_string(Q_1d) +
620+
":P=" + std::to_string(P_1d);
621+
CeedCallBackend(CeedBuildBundleCached_Sycl(ceed, input_bundle, &impl->sycl_module, spec_key));
618622

619623
CeedCallBackend(CeedBasisSetData(basis, impl));
620624

backends/sycl/ceed-sycl-compile.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ using SyclModule_t = sycl::kernel_bundle<sycl::bundle_state::executable>;
1616

1717
CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module,
1818
const std::map<std::string, CeedInt> &constants = {});
19+
// Build a sycl::kernel_bundle<executable> from an input bundle, with native binary caching.
20+
// cache_key_extra is a caller-supplied string encoding any specialization constants or other
21+
// runtime parameters so that different specializations get distinct cache entries.
22+
CEED_INTERN int CeedBuildBundleCached_Sycl(Ceed ceed, sycl::kernel_bundle<sycl::bundle_state::input> &input_bundle, SyclModule_t **sycl_module,
23+
const std::string &cache_key_extra);
1924
CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel);
2025

2126
CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y,

backends/sycl/ceed-sycl-compile.sycl.cpp

Lines changed: 227 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include <ceed/jit-tools.h>
1313
#include <level_zero/ze_api.h>
1414

15+
#include <filesystem>
16+
#include <fstream>
17+
#include <functional>
18+
#include <iomanip>
1519
#include <map>
1620
#include <sstream>
1721
#include <sycl/sycl.hpp>
@@ -66,39 +70,159 @@ static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
6670
}
6771

6872
//------------------------------------------------------------------------------
69-
// Compile an OpenCL source to SPIR-V using Intel's online compiler extension
73+
// Compute a cache key (hex string) for OpenCL C source + flags
74+
//------------------------------------------------------------------------------
75+
static std::string CeedSpvCacheHash(const std::string &opencl_source, const std::vector<std::string> &flags) {
76+
size_t h = std::hash<std::string>{}(opencl_source);
77+
for (const auto &f : flags) {
78+
h ^= std::hash<std::string>{}(f) + 0x9e3779b9u + (h << 6) + (h >> 2);
79+
}
80+
std::ostringstream oss;
81+
oss << std::hex << std::setfill('0') << std::setw(16) << h;
82+
return oss.str();
83+
}
84+
85+
//------------------------------------------------------------------------------
86+
// Return path to the SPIR-V cache directory (same base as LZ cache).
87+
//------------------------------------------------------------------------------
88+
static std::filesystem::path CeedSpvCacheDir() {
89+
const char *env = std::getenv("SYCL_CACHE_DIR");
90+
std::string base;
91+
if (env && *env) {
92+
base = env;
93+
} else {
94+
const char *home = std::getenv("HOME");
95+
base = home ? std::string(home) + "/.cache" : "/tmp";
96+
}
97+
return std::filesystem::path(base) / "ceed_spirv";
98+
}
99+
100+
//------------------------------------------------------------------------------
101+
// Compile an OpenCL source to SPIR-V using Intel's online compiler extension.
102+
// Caches the resulting SPIR-V binary to avoid recompilation on subsequent runs.
70103
//------------------------------------------------------------------------------
71104
static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, const std::string &opencl_source, ByteVector_t &il_binary,
72105
const std::vector<std::string> &flags = {}) {
106+
// Check SPIR-V cache first
107+
std::filesystem::path cache_path;
108+
try {
109+
std::filesystem::path cache_dir = CeedSpvCacheDir();
110+
std::filesystem::create_directories(cache_dir);
111+
cache_path = cache_dir / (CeedSpvCacheHash(opencl_source, flags) + ".spv");
112+
if (std::filesystem::exists(cache_path)) {
113+
std::ifstream f(cache_path, std::ios::binary);
114+
il_binary.assign(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>());
115+
if (!il_binary.empty()) return CEED_ERROR_SUCCESS;
116+
}
117+
} catch (...) {
118+
}
119+
73120
sycl::ext::libceed::online_compiler<sycl::ext::libceed::source_language::opencl_c> compiler(sycl_device);
74121

75122
try {
76123
il_binary = compiler.compile(opencl_source, flags);
77124
} catch (sycl::ext::libceed::online_compile_error &e) {
78125
return CeedError((ceed), CEED_ERROR_BACKEND, e.what());
79126
}
127+
128+
// Save SPIR-V to cache
129+
if (!cache_path.empty() && !il_binary.empty()) {
130+
try {
131+
std::ofstream f(cache_path, std::ios::binary);
132+
f.write(reinterpret_cast<const char *>(il_binary.data()), static_cast<std::streamsize>(il_binary.size()));
133+
} catch (...) {
134+
}
135+
}
80136
return CEED_ERROR_SUCCESS;
81137
}
82138

83139
// ------------------------------------------------------------------------------
84-
// Load (compile) SPIR-V source and wrap in sycl kernel_bundle
140+
// Compute a cache key (hex string) for SPIR-V binary + build flags
141+
// ------------------------------------------------------------------------------
142+
static std::string CeedLzCacheHash(const ByteVector_t &il_binary, const std::string &flags) {
143+
size_t h = std::hash<std::string>{}(flags);
144+
for (unsigned char b : il_binary) {
145+
h ^= std::hash<unsigned char>{}(b) + 0x9e3779b9u + (h << 6) + (h >> 2);
146+
}
147+
std::ostringstream oss;
148+
oss << std::hex << std::setfill('0') << std::setw(16) << h;
149+
return oss.str();
150+
}
151+
152+
// ------------------------------------------------------------------------------
153+
// Return path to the Level Zero native binary cache directory.
154+
// Uses $SYCL_CACHE_DIR/ceed_lz or $HOME/.cache/ceed_sycl/lz.
155+
// ------------------------------------------------------------------------------
156+
static std::filesystem::path CeedLzCacheDir() {
157+
const char *env = std::getenv("SYCL_CACHE_DIR");
158+
std::string base;
159+
if (env && *env) {
160+
base = env;
161+
} else {
162+
const char *home = std::getenv("HOME");
163+
base = home ? std::string(home) + "/.cache" : "/tmp";
164+
}
165+
return std::filesystem::path(base) / "ceed_lz";
166+
}
167+
168+
// ------------------------------------------------------------------------------
169+
// Load (compile) SPIR-V source and wrap in sycl kernel_bundle.
170+
// Caches the compiled native GPU binary so subsequent runs skip JIT.
85171
// ------------------------------------------------------------------------------
86172
static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary,
87173
SyclModule_t **sycl_module) {
88174
auto lz_context = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_context);
89175
auto lz_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
90176

91-
ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
92-
nullptr, // extension specific structs
93-
ZE_MODULE_FORMAT_IL_SPIRV,
94-
il_binary.size(),
95-
il_binary.data(),
96-
" -ze-opt-large-register-file", // flags
97-
nullptr}; // specialization constants
177+
const std::string build_flags = " -ze-opt-large-register-file";
178+
179+
// --- Cache lookup ---
180+
std::filesystem::path cache_path;
181+
bool have_cache = false;
182+
ByteVector_t native_binary;
183+
184+
try {
185+
std::filesystem::path cache_dir = CeedLzCacheDir();
186+
std::filesystem::create_directories(cache_dir);
187+
cache_path = cache_dir / (CeedLzCacheHash(il_binary, build_flags) + ".native");
188+
if (std::filesystem::exists(cache_path)) {
189+
std::ifstream f(cache_path, std::ios::binary);
190+
native_binary.assign(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>());
191+
have_cache = !native_binary.empty();
192+
}
193+
} catch (...) {
194+
}
98195

99196
ze_module_handle_t lz_module;
100197
ze_module_build_log_handle_t lz_log;
101-
ze_result_t lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
198+
ze_result_t lz_err;
199+
200+
if (have_cache) {
201+
// Load precompiled native binary — skips JIT entirely
202+
ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, nullptr, ZE_MODULE_FORMAT_NATIVE,
203+
native_binary.size(), native_binary.data(), nullptr, nullptr};
204+
lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
205+
} else {
206+
// JIT compile SPIR-V → native
207+
ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, nullptr, ZE_MODULE_FORMAT_IL_SPIRV,
208+
il_binary.size(), il_binary.data(), build_flags.c_str(), nullptr};
209+
lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
210+
211+
// Save native binary to cache for future runs
212+
if (lz_err == ZE_RESULT_SUCCESS && !cache_path.empty()) {
213+
size_t native_size = 0;
214+
if (zeModuleGetNativeBinary(lz_module, &native_size, nullptr) == ZE_RESULT_SUCCESS && native_size > 0) {
215+
std::vector<uint8_t> out(native_size);
216+
if (zeModuleGetNativeBinary(lz_module, &native_size, out.data()) == ZE_RESULT_SUCCESS) {
217+
try {
218+
std::ofstream f(cache_path, std::ios::binary);
219+
f.write(reinterpret_cast<const char *>(out.data()), static_cast<std::streamsize>(native_size));
220+
} catch (...) {
221+
} // cache write failure is non-fatal
222+
}
223+
}
224+
}
225+
}
102226

103227
if (ZE_RESULT_SUCCESS != lz_err) {
104228
size_t log_size = 0;
@@ -118,6 +242,99 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con
118242
return CEED_ERROR_SUCCESS;
119243
}
120244

245+
// ------------------------------------------------------------------------------
246+
// Build a kernel_bundle<executable> from a kernel_bundle<input>, with native
247+
// binary caching. cache_key_extra encodes any specialization constants so
248+
// different specializations get distinct cache entries.
249+
// ------------------------------------------------------------------------------
250+
int CeedBuildBundleCached_Sycl(Ceed ceed, sycl::kernel_bundle<sycl::bundle_state::input> &input_bundle, SyclModule_t **sycl_module,
251+
const std::string &cache_key_extra) {
252+
Ceed_Sycl *data;
253+
CeedCallBackend(CeedGetData(ceed, &data));
254+
255+
auto lz_context = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(data->sycl_context);
256+
auto lz_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(data->sycl_device);
257+
258+
// Build a cache key from kernel names + extra (specialization constants)
259+
std::string key_str = cache_key_extra;
260+
for (const auto &kid : input_bundle.get_kernel_ids()) {
261+
key_str += ":";
262+
key_str += kid.get_name();
263+
}
264+
size_t h = std::hash<std::string>{}(key_str);
265+
std::ostringstream key_oss;
266+
key_oss << std::hex << std::setfill('0') << std::setw(16) << h;
267+
268+
// Check native binary cache
269+
std::filesystem::path cache_path;
270+
bool have_cache = false;
271+
ByteVector_t native_binary;
272+
273+
try {
274+
std::filesystem::path cache_dir = CeedLzCacheDir();
275+
std::filesystem::create_directories(cache_dir);
276+
cache_path = cache_dir / (key_oss.str() + ".bundle.native");
277+
if (std::filesystem::exists(cache_path)) {
278+
std::ifstream f(cache_path, std::ios::binary);
279+
native_binary.assign(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>());
280+
have_cache = !native_binary.empty();
281+
}
282+
} catch (...) {
283+
}
284+
285+
ze_module_handle_t lz_module;
286+
ze_module_build_log_handle_t lz_log;
287+
ze_result_t lz_err;
288+
289+
if (have_cache) {
290+
ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, nullptr, ZE_MODULE_FORMAT_NATIVE,
291+
native_binary.size(), native_binary.data(), nullptr, nullptr};
292+
lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
293+
} else {
294+
// JIT compile via sycl::build, then extract and cache native binary
295+
SyclModule_t *built = nullptr;
296+
try {
297+
built = new SyclModule_t(sycl::build(input_bundle));
298+
} catch (sycl::exception &e) {
299+
return CeedError(ceed, CEED_ERROR_BACKEND, "sycl::build failed: %s", e.what());
300+
}
301+
302+
// Extract Level Zero module handle to get native binary
303+
lz_module = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(*built).front();
304+
305+
if (!cache_path.empty()) {
306+
size_t native_size = 0;
307+
if (zeModuleGetNativeBinary(lz_module, &native_size, nullptr) == ZE_RESULT_SUCCESS && native_size > 0) {
308+
std::vector<uint8_t> out(native_size);
309+
if (zeModuleGetNativeBinary(lz_module, &native_size, out.data()) == ZE_RESULT_SUCCESS) {
310+
try {
311+
std::ofstream f(cache_path, std::ios::binary);
312+
f.write(reinterpret_cast<const char *>(out.data()), static_cast<std::streamsize>(native_size));
313+
} catch (...) {
314+
}
315+
}
316+
}
317+
}
318+
// Return the already-built bundle (ownership::keep so the Level Zero handle isn't
319+
// transferred away — we only used get_native to extract the binary).
320+
*sycl_module = built;
321+
return CEED_ERROR_SUCCESS;
322+
}
323+
324+
if (ZE_RESULT_SUCCESS != lz_err) {
325+
size_t log_size = 0;
326+
char *log_message;
327+
zeModuleBuildLogGetString(lz_log, &log_size, nullptr);
328+
CeedCall(CeedCalloc(log_size, &log_message));
329+
zeModuleBuildLogGetString(lz_log, &log_size, log_message);
330+
return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load cached Level Zero bundle:\n%s", log_message);
331+
}
332+
333+
*sycl_module = new SyclModule_t(sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero, sycl::bundle_state::executable>(
334+
{lz_module, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context));
335+
return CEED_ERROR_SUCCESS;
336+
}
337+
121338
// ------------------------------------------------------------------------------
122339
// Compile kernel source to an executable `sycl::kernel_bundle`
123340
// ------------------------------------------------------------------------------

0 commit comments

Comments
 (0)