Skip to content

Commit 2986ee2

Browse files
authored
Merge pull request #1957 from CHIP-SPV/sycl-fixes
sycl: fix several correctness bugs in sycl-ref backend
2 parents e163d6c + 691a550 commit 2986ee2

6 files changed

Lines changed: 38 additions & 11 deletions

File tree

backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
9797
const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
9898
CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
9999

100+
// Prevent race: idle work items (i >= writeLen) must not overwrite
101+
// s_buffer_1 while active work items still read it from the previous comp.
102+
sycl::group_barrier(work_group);
103+
100104
for (CeedInt k = i; k < u_size; k += group_size) {
101105
s_buffer_1[k] = cur_u[k];
102106
}
@@ -105,9 +109,10 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
105109
CeedInt post = 1;
106110

107111
for (CeedInt d = 0; d < dim; d++) {
108-
// Use older version of sycl workgroup barrier for performance reasons
109-
// Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
110-
// sycl::group_barrier(work_group);
112+
// Full work-group barrier with local-only fence: s_buffer_1/2 are SLM
113+
// (local_accessor), so local_space is sufficient and avoids the cost of
114+
// a global memory fence. Do not replace with a sub-group barrier —
115+
// work_group_size (= Q) can exceed the hardware sub-group size.
111116
work_item.barrier(sycl::access::fence_space::local_space);
112117

113118
pre /= P;
@@ -206,9 +211,10 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
206211
CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
207212

208213
for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
209-
// Use older version of sycl workgroup barrier for performance reasons
210-
// Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
211-
// sycl::group_barrier(work_group);
214+
// Full work-group barrier with local-only fence: s_buffer_1/2 are SLM
215+
// (local_accessor), so local_space is sufficient and avoids the cost of
216+
// a global memory fence. Do not replace with a sub-group barrier —
217+
// work_group_size (= Q) can exceed the hardware sub-group size.
212218
work_item.barrier(sycl::access::fence_space::local_space);
213219

214220
pre /= P;

backends/sycl-ref/ceed-sycl-vector.sycl.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,6 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) {
690690
CeedCallBackend(CeedVectorGetData(vec, &impl));
691691
CeedCallBackend(CeedGetData(ceed, &data));
692692

693-
// Wait for all work to finish before freeing memory
694693
CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
695694
CeedCallSycl(ceed, sycl::free(impl->d_array_owned, data->sycl_context));
696695
CeedCallSycl(ceed, sycl::free(impl->reduction_norm, data->sycl_context));

backends/sycl/ceed-sycl-common.sycl.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ int CeedDestroy_Sycl(Ceed ceed) {
8282
Ceed_Sycl *data;
8383

8484
CeedCallBackend(CeedGetData(ceed, &data));
85+
// CeedCalloc allocates without calling constructors; explicitly run destructors
86+
// before freeing so the sycl::queue destructor waits for pending GPU work.
87+
data->~Ceed_Sycl();
8588
CeedCallBackend(CeedFree(&data));
8689
return CEED_ERROR_SUCCESS;
8790
}

backends/sycl/ceed-sycl-compile.sycl.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,26 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc
6060
//------------------------------------------------------------------------------
6161
// TODO: Add architecture flags, optimization flags
6262
//------------------------------------------------------------------------------
63-
static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
63+
static inline int CeedJitGetFlags_Sycl(Ceed ceed, std::vector<std::string> &flags) {
6464
flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int"), std::string("-DCEED_RUNNING_JIT_PASS=1")};
65+
// Add JIT source root directories as -I include paths
66+
{
67+
const char **jit_source_dirs;
68+
CeedInt num_jit_source_dirs;
69+
70+
CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
71+
for (CeedInt i = 0; i < num_jit_source_dirs; i++) flags.push_back(std::string("-I") + jit_source_dirs[i]);
72+
CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
73+
}
74+
// Add user JIT defines as -D flags
75+
{
76+
const char **jit_defines;
77+
CeedInt num_jit_defines;
78+
79+
CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
80+
for (CeedInt i = 0; i < num_jit_defines; i++) flags.push_back(std::string("-D") + jit_defines[i]);
81+
CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
82+
}
6583
return CEED_ERROR_SUCCESS;
6684
}
6785

@@ -129,7 +147,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule
129147

130148
CeedCallBackend(CeedGetData(ceed, &data));
131149
CeedCallBackend(CeedJitAddDefinitions_Sycl(ceed, kernel_source, jit_source, constants));
132-
CeedCallBackend(CeedJitGetFlags_Sycl(flags));
150+
CeedCallBackend(CeedJitGetFlags_Sycl(ceed, flags));
133151
CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags));
134152
CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module));
135153
return CEED_ERROR_SUCCESS;

examples/ceed/ex1-volume-f-c.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
/// libCEED Q-function for building quadrature data for a mass operator
1111
CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
12-
long long int *build_data = (long long int *)ctx;
12+
const long *build_data = (const long *)ctx;
1313

1414
// in[0] is Jacobians with shape [dim, dim, Q]
1515
// in[1] is quadrature weights with shape [1, Q]

interface/ceed.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,8 @@ int CeedDestroy(Ceed *ceed) {
16791679
CeedCall(CeedFree(&(*ceed)->obj_delegates));
16801680
}
16811681

1682+
CeedCall(CeedWorkVectorsDestroy(*ceed));
1683+
16821684
if ((*ceed)->Destroy) CeedCall((*ceed)->Destroy(*ceed));
16831685

16841686
for (CeedInt i = 0; i < (*ceed)->num_jit_source_roots; i++) {
@@ -1699,7 +1701,6 @@ int CeedDestroy(Ceed *ceed) {
16991701
CeedCall(CeedFree(&(*ceed)->f_offsets));
17001702
CeedCall(CeedFree(&(*ceed)->resource));
17011703
CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
1702-
CeedCall(CeedWorkVectorsDestroy(*ceed));
17031704
CeedCall(CeedObjectDestroy_Private(&(*ceed)->obj));
17041705
CeedCall(CeedFree(ceed));
17051706
return CEED_ERROR_SUCCESS;

0 commit comments

Comments
 (0)