Skip to content

Commit e3e943c

Browse files
committed
Merge branch 'main' into cl_khr_unified_svm
2 parents f136294 + d583736 commit e3e943c

6 files changed

Lines changed: 133 additions & 34 deletions

File tree

.github/workflows/build.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
check:
1919
runs-on: ubuntu-latest
2020
steps:
21-
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
21+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
2222

2323
- name: Check Tabs
2424
run: |
@@ -39,7 +39,7 @@ jobs:
3939
runs-on: ${{matrix.os}}
4040

4141
steps:
42-
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
42+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
4343

4444
- name: Get Ubuntu OpenGL Dependencies
4545
if: matrix.os == 'ubuntu-latest'
@@ -48,29 +48,29 @@ jobs:
4848
sudo apt-get install -y libglfw3-dev
4949
5050
- name: Get OpenCL Headers
51-
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
51+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
5252
with:
5353
repository: KhronosGroup/OpenCL-Headers
5454
ref: cl_khr_unified_svm
5555
path: external/OpenCL-Headers
5656

5757
- name: Get OpenCL ICD Loader
58-
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
58+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
5959
with:
6060
repository: KhronosGroup/OpenCL-ICD-Loader
6161
path: external/opencl-icd-loader
6262

6363
- name: Get OpenCL Extension Loader
6464
if: matrix.ext == 'YES'
65-
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
65+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
6666
with:
6767
repository: bashbaug/opencl-extension-loader
6868
ref: cl_khr_unified_svm
6969
path: external/opencl-extension-loader
7070

7171
- name: Get SPIR-V Headers
7272
if: matrix.ext == 'YES'
73-
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
73+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
7474
with:
7575
repository: KhronosGroup/SPIRV-Headers
7676
path: external/SPIRV-Headers

layers/10_cmdbufemu/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ The following environment variables can modify the behavior of the command buffe
3333
| Environment Variable | Behavior | Example Format |
3434
|----------------------|----------|-----------------|
3535
| `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
36+
| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
3637

3738
## Known Limitations
3839

layers/10_cmdbufemu/emulate.cpp

Lines changed: 118 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,26 @@ const cl_mutable_dispatch_fields_khr g_MutableDispatchCaps =
3939
CL_MUTABLE_DISPATCH_ARGUMENTS_KHR |
4040
CL_MUTABLE_DISPATCH_EXEC_INFO_KHR;
4141

42+
static cl_int enqueueProfilingKernel(
43+
cl_command_queue queue,
44+
cl_kernel kernel,
45+
cl_uint num_events_in_wait_list,
46+
const cl_event* event_wait_list,
47+
cl_event* event )
48+
{
49+
const size_t one = 1;
50+
return g_pNextDispatch->clEnqueueNDRangeKernel(
51+
queue,
52+
kernel,
53+
1,
54+
nullptr,
55+
&one,
56+
nullptr,
57+
num_events_in_wait_list,
58+
event_wait_list,
59+
event );
60+
}
61+
4262
typedef struct _cl_mutable_command_khr
4363
{
4464
static bool isValid( cl_mutable_command_khr command )
@@ -215,7 +235,7 @@ struct BarrierWithWaitList : Command
215235
return g_pNextDispatch->clEnqueueBarrierWithWaitList(
216236
queue,
217237
static_cast<cl_uint>(wait_list.size()),
218-
wait_list.data(),
238+
wait_list.size() ? wait_list.data() : nullptr,
219239
signal);
220240
}
221241

@@ -271,7 +291,7 @@ struct CopyBuffer : Command
271291
dst_offset,
272292
size,
273293
static_cast<cl_uint>(wait_list.size()),
274-
wait_list.data(),
294+
wait_list.size() ? wait_list.data() : nullptr,
275295
signal);
276296
}
277297

@@ -345,7 +365,7 @@ struct CopyBufferRect : Command
345365
dst_row_pitch,
346366
dst_slice_pitch,
347367
static_cast<cl_uint>(wait_list.size()),
348-
wait_list.data(),
368+
wait_list.size() ? wait_list.data() : nullptr,
349369
signal);
350370
}
351371

@@ -411,7 +431,7 @@ struct CopyBufferToImage : Command
411431
dst_origin.data(),
412432
region.data(),
413433
static_cast<cl_uint>(wait_list.size()),
414-
wait_list.data(),
434+
wait_list.size() ? wait_list.data() : nullptr,
415435
signal);
416436
}
417437

@@ -473,7 +493,7 @@ struct CopyImage : Command
473493
dst_origin.data(),
474494
region.data(),
475495
static_cast<cl_uint>(wait_list.size()),
476-
wait_list.data(),
496+
wait_list.size() ? wait_list.data() : nullptr,
477497
signal);
478498
}
479499

@@ -535,7 +555,7 @@ struct CopyImageToBuffer : Command
535555
region.data(),
536556
dst_offset,
537557
static_cast<cl_uint>(wait_list.size()),
538-
wait_list.data(),
558+
wait_list.size() ? wait_list.data() : nullptr,
539559
signal);
540560
}
541561

@@ -601,7 +621,7 @@ struct FillBuffer : Command
601621
offset,
602622
size,
603623
static_cast<cl_uint>(wait_list.size()),
604-
wait_list.data(),
624+
wait_list.size() ? wait_list.data() : nullptr,
605625
signal);
606626
}
607627

@@ -678,7 +698,7 @@ struct FillImage : Command
678698
origin.data(),
679699
region.data(),
680700
static_cast<cl_uint>(wait_list.size()),
681-
wait_list.data(),
701+
wait_list.size() ? wait_list.data() : nullptr,
682702
signal);
683703
}
684704

@@ -725,7 +745,7 @@ struct SVMMemcpy : Command
725745
src_ptr,
726746
size,
727747
static_cast<cl_uint>(wait_list.size()),
728-
wait_list.data(),
748+
wait_list.size() ? wait_list.data() : nullptr,
729749
signal);
730750
}
731751

@@ -779,7 +799,7 @@ struct SVMMemFill : Command
779799
pattern.size(),
780800
size,
781801
static_cast<cl_uint>(wait_list.size()),
782-
wait_list.data(),
802+
wait_list.size() ? wait_list.data() : nullptr,
783803
signal);
784804
}
785805

@@ -1073,7 +1093,7 @@ struct NDRangeKernel : Command
10731093
global_work_size.data(),
10741094
local_work_size.size() ? local_work_size.data() : nullptr,
10751095
static_cast<cl_uint>(wait_list.size()),
1076-
wait_list.data(),
1096+
wait_list.size() ? wait_list.data() : nullptr,
10771097
signal);
10781098
}
10791099

@@ -1229,6 +1249,7 @@ typedef struct _cl_command_buffer_khr
12291249
(props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0 );
12301250

12311251
cmdbuf->setupTestQueue(queue);
1252+
cmdbuf->setupProfilingKernel(queue);
12321253
}
12331254
}
12341255

@@ -1254,6 +1275,11 @@ typedef struct _cl_command_buffer_khr
12541275
{
12551276
g_pNextDispatch->clReleaseCommandQueue(queue);
12561277
}
1278+
1279+
for( auto kernel : ProfilingKernels )
1280+
{
1281+
g_pNextDispatch->clReleaseKernel(kernel);
1282+
}
12571283
}
12581284

12591285
static bool isValid( cl_command_buffer_khr cmdbuf )
@@ -1297,20 +1323,17 @@ typedef struct _cl_command_buffer_khr
12971323

12981324
cl_command_queue getQueue() const
12991325
{
1300-
if( Queues.size() > 0 )
1301-
{
1302-
return Queues[0];
1303-
}
1304-
return nullptr;
1326+
return Queues.empty() ? nullptr : Queues[0];
13051327
}
13061328

13071329
cl_command_queue getTestQueue() const
13081330
{
1309-
if( TestQueues.size() > 0 )
1310-
{
1311-
return TestQueues[0];
1312-
}
1313-
return nullptr;
1331+
return TestQueues.empty() ? nullptr : TestQueues[0];
1332+
}
1333+
1334+
cl_kernel getProfilingKernel() const
1335+
{
1336+
return ProfilingKernels.empty() ? nullptr : ProfilingKernels[0];
13141337
}
13151338

13161339
cl_mutable_dispatch_asserts_khr getMutableDispatchAsserts() const
@@ -1510,10 +1533,13 @@ typedef struct _cl_command_buffer_khr
15101533
NextSyncPoint.fetch_add(1, std::memory_order_relaxed) :
15111534
0;
15121535

1513-
command->addDependencies(
1514-
num_sync_points,
1515-
wait_list,
1516-
syncPoint);
1536+
// We only need to add dependencies if there is more than one queue (so
1537+
// we have possible cross-queue dependencies) or the queue is an
1538+
// out-of-order queue (so we have possible intra-queue dependencies).
1539+
if( Queues.size() > 1 || !IsInOrder[0] )
1540+
{
1541+
command->addDependencies(num_sync_points, wait_list, syncPoint);
1542+
}
15171543

15181544
if( sync_point != nullptr )
15191545
{
@@ -1671,6 +1697,7 @@ typedef struct _cl_command_buffer_khr
16711697
std::vector<bool> IsInOrder;
16721698
std::vector<cl_command_queue> TestQueues;
16731699
std::vector<cl_event> BlockingEvents;
1700+
std::vector<cl_kernel> ProfilingKernels;
16741701

16751702
std::vector<std::unique_ptr<Command>> Commands;
16761703
std::atomic<uint32_t> NextSyncPoint;
@@ -1747,6 +1774,52 @@ typedef struct _cl_command_buffer_khr
17471774
}
17481775
}
17491776

1777+
void setupProfilingKernel(cl_command_queue queue)
1778+
{
1779+
if( g_KernelForProfiling )
1780+
{
1781+
cl_context context = nullptr;
1782+
g_pNextDispatch->clGetCommandQueueInfo(
1783+
queue,
1784+
CL_QUEUE_CONTEXT,
1785+
sizeof(context),
1786+
&context,
1787+
nullptr );
1788+
1789+
cl_device_id device = nullptr;
1790+
g_pNextDispatch->clGetCommandQueueInfo(
1791+
queue,
1792+
CL_QUEUE_DEVICE,
1793+
sizeof(device),
1794+
&device,
1795+
nullptr );
1796+
1797+
const char* kernelString = "kernel void Empty() {}";
1798+
cl_program program = g_pNextDispatch->clCreateProgramWithSource(
1799+
context,
1800+
1,
1801+
&kernelString,
1802+
nullptr,
1803+
nullptr );
1804+
g_pNextDispatch->clBuildProgram(
1805+
program,
1806+
1,
1807+
&device,
1808+
nullptr,
1809+
nullptr,
1810+
nullptr );
1811+
1812+
cl_kernel kernel = g_pNextDispatch->clCreateKernel(
1813+
program,
1814+
"Empty",
1815+
nullptr );
1816+
g_pNextDispatch->clReleaseProgram(
1817+
program );
1818+
1819+
ProfilingKernels.push_back(kernel);
1820+
}
1821+
}
1822+
17501823
_cl_command_buffer_khr(
17511824
cl_command_buffer_flags_khr flags,
17521825
cl_mutable_dispatch_asserts_khr mutableDispatchAsserts) :
@@ -1993,7 +2066,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
19932066
queue,
19942067
num_events_in_wait_list,
19952068
event_wait_list,
1996-
event ? &startEvent : nullptr);
2069+
event == nullptr || g_KernelForProfiling ? nullptr : &startEvent );
2070+
if( errorCode == CL_SUCCESS && event && g_KernelForProfiling )
2071+
{
2072+
errorCode = enqueueProfilingKernel(
2073+
queue,
2074+
cmdbuf->getProfilingKernel(),
2075+
0,
2076+
nullptr,
2077+
&startEvent );
2078+
}
19972079
}
19982080

19992081
if( errorCode == CL_SUCCESS )
@@ -2007,7 +2089,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
20072089
queue,
20082090
0,
20092091
nullptr,
2010-
event );
2092+
g_KernelForProfiling ? nullptr : event );
2093+
if( errorCode == CL_SUCCESS && g_KernelForProfiling )
2094+
{
2095+
errorCode = enqueueProfilingKernel(
2096+
queue,
2097+
cmdbuf->getProfilingKernel(),
2098+
0,
2099+
nullptr,
2100+
event );
2101+
}
20112102
}
20122103

20132104
if( event )

layers/10_cmdbufemu/emulate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <map>
1111

1212
extern bool g_EnhancedErrorChecking;
13+
extern bool g_KernelForProfiling;
1314

1415
extern const struct _cl_icd_dispatch* g_pNextDispatch;
1516

layers/10_cmdbufemu/main.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@
3434

3535
bool g_EnhancedErrorChecking = false;
3636

37+
// Using kernels for profiling can fix issues with some implementations
38+
// that do not properly support event profiling on barrkers.
39+
40+
bool g_KernelForProfiling = false;
41+
3742
const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
3843

3944
static cl_int CL_API_CALL
@@ -283,6 +288,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
283288
_init_dispatch();
284289

285290
getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
291+
getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
286292

287293
g_pNextDispatch = target_dispatch;
288294

samples/python/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ numpy==1.26.3
33
pyopencl==2023.1.4
44
pillow==10.3.0
55

6-
jupyterlab==4.2.5
6+
jupyterlab==4.4.8

0 commit comments

Comments
 (0)