@@ -39,6 +39,26 @@ const cl_mutable_dispatch_fields_khr g_MutableDispatchCaps =
3939 CL_MUTABLE_DISPATCH_ARGUMENTS_KHR |
4040 CL_MUTABLE_DISPATCH_EXEC_INFO_KHR;
4141
42+ static cl_int enqueueProfilingKernel (
43+ cl_command_queue queue,
44+ cl_kernel kernel,
45+ cl_uint num_events_in_wait_list,
46+ const cl_event* event_wait_list,
47+ cl_event* event )
48+ {
49+ const size_t one = 1 ;
50+ return g_pNextDispatch->clEnqueueNDRangeKernel (
51+ queue,
52+ kernel,
53+ 1 ,
54+ nullptr ,
55+ &one,
56+ nullptr ,
57+ num_events_in_wait_list,
58+ event_wait_list,
59+ event );
60+ }
61+
4262typedef struct _cl_mutable_command_khr
4363{
4464 static bool isValid ( cl_mutable_command_khr command )
@@ -215,7 +235,7 @@ struct BarrierWithWaitList : Command
215235 return g_pNextDispatch->clEnqueueBarrierWithWaitList (
216236 queue,
217237 static_cast <cl_uint>(wait_list.size ()),
218- wait_list.data (),
238+ wait_list.size () ? wait_list. data () : nullptr ,
219239 signal);
220240 }
221241
@@ -271,7 +291,7 @@ struct CopyBuffer : Command
271291 dst_offset,
272292 size,
273293 static_cast <cl_uint>(wait_list.size ()),
274- wait_list.data (),
294+ wait_list.size () ? wait_list. data () : nullptr ,
275295 signal);
276296 }
277297
@@ -345,7 +365,7 @@ struct CopyBufferRect : Command
345365 dst_row_pitch,
346366 dst_slice_pitch,
347367 static_cast <cl_uint>(wait_list.size ()),
348- wait_list.data (),
368+ wait_list.size () ? wait_list. data () : nullptr ,
349369 signal);
350370 }
351371
@@ -411,7 +431,7 @@ struct CopyBufferToImage : Command
411431 dst_origin.data (),
412432 region.data (),
413433 static_cast <cl_uint>(wait_list.size ()),
414- wait_list.data (),
434+ wait_list.size () ? wait_list. data () : nullptr ,
415435 signal);
416436 }
417437
@@ -473,7 +493,7 @@ struct CopyImage : Command
473493 dst_origin.data (),
474494 region.data (),
475495 static_cast <cl_uint>(wait_list.size ()),
476- wait_list.data (),
496+ wait_list.size () ? wait_list. data () : nullptr ,
477497 signal);
478498 }
479499
@@ -535,7 +555,7 @@ struct CopyImageToBuffer : Command
535555 region.data (),
536556 dst_offset,
537557 static_cast <cl_uint>(wait_list.size ()),
538- wait_list.data (),
558+ wait_list.size () ? wait_list. data () : nullptr ,
539559 signal);
540560 }
541561
@@ -601,7 +621,7 @@ struct FillBuffer : Command
601621 offset,
602622 size,
603623 static_cast <cl_uint>(wait_list.size ()),
604- wait_list.data (),
624+ wait_list.size () ? wait_list. data () : nullptr ,
605625 signal);
606626 }
607627
@@ -678,7 +698,7 @@ struct FillImage : Command
678698 origin.data (),
679699 region.data (),
680700 static_cast <cl_uint>(wait_list.size ()),
681- wait_list.data (),
701+ wait_list.size () ? wait_list. data () : nullptr ,
682702 signal);
683703 }
684704
@@ -725,7 +745,7 @@ struct SVMMemcpy : Command
725745 src_ptr,
726746 size,
727747 static_cast <cl_uint>(wait_list.size ()),
728- wait_list.data (),
748+ wait_list.size () ? wait_list. data () : nullptr ,
729749 signal);
730750 }
731751
@@ -779,7 +799,7 @@ struct SVMMemFill : Command
779799 pattern.size (),
780800 size,
781801 static_cast <cl_uint>(wait_list.size ()),
782- wait_list.data (),
802+ wait_list.size () ? wait_list. data () : nullptr ,
783803 signal);
784804 }
785805
@@ -1073,7 +1093,7 @@ struct NDRangeKernel : Command
10731093 global_work_size.data (),
10741094 local_work_size.size () ? local_work_size.data () : nullptr ,
10751095 static_cast <cl_uint>(wait_list.size ()),
1076- wait_list.data (),
1096+ wait_list.size () ? wait_list. data () : nullptr ,
10771097 signal);
10781098 }
10791099
@@ -1229,6 +1249,7 @@ typedef struct _cl_command_buffer_khr
12291249 (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0 );
12301250
12311251 cmdbuf->setupTestQueue (queue);
1252+ cmdbuf->setupProfilingKernel (queue);
12321253 }
12331254 }
12341255
@@ -1254,6 +1275,11 @@ typedef struct _cl_command_buffer_khr
12541275 {
12551276 g_pNextDispatch->clReleaseCommandQueue (queue);
12561277 }
1278+
1279+ for ( auto kernel : ProfilingKernels )
1280+ {
1281+ g_pNextDispatch->clReleaseKernel (kernel);
1282+ }
12571283 }
12581284
12591285 static bool isValid ( cl_command_buffer_khr cmdbuf )
@@ -1297,20 +1323,17 @@ typedef struct _cl_command_buffer_khr
12971323
12981324 cl_command_queue getQueue () const
12991325 {
1300- if ( Queues.size () > 0 )
1301- {
1302- return Queues[0 ];
1303- }
1304- return nullptr ;
1326+ return Queues.empty () ? nullptr : Queues[0 ];
13051327 }
13061328
13071329 cl_command_queue getTestQueue () const
13081330 {
1309- if ( TestQueues.size () > 0 )
1310- {
1311- return TestQueues[0 ];
1312- }
1313- return nullptr ;
1331+ return TestQueues.empty () ? nullptr : TestQueues[0 ];
1332+ }
1333+
1334+ cl_kernel getProfilingKernel () const
1335+ {
1336+ return ProfilingKernels.empty () ? nullptr : ProfilingKernels[0 ];
13141337 }
13151338
13161339 cl_mutable_dispatch_asserts_khr getMutableDispatchAsserts () const
@@ -1510,10 +1533,13 @@ typedef struct _cl_command_buffer_khr
15101533 NextSyncPoint.fetch_add (1 , std::memory_order_relaxed) :
15111534 0 ;
15121535
1513- command->addDependencies (
1514- num_sync_points,
1515- wait_list,
1516- syncPoint);
1536+ // We only need to add dependencies if there is more than one queue (so
1537+ // we have possible cross-queue dependencies) or the queue is an
1538+ // out-of-order queue (so we have possible intra-queue dependencies).
1539+ if ( Queues.size () > 1 || !IsInOrder[0 ] )
1540+ {
1541+ command->addDependencies (num_sync_points, wait_list, syncPoint);
1542+ }
15171543
15181544 if ( sync_point != nullptr )
15191545 {
@@ -1671,6 +1697,7 @@ typedef struct _cl_command_buffer_khr
16711697 std::vector<bool > IsInOrder;
16721698 std::vector<cl_command_queue> TestQueues;
16731699 std::vector<cl_event> BlockingEvents;
1700+ std::vector<cl_kernel> ProfilingKernels;
16741701
16751702 std::vector<std::unique_ptr<Command>> Commands;
16761703 std::atomic<uint32_t > NextSyncPoint;
@@ -1747,6 +1774,52 @@ typedef struct _cl_command_buffer_khr
17471774 }
17481775 }
17491776
1777+ void setupProfilingKernel (cl_command_queue queue)
1778+ {
1779+ if ( g_KernelForProfiling )
1780+ {
1781+ cl_context context = nullptr ;
1782+ g_pNextDispatch->clGetCommandQueueInfo (
1783+ queue,
1784+ CL_QUEUE_CONTEXT,
1785+ sizeof (context),
1786+ &context,
1787+ nullptr );
1788+
1789+ cl_device_id device = nullptr ;
1790+ g_pNextDispatch->clGetCommandQueueInfo (
1791+ queue,
1792+ CL_QUEUE_DEVICE,
1793+ sizeof (device),
1794+ &device,
1795+ nullptr );
1796+
1797+ const char * kernelString = " kernel void Empty() {}" ;
1798+ cl_program program = g_pNextDispatch->clCreateProgramWithSource (
1799+ context,
1800+ 1 ,
1801+ &kernelString,
1802+ nullptr ,
1803+ nullptr );
1804+ g_pNextDispatch->clBuildProgram (
1805+ program,
1806+ 1 ,
1807+ &device,
1808+ nullptr ,
1809+ nullptr ,
1810+ nullptr );
1811+
1812+ cl_kernel kernel = g_pNextDispatch->clCreateKernel (
1813+ program,
1814+ " Empty" ,
1815+ nullptr );
1816+ g_pNextDispatch->clReleaseProgram (
1817+ program );
1818+
1819+ ProfilingKernels.push_back (kernel);
1820+ }
1821+ }
1822+
17501823 _cl_command_buffer_khr (
17511824 cl_command_buffer_flags_khr flags,
17521825 cl_mutable_dispatch_asserts_khr mutableDispatchAsserts) :
@@ -1993,7 +2066,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
19932066 queue,
19942067 num_events_in_wait_list,
19952068 event_wait_list,
1996- event ? &startEvent : nullptr );
2069+ event == nullptr || g_KernelForProfiling ? nullptr : &startEvent );
2070+ if ( errorCode == CL_SUCCESS && event && g_KernelForProfiling )
2071+ {
2072+ errorCode = enqueueProfilingKernel (
2073+ queue,
2074+ cmdbuf->getProfilingKernel (),
2075+ 0 ,
2076+ nullptr ,
2077+ &startEvent );
2078+ }
19972079 }
19982080
19992081 if ( errorCode == CL_SUCCESS )
@@ -2007,7 +2089,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
20072089 queue,
20082090 0 ,
20092091 nullptr ,
2010- event );
2092+ g_KernelForProfiling ? nullptr : event );
2093+ if ( errorCode == CL_SUCCESS && g_KernelForProfiling )
2094+ {
2095+ errorCode = enqueueProfilingKernel (
2096+ queue,
2097+ cmdbuf->getProfilingKernel (),
2098+ 0 ,
2099+ nullptr ,
2100+ event );
2101+ }
20112102 }
20122103
20132104 if ( event )
0 commit comments