Skip to content

Commit 2d734c0

Browse files
committed
Merge branch 'main' into matrixperf-final
2 parents ada8a4a + c167fde commit 2d734c0

30 files changed

Lines changed: 271 additions & 101 deletions

File tree

.github/workflows/build.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
check:
1919
runs-on: ubuntu-latest
2020
steps:
21-
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
21+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
2222

2323
- name: Check Tabs
2424
run: |
@@ -39,7 +39,7 @@ jobs:
3939
runs-on: ${{matrix.os}}
4040

4141
steps:
42-
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
42+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
4343

4444
- name: Get Ubuntu OpenGL Dependencies
4545
if: matrix.os == 'ubuntu-latest'
@@ -48,27 +48,27 @@ jobs:
4848
sudo apt-get install -y libglfw3-dev
4949
5050
- name: Get OpenCL Headers
51-
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
51+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
5252
with:
5353
repository: KhronosGroup/OpenCL-Headers
5454
path: external/OpenCL-Headers
5555

5656
- name: Get OpenCL ICD Loader
57-
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
57+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
5858
with:
5959
repository: KhronosGroup/OpenCL-ICD-Loader
6060
path: external/opencl-icd-loader
6161

6262
- name: Get OpenCL Extension Loader
6363
if: matrix.ext == 'YES'
64-
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
64+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
6565
with:
6666
repository: bashbaug/opencl-extension-loader
6767
path: external/opencl-extension-loader
6868

6969
- name: Get SPIR-V Headers
7070
if: matrix.ext == 'YES'
71-
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
71+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
7272
with:
7373
repository: KhronosGroup/SPIRV-Headers
7474
path: external/SPIRV-Headers

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ Many samples that use extensions additionally require the OpenCL Extension Loade
3939

4040
git clone https://github.com/bashbaug/opencl-extension-loader external/opencl-extension-loader
4141

42-
Several samples that interact with SPIR-V require the SPIR-V headres:
42+
Several samples that interact with SPIR-V require the SPIR-V headers:
4343

4444
git clone https://github.com/KhronosGroup/SPIRV-Headers external/SPIRV-Headers
4545

@@ -97,4 +97,4 @@ parsing, which is licensed under the MIT License.
9797
---
9898
OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
9999

100-
\* Other names and brands may be claimed as the property of others.
100+
\* Other names and brands may be claimed as the property of others.

layers/00_example/main.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,16 +97,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
9797
return CL_SUCCESS;
9898
}
9999

100-
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
100+
CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
101101
cl_uint num_entries,
102102
const struct _cl_icd_dispatch* target_dispatch,
103103
cl_uint* num_entries_out,
104-
const struct _cl_icd_dispatch** layer_dispatch_ret)
104+
const struct _cl_icd_dispatch** layer_dispatch_ret,
105+
const cl_layer_properties* properties)
105106
{
106107
const size_t dispatchTableSize =
107108
sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
108109

109-
if (target_dispatch == nullptr ||
110+
if (target_dispatch == nullptr ||
110111
num_entries_out == nullptr ||
111112
layer_dispatch_ret == nullptr) {
112113
return CL_INVALID_VALUE;
@@ -126,3 +127,16 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
126127
return CL_SUCCESS;
127128
}
128129

130+
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
131+
cl_uint num_entries,
132+
const struct _cl_icd_dispatch* target_dispatch,
133+
cl_uint* num_entries_out,
134+
const struct _cl_icd_dispatch** layer_dispatch_ret)
135+
{
136+
return clInitLayerWithProperties(
137+
num_entries,
138+
target_dispatch,
139+
num_entries_out,
140+
layer_dispatch_ret,
141+
nullptr);
142+
}

layers/10_cmdbufemu/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe
3434
|----------------------|----------|-----------------|
3535
| `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
3636
| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
37+
| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes. Only valid when an implementation supports the local work-group size extension and the command is not mutable. By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`<br/><br/>`set CMDBUFEMU_SuggestedLocalWorkSize=0` |
3738

3839
## Known Limitations
3940

layers/10_cmdbufemu/emulate.cpp

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,7 @@ struct SVMMemFill : Command
817817
struct NDRangeKernel : Command
818818
{
819819
static std::unique_ptr<NDRangeKernel> create(
820+
const bool isMutable,
820821
const cl_command_properties_khr* properties,
821822
cl_command_buffer_khr cmdbuf,
822823
cl_command_queue queue,
@@ -1120,7 +1121,7 @@ struct NDRangeKernel : Command
11201121
const size_t* global_work_size,
11211122
const size_t* local_work_size )
11221123
{
1123-
if( work_dim == 0 ||
1124+
if( work_dim == 0 ||
11241125
global_work_size == nullptr ||
11251126
local_work_size == nullptr )
11261127
{
@@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr
12351236
cmdbuf->TestQueues.reserve(num_queues);
12361237
cmdbuf->BlockingEvents.reserve(num_queues);
12371238

1239+
if( cmdbuf->Queues.size() == 1 )
1240+
{
1241+
cmdbuf->setupSuggestedLocalWorkSize();
1242+
}
1243+
12381244
for( auto queue : cmdbuf->Queues )
12391245
{
12401246
g_pNextDispatch->clRetainCommandQueue(queue);
@@ -1604,7 +1610,7 @@ typedef struct _cl_command_buffer_khr
16041610
for( const auto& command : Commands )
16051611
{
16061612
errorCode = command->playback(queue, deps);
1607-
if( (errorCode == CL_SUCCESS) &&
1613+
if( (errorCode == CL_SUCCESS) &&
16081614
isRecordQueueInOrder && !isReplayQueueInOrder )
16091615
{
16101616
errorCode = g_pNextDispatch->clEnqueueBarrierWithWaitList(
@@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr
16831689
return CL_SUCCESS;
16841690
}
16851691

1692+
cl_int clGetKernelSuggestedLocalWorkSize(
1693+
cl_command_queue queue,
1694+
cl_kernel kernel,
1695+
cl_uint work_dim,
1696+
const size_t* global_work_offset,
1697+
const size_t* global_work_size,
1698+
size_t* suggested_local_work_size )
1699+
{
1700+
if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr )
1701+
{
1702+
return CL_INVALID_OPERATION;
1703+
}
1704+
if( queue != nullptr && queue != Queues[0] )
1705+
{
1706+
return CL_INVALID_COMMAND_QUEUE;
1707+
}
1708+
1709+
return ptrGetKernelSuggestedLocalWorkSizeKHR(
1710+
Queues[0],
1711+
kernel,
1712+
work_dim,
1713+
global_work_offset,
1714+
global_work_size,
1715+
suggested_local_work_size );
1716+
}
1717+
16861718
private:
16871719
static constexpr cl_uint cMagic = 0x434d4442; // "CMDB"
16881720

@@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr
17031735
std::vector<std::unique_ptr<Command>> Commands;
17041736
std::atomic<uint32_t> NextSyncPoint;
17051737

1738+
clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr;
1739+
1740+
void setupSuggestedLocalWorkSize()
1741+
{
1742+
cl_device_id device = nullptr;
1743+
g_pNextDispatch->clGetCommandQueueInfo(
1744+
Queues[0],
1745+
CL_QUEUE_DEVICE,
1746+
sizeof(device),
1747+
&device,
1748+
nullptr );
1749+
1750+
cl_platform_id platform = nullptr;
1751+
g_pNextDispatch->clGetDeviceInfo(
1752+
device,
1753+
CL_DEVICE_PLATFORM,
1754+
sizeof(platform),
1755+
&platform,
1756+
nullptr );
1757+
1758+
ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
1759+
g_pNextDispatch->clGetExtensionFunctionAddressForPlatform(
1760+
platform,
1761+
"clGetKernelSuggestedLocalWorkSizeKHR" );
1762+
}
1763+
17061764
void setupTestQueue(cl_command_queue src)
17071765
{
17081766
if( g_EnhancedErrorChecking )
@@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr(
18471905
Queue(queue ? queue : cmdbuf->getQueue()) {}
18481906

18491907
std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
1908+
const bool isMutable,
18501909
const cl_command_properties_khr* properties,
18511910
cl_command_buffer_khr cmdbuf,
18521911
cl_command_queue queue,
@@ -1964,6 +2023,21 @@ std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
19642023
local_work_size,
19652024
local_work_size + work_dim);
19662025
}
2026+
else if( g_SuggestedLocalWorkSize && isMutable == false )
2027+
{
2028+
command->local_work_size.resize(work_dim);
2029+
cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize(
2030+
queue,
2031+
kernel,
2032+
work_dim,
2033+
global_work_offset,
2034+
global_work_size,
2035+
command->local_work_size.data() );
2036+
if( checkError != CL_SUCCESS )
2037+
{
2038+
command->local_work_size.clear();
2039+
}
2040+
}
19672041

19682042
g_pNextDispatch->clRetainKernel(command->original_kernel);
19692043

@@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU(
28382912
}
28392913
}
28402914

2915+
const bool isMutable = mutable_handle != nullptr;
2916+
28412917
cl_int errorCode = CL_SUCCESS;
28422918
auto command = NDRangeKernel::create(
2919+
isMutable,
28432920
properties,
28442921
cmdbuf,
28452922
command_queue,

layers/10_cmdbufemu/emulate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
extern bool g_EnhancedErrorChecking;
1313
extern bool g_KernelForProfiling;
14+
extern bool g_SuggestedLocalWorkSize;
1415

1516
extern const struct _cl_icd_dispatch* g_pNextDispatch;
1617

layers/10_cmdbufemu/main.cpp

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,16 @@
3535
bool g_EnhancedErrorChecking = false;
3636

3737
// Using kernels for profiling can fix issues with some implementations
38-
// that do not properly support event profiling on barrkers.
38+
// that do not properly support event profiling on barriers.
3939

4040
bool g_KernelForProfiling = false;
4141

42+
// Using the suggested local work-group size can reduce overhead by determining
43+
// the values for a NULL local work-group size when the command buffer is
44+
// created rather than when it is executed.
45+
46+
bool g_SuggestedLocalWorkSize = true;
47+
4248
const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
4349

4450
static cl_int CL_API_CALL
@@ -231,7 +237,7 @@ static void _init_dispatch()
231237
}
232238

233239
CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
234-
cl_layer_info param_name,
240+
cl_layer_info param_name,
235241
size_t param_value_size,
236242
void* param_value,
237243
size_t* param_value_size_ret)
@@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
251257
#if defined(CL_LAYER_NAME)
252258
case CL_LAYER_NAME:
253259
{
260+
char str[256];
261+
snprintf(str, 256, "Emulation Layer for "
262+
CL_KHR_COMMAND_BUFFER_EXTENSION_NAME
263+
" (EEC: %s, KFP: %s, SLWS: %s)",
264+
g_EnhancedErrorChecking ? "Y" : "N",
265+
g_KernelForProfiling ? "Y" : "N",
266+
g_SuggestedLocalWorkSize ? "Y" : "N");
254267
auto ptr = (char*)param_value;
255268
return writeStringToMemory(
256269
param_value_size,
257-
"Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME,
270+
str,
258271
param_value_size_ret,
259272
ptr);
260273
}
@@ -266,16 +279,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
266279
return CL_SUCCESS;
267280
}
268281

269-
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
282+
CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
270283
cl_uint num_entries,
271284
const struct _cl_icd_dispatch* target_dispatch,
272285
cl_uint* num_entries_out,
273-
const struct _cl_icd_dispatch** layer_dispatch_ret)
286+
const struct _cl_icd_dispatch** layer_dispatch_ret,
287+
const cl_layer_properties* properties)
274288
{
275289
const size_t dispatchTableSize =
276290
sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
277291

278-
if (target_dispatch == nullptr ||
292+
if (target_dispatch == nullptr ||
279293
num_entries_out == nullptr ||
280294
layer_dispatch_ret == nullptr) {
281295
return CL_INVALID_VALUE;
@@ -289,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
289303

290304
getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
291305
getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
306+
getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize);
292307

293308
g_pNextDispatch = target_dispatch;
294309

@@ -297,3 +312,17 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
297312

298313
return CL_SUCCESS;
299314
}
315+
316+
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
317+
cl_uint num_entries,
318+
const struct _cl_icd_dispatch* target_dispatch,
319+
cl_uint* num_entries_out,
320+
const struct _cl_icd_dispatch** layer_dispatch_ret)
321+
{
322+
return clInitLayerWithProperties(
323+
num_entries,
324+
target_dispatch,
325+
num_entries_out,
326+
layer_dispatch_ret,
327+
nullptr);
328+
}

layers/11_semaemu/main.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,16 +225,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
225225
return CL_SUCCESS;
226226
}
227227

228-
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
228+
CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
229229
cl_uint num_entries,
230230
const struct _cl_icd_dispatch* target_dispatch,
231231
cl_uint* num_entries_out,
232-
const struct _cl_icd_dispatch** layer_dispatch_ret)
232+
const struct _cl_icd_dispatch** layer_dispatch_ret,
233+
const cl_layer_properties* properties)
233234
{
234235
const size_t dispatchTableSize =
235236
sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
236237

237-
if (target_dispatch == nullptr ||
238+
if (target_dispatch == nullptr ||
238239
num_entries_out == nullptr ||
239240
layer_dispatch_ret == nullptr) {
240241
return CL_INVALID_VALUE;
@@ -253,3 +254,17 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
253254

254255
return CL_SUCCESS;
255256
}
257+
258+
CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
259+
cl_uint num_entries,
260+
const struct _cl_icd_dispatch* target_dispatch,
261+
cl_uint* num_entries_out,
262+
const struct _cl_icd_dispatch** layer_dispatch_ret)
263+
{
264+
return clInitLayerWithProperties(
265+
num_entries,
266+
target_dispatch,
267+
num_entries_out,
268+
layer_dispatch_ret,
269+
nullptr);
270+
}

0 commit comments

Comments
 (0)