bashbaug
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/build.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎layers/00_example/main.cpp‎
Lines changed: 17 additions & 3 deletions b/‎layers/00_example/main.cpp‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎layers/10_cmdbufemu/README.md‎
Lines changed: 1 addition & 0 deletions b/‎layers/10_cmdbufemu/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎layers/10_cmdbufemu/emulate.cpp‎
Lines changed: 79 additions & 2 deletions b/‎layers/10_cmdbufemu/emulate.cpp‎
Lines changed: 79 additions & 2 deletions
diff --git a/‎layers/10_cmdbufemu/emulate.h‎
Lines changed: 1 addition & 0 deletions b/‎layers/10_cmdbufemu/emulate.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎layers/10_cmdbufemu/main.cpp‎
Lines changed: 35 additions & 6 deletions b/‎layers/10_cmdbufemu/main.cpp‎
Lines changed: 35 additions & 6 deletions
diff --git a/‎layers/11_semaemu/main.cpp‎
Lines changed: 18 additions & 3 deletions b/‎layers/11_semaemu/main.cpp‎
Lines changed: 18 additions & 3 deletions
@@ -18,7 +18,7 @@ jobs:
   check:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
     - name: Check Tabs
       run: |
@@ -39,7 +39,7 @@ jobs:
     runs-on: ${{matrix.os}}
 
     steps:
-    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
     - name: Get Ubuntu OpenGL Dependencies
       if: matrix.os == 'ubuntu-latest'
@@ -48,27 +48,27 @@ jobs:
         sudo apt-get install -y libglfw3-dev
 
     - name: Get OpenCL Headers
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       with:
         repository: KhronosGroup/OpenCL-Headers
         path: external/OpenCL-Headers
 
     - name: Get OpenCL ICD Loader
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       with:
         repository: KhronosGroup/OpenCL-ICD-Loader
         path: external/opencl-icd-loader
 
     - name: Get OpenCL Extension Loader
       if: matrix.ext == 'YES'
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       with:
         repository: bashbaug/opencl-extension-loader
         path: external/opencl-extension-loader
 
     - name: Get SPIR-V Headers
       if: matrix.ext == 'YES'
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       with:
         repository: KhronosGroup/SPIRV-Headers
         path: external/SPIRV-Headers
 
@@ -39,7 +39,7 @@ Many samples that use extensions additionally require the OpenCL Extension Loade
 
     git clone https://github.com/bashbaug/opencl-extension-loader external/opencl-extension-loader
 
-Several samples that interact with SPIR-V require the SPIR-V headres:
+Several samples that interact with SPIR-V require the SPIR-V headers:
 
     git clone https://github.com/KhronosGroup/SPIRV-Headers external/SPIRV-Headers
 
@@ -97,4 +97,4 @@ parsing, which is licensed under the MIT License.
 ---
 OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
 
-\* Other names and brands may be claimed as the property of others.
+\* Other names and brands may be claimed as the property of others.
@@ -97,16 +97,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
     return CL_SUCCESS;
 }
 
-CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
     cl_uint num_entries,
     const struct _cl_icd_dispatch* target_dispatch,
     cl_uint* num_entries_out,
-    const struct _cl_icd_dispatch** layer_dispatch_ret)
+    const struct _cl_icd_dispatch** layer_dispatch_ret,
+    const cl_layer_properties* properties)
 {
     const size_t dispatchTableSize =
         sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
 
-    if (target_dispatch == nullptr || 
+    if (target_dispatch == nullptr ||
         num_entries_out == nullptr ||
         layer_dispatch_ret == nullptr) {
         return CL_INVALID_VALUE;
@@ -126,3 +127,16 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
     return CL_SUCCESS;
 }
 
+CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+    cl_uint num_entries,
+    const struct _cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_out,
+    const struct _cl_icd_dispatch** layer_dispatch_ret)
+{
+    return clInitLayerWithProperties(
+        num_entries,
+        target_dispatch,
+        num_entries_out,
+        layer_dispatch_ret,
+        nullptr);
+}
@@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe
 |----------------------|----------|-----------------|
 | `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue".  By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
 | `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier.  By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
+| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes.  Only valid when an implementation supports the local work-group size extension and the command is not mutable.  By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`<br/><br/>`set CMDBUFEMU_SuggestedLocalWorkSize=0` |
 
 ## Known Limitations
 
 
@@ -817,6 +817,7 @@ struct SVMMemFill : Command
 struct NDRangeKernel : Command
 {
     static std::unique_ptr<NDRangeKernel> create(
+        const bool isMutable,
         const cl_command_properties_khr* properties,
         cl_command_buffer_khr cmdbuf,
         cl_command_queue queue,
@@ -1120,7 +1121,7 @@ struct NDRangeKernel : Command
         const size_t* global_work_size,
         const size_t* local_work_size )
     {
-        if( work_dim == 0 || 
+        if( work_dim == 0 ||
             global_work_size == nullptr ||
             local_work_size == nullptr )
         {
@@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr
             cmdbuf->TestQueues.reserve(num_queues);
             cmdbuf->BlockingEvents.reserve(num_queues);
 
+            if( cmdbuf->Queues.size() == 1 )
+            {
+                cmdbuf->setupSuggestedLocalWorkSize();
+            }
+
             for( auto queue : cmdbuf->Queues )
             {
                 g_pNextDispatch->clRetainCommandQueue(queue);
@@ -1604,7 +1610,7 @@ typedef struct _cl_command_buffer_khr
         for( const auto& command : Commands )
         {
             errorCode = command->playback(queue, deps);
-            if( (errorCode == CL_SUCCESS) && 
+            if( (errorCode == CL_SUCCESS) &&
                 isRecordQueueInOrder && !isReplayQueueInOrder )
             {
                 errorCode = g_pNextDispatch->clEnqueueBarrierWithWaitList(
@@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr
         return CL_SUCCESS;
     }
 
+    cl_int  clGetKernelSuggestedLocalWorkSize(
+                cl_command_queue queue,
+                cl_kernel kernel,
+                cl_uint work_dim,
+                const size_t* global_work_offset,
+                const size_t* global_work_size,
+                size_t* suggested_local_work_size )
+    {
+        if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr )
+        {
+            return CL_INVALID_OPERATION;
+        }
+        if( queue != nullptr && queue != Queues[0] )
+        {
+            return CL_INVALID_COMMAND_QUEUE;
+        }
+
+        return ptrGetKernelSuggestedLocalWorkSizeKHR(
+            Queues[0],
+            kernel,
+            work_dim,
+            global_work_offset,
+            global_work_size,
+            suggested_local_work_size );
+    }
+
 private:
     static constexpr cl_uint cMagic = 0x434d4442;   // "CMDB"
 
@@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr
     std::vector<std::unique_ptr<Command>> Commands;
     std::atomic<uint32_t> NextSyncPoint;
 
+    clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr;
+
+    void setupSuggestedLocalWorkSize()
+    {
+        cl_device_id device = nullptr;
+        g_pNextDispatch->clGetCommandQueueInfo(
+            Queues[0],
+            CL_QUEUE_DEVICE,
+            sizeof(device),
+            &device,
+            nullptr );
+
+        cl_platform_id platform = nullptr;
+        g_pNextDispatch->clGetDeviceInfo(
+            device,
+            CL_DEVICE_PLATFORM,
+            sizeof(platform),
+            &platform,
+            nullptr );
+
+        ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+            g_pNextDispatch->clGetExtensionFunctionAddressForPlatform(
+                platform,
+                "clGetKernelSuggestedLocalWorkSizeKHR" );
+    }
+
     void setupTestQueue(cl_command_queue src)
     {
         if( g_EnhancedErrorChecking )
@@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr(
     Queue(queue ? queue : cmdbuf->getQueue()) {}
 
 std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
+    const bool isMutable,
     const cl_command_properties_khr* properties,
     cl_command_buffer_khr cmdbuf,
     cl_command_queue queue,
@@ -1964,6 +2023,21 @@ std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
             local_work_size,
             local_work_size + work_dim);
     }
+    else if( g_SuggestedLocalWorkSize && isMutable == false )
+    {
+        command->local_work_size.resize(work_dim);
+        cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize(
+            queue,
+            kernel,
+            work_dim,
+            global_work_offset,
+            global_work_size,
+            command->local_work_size.data() );
+        if( checkError != CL_SUCCESS )
+        {
+            command->local_work_size.clear();
+        }
+    }
 
     g_pNextDispatch->clRetainKernel(command->original_kernel);
 
@@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU(
         }
     }
 
+    const bool isMutable = mutable_handle != nullptr;
+
     cl_int errorCode = CL_SUCCESS;
     auto command = NDRangeKernel::create(
+        isMutable,
         properties,
         cmdbuf,
         command_queue,
 
@@ -11,6 +11,7 @@
 
 extern bool g_EnhancedErrorChecking;
 extern bool g_KernelForProfiling;
+extern bool g_SuggestedLocalWorkSize;
 
 extern const struct _cl_icd_dispatch* g_pNextDispatch;
 
 
@@ -35,10 +35,16 @@
 bool g_EnhancedErrorChecking = false;
 
 // Using kernels for profiling can fix issues with some implementations
-// that do not properly support event profiling on barrkers.
+// that do not properly support event profiling on barriers.
 
 bool g_KernelForProfiling = false;
 
+// Using the suggested local work-group size can reduce overhead by determining
+// the values for a NULL local work-group size when the command buffer is
+// created rather than when it is executed.
+
+bool g_SuggestedLocalWorkSize = true;
+
 const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
 
 static cl_int CL_API_CALL
@@ -231,7 +237,7 @@ static void _init_dispatch()
 }
 
 CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
-    cl_layer_info  param_name,
+    cl_layer_info param_name,
     size_t param_value_size,
     void* param_value,
     size_t* param_value_size_ret)
@@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
 #if defined(CL_LAYER_NAME)
     case CL_LAYER_NAME:
         {
+            char str[256];
+            snprintf(str, 256, "Emulation Layer for "
+                CL_KHR_COMMAND_BUFFER_EXTENSION_NAME
+                " (EEC: %s, KFP: %s, SLWS: %s)",
+                g_EnhancedErrorChecking ? "Y" : "N",
+                g_KernelForProfiling ? "Y" : "N",
+                g_SuggestedLocalWorkSize ? "Y" : "N");
             auto ptr = (char*)param_value;
             return writeStringToMemory(
                 param_value_size,
-                "Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME,
+                str,
                 param_value_size_ret,
                 ptr);
         }
@@ -266,16 +279,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
     return CL_SUCCESS;
 }
 
-CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
     cl_uint num_entries,
     const struct _cl_icd_dispatch* target_dispatch,
     cl_uint* num_entries_out,
-    const struct _cl_icd_dispatch** layer_dispatch_ret)
+    const struct _cl_icd_dispatch** layer_dispatch_ret,
+    const cl_layer_properties* properties)
 {
     const size_t dispatchTableSize =
         sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
 
-    if (target_dispatch == nullptr || 
+    if (target_dispatch == nullptr ||
         num_entries_out == nullptr ||
         layer_dispatch_ret == nullptr) {
         return CL_INVALID_VALUE;
@@ -289,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
 
     getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
     getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
+    getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize);
 
     g_pNextDispatch = target_dispatch;
 
@@ -297,3 +312,17 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
 
     return CL_SUCCESS;
 }
+
+CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+    cl_uint num_entries,
+    const struct _cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_out,
+    const struct _cl_icd_dispatch** layer_dispatch_ret)
+{
+    return clInitLayerWithProperties(
+        num_entries,
+        target_dispatch,
+        num_entries_out,
+        layer_dispatch_ret,
+        nullptr);
+}
@@ -225,16 +225,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
     return CL_SUCCESS;
 }
 
-CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
     cl_uint num_entries,
     const struct _cl_icd_dispatch* target_dispatch,
     cl_uint* num_entries_out,
-    const struct _cl_icd_dispatch** layer_dispatch_ret)
+    const struct _cl_icd_dispatch** layer_dispatch_ret,
+    const cl_layer_properties* properties)
 {
     const size_t dispatchTableSize =
         sizeof(dispatch) / sizeof(dispatch.clGetPlatformIDs);
 
-    if (target_dispatch == nullptr || 
+    if (target_dispatch == nullptr ||
         num_entries_out == nullptr ||
         layer_dispatch_ret == nullptr) {
         return CL_INVALID_VALUE;
@@ -253,3 +254,17 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
 
     return CL_SUCCESS;
 }
+
+CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
+    cl_uint num_entries,
+    const struct _cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_out,
+    const struct _cl_icd_dispatch** layer_dispatch_ret)
+{
+    return clInitLayerWithProperties(
+        num_entries,
+        target_dispatch,
+        num_entries_out,
+        layer_dispatch_ret,
+        nullptr);
+}