add a queue family scenario explicitly using the same queue index (#57)

bashbaug · web-flow · commit 9f279b02db16 · 2022-06-16T14:58:45.000-07:00
diff --git a/samples/10_queueexperiments/README.md b/samples/10_queueexperiments/README.md
@@ -21,6 +21,9 @@ Some devices that do not support out-of-order command-queues can still execute c
 5. The fifth pattern executes independent ND-range kernels using multiple in-order command-queues, except in this scenario the command-queues are created using different command-queue indices.
 This pattern requires support for the [cl_intel_command_queue_families](https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_command_queue_families.html) extension.
 Command-queues with different command-queue indices may execute differently than ordinary command-queues.
+5. The sixth pattern is similar, except in this scenario the command-queues are explicitly created using the same command-queue indices.
+This pattern also requires support for the [cl_intel_command_queue_families](https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_command_queue_families.html) extension.
+Command-queues with different command-queue indices may execute differently than ordinary command-queues.
 6. The sixth pattern executes independent ND-range kernels using in-order command-queues created different OpenCL contexts.
 This pattern simulates the behavior of multiple applications running in parallel, or multiple isolated threads running in parallel.
 
@@ -52,6 +55,6 @@ For GPU OpenCL devices this may require running the application in a console or
 |:--|:-:|:--|
 | `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
 | `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
-| `-k <number>` | 8 | Specify the number of kernels to execute for the variable execution.
+| `-k <number>` | 0 | Specify the number of kernels to execute for the variable execution.  Must be less than or equal to 64.  Specifying zero runs a sweep over different values.
 | `-i <number>` | 1 | Specify the number of kernel iterations to execute.
 | `-e <number>` | 1 | Specify the number of ND-range elements to execute (the global work size).
diff --git a/samples/10_queueexperiments/main.cpp b/samples/10_queueexperiments/main.cpp
@@ -33,7 +33,6 @@ using test_clock = std::chrono::high_resolution_clock;
 constexpr int maxKernels = 64;
 constexpr int testIterations = 32;
 
-int numKernels = 8;
 int numIterations = 1;
 size_t numElements = 1;
 
@@ -272,6 +271,67 @@ static void go_kernel_qf_ioqxN( cl::Context& context, cl::Device& device, const
     printf("Finished in %f seconds\n", best);
 }
 
+static void go_kernel_qfs_ioqxN( cl::Context& context, cl::Device& device, const int numKernels )
+{
+    init(context, device);
+
+    printf("%s (n=%d): ", __FUNCTION__, numKernels); fflush(stdout);
+
+    if (!checkDeviceForExtension(device, "cl_intel_command_queue_families")) {
+        printf("Skipping (device does not support cl_intel_command_queue_families).\n");
+        return;
+    }
+
+    cl_uint family = 0;
+    cl_uint numQueues = 0;
+    findQueueFamily(device, family, numQueues);
+    if (numQueues == 0) {
+        printf("Skipping (no queues found?).\n");
+        return;
+    } else {
+        printf("Using queue family %u with %u queue(s): ", family, numQueues); fflush(stdout);
+    }
+
+    cl_command_queue_properties props = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
+    if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+        // Create a dummy out-of-order queue to enable command aggregation.
+        cl::CommandQueue dummy{context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE};
+    }
+
+    std::vector<cl::CommandQueue> queues;
+    for (int i = 0; i < numKernels; i++) {
+        cl_queue_properties props[] = {
+            CL_QUEUE_FAMILY_INTEL, family,
+            CL_QUEUE_INDEX_INTEL, 0,    // always use queue index zero
+            0
+        };
+        queues.push_back(cl::CommandQueue{
+            clCreateCommandQueueWithProperties(context(), device(), props, NULL)});
+    }
+
+    float best = 999.0f;
+    for (int test = 0; test < testIterations; test++) {
+        auto start = test_clock::now();
+        for (int i = 0; i < numKernels; i++) {
+            queues[i].enqueueNDRangeKernel(
+                kernels[i],
+                cl::NullRange,
+                cl::NDRange{numElements});
+        }
+        for (int i = 0; i < numKernels; i++) {
+            queues[i].flush();
+        }
+        for (int i = 0; i < numKernels; i++) {
+            queues[i].finish();
+        }
+
+        auto end = test_clock::now();
+        std::chrono::duration<float> elapsed_seconds = end - start;
+        best = std::min(best, elapsed_seconds.count());
+    }
+    printf("Finished in %f seconds\n", best);
+}
+
 static void go_kernel_ctx_ioqxN( cl::Device& device, const int numKernels )
 {
     printf("%s (n=%d): ", __FUNCTION__, numKernels); fflush(stdout);
@@ -335,12 +395,13 @@ int main(
 {
     int platformIndex = 0;
     int deviceIndex = 0;
+    int numKernels = 0;
 
     {
         popl::OptionParser op("Supported Options");
         op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
         op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
-        op.add<popl::Value<int>>("k", "kernels", "Kernel to Execute", numKernels, &numKernels);
+        op.add<popl::Value<int>>("k", "kernels", "Kernel to Execute (<=0 to sweep)", numKernels, &numKernels);
         op.add<popl::Value<int>>("i", "iterations", "Kernel Iterations", numIterations, &numIterations);
         op.add<popl::Value<size_t>>("e", "elements", "Number of ND-Range Elements", numElements, &numElements);
         bool printUsage = false;
@@ -391,35 +452,40 @@ int main(
         kernels[i].setArg(1, numIterations);
     }
 
-    go_kernelxN(context, device, 1);
-    go_kernelxN(context, device, 2);
-    go_kernelxN(context, device, 4);
-    go_kernelxN(context, device, numKernels);
+    std::vector<int> counts;
+    if (numKernels <= 0) {
+        counts.assign({1, 2, 4, 8, 16, 32, 64});
+    } else {
+        counts.assign({numKernels});
+    }
 
-    go_kernelxN_ooq(context, device, 1);
-    go_kernelxN_ooq(context, device, 2);
-    go_kernelxN_ooq(context, device, 4);
-    go_kernelxN_ooq(context, device, numKernels);
+    for (auto count : counts) {
+        go_kernelxN(context, device, count);
+    }
 
-    go_kernelxN_ooq_events(context, device, 1);
-    go_kernelxN_ooq_events(context, device, 2);
-    go_kernelxN_ooq_events(context, device, 4);
-    go_kernelxN_ooq_events(context, device, numKernels);
+    for (auto count : counts) {
+        go_kernelxN_ooq(context, device, count);
+    }
+
+    for (auto count : counts) {
+        go_kernelxN_ooq_events(context, device, count);
+    }
+
+    for (auto count : counts) {
+        go_kernel_ioqxN(context, device, count);
+    }
 
-    go_kernel_ioqxN(context, device, 1);
-    go_kernel_ioqxN(context, device, 2);
-    go_kernel_ioqxN(context, device, 4);
-    go_kernel_ioqxN(context, device, numKernels);
+    for (auto count : counts) {
+        go_kernel_qf_ioqxN(context, device, count);
+    }
 
-    go_kernel_qf_ioqxN(context, device, 1);
-    go_kernel_qf_ioqxN(context, device, 2);
-    go_kernel_qf_ioqxN(context, device, 4);
-    go_kernel_qf_ioqxN(context, device, numKernels);
+    for (auto count : counts) {
+        go_kernel_qfs_ioqxN(context, device, count);
+    }
 
-    go_kernel_ctx_ioqxN(device, 1);
-    go_kernel_ctx_ioqxN(device, 2);
-    go_kernel_ctx_ioqxN(device, 4);
-    go_kernel_ctx_ioqxN(device, numKernels);
+    for (auto count : counts) {
+        go_kernel_ctx_ioqxN(device, count);
+    }
 
     return 0;
 }