Skip to content

Commit 9f279b0

Browse files
authored
add a queue family scenario explicitly using the same queue index (#57)
1 parent 58b6371 commit 9f279b0

2 files changed

Lines changed: 96 additions & 27 deletions

File tree

samples/10_queueexperiments/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ Some devices that do not support out-of-order command-queues can still execute c
2121
5. The fifth pattern executes independent ND-range kernels using multiple in-order command-queues, except in this scenario the command-queues are created using different command-queue indices.
2222
This pattern requires support for the [cl_intel_command_queue_families](https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_command_queue_families.html) extension.
2323
Command-queues with different command-queue indices may execute differently than ordinary command-queues.
24+
5. The sixth pattern is similar, except in this scenario the command-queues are explicitly created using the same command-queue indices.
25+
This pattern also requires support for the [cl_intel_command_queue_families](https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_command_queue_families.html) extension.
26+
Command-queues with different command-queue indices may execute differently than ordinary command-queues.
2427
6. The sixth pattern executes independent ND-range kernels using in-order command-queues created different OpenCL contexts.
2528
This pattern simulates the behavior of multiple applications running in parallel, or multiple isolated threads running in parallel.
2629

@@ -52,6 +55,6 @@ For GPU OpenCL devices this may require running the application in a console or
5255
|:--|:-:|:--|
5356
| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
5457
| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
55-
| `-k <number>` | 8 | Specify the number of kernels to execute for the variable execution.
58+
| `-k <number>` | 0 | Specify the number of kernels to execute for the variable execution. Must be less than or equal to 64. Specifying zero runs a sweep over different values.
5659
| `-i <number>` | 1 | Specify the number of kernel iterations to execute.
5760
| `-e <number>` | 1 | Specify the number of ND-range elements to execute (the global work size).

samples/10_queueexperiments/main.cpp

Lines changed: 92 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ using test_clock = std::chrono::high_resolution_clock;
3333
constexpr int maxKernels = 64;
3434
constexpr int testIterations = 32;
3535

36-
int numKernels = 8;
3736
int numIterations = 1;
3837
size_t numElements = 1;
3938

@@ -272,6 +271,67 @@ static void go_kernel_qf_ioqxN( cl::Context& context, cl::Device& device, const
272271
printf("Finished in %f seconds\n", best);
273272
}
274273

274+
static void go_kernel_qfs_ioqxN( cl::Context& context, cl::Device& device, const int numKernels )
275+
{
276+
init(context, device);
277+
278+
printf("%s (n=%d): ", __FUNCTION__, numKernels); fflush(stdout);
279+
280+
if (!checkDeviceForExtension(device, "cl_intel_command_queue_families")) {
281+
printf("Skipping (device does not support cl_intel_command_queue_families).\n");
282+
return;
283+
}
284+
285+
cl_uint family = 0;
286+
cl_uint numQueues = 0;
287+
findQueueFamily(device, family, numQueues);
288+
if (numQueues == 0) {
289+
printf("Skipping (no queues found?).\n");
290+
return;
291+
} else {
292+
printf("Using queue family %u with %u queue(s): ", family, numQueues); fflush(stdout);
293+
}
294+
295+
cl_command_queue_properties props = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
296+
if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
297+
// Create a dummy out-of-order queue to enable command aggregation.
298+
cl::CommandQueue dummy{context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE};
299+
}
300+
301+
std::vector<cl::CommandQueue> queues;
302+
for (int i = 0; i < numKernels; i++) {
303+
cl_queue_properties props[] = {
304+
CL_QUEUE_FAMILY_INTEL, family,
305+
CL_QUEUE_INDEX_INTEL, 0, // always use queue index zero
306+
0
307+
};
308+
queues.push_back(cl::CommandQueue{
309+
clCreateCommandQueueWithProperties(context(), device(), props, NULL)});
310+
}
311+
312+
float best = 999.0f;
313+
for (int test = 0; test < testIterations; test++) {
314+
auto start = test_clock::now();
315+
for (int i = 0; i < numKernels; i++) {
316+
queues[i].enqueueNDRangeKernel(
317+
kernels[i],
318+
cl::NullRange,
319+
cl::NDRange{numElements});
320+
}
321+
for (int i = 0; i < numKernels; i++) {
322+
queues[i].flush();
323+
}
324+
for (int i = 0; i < numKernels; i++) {
325+
queues[i].finish();
326+
}
327+
328+
auto end = test_clock::now();
329+
std::chrono::duration<float> elapsed_seconds = end - start;
330+
best = std::min(best, elapsed_seconds.count());
331+
}
332+
printf("Finished in %f seconds\n", best);
333+
}
334+
275335
static void go_kernel_ctx_ioqxN( cl::Device& device, const int numKernels )
276336
{
277337
printf("%s (n=%d): ", __FUNCTION__, numKernels); fflush(stdout);
@@ -335,12 +395,13 @@ int main(
335395
{
336396
int platformIndex = 0;
337397
int deviceIndex = 0;
398+
int numKernels = 0;
338399

339400
{
340401
popl::OptionParser op("Supported Options");
341402
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
342403
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
343-
op.add<popl::Value<int>>("k", "kernels", "Kernel to Execute", numKernels, &numKernels);
404+
op.add<popl::Value<int>>("k", "kernels", "Kernel to Execute (<=0 to sweep)", numKernels, &numKernels);
344405
op.add<popl::Value<int>>("i", "iterations", "Kernel Iterations", numIterations, &numIterations);
345406
op.add<popl::Value<size_t>>("e", "elements", "Number of ND-Range Elements", numElements, &numElements);
346407
bool printUsage = false;
@@ -391,35 +452,40 @@ int main(
391452
kernels[i].setArg(1, numIterations);
392453
}
393454

394-
go_kernelxN(context, device, 1);
395-
go_kernelxN(context, device, 2);
396-
go_kernelxN(context, device, 4);
397-
go_kernelxN(context, device, numKernels);
455+
std::vector<int> counts;
456+
if (numKernels <= 0) {
457+
counts.assign({1, 2, 4, 8, 16, 32, 64});
458+
} else {
459+
counts.assign({numKernels});
460+
}
398461

399-
go_kernelxN_ooq(context, device, 1);
400-
go_kernelxN_ooq(context, device, 2);
401-
go_kernelxN_ooq(context, device, 4);
402-
go_kernelxN_ooq(context, device, numKernels);
462+
for (auto count : counts) {
463+
go_kernelxN(context, device, count);
464+
}
403465

404-
go_kernelxN_ooq_events(context, device, 1);
405-
go_kernelxN_ooq_events(context, device, 2);
406-
go_kernelxN_ooq_events(context, device, 4);
407-
go_kernelxN_ooq_events(context, device, numKernels);
466+
for (auto count : counts) {
467+
go_kernelxN_ooq(context, device, count);
468+
}
469+
470+
for (auto count : counts) {
471+
go_kernelxN_ooq_events(context, device, count);
472+
}
473+
474+
for (auto count : counts) {
475+
go_kernel_ioqxN(context, device, count);
476+
}
408477

409-
go_kernel_ioqxN(context, device, 1);
410-
go_kernel_ioqxN(context, device, 2);
411-
go_kernel_ioqxN(context, device, 4);
412-
go_kernel_ioqxN(context, device, numKernels);
478+
for (auto count : counts) {
479+
go_kernel_qf_ioqxN(context, device, count);
480+
}
413481

414-
go_kernel_qf_ioqxN(context, device, 1);
415-
go_kernel_qf_ioqxN(context, device, 2);
416-
go_kernel_qf_ioqxN(context, device, 4);
417-
go_kernel_qf_ioqxN(context, device, numKernels);
482+
for (auto count : counts) {
483+
go_kernel_qfs_ioqxN(context, device, count);
484+
}
418485

419-
go_kernel_ctx_ioqxN(device, 1);
420-
go_kernel_ctx_ioqxN(device, 2);
421-
go_kernel_ctx_ioqxN(device, 4);
422-
go_kernel_ctx_ioqxN(device, numKernels);
486+
for (auto count : counts) {
487+
go_kernel_ctx_ioqxN(device, count);
488+
}
423489

424490
return 0;
425491
}

0 commit comments

Comments
 (0)