Skip to content

Commit a813ccd

Browse files
No OpenCL advantage hints
Until now we had the per-device advantage setting; it was used to decide if tiled processing was worth to be done via OpenCL. That decision was based on the assumption, the overall amount of processed data on CPU vs. OpenCL knowing a "performance-ratio" would be a good bet. Unfortunately, this didn't work that good. 1. The "calibration" was difficult and thus the advantage feature was used by only very few people. 2. The assumption was often misleading. Still, on some systems like those with very small OpenCL memory but decent CPUs it might be worthwhile to have non-demanding modules (like exposure, temperature ...) running on the GPU but exclude those requiring a lot of graphics memory. So let's 1. remove the advantage feature 2. introduce a per-device entry in the conf database '***device***_nocl This can hold a comma-separated list of module->so names. Any module in this list will be executed on the CPU instead of OpenCL. It would be tempting, to allow users to toggle OpenCL processing on/off in the module header, but that would only make sense for systems with a single OpenCL device. - some minor code cleanup when reading/writing per-device conf, simplifications - improved logs for unified memory
1 parent a16f69a commit a813ccd

6 files changed

Lines changed: 52 additions & 187 deletions

File tree

src/common/guided_filter.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -484,10 +484,8 @@ static int _guided_filter_cl_impl(int devid,
484484
const gboolean tiling = num_tiles > 1;
485485

486486
// When should we avoid internal tiling and thus use CPU fallback code?
487-
// Lets use advantage hint if provided or assume OpenCL is 10 times faster
488-
const float hint = darktable.opencl->dev[devid].advantage;
489-
const float advantage = hint > 1.0f ? 1.0f / hint : 0.1f;
490-
const gboolean possible = ((float)valid_rows / (float)tile_height) > advantage;
487+
// Lets assume OpenCL is 10 times faster
488+
const gboolean possible = ((float)valid_rows / (float)tile_height) > 0.1f;
491489

492490
if(tiling || (darktable.unmuted & DT_DEBUG_VERBOSE))
493491
dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING,

src/common/opencl.c

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -359,26 +359,33 @@ static void _opencl_write_device_config(const int devid)
359359

360360
gchar key[256] = { 0 };
361361
gchar dat[512] = { 0 };
362-
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
363-
g_snprintf(dat, 510, "%i %i %i %i %i %.3f %.3f",
362+
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
363+
g_snprintf(dat, sizeof(dat), "%i %i %i %i %i %.3f %.3f",
364364
cl->dev[devid].micro_nap,
365365
cl->dev[devid].pinned_memory,
366366

367367
// this used to define the number of slots, now a bool and using DT_OPENCL_EVENTS if true
368368
cl->dev[devid].use_events ? 1 : 0,
369369
cl->dev[devid].asyncmode,
370370
cl->dev[devid].disabled,
371-
cl->dev[devid].advantage,
371+
0.0f,
372372
cl->dev[devid].unified_fraction);
373373
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
374-
"\n[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
374+
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
375+
dt_conf_set_string(key, dat);
376+
377+
// write per device list of modules that should not use OpenCL
378+
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
379+
g_snprintf(dat, sizeof(dat), "%s", cl->dev[devid].avoid ? cl->dev[devid].avoid : "");
380+
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
381+
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
375382
dt_conf_set_string(key, dat);
376383

377384
// Also take care of extended device data, these are not only device
378385
// specific but also depend on the devid to support systems with two
379386
// similar cards.
380-
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
381-
g_snprintf(dat, 510, "%i", cl->dev[devid].headroom);
387+
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
388+
g_snprintf(dat, sizeof(dat), "%i", cl->dev[devid].headroom);
382389
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
383390
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
384391
dt_conf_set_string(key, dat);
@@ -413,7 +420,7 @@ static gboolean _opencl_read_device_config(const int devid)
413420
dt_opencl_t *cl = darktable.opencl;
414421
dt_opencl_device_t *cldid = &cl->dev[devid];
415422
gchar key[256] = { 0 };
416-
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
423+
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
417424

418425
const gboolean existing_device = dt_conf_key_not_empty(key);
419426
gboolean safety_ok = TRUE;
@@ -425,17 +432,16 @@ static gboolean _opencl_read_device_config(const int devid)
425432
int events;
426433
int asyncmode;
427434
int disabled;
428-
float advantage;
435+
float dummy;
429436
float unified_fraction;
430437
sscanf(dat, "%i %i %i %i %i %f %f",
431-
&micro_nap, &pinned_memory, &events, &asyncmode, &disabled, &advantage, &unified_fraction);
438+
&micro_nap, &pinned_memory, &events, &asyncmode, &disabled, &dummy, &unified_fraction);
432439

433440
cldid->use_events = events ? TRUE : FALSE;
434441
cldid->micro_nap = micro_nap;
435442
cldid->pinned_memory = pinned_memory ? TRUE : FALSE;
436443
cldid->asyncmode = asyncmode ? TRUE : FALSE;
437444
cldid->disabled = disabled ? TRUE : FALSE;
438-
cldid->advantage = advantage;
439445
cldid->unified_fraction = unified_fraction;
440446
}
441447

@@ -444,12 +450,14 @@ static gboolean _opencl_read_device_config(const int devid)
444450
cldid->unified_fraction = 0.25f;
445451
if((cldid->micro_nap < 0) || (cldid->micro_nap > 1000000))
446452
cldid->micro_nap = 250;
447-
if((cldid->advantage < 0.0f) || (cldid->advantage > 10000.0f))
448-
cldid->advantage = 0.0f;
453+
454+
// Also read the per-device list of modules to be avoided for OpenCL
455+
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
456+
cldid->avoid = dt_conf_key_not_empty(key) ? dt_conf_get_string(key) : NULL;
449457

450458
// Also take care of extended device data, these are not only device
451459
// specific but also depend on the devid
452-
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
460+
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
453461
if(dt_conf_key_not_empty(key))
454462
{
455463
const gchar *dat = dt_conf_get_string_const(key);
@@ -516,6 +524,7 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
516524
cl->dev[dev].cname = NULL;
517525
cl->dev[dev].options = NULL;
518526
cl->dev[dev].cflags = NULL;
527+
cl->dev[dev].avoid = NULL;
519528
cl->dev[dev].memory_in_use = 0;
520529
cl->dev[dev].peak_memory = 0;
521530
cl->dev[dev].used_available = 0;
@@ -528,7 +537,6 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
528537
cl->dev[dev].clmem_error = FALSE;
529538
cl->dev[dev].clroundup_wd = 16;
530539
cl->dev[dev].clroundup_ht = 16;
531-
cl->dev[dev].advantage = 0.0f;
532540
cl->dev[dev].use_events = TRUE;
533541
cl->dev[dev].asyncmode = FALSE;
534542
cl->dev[dev].disabled = FALSE;
@@ -785,13 +793,16 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
785793
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE VERSION: %s API=%s\n",
786794
cl->dev[dev].device_version,
787795
cl->api30 ? "300" : "120");
788-
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s\n",
796+
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s",
789797
((type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) ? "CPU" : "",
790798
((type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) ? "GPU" : "",
791799
((type & CL_DEVICE_TYPE_CUSTOM) == CL_DEVICE_TYPE_CUSTOM) ? "CUSTOM" : "",
792800
(type & CL_DEVICE_TYPE_ACCELERATOR) ? ", Accelerator" : "",
793801
unified_memory ? ", unified mem" : ", dedicated mem" );
794802

803+
if(unified_memory) dt_print_nts(DT_DEBUG_OPENCL, " (%i%%)\n", (int)(100.f * cl->dev[dev].unified_fraction));
804+
else dt_print_nts(DT_DEBUG_OPENCL, "\n");
805+
795806
if(is_custom_device && newdevice)
796807
{
797808
dt_print_nts(DT_DEBUG_OPENCL,
@@ -853,8 +864,8 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
853864
if(cl->dev[dev].max_global_mem < (uint64_t)800ul * DT_MEGA)
854865
{
855866
dt_print_nts(DT_DEBUG_OPENCL,
856-
" *** insufficient global memory (%" PRIu64 "MB) ***\n",
857-
cl->dev[dev].max_global_mem / DT_MEGA);
867+
" *** insufficient global memory %zu MB) ***\n",
868+
(size_t)cl->dev[dev].max_global_mem / DT_MEGA);
858869
res = TRUE;
859870
cl->dev[dev].disabled |= TRUE;
860871
goto end;
@@ -875,18 +886,15 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
875886
}
876887

877888
dt_print_nts(DT_DEBUG_OPENCL,
878-
" GLOBAL MEM SIZE: %.0f MB\n",
879-
(double)cl->dev[dev].max_global_mem / (double)DT_MEGA);
889+
" GLOBAL MEM SIZE: %zu MB\n", (size_t)(cl->dev[dev].max_global_mem / DT_MEGA));
880890
dt_print_nts(DT_DEBUG_OPENCL,
881-
" MAX IMAGE ALLOC: %.0f MB\n",
882-
(double)cl->dev[dev].max_mem_alloc / (double)DT_MEGA);
891+
" MAX IMAGE ALLOC: %zu MB\n", (size_t)(cl->dev[dev].max_mem_alloc / DT_MEGA));
883892
dt_print_nts(DT_DEBUG_OPENCL,
884-
" MAX IMAGE SIZE: %zd x %zd\n",
885-
cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
893+
" MAX IMAGE SIZE: %zu x %zu\n", cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
886894
dt_print_nts(DT_DEBUG_OPENCL,
887-
" MAX CONSTANT BUFFER: %.0f KB\n", (double)cl->dev[dev].max_mem_constant / 1024.0);
895+
" MAX CONSTANT BUFFER: %zu KB\n", (size_t)(cl->dev[dev].max_mem_constant / 1024));
888896
dt_print_nts(DT_DEBUG_OPENCL,
889-
" LOCAL MEM SIZE: %zu KB\n", cl->dev[dev].local_size / 1024lu);
897+
" LOCAL MEM SIZE: %zu KB\n", (size_t)(cl->dev[dev].local_size / 1024));
890898
dt_print_nts(DT_DEBUG_OPENCL,
891899
" ADDRESS ALIGN: %d B\n", cl->dev[dev].alignsize / 8);
892900
dt_print_nts(DT_DEBUG_OPENCL,
@@ -961,10 +969,10 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
961969
" EVENTS HANDLED: %s\n", STR_YESNO(cl->dev[dev].use_events));
962970
dt_print_nts(DT_DEBUG_OPENCL,
963971
" OPENCL FAST MODE: %s\n", STR_YESNO(fastopencl));
964-
dt_print_nts(DT_DEBUG_OPENCL,
965-
" TILING ADVANTAGE: %.3f\n", cl->dev[dev].advantage);
966972
dt_print_nts(DT_DEBUG_OPENCL,
967973
" DEFAULT DEVICE: %s\n", STR_YESNO(type & CL_DEVICE_TYPE_DEFAULT));
974+
dt_print_nts(DT_DEBUG_OPENCL,
975+
" AVOIDED MODULES: %s\n", cl->dev[dev].avoid ? cl->dev[dev].avoid : "none");
968976

969977
if(cl->dev[dev].disabled)
970978
{
@@ -1221,6 +1229,7 @@ static void _cleanup_cl_device_mem(dt_opencl_t *cl, const int i)
12211229
free((void *)(cl->dev[i].cname));
12221230
free((void *)(cl->dev[i].options));
12231231
free((void *)(cl->dev[i].cflags));
1232+
g_free((void *)(cl->dev[i].avoid));
12241233
}
12251234

12261235
void dt_opencl_init(dt_opencl_t *cl,
@@ -1607,7 +1616,7 @@ void dt_opencl_init(dt_opencl_t *cl,
16071616
dt_opencl_scheduling_profile_t profile = _opencl_get_scheduling_profile();
16081617
_opencl_apply_scheduling_profile(profile);
16091618

1610-
// let's keep track on unified memory devices
1619+
// let's report unified memory per device
16111620
dt_sys_resources_t *res = &darktable.dtresources;
16121621
for(int i = 0; i < cl->num_devs; i++)
16131622
{
@@ -3598,9 +3607,9 @@ void dt_opencl_memory_statistics(int devid,
35983607
{
35993608
dt_print(DT_DEBUG_OPENCL,"[opencl memory] device '%s' id=%d: %.1fMB in use, %.1fMB available GPU mem of %.1fMB",
36003609
cl->dev[devid].fullname, devid,
3601-
(float)cl->dev[devid].memory_in_use/(1024*1024),
3602-
(float)cl->dev[devid].used_available/(1024*1024),
3603-
(float)cl->dev[devid].max_global_mem/(1024*1024));
3610+
(float)cl->dev[devid].memory_in_use / DT_MEGA,
3611+
(float)cl->dev[devid].used_available / DT_MEGA,
3612+
(float)cl->dev[devid].max_global_mem / DT_MEGA);
36043613
if(cl->dev[devid].memory_in_use > darktable.opencl->dev[devid].used_available)
36053614
{
36063615
dt_print(DT_DEBUG_OPENCL,

src/common/opencl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ typedef struct dt_opencl_device_t
151151
const char *cname;
152152
const char *options;
153153
const char *cflags;
154+
const char *avoid;
154155
cl_int summary;
155156
size_t memory_in_use;
156157
size_t peak_memory;
@@ -211,8 +212,6 @@ typedef struct dt_opencl_device_t
211212

212213
// lets keep the vendor for runtime checks
213214
int vendor_id;
214-
215-
float advantage;
216215
} dt_opencl_device_t;
217216

218217
struct dt_bilateral_cl_global_t;

src/develop/pixelpipe_hb.c

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,6 +1694,13 @@ static void _opencl_dump_diff_pipe_pfm(dt_dev_pixelpipe_t *pipe,
16941694
dt_free_align(clin);
16951695
}
16961696
}
1697+
1698+
static inline gboolean _avoid_cl_module(const dt_dev_pixelpipe_iop_t *piece)
1699+
{
1700+
const dt_opencl_device_t *cldid = &darktable.opencl->dev[piece->pipe->devid];
1701+
return cldid->avoid && dt_str_commasubstring(cldid->avoid, piece->module->op);
1702+
}
1703+
16971704
#endif
16981705

16991706
static inline gboolean _skip_piece_on_tags(const dt_dev_pixelpipe_iop_t *piece)
@@ -2140,7 +2147,8 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
21402147
gboolean possible_cl =
21412148
module->process_cl
21422149
&& piece->process_cl_ready
2143-
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL));
2150+
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL))
2151+
&& !_avoid_cl_module(piece);
21442152

21452153
const uint32_t m_bpp = MAX(in_bpp, bpp);
21462154
const size_t m_width = MAX(roi_in.width, roi_out->width);
@@ -2154,24 +2162,6 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
21542162
{
21552163
if(!_piece_may_tile(piece))
21562164
possible_cl = FALSE;
2157-
2158-
const float advantage = darktable.opencl->dev[pipe->devid].advantage;
2159-
if(possible_cl && (advantage > 0.0f))
2160-
{
2161-
const float tilemem_cl = dt_tiling_estimate_clmem(&tiling, piece,
2162-
&roi_in, roi_out, m_bpp);
2163-
const float tilemem_cpu = dt_tiling_estimate_cpumem(&tiling, piece,
2164-
&roi_in, roi_out, m_bpp);
2165-
if((tilemem_cpu * advantage) < tilemem_cl)
2166-
{
2167-
dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING,
2168-
"[dt_dev_pixelpipetiling_cl] [%s] estimates cpu"
2169-
" advantage in `%s', (dev=%i, adv=%.2f, GPU %.2f CPU %.2f)",
2170-
dt_dev_pixelpipe_type_to_str(pipe->type), module->op, pipe->devid,
2171-
advantage, tilemem_cl / 1e9, tilemem_cpu / 1e9);
2172-
possible_cl = FALSE;
2173-
}
2174-
}
21752165
}
21762166

21772167
if(possible_cl)

0 commit comments

Comments
 (0)