Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/common/guided_filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -484,10 +484,8 @@ static int _guided_filter_cl_impl(int devid,
const gboolean tiling = num_tiles > 1;

// When should we avoid internal tiling and thus use CPU fallback code?
// Lets use advantage hint if provided or assume OpenCL is 10 times faster
const float hint = darktable.opencl->dev[devid].advantage;
const float advantage = hint > 1.0f ? 1.0f / hint : 0.1f;
const gboolean possible = ((float)valid_rows / (float)tile_height) > advantage;
// Lets assume OpenCL is 10 times faster
const gboolean possible = ((float)valid_rows / (float)tile_height) > 0.1f;

if(tiling || (darktable.unmuted & DT_DEBUG_VERBOSE))
dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING,
Expand Down
60 changes: 33 additions & 27 deletions src/common/opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,26 +359,33 @@ static void _opencl_write_device_config(const int devid)

gchar key[256] = { 0 };
gchar dat[512] = { 0 };
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, 510, "%i %i %i %i %i %.3f %.3f",
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, sizeof(dat), "%i %i %i %i %i %.3f %.3f",
cl->dev[devid].micro_nap,
cl->dev[devid].pinned_memory,

// this used to define the number of slots, now a bool and using DT_OPENCL_EVENTS if true
cl->dev[devid].use_events ? 1 : 0,
cl->dev[devid].asyncmode,
cl->dev[devid].disabled,
cl->dev[devid].advantage,
0.0f,
cl->dev[devid].unified_fraction);
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"\n[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);

// write per device list of modules that should not use OpenCL
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, sizeof(dat), "%s", cl->dev[devid].avoid ? cl->dev[devid].avoid : "");
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);

// Also take care of extended device data, these are not only device
// specific but also depend on the devid to support systems with two
// similar cards.
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
g_snprintf(dat, 510, "%i", cl->dev[devid].headroom);
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
g_snprintf(dat, sizeof(dat), "%i", cl->dev[devid].headroom);
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);
Expand Down Expand Up @@ -413,7 +420,7 @@ static gboolean _opencl_read_device_config(const int devid)
dt_opencl_t *cl = darktable.opencl;
dt_opencl_device_t *cldid = &cl->dev[devid];
gchar key[256] = { 0 };
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);

const gboolean existing_device = dt_conf_key_not_empty(key);
gboolean safety_ok = TRUE;
Expand All @@ -435,7 +442,6 @@ static gboolean _opencl_read_device_config(const int devid)
cldid->pinned_memory = pinned_memory ? TRUE : FALSE;
cldid->asyncmode = asyncmode ? TRUE : FALSE;
cldid->disabled = disabled && dt_conf_get_int("performance_configuration_version_completed") != 19 ? TRUE : FALSE;
cldid->advantage = advantage;
cldid->unified_fraction = unified_fraction;
}

Expand All @@ -444,12 +450,14 @@ static gboolean _opencl_read_device_config(const int devid)
cldid->unified_fraction = 0.25f;
if((cldid->micro_nap < 0) || (cldid->micro_nap > 1000000))
cldid->micro_nap = 250;
if((cldid->advantage < 0.0f) || (cldid->advantage > 10000.0f))
cldid->advantage = 0.0f;

// Also read the per-device list of modules to be avoided for OpenCL
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
cldid->avoid = dt_conf_key_not_empty(key) ? dt_conf_get_string(key) : NULL;

// Also take care of extended device data, these are not only device
// specific but also depend on the devid
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
if(dt_conf_key_not_empty(key))
{
const gchar *dat = dt_conf_get_string_const(key);
Expand Down Expand Up @@ -516,6 +524,7 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
cl->dev[dev].cname = NULL;
cl->dev[dev].options = NULL;
cl->dev[dev].cflags = NULL;
cl->dev[dev].avoid = NULL;
cl->dev[dev].memory_in_use = 0;
cl->dev[dev].peak_memory = 0;
cl->dev[dev].used_available = 0;
Expand All @@ -528,7 +537,6 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
cl->dev[dev].clmem_error = FALSE;
cl->dev[dev].clroundup_wd = 16;
cl->dev[dev].clroundup_ht = 16;
cl->dev[dev].advantage = 0.0f;
cl->dev[dev].use_events = TRUE;
cl->dev[dev].asyncmode = FALSE;
cl->dev[dev].disabled = FALSE;
Expand Down Expand Up @@ -853,8 +861,8 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
if(cl->dev[dev].max_global_mem < (uint64_t)800ul * DT_MEGA)
{
dt_print_nts(DT_DEBUG_OPENCL,
" *** insufficient global memory (%" PRIu64 "MB) ***\n",
cl->dev[dev].max_global_mem / DT_MEGA);
" *** insufficient global memory %zu MB) ***\n",
(size_t)cl->dev[dev].max_global_mem / DT_MEGA);
res = TRUE;
cl->dev[dev].disabled |= TRUE;
goto end;
Expand All @@ -875,18 +883,15 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
}

dt_print_nts(DT_DEBUG_OPENCL,
" GLOBAL MEM SIZE: %.0f MB\n",
(double)cl->dev[dev].max_global_mem / (double)DT_MEGA);
" GLOBAL MEM SIZE: %zu MB\n", (size_t)(cl->dev[dev].max_global_mem / DT_MEGA));
dt_print_nts(DT_DEBUG_OPENCL,
" MAX IMAGE ALLOC: %.0f MB\n",
(double)cl->dev[dev].max_mem_alloc / (double)DT_MEGA);
" MAX IMAGE ALLOC: %zu MB\n", (size_t)(cl->dev[dev].max_mem_alloc / DT_MEGA));
dt_print_nts(DT_DEBUG_OPENCL,
" MAX IMAGE SIZE: %zd x %zd\n",
cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
" MAX IMAGE SIZE: %zu x %zu\n", cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
dt_print_nts(DT_DEBUG_OPENCL,
" MAX CONSTANT BUFFER: %.0f KB\n", (double)cl->dev[dev].max_mem_constant / 1024.0);
" MAX CONSTANT BUFFER: %zu KB\n", (size_t)(cl->dev[dev].max_mem_constant / 1024));
dt_print_nts(DT_DEBUG_OPENCL,
" LOCAL MEM SIZE: %zu KB\n", cl->dev[dev].local_size / 1024lu);
" LOCAL MEM SIZE: %zu KB\n", (size_t)(cl->dev[dev].local_size / 1024));
dt_print_nts(DT_DEBUG_OPENCL,
" ADDRESS ALIGN: %d B\n", cl->dev[dev].alignsize / 8);
dt_print_nts(DT_DEBUG_OPENCL,
Expand Down Expand Up @@ -961,10 +966,10 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
" EVENTS HANDLED: %s\n", STR_YESNO(cl->dev[dev].use_events));
dt_print_nts(DT_DEBUG_OPENCL,
" OPENCL FAST MODE: %s\n", STR_YESNO(fastopencl));
dt_print_nts(DT_DEBUG_OPENCL,
" TILING ADVANTAGE: %.3f\n", cl->dev[dev].advantage);
dt_print_nts(DT_DEBUG_OPENCL,
" DEFAULT DEVICE: %s\n", STR_YESNO(type & CL_DEVICE_TYPE_DEFAULT));
dt_print_nts(DT_DEBUG_OPENCL,
" AVOIDED MODULES: %s\n", cl->dev[dev].avoid ? cl->dev[dev].avoid : "none");

if(cl->dev[dev].disabled)
{
Expand Down Expand Up @@ -1221,6 +1226,7 @@ static void _cleanup_cl_device_mem(dt_opencl_t *cl, const int i)
free((void *)(cl->dev[i].cname));
free((void *)(cl->dev[i].options));
free((void *)(cl->dev[i].cflags));
g_free((void *)(cl->dev[i].avoid));
}

void dt_opencl_init(dt_opencl_t *cl,
Expand Down Expand Up @@ -3598,9 +3604,9 @@ void dt_opencl_memory_statistics(int devid,
{
dt_print(DT_DEBUG_OPENCL,"[opencl memory] device '%s' id=%d: %.1fMB in use, %.1fMB available GPU mem of %.1fMB",
cl->dev[devid].fullname, devid,
(float)cl->dev[devid].memory_in_use/(1024*1024),
(float)cl->dev[devid].used_available/(1024*1024),
(float)cl->dev[devid].max_global_mem/(1024*1024));
(float)cl->dev[devid].memory_in_use / DT_MEGA,
(float)cl->dev[devid].used_available / DT_MEGA,
(float)cl->dev[devid].max_global_mem / DT_MEGA);
if(cl->dev[devid].memory_in_use > darktable.opencl->dev[devid].used_available)
{
dt_print(DT_DEBUG_OPENCL,
Expand Down
3 changes: 1 addition & 2 deletions src/common/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ typedef struct dt_opencl_device_t
const char *cname;
const char *options;
const char *cflags;
const char *avoid;
cl_int summary;
size_t memory_in_use;
size_t peak_memory;
Expand Down Expand Up @@ -211,8 +212,6 @@ typedef struct dt_opencl_device_t

// lets keep the vendor for runtime checks
int vendor_id;

float advantage;
} dt_opencl_device_t;

struct dt_bilateral_cl_global_t;
Expand Down
28 changes: 9 additions & 19 deletions src/develop/pixelpipe_hb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,13 @@ static void _opencl_dump_diff_pipe_pfm(dt_dev_pixelpipe_t *pipe,
dt_free_align(clin);
}
}

static inline gboolean _avoid_cl_module(const dt_dev_pixelpipe_iop_t *piece)
{
const dt_opencl_device_t *cldid = &darktable.opencl->dev[piece->pipe->devid];
return cldid->avoid && dt_str_commasubstring(cldid->avoid, piece->module->op);
}

#endif

static inline gboolean _skip_piece_on_tags(const dt_dev_pixelpipe_iop_t *piece)
Expand Down Expand Up @@ -2141,7 +2148,8 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
gboolean possible_cl =
module->process_cl
&& piece->process_cl_ready
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL));
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL))
&& !_avoid_cl_module(piece);

const uint32_t m_bpp = MAX(in_bpp, bpp);
const size_t m_width = MAX(roi_in.width, roi_out->width);
Expand All @@ -2155,24 +2163,6 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
{
if(!_piece_may_tile(piece))
possible_cl = FALSE;

const float advantage = darktable.opencl->dev[pipe->devid].advantage;
if(possible_cl && (advantage > 0.0f))
{
const float tilemem_cl = dt_tiling_estimate_clmem(&tiling, piece,
&roi_in, roi_out, m_bpp);
const float tilemem_cpu = dt_tiling_estimate_cpumem(&tiling, piece,
&roi_in, roi_out, m_bpp);
if((tilemem_cpu * advantage) < tilemem_cl)
{
dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING,
"[dt_dev_pixelpipetiling_cl] [%s] estimates cpu"
" advantage in `%s', (dev=%i, adv=%.2f, GPU %.2f CPU %.2f)",
dt_dev_pixelpipe_type_to_str(pipe->type), module->op, pipe->devid,
advantage, tilemem_cl / 1e9, tilemem_cpu / 1e9);
possible_cl = FALSE;
}
}
}

if(possible_cl)
Expand Down
122 changes: 0 additions & 122 deletions src/develop/tiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -1181,129 +1181,7 @@ void default_process_tiling(dt_iop_module_t *self,
return;
}

float dt_tiling_estimate_cpumem(const dt_develop_tiling_t *tiling,
const dt_dev_pixelpipe_iop_t *piece,
const dt_iop_roi_t *const roi_in,
const dt_iop_roi_t *const roi_out,
const int max_bpp)
{
const int m_dx = MAX(roi_in->width, roi_out->width);
const int m_dy = MAX(roi_in->height, roi_out->height);
if(dt_tiling_piece_fits_host_memory(piece, m_dx, m_dy, max_bpp, tiling->factor, tiling->overhead))
return (float)m_dx * m_dy * max_bpp * tiling->factor + tiling->overhead;

const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
/ ((float)roi_out->width * roi_out->height)));
float available = dt_get_available_pipe_mem(piece->pipe);
available = fmaxf(available - ((float)roi_out->width * roi_out->height * max_bpp)
- ((float)roi_in->width * roi_in->height * max_bpp) - tiling->overhead, 0.0f);

float singlebuffer = dt_get_singlebuffer_mem();
const float factor = fmaxf(tiling->factor, 1.0f);
const float maxbuf = fmaxf(tiling->maxbuf, 1.0f);
singlebuffer = fmaxf(available / factor, singlebuffer);

int width = MAX(roi_in->width, roi_out->width);
int height = MAX(roi_in->height, roi_out->height);

const unsigned int align = tiling->align;
if((float)width * height * max_bpp * maxbuf > singlebuffer)
{
const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
if(width < height && scale >= 0.333f)
height = _align_down((int)floorf(height * scale), align);
else if(height <= width && scale >= 0.333f)
width = _align_down((int)floorf(width * scale), align);
else
{
width = _align_down((int)floorf(width * sqrtf(scale)), align);
height = _align_down((int)floorf(height * sqrtf(scale)), align);
}
}

if(3 * tiling->overlap > width || 3 * tiling->overlap > height)
width = height = _align_down((int)floorf(sqrtf((float)width * height)), align);
const int overlap_in = _align_up(tiling->overlap, align);
const int overlap_out = ceilf((float)overlap_in / fullscale);

int tiles_x = 1, tiles_y = 1;

if(roi_in->width > roi_out->width)
tiles_x = (width < roi_in->width) ? ceilf((float)roi_in->width / (float)MAX(width - 2 * overlap_in, 1)) : 1;
else
tiles_x = (width < roi_out->width) ? ceilf((float)roi_out->width / (float)MAX(width - 2 * overlap_out, 1)) : 1;

if(roi_in->height > roi_out->height)
tiles_y = (height < roi_in->height) ? ceilf((float)roi_in->height / (float)MAX(height - 2 * overlap_in, 1)) : 1;
else
tiles_y = (height < roi_out->height) ? ceilf((float)roi_out->height / (float)MAX(height - 2 * overlap_out, 1)) : 1;
dt_print(DT_DEBUG_TILING, "tilex = %i, tiley = %i", tiles_x, tiles_y);
return (float)tiles_x * tiles_y * singlebuffer ;
}

#ifdef HAVE_OPENCL
float dt_tiling_estimate_clmem(const dt_develop_tiling_t *tiling,
const dt_dev_pixelpipe_iop_t *piece,
const dt_iop_roi_t *const roi_in,
const dt_iop_roi_t *const roi_out,
const int max_bpp)
{
const int devid = piece->pipe->devid;
const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
/ ((float)roi_out->width * roi_out->height)));
const gboolean use_pinned_memory = dt_opencl_use_pinned_memory(devid);
/* If using pinned transfer on devices with dedicated GPU mem there is an additional
mem pressure as they will allocate also on device as cache for performance
*/
const float pinned_buffer_overhead = use_pinned_memory && !dt_opencl_unified_memory(devid) ? 2.0f : 0.0f;
const float pinned_buffer_slack = use_pinned_memory ? 0.85f : 1.0f;
const float available = (float)dt_opencl_get_device_available(devid);
const float factor = fmaxf(tiling->factor_cl + pinned_buffer_overhead, 1.0f);
const float singlebuffer = fminf(fmaxf((available - tiling->overhead) / factor, 0.0f),
pinned_buffer_slack * (float)(dt_opencl_get_device_memalloc(devid)));
const float maxbuf = fmaxf(tiling->maxbuf_cl, 1.0f);

int width = MIN(MAX(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
int height = MIN(MAX(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);

const unsigned int align = _lcm(tiling->align, dt_opencl_tiling_align(devid));

if((float)width * height * max_bpp * maxbuf > singlebuffer)
{
const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);

if(width < height && scale >= 0.333f)
height = _align_down((int)floorf(height * scale), align);
else if(height <= width && scale >= 0.333f)
width = _align_down((int)floorf(width * scale), align);
else
{
width = _align_down((int)floorf(width * sqrtf(scale)), align);
height = _align_down((int)floorf(height * sqrtf(scale)), align);
}
}

if(3 * tiling->overlap > width || 3 * tiling->overlap > height)
width = height = _align_down((int)floorf(sqrtf((float)width * height)), align);

const int overlap_in = _align_up(tiling->overlap, align);
const int overlap_out = ceilf((float)overlap_in / fullscale);

int tiles_x = 1, tiles_y = 1;

if(roi_in->width > roi_out->width)
tiles_x = (width < roi_in->width) ? ceilf((float)roi_in->width / (float)MAX(width - 2 * overlap_in, 1)) : 1;
else
tiles_x = (width < roi_out->width) ? ceilf((float)roi_out->width / (float)MAX(width - 2 * overlap_out, 1)) : 1;

if(roi_in->height > roi_out->height)
tiles_y = (height < roi_in->height) ? ceilf((float)roi_in->height / (float)MAX(height - 2 * overlap_in, 1)) : 1;
else
tiles_y = (height < roi_out->height) ? ceilf((float)roi_out->height / (float)MAX(height - 2 * overlap_out, 1)) : 1;

return (float)tiles_x * tiles_y * singlebuffer * factor;
}

/* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
dt_dev_pixelpipe_iop_t *piece,
Expand Down
9 changes: 0 additions & 9 deletions src/develop/tiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,6 @@ void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t
gboolean dt_tiling_piece_fits_host_memory(const struct dt_dev_pixelpipe_iop_t *piece, const size_t width, const size_t height, const unsigned bpp,
const float factor, const size_t overhead);

float dt_tiling_estimate_cpumem(const dt_develop_tiling_t *tiling, const struct dt_dev_pixelpipe_iop_t *piece,
const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
const int max_bpp);

#ifdef HAVE_OPENCL
float dt_tiling_estimate_clmem(const dt_develop_tiling_t *tiling, const struct dt_dev_pixelpipe_iop_t *piece,
const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
const int max_bpp);
#endif
// clang-format off
// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
// vim: shiftwidth=2 expandtab tabstop=2 cindent
Expand Down
Loading