Skip to content

Commit 8a619fc

Browse files
Some fixes related to unified OpenCL memory
1. The remaining total system memory was wrongfully decreased too far when using unified cl memory possibly leading to decreased performance, also wrong log reports later on. 2. The available memory restrictions due to OpenCL and the resource level are calculated now inside dt_opencl_update_setting(), this is called at startup and when changing preferences so cl_mem setting are always up-to-date and logged there. 3. Checks for available cl_mem when allocating buffers/images have been fixed, relevant for small devices with unified memory. 4. Simplified pipe starting logs, we don't do checks there any more as we do that in (2)
1 parent b3c41a1 commit 8a619fc

5 files changed

Lines changed: 72 additions & 90 deletions

File tree

src/common/darktable.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1830,6 +1830,7 @@ int dt_init(int argc,
18301830
size_t total_mb = _get_total_memory() / 1024lu;
18311831
if(total_mb < 8192) total_mb -= 1024;
18321832
res->total_memory = total_mb * DT_MEGA;
1833+
res->cl_uni_memory = 0;
18331834

18341835
char *config_info = calloc(1, DT_PERF_INFOSIZE);
18351836
if(last_configure_version != DT_CURRENT_PERFORMANCE_CONFIGURE_VERSION
@@ -2457,7 +2458,7 @@ size_t dt_get_available_mem()
24572458
return res->refresource[4*(-level-1)] * DT_MEGA;
24582459

24592460
const int fraction = res->fractions[4*level];
2460-
return MAX(512lu * DT_MEGA, res->total_memory / 1024lu * fraction);
2461+
return MAX(512lu * DT_MEGA, (res->total_memory - res->cl_uni_memory) / 1024lu * fraction);
24612462
}
24622463

24632464
size_t dt_get_singlebuffer_mem()

src/common/darktable.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ typedef struct dt_sys_resources_t
395395
{
396396
size_t total_memory;
397397
size_t mipmap_memory;
398+
size_t cl_uni_memory;
398399
int *fractions; // fractions are calculated as res=input / 1024 * fraction
399400
int *refresource; // for the debug resource modes we use fixed settings
400401
int level;

src/common/opencl.c

Lines changed: 61 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,23 +1606,6 @@ void dt_opencl_init(dt_opencl_t *cl,
16061606
// priorities and pixelpipe synchronization timeout
16071607
dt_opencl_scheduling_profile_t profile = _opencl_get_scheduling_profile();
16081608
_opencl_apply_scheduling_profile(profile);
1609-
1610-
// let's keep track on unified memory devices
1611-
dt_sys_resources_t *res = &darktable.dtresources;
1612-
for(int i = 0; i < cl->num_devs; i++)
1613-
{
1614-
if(cl->dev[i].unified_memory)
1615-
{
1616-
const size_t reserved = MIN(cl->dev[i].max_global_mem, res->total_memory * cl->dev[i].unified_fraction);
1617-
cl->dev[i].max_global_mem = reserved;
1618-
cl->dev[i].max_mem_alloc = MIN(cl->dev[i].max_mem_alloc, reserved);
1619-
dt_print_nts(DT_DEBUG_OPENCL,
1620-
" UNIFIED MEM SIZE: %.0f MB (%i%%) reserved for '%s' id=%d\n",
1621-
(double)reserved / 1024.0 / 1024.0, (int)(100.0f * cl->dev[i].unified_fraction),
1622-
cl->dev[i].cname, i);
1623-
res->total_memory -= reserved;
1624-
}
1625-
}
16261609
}
16271610
else // initialization failed
16281611
{
@@ -3412,13 +3395,19 @@ void *dt_opencl_alloc_device(const int devid,
34123395
return dev;
34133396
}
34143397

3398+
static cl_ulong _opencl_get_device_memalloc(const int devid)
3399+
{
3400+
dt_opencl_t *cl = darktable.opencl;
3401+
return MIN(cl->dev[devid].used_available, cl->dev[devid].max_mem_alloc);
3402+
}
3403+
34153404
void *dt_opencl_alloc_device_buffer(const int devid,
34163405
const size_t size)
34173406
{
34183407
if(!_cldev_running(devid))
34193408
return NULL;
34203409
dt_opencl_t *cl = darktable.opencl;
3421-
if(cl->dev[devid].max_mem_alloc < size)
3410+
if(_opencl_get_device_memalloc(devid) < size)
34223411
return NULL;
34233412
cl_int err = CL_SUCCESS;
34243413

@@ -3443,7 +3432,7 @@ void *dt_opencl_alloc_device_buffer_with_flags(const int devid,
34433432
if(!_cldev_running(devid))
34443433
return NULL;
34453434
dt_opencl_t *cl = darktable.opencl;
3446-
if(cl->dev[devid].max_mem_alloc < size)
3435+
if(_opencl_get_device_memalloc(devid) < size)
34473436
return NULL;
34483437

34493438
cl_int err = CL_SUCCESS;
@@ -3610,61 +3599,12 @@ void dt_opencl_memory_statistics(int devid,
36103599
}
36113600
}
36123601

3613-
/* amount of graphics memory declared as available depends on max_global_mem and
3614-
"resourcelevel". We garantee
3615-
- a headroom of DT_OPENCL_DEFAULT_HEADROOM MB in all cases not using tuned cl
3616-
- 256MB to simulate a minimum system
3617-
- 2GB to simulate a reference system
3618-
*/
3619-
void dt_opencl_check_tuning(const int devid)
3620-
{
3621-
dt_sys_resources_t *res = &darktable.dtresources;
3622-
dt_opencl_t *cl = darktable.opencl;
3623-
if(!_cldev_running(devid)) return;
3624-
3625-
const int level = res->level;
3626-
const gboolean tunehead = cl->num_devs > 1
3627-
&& level >= 0
3628-
&& !dt_gimpmode()
3629-
&& dt_conf_get_bool("opencl_tune_headroom");
3630-
3631-
cl->dev[devid].tunehead = tunehead;
3632-
3633-
if(level < 0)
3634-
{
3635-
cl->dev[devid].used_available = res->refresource[4*(-level-1) + 3] * DT_MEGA;
3636-
}
3637-
else
3638-
{
3639-
const size_t allmem = cl->dev[devid].max_global_mem;
3640-
const size_t lowmem = 256ul * DT_MEGA;
3641-
const size_t dhead = DT_OPENCL_DEFAULT_HEADROOM * DT_MEGA;
3642-
if(cl->dev[devid].tunehead)
3643-
{
3644-
const size_t headroom = (cl->dev[devid].headroom ? DT_MEGA * cl->dev[devid].headroom : dhead)
3645-
+ (cl->dev[devid].clmem_error ? dhead : 0);
3646-
cl->dev[devid].used_available = allmem > headroom ? allmem - headroom : lowmem;
3647-
}
3648-
else
3649-
{
3650-
const size_t disposable = allmem > dhead ? allmem - dhead : 0;
3651-
const int fraction = MIN(1024, res->fractions[4*res->level + 3]);
3652-
cl->dev[devid].used_available = MAX(lowmem, disposable / 1024ul * fraction);
3653-
}
3654-
}
3655-
}
3656-
36573602
cl_ulong dt_opencl_get_device_available(const int devid)
36583603
{
36593604
if(!darktable.opencl->inited || devid <= DT_DEVICE_CPU) return 0;
36603605
return darktable.opencl->dev[devid].used_available;
36613606
}
36623607

3663-
static cl_ulong _opencl_get_device_memalloc(const int devid)
3664-
{
3665-
return darktable.opencl->dev[devid].max_mem_alloc;
3666-
}
3667-
36683608
cl_ulong dt_opencl_get_device_memalloc(const int devid)
36693609
{
36703610
if(!darktable.opencl->inited || devid <= DT_DEVICE_CPU) return 0;
@@ -3756,6 +3696,59 @@ void dt_opencl_update_settings(void)
37563696
const char *pstr = dt_conf_get_string_const("opencl_scheduling_profile");
37573697
dt_print(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
37583698
"[opencl_update_settings] scheduling profile set to %s", pstr);
3699+
3700+
dt_sys_resources_t *res = &darktable.dtresources;
3701+
/* If we have cl devices with unified memery we should not use that part
3702+
for general dt use.
3703+
As that part might change with a different resource level we have to
3704+
fix that whenever that changes.
3705+
*/
3706+
res->cl_uni_memory = 0;
3707+
const int level = res->level;
3708+
const gboolean tunehead = cl->num_devs > 1
3709+
&& level >= 0
3710+
&& !dt_gimpmode()
3711+
&& dt_conf_get_bool("opencl_tune_headroom");
3712+
3713+
for(int i = 0; i < cl->num_devs; i++)
3714+
{
3715+
cl->dev[i].tunehead = tunehead;
3716+
if(level < 0)
3717+
{
3718+
cl->dev[i].used_available = res->refresource[4*(-level-1) + 3] * DT_MEGA;
3719+
}
3720+
else
3721+
{
3722+
const size_t allmem = cl->dev[i].max_global_mem;
3723+
const size_t lowmem = 256ul * DT_MEGA;
3724+
const size_t dhead = DT_OPENCL_DEFAULT_HEADROOM * DT_MEGA;
3725+
if(cl->dev[i].tunehead)
3726+
{
3727+
const size_t headroom = cl->dev[i].headroom ? DT_MEGA * cl->dev[i].headroom : dhead;
3728+
cl->dev[i].used_available = allmem > headroom ? allmem - headroom : lowmem;
3729+
}
3730+
else
3731+
{
3732+
const size_t disposable = allmem > dhead ? allmem - dhead : 0;
3733+
const int fraction = MIN(1024, res->fractions[4*res->level + 3]);
3734+
cl->dev[i].used_available = MAX(lowmem, disposable / 1024ul * fraction);
3735+
}
3736+
}
3737+
3738+
if(cl->dev[i].unified_memory)
3739+
{
3740+
cl->dev[i].used_available = MIN(cl->dev[i].used_available, res->total_memory * cl->dev[i].unified_fraction);
3741+
res->cl_uni_memory += cl->dev[i].used_available;
3742+
}
3743+
dt_print_nts(DT_DEBUG_OPENCL,
3744+
" AVAILABLE CLMEM SIZE: %zu MB%s%s\n",
3745+
(size_t)(cl->dev[i].used_available / DT_MEGA),
3746+
cl->dev[i].tunehead ? ", tuned" : "",
3747+
cl->dev[i].pinned_memory ? ", pinned": "");
3748+
}
3749+
if(res->cl_uni_memory)
3750+
dt_print_nts(DT_DEBUG_OPENCL,
3751+
" UNIFIED SYSMEM SIZE: %zu MB\n", (size_t)(res->cl_uni_memory / DT_MEGA));
37593752
}
37603753

37613754
/** read scheduling profile for config variables */

src/common/opencl.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -572,9 +572,6 @@ gboolean dt_opencl_image_fits_device(const int devid,
572572
/** get available memory for the device */
573573
cl_ulong dt_opencl_get_device_available(const int devid);
574574

575-
/** check tuning settings and available memory for the device */
576-
void dt_opencl_check_tuning(const int devid);
577-
578575
/** get size of allocatable single buffer */
579576
cl_ulong dt_opencl_get_device_memalloc(const int devid);
580577

src/develop/pixelpipe_hb.c

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3112,25 +3112,15 @@ gboolean dt_dev_pixelpipe_process(dt_dev_pixelpipe_t *pipe,
31123112
dt_iop_buffer_dsc_t _out_format = { 0 };
31133113
dt_iop_buffer_dsc_t *out_format = &_out_format;
31143114

3115-
#ifdef HAVE_OPENCL
3116-
dt_opencl_check_tuning(pipe->devid);
3117-
if(pipe->devid > DT_DEVICE_CPU)
3118-
dt_print_pipe(DT_DEBUG_PIPE, "pipe starting",
3119-
pipe, NULL, pipe->devid, &roi, &roi, "'%s' ID=%i, %s using %luMB%s%s",
3120-
pipe->image.filename, pipe->image.id,
3121-
darktable.opencl->dev[pipe->devid].cname,
3122-
darktable.opencl->dev[pipe->devid].used_available / DT_MEGA,
3123-
darktable.opencl->dev[pipe->devid].tunehead ? ", tuned" : "",
3124-
darktable.opencl->dev[pipe->devid].pinned_memory ? ", pinned": "");
3125-
else
3126-
dt_print_pipe(DT_DEBUG_PIPE, "pipe starting",
3127-
pipe, NULL, pipe->devid, &roi, &roi, "'%s' ID=%i using %luMB",
3128-
pipe->image.filename, pipe->image.id, dt_get_available_mem() / DT_MEGA);
3129-
#else
3115+
const size_t avail_mem =
3116+
#ifdef HAVE_OPENCL
3117+
pipe->devid > DT_DEVICE_CPU ? dt_opencl_get_device_available(pipe->devid) : dt_get_available_mem();
3118+
#else
3119+
dt_get_available_mem();
3120+
#endif
31303121
dt_print_pipe(DT_DEBUG_PIPE, "pipe starting",
3131-
pipe, NULL, pipe->devid, &roi, &roi, "'%s' ID=%i using %luMB",
3132-
pipe->image.filename, pipe->image.id, dt_get_available_mem() / DT_MEGA);
3133-
#endif
3122+
pipe, NULL, pipe->devid, &roi, &roi, "'%s' ID=%i using %luMB",
3123+
pipe->image.filename, pipe->image.id, avail_mem / DT_MEGA);
31343124
dt_print_mem_usage("before pixelpipe process");
31353125

31363126
// run pixelpipe recursively and get error status

0 commit comments

Comments
 (0)