Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions src/common/box_filters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,18 @@ static void _blur_vertical_1ch(float *const __restrict__ buf,
float *const __restrict__ scanlines,
const size_t padded_size)
{
(void)scanlines;
DT_OMP_FOR()
for(size_t x = 0; x < width; x += MAX_VECT)
{
float *const __restrict__ scratch = (float*)dt_get_perthread(scanlines,padded_size);
// Avoid dt_get_perthread() here: under ASan/libomp we can observe worker
// thread ids that step past the precomputed pool size, which makes the
// shared scratch pool alias unrelated heap memory. A local scratch buffer
// per OpenMP iteration keeps the vertical pass isolated.
float *const __restrict__ scratch = dt_alloc_align_float(padded_size);
if(!scratch)
continue;

if(x + MAX_VECT <= width)
{
_blur_vertical<MAX_VECT,compensated>(buf + x, height, width, radius, scratch);
Expand All @@ -329,6 +337,7 @@ static void _blur_vertical_1ch(float *const __restrict__ buf,
for( ; col < width; col++)
_blur_vertical<1,compensated>(buf + col, height, width, radius, scratch);
}
dt_free_align(scratch);
}
return;
}
Expand Down Expand Up @@ -377,8 +386,14 @@ static void _box_mean(float *const buf,
DT_OMP_FOR()
for(size_t row = 0; row < height; row++)
{
float *const __restrict__ scratch = (float*)dt_get_perthread(scanlines,padded_size);
// Avoid dt_get_perthread() here for the same reason as the vertical
// pass below: with ASan/libomp the worker identity can outrun the
// scratch-pool sizing assumptions and overrun the shared buffer.
float *const __restrict__ scratch = dt_alloc_align_float(padded_size);
if(!scratch)
continue;
_blur_horizontal<N,compensated>(buf + row * N * width, width, radius, scratch);
dt_free_align(scratch);
}
// we need to multiply width by N to get the correct stride for the vertical blur
_blur_vertical_1ch<compensated>(buf, height, N*width, radius, scanlines, padded_size);
Expand Down
16 changes: 10 additions & 6 deletions src/common/guided_filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,13 @@ static void _guided_filter_tiling(color_image imgg,
color_image mean = _new_color_image(width, height, 4);
color_image variance = _new_color_image(width, height, 9);
const size_t img_dimen = dt_round_size(mean.width, 16);
size_t img_bak_sz;
float *img_bak = dt_alloc_perthread_float(9*img_dimen, &img_bak_sz);
DT_OMP_FOR(shared(img, imgg, mean, variance, img_bak) dt_omp_sharedconst(source))
DT_OMP_FOR(shared(img, imgg, mean, variance) firstprivate(img_dimen) dt_omp_sharedconst(source))
for(int j_imgg = source.lower; j_imgg < source.upper; j_imgg++)
{
float *const restrict scratch = dt_alloc_align_float(9 * img_dimen);
if(!scratch)
continue;

int j = j_imgg - source.lower;
float *const restrict meanpx = mean.data + 4 * j * mean.width;
float *const restrict varpx = variance.data + 9 * j * variance.width;
Expand All @@ -146,12 +148,14 @@ static void _guided_filter_tiling(color_image imgg,
varpx[9*i+VAR_GB] = pixel[1] * pixel[2];
varpx[9*i+VAR_BB] = pixel[2] * pixel[2];
}
// apply horizontal pass of box mean filter while the cache is still hot
float *const restrict scratch = dt_get_perthread(img_bak, img_bak_sz);
// apply horizontal pass of box mean filter while the cache is still hot.
// Use a row-local scratch buffer instead of dt_alloc_perthread_float():
// ASan shows OpenMP occasionally uses a thread index beyond the pool size
// computed for that helper, which overruns the shared scratch pool here.
dt_box_mean_horizontal(meanpx, mean.width, 4|BOXFILTER_KAHAN_SUM, w, scratch);
dt_box_mean_horizontal(varpx, variance.width, 9|BOXFILTER_KAHAN_SUM, w, scratch);
dt_free_align(scratch);
}
dt_free_align(img_bak);
dt_box_mean_vertical(mean.data, mean.height, mean.width, 4|BOXFILTER_KAHAN_SUM, w);
dt_box_mean_vertical(variance.data, variance.height, variance.width, 9|BOXFILTER_KAHAN_SUM, w);
// we will recycle memory of 'mean' for the new coefficient arrays a_? and b to reduce memory foot print
Expand Down
9 changes: 9 additions & 0 deletions src/develop/masks/object.c
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,15 @@ static void _run_decoder(dt_masks_form_gui_t *gui)
if(gui->guipoints_count <= 0)
return;

// ENCODE_READY is published before the background thread finishes decoder
// warmup. Wait here so the first real decode can't race the warmup on the
// shared segmentation context.
if(d->encode_thread)
{
g_thread_join(d->encode_thread);
d->encode_thread = NULL;
}

dt_gui_cursor_set_busy();

const float *gp = dt_masks_dynbuf_buffer(gui->guipoints);
Expand Down