From 4bc6231b7841ac4018519b96bf5b194e779496f5 Mon Sep 17 00:00:00 2001 From: Jeff Cook Date: Wed, 8 Apr 2026 16:37:20 -0600 Subject: [PATCH] Fix object mask ASan races and scratch overflows --- src/common/box_filters.cc | 19 +++++++++++++++++-- src/common/guided_filter.c | 16 ++++++++++------ src/develop/masks/object.c | 9 +++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/common/box_filters.cc b/src/common/box_filters.cc index 34f4cc9ef97a..a8e4aedb7264 100644 --- a/src/common/box_filters.cc +++ b/src/common/box_filters.cc @@ -312,10 +312,18 @@ static void _blur_vertical_1ch(float *const __restrict__ buf, float *const __restrict__ scanlines, const size_t padded_size) { + (void)scanlines; DT_OMP_FOR() for(size_t x = 0; x < width; x += MAX_VECT) { - float *const __restrict__ scratch = (float*)dt_get_perthread(scanlines,padded_size); + // Avoid dt_get_perthread() here: under ASan/libomp we can observe worker + // thread ids that step past the precomputed pool size, which makes the + // shared scratch pool alias unrelated heap memory. A local scratch buffer + // per OpenMP iteration keeps the vertical pass isolated. + float *const __restrict__ scratch = dt_alloc_align_float(padded_size); + if(!scratch) + continue; + if(x + MAX_VECT <= width) { _blur_vertical(buf + x, height, width, radius, scratch); @@ -329,6 +337,7 @@ static void _blur_vertical_1ch(float *const __restrict__ buf, for( ; col < width; col++) _blur_vertical<1,compensated>(buf + col, height, width, radius, scratch); } + dt_free_align(scratch); } return; } @@ -377,8 +386,14 @@ static void _box_mean(float *const buf, DT_OMP_FOR() for(size_t row = 0; row < height; row++) { - float *const __restrict__ scratch = (float*)dt_get_perthread(scanlines,padded_size); + // Avoid dt_get_perthread() here for the same reason as the vertical + // pass below: with ASan/libomp the worker identity can outrun the + // scratch-pool sizing assumptions and overrun the shared buffer. + float *const __restrict__ scratch = dt_alloc_align_float(padded_size); + if(!scratch) + continue; _blur_horizontal(buf + row * N * width, width, radius, scratch); + dt_free_align(scratch); } // we need to multiply width by N to get the correct stride for the vertical blur _blur_vertical_1ch(buf, height, N*width, radius, scanlines, padded_size); diff --git a/src/common/guided_filter.c b/src/common/guided_filter.c index ddf51c2634dc..90f2a6b5cece 100644 --- a/src/common/guided_filter.c +++ b/src/common/guided_filter.c @@ -117,11 +117,13 @@ static void _guided_filter_tiling(color_image imgg, color_image mean = _new_color_image(width, height, 4); color_image variance = _new_color_image(width, height, 9); const size_t img_dimen = dt_round_size(mean.width, 16); - size_t img_bak_sz; - float *img_bak = dt_alloc_perthread_float(9*img_dimen, &img_bak_sz); - DT_OMP_FOR(shared(img, imgg, mean, variance, img_bak) dt_omp_sharedconst(source)) + DT_OMP_FOR(shared(img, imgg, mean, variance) firstprivate(img_dimen) dt_omp_sharedconst(source)) for(int j_imgg = source.lower; j_imgg < source.upper; j_imgg++) { + float *const restrict scratch = dt_alloc_align_float(9 * img_dimen); + if(!scratch) + continue; + int j = j_imgg - source.lower; float *const restrict meanpx = mean.data + 4 * j * mean.width; float *const restrict varpx = variance.data + 9 * j * variance.width; @@ -146,12 +148,14 @@ static void _guided_filter_tiling(color_image imgg, varpx[9*i+VAR_GB] = pixel[1] * pixel[2]; varpx[9*i+VAR_BB] = pixel[2] * pixel[2]; } - // apply horizontal pass of box mean filter while the cache is still hot - float *const restrict scratch = dt_get_perthread(img_bak, img_bak_sz); + // apply horizontal pass of box mean filter while the cache is still hot. + // Use a row-local scratch buffer instead of dt_alloc_perthread_float(): + // ASan shows OpenMP occasionally uses a thread index beyond the pool size + // computed for that helper, which overruns the shared scratch pool here. dt_box_mean_horizontal(meanpx, mean.width, 4|BOXFILTER_KAHAN_SUM, w, scratch); dt_box_mean_horizontal(varpx, variance.width, 9|BOXFILTER_KAHAN_SUM, w, scratch); + dt_free_align(scratch); } - dt_free_align(img_bak); dt_box_mean_vertical(mean.data, mean.height, mean.width, 4|BOXFILTER_KAHAN_SUM, w); dt_box_mean_vertical(variance.data, variance.height, variance.width, 9|BOXFILTER_KAHAN_SUM, w); // we will recycle memory of 'mean' for the new coefficient arrays a_? and b to reduce memory foot print diff --git a/src/develop/masks/object.c b/src/develop/masks/object.c index 8a55a2f88d77..a02306784fa9 100644 --- a/src/develop/masks/object.c +++ b/src/develop/masks/object.c @@ -690,6 +690,15 @@ static void _run_decoder(dt_masks_form_gui_t *gui) if(gui->guipoints_count <= 0) return; + // ENCODE_READY is published before the background thread finishes decoder + // warmup. Wait here so the first real decode can't race the warmup on the + // shared segmentation context. + if(d->encode_thread) + { + g_thread_join(d->encode_thread); + d->encode_thread = NULL; + } + dt_gui_cursor_set_busy(); const float *gp = dt_masks_dynbuf_buffer(gui->guipoints);