darktable-org · masterpiga · Jun 18, 2026
diff --git a/data/kernels/overlay.cl b/data/kernels/overlay.cl
@@ -18,20 +18,54 @@
 
 #include "common.h"
 
-// Alpha-blend a Cairo ARGB32 overlay onto a float4 image.
+// Alpha-blend a straight-alpha float RGBA overlay onto a float4 image.
 //
-// overlay_argb is a flat byte buffer (Cairo ARGB32, little-endian byte order
-// [B, G, R, A] at each pixel), with row pitch = stride bytes.
+// overlay_rgba is a flat float buffer, 4 floats per pixel [R, G, B, coverage],
+// in the pipe's scene-referred linear working RGB, row pitch = width*4 floats.
 // The blend formula matches the CPU path in overlay.c:
-//   alpha  = (s_a / 255) * opacity
-//   out_c  = (1 - alpha) * in_c + opacity * s_c / 255
+//   alpha  = coverage * opacity
+//   out_c  = (1 - alpha) * in_c + alpha * s_c
 kernel void overlay_blend(read_only  image2d_t in,
-                          __global const uchar *overlay_argb,
+                          __global const float *overlay_rgba,
                           write_only image2d_t out,
                           const int   width,
                           const int   height,
-                          const float opacity,
-                          const int   stride)
+                          const float opacity)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  if(x >= width || y >= height) return;
+
+  const float4 i   = Areadpixel(in, x, y);
+  const int    off = (y * width + x) * 4;
+
+  const float r = overlay_rgba[off + 0];
+  const float g = overlay_rgba[off + 1];
+  const float b = overlay_rgba[off + 2];
+  const float a = overlay_rgba[off + 3] * opacity;
+
+  float4 o;
+  o.x = (1.f - a) * i.x + a * r;
+  o.y = (1.f - a) * i.y + a * g;
+  o.z = (1.f - a) * i.z + a * b;
+  o.w = i.w;
+  write_imagef(out, (int2)(x, y), o);
+}
+
+// Legacy alpha-blend of a Cairo ARGB32 overlay onto a float4 image.
+//
+// overlay_argb is a flat byte buffer (Cairo ARGB32, little-endian byte order
+// [B, G, R, A] at each pixel), with row pitch = stride bytes. This reproduces
+// the original 8-bit compositing for edits made before the float path existed:
+//   alpha  = (s_a / 255) * opacity
+//   out_c  = (1 - alpha) * in_c + opacity * s_c / 255
+kernel void overlay_blend_legacy(read_only  image2d_t in,
+                                 __global const uchar *overlay_argb,
+                                 write_only image2d_t out,
+                                 const int   width,
+                                 const int   height,
+                                 const float opacity,
+                                 const int   stride)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);

diff --git a/src/develop/develop.c b/src/develop/develop.c
@@ -3927,14 +3927,20 @@ void dt_dev_image(const dt_imgid_t imgid,
                   const int snapshot_id,
                   GList *module_filter_out,
                   const int devid,
-                  const gboolean finalscale)
+                  const gboolean finalscale,
+                  const gboolean want_float)
 {
   dt_develop_t dev;
   dt_dev_init(&dev, TRUE);
   dev.gui_attached = FALSE;
   dt_dev_pixelpipe_t *pipe = dev.full.pipe;
 
   pipe->type |= DT_DEV_PIXELPIPE_IMAGE | (finalscale ? DT_DEV_PIXELPIPE_IMAGE_FINAL : DT_DEV_PIXELPIPE_NONE);
+  // want_float: keep gamma as the terminal module (so backbuf dimensions stay
+  // consistent) but have it pass the linear-float working RGB straight through
+  // instead of packing it to 8-bit. See gamma.c process().
+  if(want_float)
+    pipe->type |= DT_DEV_PIXELPIPE_IMAGE_FLOAT;
   // load image and set history_end
 
   dev.snapshot_id = snapshot_id;
@@ -3965,10 +3971,16 @@ void dt_dev_image(const dt_imgid_t imgid,
 
   // record resulting image and dimensions
 
-  const uint32_t bufsize =
-    sizeof(uint32_t) * pipe->backbuf_width * pipe->backbuf_height;
+  // Destination size the caller expects: 16 B/px for float, else 8-bit ARGB.
+  // The pipe's terminate step (dt_dev_pixelpipe_process) sizes pipe->backbuf to
+  // match: 4 floats/px when want_float (gamma passed the linear float through),
+  // 8-bit ARGB otherwise. So a straight copy of bufsize is exact.
+  const size_t bufsize = (want_float ? 4 * sizeof(float) : sizeof(uint32_t)) * pipe->backbuf_width *
+                         pipe->backbuf_height;
   *buf = dt_alloc_aligned(bufsize);
-  memcpy(*buf, pipe->backbuf, bufsize);
+  memset(*buf, 0, bufsize);
+  if(pipe->backbuf)
+    memcpy(*buf, pipe->backbuf, MIN(bufsize, pipe->backbuf_size));
 
   if(buf_width) *buf_width = pipe->backbuf_width;
   if(buf_height) *buf_height = pipe->backbuf_height;

diff --git a/src/develop/develop.h b/src/develop/develop.h
@@ -643,8 +643,8 @@ void dt_dev_image(const dt_imgid_t imgid,
                   const int32_t snapshot_id,
                   GList *module_filter_out,
                   const int devid,
-                  const gboolean finalscale);
-
+                  const gboolean finalscale,
+                  const gboolean want_float);
 
 gboolean dt_dev_equal_chroma(const float *f, const double *d);
 void dt_dev_reset_chroma(dt_develop_t *dev);

diff --git a/src/develop/pixelpipe.h b/src/develop/pixelpipe.h
@@ -40,20 +40,25 @@ typedef struct dt_iop_roi_t
 /* The pixelpipe types here are all defined as a bit mask to ensure easy testing via & operator */
 typedef enum dt_dev_pixelpipe_type_t
 {
-  DT_DEV_PIXELPIPE_NONE      = 0,
-  DT_DEV_PIXELPIPE_EXPORT    = 1 << 0,
-  DT_DEV_PIXELPIPE_FULL      = 1 << 1,
-  DT_DEV_PIXELPIPE_PREVIEW   = 1 << 2,
+  DT_DEV_PIXELPIPE_NONE = 0,
+  DT_DEV_PIXELPIPE_EXPORT = 1 << 0,
+  DT_DEV_PIXELPIPE_FULL = 1 << 1,
+  DT_DEV_PIXELPIPE_PREVIEW = 1 << 2,
   DT_DEV_PIXELPIPE_THUMBNAIL = 1 << 3,
-  DT_DEV_PIXELPIPE_PREVIEW2  = 1 << 4,
-  DT_DEV_PIXELPIPE_SCREEN    = DT_DEV_PIXELPIPE_PREVIEW | DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW2,
-  DT_DEV_PIXELPIPE_CANVAS    = DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW2,
-  DT_DEV_PIXELPIPE_ANY       = DT_DEV_PIXELPIPE_EXPORT | DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW
-                               | DT_DEV_PIXELPIPE_THUMBNAIL | DT_DEV_PIXELPIPE_PREVIEW2,
-  DT_DEV_PIXELPIPE_FAST      = 1 << 8,
-  DT_DEV_PIXELPIPE_IMAGE     = 1 << 9,    // special additional flag used by dt_dev_image()
-  DT_DEV_PIXELPIPE_IMAGE_FINAL = 1 << 10, // special additional flag used by dt_dev_image(), mark to use finalscale
-  DT_DEV_PIXELPIPE_BASIC     = DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW
+  DT_DEV_PIXELPIPE_PREVIEW2 = 1 << 4,
+  DT_DEV_PIXELPIPE_SCREEN =
+    DT_DEV_PIXELPIPE_PREVIEW | DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW2,
+  DT_DEV_PIXELPIPE_CANVAS = DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW2,
+  DT_DEV_PIXELPIPE_ANY = DT_DEV_PIXELPIPE_EXPORT | DT_DEV_PIXELPIPE_FULL |
+                         DT_DEV_PIXELPIPE_PREVIEW | DT_DEV_PIXELPIPE_THUMBNAIL |
+                         DT_DEV_PIXELPIPE_PREVIEW2,
+  DT_DEV_PIXELPIPE_FAST = 1 << 8,
+  DT_DEV_PIXELPIPE_IMAGE = 1 << 9, // special additional flag used by dt_dev_image()
+  DT_DEV_PIXELPIPE_IMAGE_FINAL =
+    1 << 10, // special additional flag used by dt_dev_image(), mark to use finalscale
+  DT_DEV_PIXELPIPE_IMAGE_FLOAT =
+    1 << 11, // dt_dev_image() want_float: gamma passes float through instead of packing 8-bit
+  DT_DEV_PIXELPIPE_BASIC = DT_DEV_PIXELPIPE_FULL | DT_DEV_PIXELPIPE_PREVIEW
 } dt_dev_pixelpipe_type_t;
 
 /** when to collect histogram */

diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
@@ -3219,16 +3219,22 @@ gboolean dt_dev_pixelpipe_process(dt_dev_pixelpipe_t *pipe,
   //FIXME lock/release cache line instead of copying
   if(dt_pipe_is_screen(pipe))
   {
-    if(pipe->backbuf == NULL
-       || pipe->backbuf_width * pipe->backbuf_height != width * height)
+    // dt_dev_image(want_float) keeps gamma terminal but lets it pass the
+    // 4-channel linear float working RGB through unpacked. In that case the
+    // backbuf is 16 B/px (4 floats) rather than the usual 8-bit ARGB.
+    const size_t bbpp =
+      (pipe->type & DT_DEV_PIXELPIPE_IMAGE_FLOAT) ? 4 * sizeof(float) : 4 * sizeof(uint8_t);
+    if(pipe->backbuf == NULL || pipe->backbuf_width * pipe->backbuf_height != width * height ||
+       pipe->backbuf_size != bbpp * width * height)
     {
       g_free(pipe->backbuf);
-      pipe->backbuf = g_malloc0(sizeof(uint8_t) * 4 * width * height);
+      pipe->backbuf = g_malloc0(bbpp * width * height);
+      pipe->backbuf_size = bbpp * width * height;
     }
 
     if(pipe->backbuf)
     {
-      memcpy(pipe->backbuf, buf, sizeof(uint8_t) * 4 * width * height);
+      memcpy(pipe->backbuf, buf, bbpp * width * height);
       pipe->backbuf_scale = scale;
       for(int i = 0; i < 6; i++) pipe->backbuf_zoom_pos[i] = pts[i] * pipe->iscale;
       pipe->output_imgid = pipe->image.id;

diff --git a/src/iop/gamma.c b/src/iop/gamma.c
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "common/colorspaces_inline_conversions.h"
+#include "common/imagebuf.h"
 #include "control/control.h"
 #include "develop/develop.h"
 #include "gui/accelerators.h"
@@ -293,6 +294,17 @@ void process(dt_iop_module_t *self,
   if(roi_in->width != roi_out->width || roi_in->height != roi_out->height)
     return;
 
+  // dt_dev_image(want_float): the caller wants the scene-referred linear float
+  // working RGB, not display-encoded 8-bit ARGB. Pass the 4-channel float buffer
+  // straight through (the output buffer is already allocated 16 B/px). gamma
+  // stays the terminal module so backbuf dimensions remain consistent.
+  if(piece->pipe->type & DT_DEV_PIXELPIPE_IMAGE_FLOAT)
+  {
+    dt_iop_image_copy_by_size(
+      (float *const restrict)o, (const float *const restrict)i, roi_out->width, roi_out->height, 4);
+    return;
+  }
+
   const dt_dev_pixelpipe_display_mask_t mask_display = piece->pipe->mask_display;
   const gboolean fcolor = dt_conf_is_equal("channel_display", "false color");