Skip to content

Commit 791cf93

Browse files
authored
Merge pull request #20752 from jenshannoschwalm/opencl_cpu_maths_5
Opencl / cpu maths part 5
2 parents 8d95339 + 1c287d3 commit 791cf93

33 files changed

Lines changed: 242 additions & 215 deletions

data/kernels/basic.cl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3243,12 +3243,12 @@ colorzones_v3 (read_only image2d_t in,
32433243

32443244
if(x >= width || y >= height) return;
32453245

3246-
float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
3246+
float4 pixel = readpixel(in, x, y);
32473247

32483248
const float a = pixel.y;
32493249
const float b = pixel.z;
3250-
const float h = fmod(atan2(b, a) + 2.0f*M_PI_F, 2.0f*M_PI_F)/(2.0f*M_PI_F);
3251-
const float C = sqrt(b*b + a*a);
3250+
const float h = fmod(atan2(b, a) + DT_2PI_F, DT_2PI_F) / DT_2PI_F;
3251+
const float C = dt_fast_hypot(b, a);
32523252

32533253
float select = 0.0f;
32543254
float blend = 0.0f;
@@ -3264,7 +3264,7 @@ colorzones_v3 (read_only image2d_t in,
32643264
default:
32653265
case DT_IOP_COLORZONES_h:
32663266
select = h;
3267-
blend = pow(1.0f - C/128.0f, 2.0f);
3267+
blend = dtcl_pow(1.0f - C/128.0f, 2.0f);
32683268
break;
32693269
}
32703270

@@ -3273,11 +3273,11 @@ colorzones_v3 (read_only image2d_t in,
32733273
blend *= blend; // saturation isn't as prone to artifacts:
32743274
// const float Cm = 2.0f* (blend*0.5f + (1.0f-blend)*lookup(d->lut[1], select));
32753275
const float Cm = 2.0f * lookup(table_a, select);
3276-
const float L = pixel.x * pow(2.0f, 4.0f*Lm);
3276+
const float L = pixel.x * dtcl_pow(2.0f, 4.0f*Lm);
32773277

32783278
pixel.x = L;
3279-
pixel.y = cos(2.0f*M_PI_F*(h + hm)) * Cm * C;
3280-
pixel.z = sin(2.0f*M_PI_F*(h + hm)) * Cm * C;
3279+
pixel.y = dtcl_cos(DT_2PI_F*(h + hm)) * Cm * C;
3280+
pixel.z = dtcl_sin(DT_2PI_F*(h + hm)) * Cm * C;
32813281

32823282
write_imagef (out, (int2)(x, y), pixel);
32833283
}
@@ -3297,10 +3297,10 @@ colorzones (read_only image2d_t in,
32973297

32983298
if(x >= width || y >= height) return;
32993299

3300-
float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
3300+
float4 pixel = readpixel(in, x, y);
33013301

33023302
float4 LCh;
3303-
const float normalize_C = 1.f / (128.0f * sqrt(2.f));
3303+
const float normalize_C = 1.f / (128.0f * M_SQRT2_F);
33043304

33053305
LCh = Lab_2_LCH(pixel);
33063306

data/kernels/blendop.cl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,10 @@ blendif_factor_Lab(const float4 input, const float4 output,
194194
float4 LCH_input = Lab_2_LCH(input);
195195
float4 LCH_output = Lab_2_LCH(output);
196196

197-
scaled[DEVELOP_BLENDIF_C_in] = LCH_input.y / (128.0f*sqrt(2.0f)); // C scaled to 0..1
197+
scaled[DEVELOP_BLENDIF_C_in] = LCH_input.y / (128.0f*M_SQRT2_F); // C scaled to 0..1
198198
scaled[DEVELOP_BLENDIF_h_in] = LCH_input.z; // h scaled to 0..1
199199

200-
scaled[DEVELOP_BLENDIF_C_out] = LCH_output.y / (128.0f*sqrt(2.0f)); // C scaled to 0..1
200+
scaled[DEVELOP_BLENDIF_C_out] = LCH_output.y / (128.0f*M_SQRT2_F); // C scaled to 0..1
201201
scaled[DEVELOP_BLENDIF_h_out] = LCH_output.z; // h scaled to 0..1
202202
}
203203

@@ -1266,10 +1266,10 @@ blendop_rgb_hsl(__read_only image2d_t in_a, __read_only image2d_t in_b, __read_o
12661266
ta = RGB_2_HSV(a);
12671267
tb = RGB_2_HSV(b);
12681268
// blend color vectors of input and output
1269-
d = ta.y*cos(2.0f*M_PI_F*ta.x) * (1.0f - opacity) + tb.y*cos(2.0f*M_PI_F*tb.x) * opacity;
1270-
s = ta.y*sin(2.0f*M_PI_F*ta.x) * (1.0f - opacity) + tb.y*sin(2.0f*M_PI_F*tb.x) * opacity;
1271-
to.x = fmod(atan2(s, d)/(2.0f*M_PI_F)+1.0f, 1.0f);
1272-
to.y = sqrt(s*s + d*d);
1269+
d = ta.y*cos(DT_2PI_F*ta.x) * (1.0f - opacity) + tb.y*cos(DT_2PI_F*tb.x) * opacity;
1270+
s = ta.y*sin(DT_2PI_F*ta.x) * (1.0f - opacity) + tb.y*sin(DT_2PI_F*tb.x) * opacity;
1271+
to.x = fmod(atan2(s, d)/DT_2PI_F+1.0f, 1.0f);
1272+
to.y = dt_fast_hypot(s, d);
12731273
to.z = ta.z;
12741274
o = HSV_2_RGB(to);
12751275
break;
@@ -1560,12 +1560,12 @@ blendop_display_channel(__read_only image2d_t in_a, __read_only image2d_t in_b,
15601560
break;
15611561
case DT_DEV_PIXELPIPE_DISPLAY_LCH_C:
15621562
LCH = Lab_2_LCH(a);
1563-
c = clipf(LCH.y / (128.0f * sqrt(2.0f) / exp2(boost_factors[DEVELOP_BLENDIF_C_in])));
1563+
c = clipf(LCH.y / (128.0f * M_SQRT2_F / exp2(boost_factors[DEVELOP_BLENDIF_C_in])));
15641564
is_lab = 1;
15651565
break;
15661566
case (DT_DEV_PIXELPIPE_DISPLAY_LCH_C | DT_DEV_PIXELPIPE_DISPLAY_OUTPUT):
15671567
LCH = Lab_2_LCH(b);
1568-
c = clipf(LCH.y / (128.0f * sqrt(2.0f)) / exp2(boost_factors[DEVELOP_BLENDIF_C_out]));
1568+
c = clipf(LCH.y / (128.0f * M_SQRT2_F) / exp2(boost_factors[DEVELOP_BLENDIF_C_out]));
15691569
is_lab = 1;
15701570
break;
15711571
case DT_DEV_PIXELPIPE_DISPLAY_LCH_h:

data/kernels/colorequal.cl

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,23 +37,12 @@ typedef enum dt_iop_colorequal_channel_t
3737

3838
static inline float _get_satweight(const float sat, global float *weights)
3939
{
40-
const float isat = SATSIZE * (1.0f + clamp(sat, -1.0f, 1.0f - (1.0f / SATSIZE)));
40+
const float isat = (float)SATSIZE * (1.0f + clamp(sat, -1.0f, 1.0f - (1.0f / SATSIZE)));
4141
const float base = floor(isat);
4242
const int i = (int)base;
4343
return weights[i] + (isat - base) * (weights[i+1] - weights[i]);
4444
}
4545

46-
static inline float _scharr_gradient(global float *in,
47-
const size_t k,
48-
const int w)
49-
{
50-
const float gx = 47.0f / 255.0f * (in[k-w-1] - in[k-w+1] + in[k+w-1] - in[k+w+1])
51-
+ 162.0f / 255.0f * (in[k-1] - in[k+1]);
52-
const float gy = 47.0f / 255.0f * (in[k-w-1] - in[k+w-1] + in[k-w+1] - in[k+w+1])
53-
+ 162.0f / 255.0f * (in[k-w] - in[k+w]);
54-
return dt_fast_hypot(gx, gy);
55-
}
56-
5746
static inline float gamut_map_HSB(const float4 HSB, global float *gamut_LUT, const float L_white)
5847
{
5948
const float4 JCH = dt_UCS_HSB_to_JCH(HSB);
@@ -258,9 +247,9 @@ __kernel void apply_guided(global float2 *uv,
258247
const float2 CV = { a[k].x * uv[k].x + a[k].y * uv[k].y + b[k].x,
259248
a[k].z * uv[k].x + a[k].w * uv[k].y + b[k].y };
260249

261-
corrections[k].y = mix(1.0f, CV.x, _get_satweight(saturation[k] - sat_shift, weights));
250+
corrections[k].y = 1.0f + (CV.x - 1.0f) * _get_satweight(saturation[k] - sat_shift, weights);
262251
const float gradient_weight = 1.0f - clipf(scharr[k]);
263-
b_corrections[k] = mix(0.0f, CV.y, gradient_weight * _get_satweight(saturation[k] - bright_shift, weights));
252+
b_corrections[k] = CV.y * gradient_weight * _get_satweight(saturation[k] - bright_shift, weights);
264253
}
265254

266255
__kernel void sample_input(__read_only image2d_t dev_in,
@@ -443,7 +432,7 @@ __kernel void process_data(global float2 *uv,
443432
{
444433
const int kk = mad24(clamp(row, 1, height - 2), width, clamp(col, 1, width - 2));
445434

446-
const float kscharr = fmax(0.0f, _scharr_gradient(saturation, kk, width) - 0.02f);
435+
const float kscharr = fmax(0.0f, scharr_gradient(saturation, kk, width) - 0.02f);
447436
Lscharr[k] = gradient_amp * kscharr * kscharr;
448437
}
449438

data/kernels/colorharmonizer.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ kernel void colorharmonizer_map(read_only image2d_t in,
9999
float4 xyY = dt_D65_XYZ_to_xyY(XYZ_D65);
100100
float4 JCH = xyY_to_dt_UCS_JCH(xyY, L_white);
101101

102-
const float hue = (JCH.z + M_PI_F) / (2.0f * M_PI_F);
102+
const float hue = (JCH.z + M_PI_F) / DT_2PI_F;
103103

104104
const int idx = y * width + x;
105105
jch_out[idx] = (float4)(JCH.x, JCH.y, hue, pix_in.w);
@@ -146,7 +146,7 @@ kernel void colorharmonizer_apply(write_only image2d_t out,
146146
float4 JCH;
147147
JCH.x = J;
148148
JCH.y = fmax(chroma * (1.0f + corr.y * chroma_weight), 0.0f);
149-
JCH.z = wrap_hue(hue + corr.x * effect_strength * chroma_weight) * 2.0f * M_PI_F - M_PI_F;
149+
JCH.z = wrap_hue(hue + corr.x * effect_strength * chroma_weight) * DT_2PI_F - M_PI_F;
150150

151151
float4 xyY = dt_UCS_JCH_to_xyY(JCH, L_white);
152152
float4 XYZ_D65 = dt_xyY_to_XYZ(xyY);

data/kernels/colorreconstruction.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ colorreconstruction_splat(
9999
case COLORRECONSTRUCT_PRECEDENCE_HUE:
100100
m = atan2(pixel.z, pixel.y) - params.x;
101101
// readjust m into [-pi, +pi] interval
102-
m = m > M_PI_F ? m - 2*M_PI_F : (m < -M_PI_F ? m + 2*M_PI_F : m);
102+
m = m > M_PI_F ? m - DT_2PI_F : (m < -M_PI_F ? m + DT_2PI_F : m);
103103
weight = exp(-m*m/params.y);
104104
break;
105105

data/kernels/colorspace.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ static inline float4 Lab_2_LCH(float4 Lab)
5656
{
5757
float H = atan2(Lab.z, Lab.y);
5858

59-
H = (H > 0.0f) ? H / (2.0f*M_PI_F) : 1.0f - fabs(H) / (2.0f*M_PI_F);
59+
H = (H > 0.0f) ? H / DT_2PI_F : 1.0f - fabs(H) / DT_2PI_F;
6060

6161
const float L = Lab.x;
62-
const float C = hypot(Lab.y, Lab.z);
62+
const float C = dt_fast_hypot(Lab.y, Lab.z);
6363

6464
return (float4)(L, C, H, Lab.w);
6565
}
@@ -68,8 +68,8 @@ static inline float4 Lab_2_LCH(float4 Lab)
6868
static inline float4 LCH_2_Lab(float4 LCH)
6969
{
7070
const float L = LCH.x;
71-
const float a = cos(2.0f*M_PI_F*LCH.z) * LCH.y;
72-
const float b = sin(2.0f*M_PI_F*LCH.z) * LCH.y;
71+
const float a = cos(DT_2PI_F*LCH.z) * LCH.y;
72+
const float b = sin(DT_2PI_F*LCH.z) * LCH.y;
7373

7474
return (float4)(L, a, b, LCH.w);
7575
}
@@ -434,10 +434,10 @@ static inline float4 JzAzBz_2_XYZ(const float4 JzAzBz)
434434

435435
static inline float4 JzAzBz_to_JzCzhz(float4 JzAzBz)
436436
{
437-
const float h = atan2(JzAzBz.z, JzAzBz.y) / (2.0f * M_PI_F);
437+
const float h = atan2(JzAzBz.z, JzAzBz.y) / DT_2PI_F;
438438
float4 JzCzhz;
439439
JzCzhz.x = JzAzBz.x;
440-
JzCzhz.y = hypot(JzAzBz.y, JzAzBz.z);
440+
JzCzhz.y = dt_fast_hypot(JzAzBz.y, JzAzBz.z);
441441
JzCzhz.z = (h >= 0.0f) ? h : 1.0f + h;
442442
JzCzhz.w = JzAzBz.w;
443443
return JzCzhz;
@@ -561,7 +561,7 @@ static inline float4 Yrg_to_Ych(const float4 Yrg)
561561
// -> grading RGB conversion.
562562
const float r = Yrg.y - 0.21902143f;
563563
const float g = Yrg.z - 0.54371398f;
564-
const float c = hypot(g, r);
564+
const float c = dt_fast_hypot(g, r);
565565
const float cos_h = c != 0.f ? r / c : 1.f;
566566
const float sin_h = c != 0.f ? g / c : 0.f;
567567
return (float4)(Y, c, cos_h, sin_h);
@@ -955,7 +955,7 @@ static inline float lookup_gamut(global const float *gamut_lut, const float x)
955955

956956
// Linearly interpolate the value of the gamut LUT at the hue angle in radians.
957957
// convert in LUT coordinate
958-
const float x_test = (float)LUT_ELEM * (x + M_PI_F) / (2.f * M_PI_F);
958+
const float x_test = (float)LUT_ELEM * (x + M_PI_F) / DT_2PI_F;
959959

960960
// find the 2 closest integer coordinates (next/previous)
961961
const float x_prev = floor(x_test);

data/kernels/common.h

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,26 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
3131

3232

3333
#ifndef M_PI_F
34-
#define M_PI_F 3.14159265358979323846f // should be defined by the OpenCL compiler acc. to standard
34+
#define M_PI_F 3.14159265358979323846f
3535
#endif
3636

37+
#ifndef M_LN2f
3738
#define M_LN2f 0.69314718055994530942f
39+
#endif
40+
41+
#ifndef M_PI_2f
42+
#define M_PI_2f 1.57079632679489661923f
43+
#endif
44+
45+
#ifndef M_PI_4f
46+
#define M_PI_4f 0.78539816339744830962f
47+
#endif
48+
49+
#ifndef M_SQRT2_F
50+
#define M_SQRT2_F 1.41421356237309504880f
51+
#endif
52+
53+
#define DT_2PI_F 6.28318530717958647693f
3854

3955
#define LUT_ELEM 512 // gamut LUT number of elements:
4056

@@ -55,6 +71,11 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
5571
#define dtcl_sin(A) native_sin(A)
5672
#define dtcl_cos(A) native_cos(A)
5773

74+
static inline float dt_fast_hypot(const float x, const float y)
75+
{
76+
return native_sqrt(x * x + y * y);
77+
}
78+
5879
// Allow the compiler to convert a * b + c to fused multiply-add to use hardware acceleration
5980
// on compatible platforms
6081
#pragma OPENCL FP_CONTRACT ON
@@ -70,6 +91,11 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
7091
#define dtcl_sin(A) sin(A)
7192
#define dtcl_cos(A) cos(A)
7293

94+
static inline float dt_fast_hypot(const float x, const float y)
95+
{
96+
return hypot(x, y);
97+
}
98+
7399
#pragma OPENCL FP_CONTRACT OFF
74100
#endif
75101

@@ -82,6 +108,15 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
82108
m = t2; \
83109
}
84110

111+
static inline float scharr_gradient(global float *in, const int k, const int w)
112+
{
113+
const float gx = 47.0f / 255.0f * (in[k-w-1] - in[k-w+1] + in[k+w-1] - in[k+w+1])
114+
+ 162.0f / 255.0f * (in[k-1] - in[k+1]);
115+
const float gy = 47.0f / 255.0f * (in[k-w-1] - in[k+w-1] + in[k-w+1] - in[k+w+1])
116+
+ 162.0f / 255.0f * (in[k-w] - in[k+w]);
117+
return dt_fast_hypot(gx, gy);
118+
}
119+
85120
static inline int
86121
FC(const int row, const int col, const unsigned int filters)
87122
{
@@ -143,12 +178,6 @@ atomic_add_f(
143178
#endif
144179
}
145180

146-
static inline float
147-
dt_fast_hypot(const float x, const float y)
148-
{
149-
return dtcl_sqrt(x * x + y * y);
150-
}
151-
152181
/* we use this exp approximation to maintain full identity with cpu path */
153182
static inline float
154183
dt_fast_expf(const float x)

data/kernels/demosaic_rcd.cl

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ __kernel void rcd_populate (__read_only image2d_t in, global float *cfa, global
3232
const int col = get_global_id(0);
3333
const int row = get_global_id(1);
3434
if(col >= w || row >= height) return;
35-
const float val = scale * fmax(0.0f, readsingle(in, col, row));
35+
const float val = scale * fmax(0.0f, Areadsingle(in, col, row));
3636
const int color = FC(row, col, filters);
3737

3838
global float *rgbcol = rgb0;
@@ -51,7 +51,7 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0,
5151
if(!(col >= border && col < w - border && row >= border && row < height - border)) return;
5252
const int idx = mad24(row, w, col);
5353

54-
write_imagef(out, (int2)(col, row), (float4)(fmax(scale * rgb0[idx], 0.0f), fmax(scale * rgb1[idx], 0.0f), fmax(scale * rgb2[idx], 0.0f), 0.0f));
54+
write_imagef(out, (int2)(col, row), fmax(0.0f, (float4)(scale * rgb0[idx], scale * rgb1[idx], scale * rgb2[idx], 0.0f)));
5555
}
5656

5757
#define eps 1e-5f // Tolerance to avoid dividing by zero
@@ -278,8 +278,8 @@ __kernel void write_blended_dual(__read_only image2d_t high,
278278
const int row = get_global_id(1);
279279
if((col >= w) || (row >= height)) return;
280280

281-
const float4 high_val = readpixel(high, col, row);
282-
const float4 low_val = readpixel(low, col, row);
281+
const float4 high_val = Areadpixel(high, col, row);
282+
const float4 low_val = Areadpixel(low, col, row);
283283
const float4 blender = (float4)clipf(mask[mad24(row, w, col)]);
284284
float4 data = mix(low_val, high_val, blender);
285285
data.w = showmask ? blender.x : 0.0f;
@@ -297,8 +297,8 @@ __kernel void calc_Y0_mask(global float *mask,
297297
if((col >= w) || (row >= height)) return;
298298
const int idx = mad24(row, w, col);
299299

300-
const float4 pt = wb * fmax(0.0f, readpixel(in, col, row));
301-
mask[idx] = dtcl_sqrt(0.33333333f * (pt.x + pt.y + pt.z));
300+
const float4 pt = wb * fmax(0.0f, Areadpixel(in, col, row));
301+
mask[idx] = dtcl_sqrt((pt.x + pt.y + pt.z) / 3.0f);
302302
}
303303

304304
__kernel void calc_scharr_mask(global float *in, global float *out, const int w, const int height)
@@ -311,11 +311,7 @@ __kernel void calc_scharr_mask(global float *in, global float *out, const int w,
311311
const int incol = clamp(col, 1, w - 2);
312312
const int inrow = clamp(row, 1, height -2);
313313
const int idx = mad24(inrow, w, incol);
314-
const float gx = 47.0f / 255.0f * (in[idx-w-1] - in[idx-w+1] + in[idx+w-1] - in[idx+w+1])
315-
+ 162.0f / 255.0f * (in[idx-1] - in[idx+1]);
316-
const float gy = 47.0f / 255.0f * (in[idx-w-1] - in[idx+w-1] + in[idx-w+1] - in[idx+w+1])
317-
+ 162.0f / 255.0f * (in[idx-w] - in[idx+w]);
318-
const float gradient_magnitude = dt_fast_hypot(gx, gy);
314+
const float gradient_magnitude = scharr_gradient(in, idx, w);
319315
out[oidx] = clipf(gradient_magnitude / 16.0f);
320316
}
321317

@@ -350,7 +346,7 @@ kernel void demosaic_box3(read_only image2d_t in,
350346
if(x >= 0 && y >= 0 && x < width && y < height)
351347
{
352348
const int color = fcol(y, x, filters, xtrans);
353-
sum[color] += fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
349+
sum[color] += fmax(0.0f, Areadsingle(in, x, y));
354350
cnt[color] += 1.0f;
355351
}
356352
}

src/chart/thinplate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ int thinplate_match(const tonecurve_t *curve, // tonecurve to apply after this (
440440
float thinplate_color_pos(const float L, const float a, const float b)
441441
{
442442
const float h = atan2f(b, a) + M_PI_F;
443-
const int sector = 4.0f * h / (2.0f * M_PI_F);
443+
const int sector = 4.0f * h / DT_2PI_F;
444444
return 256.0 * sector + L; // C;
445445
}
446446

0 commit comments

Comments
 (0)