Skip to content

Commit f2fe675

Browse files
committed
Improve SIMD normalization for signed integer image types
Refactors SIMD load and store routines to consistently normalize signed integer types (int8, int16, int32) to approximately [-1.0, 1.0] range, and updates denormalization to match. This ensures symmetric mapping and clamping for negative values, improving accuracy and consistency in image operations.
1 parent 53b0294 commit f2fe675

6 files changed

Lines changed: 466 additions & 287 deletions

File tree

src/libOpenImageIO/imagebufalgo_addsub.cpp

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
6464
// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
6565
template<class T>
6666
static bool
67-
add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
68-
int nthreads)
67+
add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
68+
ROI roi, int nthreads)
6969
{
7070
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
7171
const ImageSpec& Rspec = R.spec();
@@ -103,31 +103,34 @@ add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI r
103103
if (contig) {
104104
// Native integer saturated add - much faster than float conversion!
105105
size_t n = static_cast<size_t>(roi.width()) * nchannels;
106-
RunHwyBinaryNativeInt<T>(
107-
reinterpret_cast<T*>(r_row),
108-
reinterpret_cast<const T*>(a_row),
109-
reinterpret_cast<const T*>(b_row), n,
110-
[](auto d, auto a, auto b) { return hn::SaturatedAdd(a, b); });
106+
RunHwyBinaryNativeInt<T>(reinterpret_cast<T*>(r_row),
107+
reinterpret_cast<const T*>(a_row),
108+
reinterpret_cast<const T*>(b_row), n,
109+
[](auto d, auto a, auto b) {
110+
return hn::SaturatedAdd(a, b);
111+
});
111112
} else {
112113
// Scalar fallback
113114
for (int x = 0; x < roi.width(); ++x) {
114115
T* r_ptr = reinterpret_cast<T*>(r_row)
115-
+ x * r_pixel_bytes / sizeof(T);
116+
+ x * r_pixel_bytes / sizeof(T);
116117
const T* a_ptr = reinterpret_cast<const T*>(a_row)
117-
+ x * a_pixel_bytes / sizeof(T);
118+
+ x * a_pixel_bytes / sizeof(T);
118119
const T* b_ptr = reinterpret_cast<const T*>(b_row)
119-
+ x * b_pixel_bytes / sizeof(T);
120+
+ x * b_pixel_bytes / sizeof(T);
120121
for (int c = 0; c < nchannels; ++c) {
121122
// Saturating add in scalar
122123
int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
123124
if constexpr (std::is_unsigned_v<T>) {
124125
r_ptr[c] = (sum > std::numeric_limits<T>::max())
125-
? std::numeric_limits<T>::max() : (T)sum;
126+
? std::numeric_limits<T>::max()
127+
: (T)sum;
126128
} else {
127129
r_ptr[c] = (sum > std::numeric_limits<T>::max())
128-
? std::numeric_limits<T>::max()
129-
: (sum < std::numeric_limits<T>::min())
130-
? std::numeric_limits<T>::min() : (T)sum;
130+
? std::numeric_limits<T>::max()
131+
: (sum < std::numeric_limits<T>::min())
132+
? std::numeric_limits<T>::min()
133+
: (T)sum;
131134
}
132135
}
133136
}
@@ -193,8 +196,9 @@ add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
193196
const Btype* b_ptr = reinterpret_cast<const Btype*>(b_row)
194197
+ x * b_pixel_bytes / sizeof(Btype);
195198
for (int c = 0; c < nchannels; ++c) {
196-
r_ptr[c] = static_cast<Rtype>(static_cast<float>(a_ptr[c]) +
197-
static_cast<float>(b_ptr[c]));
199+
r_ptr[c] = static_cast<Rtype>(
200+
static_cast<float>(a_ptr[c])
201+
+ static_cast<float>(b_ptr[c]));
198202
}
199203
}
200204
}
@@ -248,7 +252,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
248252
&& B.localpixels()) {
249253
// Use native integer path for scale-invariant add when all types match
250254
// and are integer types (much faster: 6-12x vs 3-5x with float conversion)
251-
constexpr bool all_same = std::is_same_v<Rtype, Atype> && std::is_same_v<Atype, Btype>;
255+
constexpr bool all_same = std::is_same_v<Rtype, Atype>
256+
&& std::is_same_v<Atype, Btype>;
252257
constexpr bool is_integer = std::is_integral_v<Rtype>;
253258
if constexpr (all_same && is_integer) {
254259
return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
@@ -270,8 +275,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
270275
// Native integer sub using SaturatedSub (scale-invariant, no float conversion)
271276
template<class T>
272277
static bool
273-
sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
274-
int nthreads)
278+
sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
279+
ROI roi, int nthreads)
275280
{
276281
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
277282
const ImageSpec& Rspec = R.spec();
@@ -309,31 +314,35 @@ sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI r
309314
if (contig) {
310315
// Native integer saturated sub - much faster than float conversion!
311316
size_t n = static_cast<size_t>(roi.width()) * nchannels;
312-
RunHwyBinaryNativeInt<T>(
313-
reinterpret_cast<T*>(r_row),
314-
reinterpret_cast<const T*>(a_row),
315-
reinterpret_cast<const T*>(b_row), n,
316-
[](auto d, auto a, auto b) { return hn::SaturatedSub(a, b); });
317+
RunHwyBinaryNativeInt<T>(reinterpret_cast<T*>(r_row),
318+
reinterpret_cast<const T*>(a_row),
319+
reinterpret_cast<const T*>(b_row), n,
320+
[](auto d, auto a, auto b) {
321+
return hn::SaturatedSub(a, b);
322+
});
317323
} else {
318324
// Scalar fallback
319325
for (int x = 0; x < roi.width(); ++x) {
320326
T* r_ptr = reinterpret_cast<T*>(r_row)
321-
+ x * r_pixel_bytes / sizeof(T);
327+
+ x * r_pixel_bytes / sizeof(T);
322328
const T* a_ptr = reinterpret_cast<const T*>(a_row)
323-
+ x * a_pixel_bytes / sizeof(T);
329+
+ x * a_pixel_bytes / sizeof(T);
324330
const T* b_ptr = reinterpret_cast<const T*>(b_row)
325-
+ x * b_pixel_bytes / sizeof(T);
331+
+ x * b_pixel_bytes / sizeof(T);
326332
for (int c = 0; c < nchannels; ++c) {
327333
// Saturating sub in scalar
328334
if constexpr (std::is_unsigned_v<T>) {
329335
r_ptr[c] = (a_ptr[c] > b_ptr[c])
330-
? (a_ptr[c] - b_ptr[c]) : T(0);
336+
? (a_ptr[c] - b_ptr[c])
337+
: T(0);
331338
} else {
332-
int64_t diff = (int64_t)a_ptr[c] - (int64_t)b_ptr[c];
339+
int64_t diff = (int64_t)a_ptr[c]
340+
- (int64_t)b_ptr[c];
333341
r_ptr[c] = (diff > std::numeric_limits<T>::max())
334-
? std::numeric_limits<T>::max()
335-
: (diff < std::numeric_limits<T>::min())
336-
? std::numeric_limits<T>::min() : (T)diff;
342+
? std::numeric_limits<T>::max()
343+
: (diff < std::numeric_limits<T>::min())
344+
? std::numeric_limits<T>::min()
345+
: (T)diff;
337346
}
338347
}
339348
}
@@ -396,8 +405,9 @@ sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
396405
const Btype* b_ptr = reinterpret_cast<const Btype*>(b_row)
397406
+ x * b_pixel_bytes / sizeof(Btype);
398407
for (int c = 0; c < nchannels; ++c) {
399-
r_ptr[c] = static_cast<Rtype>(static_cast<float>(a_ptr[c]) -
400-
static_cast<float>(b_ptr[c]));
408+
r_ptr[c] = static_cast<Rtype>(
409+
static_cast<float>(a_ptr[c])
410+
- static_cast<float>(b_ptr[c]));
401411
}
402412
}
403413
}
@@ -415,7 +425,8 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
415425
&& B.localpixels()) {
416426
// Use native integer path for scale-invariant sub when all types match
417427
// and are integer types (much faster: 6-12x vs 3-5x with float conversion)
418-
constexpr bool all_same = std::is_same_v<Rtype, Atype> && std::is_same_v<Atype, Btype>;
428+
constexpr bool all_same = std::is_same_v<Rtype, Atype>
429+
&& std::is_same_v<Atype, Btype>;
419430
constexpr bool is_integer = std::is_integral_v<Rtype>;
420431
if constexpr (all_same && is_integer) {
421432
return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);

0 commit comments

Comments
 (0)