Skip to content

Commit 76d7f22

Browse files
committed
Add HWY helpers for RGBA-with-RGB ROI
Replace duplicated ad-hoc SIMD special-cases in add/sub/mad with generalized HWY helpers that handle the common packed-RGBA-but-ROI-is-RGB case. Introduce PromoteVec/DemoteVec, lane-type mapping for half, interleaved Load/Store helpers (including partial-vector variants), and per-pixel/ternary routines that preserve alpha or mask it for native integer ops. Also switch HwyPixels to use pixel/scanline stride, add necessary forward declarations and includes, and simplify callers to use the new helpers, broadening support to integer/native ops and reducing code duplication. Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
1 parent 927501c commit 76d7f22

File tree

4 files changed

+584
-753
lines changed

4 files changed

+584
-753
lines changed

src/libOpenImageIO/imagebufalgo_addsub.cpp

Lines changed: 49 additions & 268 deletions
Original file line numberDiff line numberDiff line change
@@ -86,125 +86,20 @@ add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
8686
return hn::Add(a, b);
8787
};
8888

89-
// Special-case: RGBA images but ROI is RGB (strided channel subset). We
90-
// still can SIMD the RGB channels by processing full RGBA and preserving
91-
// alpha exactly (bitwise) from the destination.
92-
if (roi.chbegin == 0 && roi.chend == 3) {
93-
// Only support same-type float/half/double in this fast path.
94-
constexpr bool floaty = (std::is_same_v<Rtype, float>
95-
|| std::is_same_v<Rtype, double>
96-
|| std::is_same_v<Rtype, half>)
97-
&& std::is_same_v<Rtype, Atype>
98-
&& std::is_same_v<Rtype, Btype>;
99-
if constexpr (floaty) {
100-
auto Rv = HwyPixels(R);
101-
auto Av = HwyPixels(A);
102-
auto Bv = HwyPixels(B);
103-
if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
104-
&& ChannelsContiguous<Rtype>(Rv, 4)
105-
&& ChannelsContiguous<Atype>(Av, 4)
106-
&& ChannelsContiguous<Btype>(Bv, 4)) {
107-
ROI roi4 = roi;
108-
roi4.chbegin = 0;
109-
roi4.chend = 4;
110-
using MathT = typename SimdMathType<Rtype>::type;
111-
const hn::ScalableTag<MathT> d;
112-
const size_t lanes = hn::Lanes(d);
113-
ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
114-
for (int y = roi4.ybegin; y < roi4.yend; ++y) {
115-
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi4);
116-
const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
117-
const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
118-
const size_t npixels = static_cast<size_t>(roi4.width());
119-
120-
size_t x = 0;
121-
for (; x + lanes <= npixels; x += lanes) {
122-
const size_t off = x * 4;
123-
if constexpr (std::is_same_v<Rtype, half>) {
124-
using T16 = hwy::float16_t;
125-
auto d16 = hn::Rebind<T16, decltype(d)>();
126-
const T16* a16
127-
= reinterpret_cast<const T16*>(a_row + off);
128-
const T16* b16
129-
= reinterpret_cast<const T16*>(b_row + off);
130-
T16* r16 = reinterpret_cast<T16*>(r_row + off);
131-
132-
hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
133-
hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
134-
hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
135-
hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
136-
aa16);
137-
hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
138-
ba16);
139-
hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
140-
da16);
141-
(void)aa16;
142-
(void)ba16;
143-
(void)dr16;
144-
(void)dg16;
145-
(void)db16;
146-
147-
auto rr = op(d, hn::PromoteTo(d, ar16),
148-
hn::PromoteTo(d, br16));
149-
auto rg = op(d, hn::PromoteTo(d, ag16),
150-
hn::PromoteTo(d, bg16));
151-
auto rb = op(d, hn::PromoteTo(d, ab16),
152-
hn::PromoteTo(d, bb16));
153-
154-
auto rr16 = hn::DemoteTo(d16, rr);
155-
auto rg16 = hn::DemoteTo(d16, rg);
156-
auto rb16 = hn::DemoteTo(d16, rb);
157-
hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
158-
r16);
159-
} else {
160-
hn::Vec<decltype(d)> ar, ag, ab, aa;
161-
hn::Vec<decltype(d)> br, bg, bb, ba;
162-
hn::Vec<decltype(d)> dr, dg, db, da;
163-
hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
164-
aa);
165-
hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
166-
ba);
167-
hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
168-
da);
169-
(void)aa;
170-
(void)ba;
171-
(void)dr;
172-
(void)dg;
173-
(void)db;
174-
175-
auto rr = op(d, ar, br);
176-
auto rg = op(d, ag, bg);
177-
auto rb = op(d, ab, bb);
178-
hn::StoreInterleaved4(rr, rg, rb, da, d,
179-
r_row + off);
180-
}
181-
}
182-
183-
for (; x < npixels; ++x) {
184-
const size_t off = x * 4;
185-
if constexpr (std::is_same_v<Rtype, half>) {
186-
r_row[off + 0]
187-
= half((float)a_row[off + 0]
188-
+ (float)b_row[off + 0]);
189-
r_row[off + 1]
190-
= half((float)a_row[off + 1]
191-
+ (float)b_row[off + 1]);
192-
r_row[off + 2]
193-
= half((float)a_row[off + 2]
194-
+ (float)b_row[off + 2]);
195-
} else {
196-
r_row[off + 0] = a_row[off + 0] + b_row[off + 0];
197-
r_row[off + 1] = a_row[off + 1] + b_row[off + 1];
198-
r_row[off + 2] = a_row[off + 2] + b_row[off + 2];
199-
}
200-
// Preserve alpha (off+3).
201-
}
202-
}
203-
});
204-
return true;
205-
}
206-
}
89+
// Handle packed RGBA images with an RGB ROI (preserve alpha).
90+
if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
91+
&& std::is_same_v<Rtype, Btype>) {
92+
auto op_int = [](auto /*d*/, auto a, auto b) {
93+
return hn::SaturatedAdd(a, b);
94+
};
95+
if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
96+
nthreads,
97+
op_int))
98+
return true;
20799
}
100+
if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
101+
nthreads, op))
102+
return true;
208103

209104
return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
210105
op);
@@ -267,22 +162,15 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
267162
}
268163

269164
// Handle the common RGBA + RGB ROI strided case (preserving alpha).
270-
constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
271-
|| std::is_same_v<Rtype, double>
272-
|| std::is_same_v<Rtype, half>)
273-
&& std::is_same_v<Rtype, Atype>
274-
&& std::is_same_v<Rtype, Btype>;
275-
if constexpr (floaty_strided) {
276-
if (roi.chbegin == 0 && roi.chend == 3) {
277-
const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
278-
&& Bv.nchannels >= 4)
279-
&& ChannelsContiguous<Rtype>(Rv, 4)
280-
&& ChannelsContiguous<Atype>(Av, 4)
281-
&& ChannelsContiguous<Btype>(Bv, 4);
282-
if (contig4)
283-
return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
284-
nthreads);
285-
}
165+
if (roi.chbegin == 0 && roi.chend == 3) {
166+
const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
167+
&& Bv.nchannels >= 4)
168+
&& ChannelsContiguous<Rtype>(Rv, 4)
169+
&& ChannelsContiguous<Atype>(Av, 4)
170+
&& ChannelsContiguous<Btype>(Bv, 4);
171+
if (contig4)
172+
return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
173+
nthreads);
286174
}
287175
}
288176
#endif
@@ -322,131 +210,31 @@ sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
322210
return hn::Sub(a, b);
323211
};
324212

325-
// Special-case: RGBA images but ROI is RGB (strided channel subset). We
326-
// still can SIMD the RGB channels by processing full RGBA and preserving
327-
// alpha exactly (bitwise) from the destination.
328-
if (roi.chbegin == 0 && roi.chend == 3) {
329-
// Only support same-type float/half/double in this fast path.
330-
constexpr bool floaty = (std::is_same_v<Rtype, float>
331-
|| std::is_same_v<Rtype, double>
332-
|| std::is_same_v<Rtype, half>)
333-
&& std::is_same_v<Rtype, Atype>
334-
&& std::is_same_v<Rtype, Btype>;
335-
if constexpr (floaty) {
336-
auto Rv = HwyPixels(R);
337-
auto Av = HwyPixels(A);
338-
auto Bv = HwyPixels(B);
339-
if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
340-
&& ChannelsContiguous<Rtype>(Rv, 4)
341-
&& ChannelsContiguous<Atype>(Av, 4)
342-
&& ChannelsContiguous<Btype>(Bv, 4)) {
343-
ROI roi4 = roi;
344-
roi4.chbegin = 0;
345-
roi4.chend = 4;
346-
using MathT = typename SimdMathType<Rtype>::type;
347-
const hn::ScalableTag<MathT> d;
348-
const size_t lanes = hn::Lanes(d);
349-
ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
350-
for (int y = roi4.ybegin; y < roi4.yend; ++y) {
351-
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi4);
352-
const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
353-
const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
354-
const size_t npixels = static_cast<size_t>(roi4.width());
355-
356-
size_t x = 0;
357-
for (; x + lanes <= npixels; x += lanes) {
358-
const size_t off = x * 4;
359-
if constexpr (std::is_same_v<Rtype, half>) {
360-
using T16 = hwy::float16_t;
361-
auto d16 = hn::Rebind<T16, decltype(d)>();
362-
const T16* a16
363-
= reinterpret_cast<const T16*>(a_row + off);
364-
const T16* b16
365-
= reinterpret_cast<const T16*>(b_row + off);
366-
T16* r16 = reinterpret_cast<T16*>(r_row + off);
367-
368-
hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
369-
hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
370-
hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
371-
hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
372-
aa16);
373-
hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
374-
ba16);
375-
hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
376-
da16);
377-
(void)aa16;
378-
(void)ba16;
379-
(void)dr16;
380-
(void)dg16;
381-
(void)db16;
382-
383-
auto rr = op(d, hn::PromoteTo(d, ar16),
384-
hn::PromoteTo(d, br16));
385-
auto rg = op(d, hn::PromoteTo(d, ag16),
386-
hn::PromoteTo(d, bg16));
387-
auto rb = op(d, hn::PromoteTo(d, ab16),
388-
hn::PromoteTo(d, bb16));
389-
390-
auto rr16 = hn::DemoteTo(d16, rr);
391-
auto rg16 = hn::DemoteTo(d16, rg);
392-
auto rb16 = hn::DemoteTo(d16, rb);
393-
hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
394-
r16);
395-
} else {
396-
hn::Vec<decltype(d)> ar, ag, ab, aa;
397-
hn::Vec<decltype(d)> br, bg, bb, ba;
398-
hn::Vec<decltype(d)> dr, dg, db, da;
399-
hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
400-
aa);
401-
hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
402-
ba);
403-
hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
404-
da);
405-
(void)aa;
406-
(void)ba;
407-
(void)dr;
408-
(void)dg;
409-
(void)db;
410-
411-
auto rr = op(d, ar, br);
412-
auto rg = op(d, ag, bg);
413-
auto rb = op(d, ab, bb);
414-
hn::StoreInterleaved4(rr, rg, rb, da, d,
415-
r_row + off);
416-
}
417-
}
418-
419-
for (; x < npixels; ++x) {
420-
const size_t off = x * 4;
421-
if constexpr (std::is_same_v<Rtype, half>) {
422-
r_row[off + 0]
423-
= half((float)a_row[off + 0]
424-
- (float)b_row[off + 0]);
425-
r_row[off + 1]
426-
= half((float)a_row[off + 1]
427-
- (float)b_row[off + 1]);
428-
r_row[off + 2]
429-
= half((float)a_row[off + 2]
430-
- (float)b_row[off + 2]);
431-
} else {
432-
r_row[off + 0] = a_row[off + 0] - b_row[off + 0];
433-
r_row[off + 1] = a_row[off + 1] - b_row[off + 1];
434-
r_row[off + 2] = a_row[off + 2] - b_row[off + 2];
435-
}
436-
// Preserve alpha (off+3).
437-
}
438-
}
439-
});
440-
return true;
441-
}
442-
}
213+
// Handle packed RGBA images with an RGB ROI (preserve alpha).
214+
if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
215+
&& std::is_same_v<Rtype, Btype>) {
216+
auto op_int = [](auto /*d*/, auto a, auto b) {
217+
return hn::SaturatedSub(a, b);
218+
};
219+
if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
220+
nthreads,
221+
op_int))
222+
return true;
443223
}
224+
if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
225+
nthreads, op))
226+
return true;
444227

445228
return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
446229
op);
447230
}
448231
#endif // defined(OIIO_USE_HWY) && OIIO_USE_HWY
449232

233+
template<class Rtype, class Atype, class Btype>
234+
static bool
235+
sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
236+
int nthreads);
237+
450238
template<class Rtype, class Atype, class Btype>
451239
static bool
452240
sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
@@ -475,22 +263,15 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
475263
}
476264

477265
// Handle the common RGBA + RGB ROI strided case (preserving alpha).
478-
constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
479-
|| std::is_same_v<Rtype, double>
480-
|| std::is_same_v<Rtype, half>)
481-
&& std::is_same_v<Rtype, Atype>
482-
&& std::is_same_v<Rtype, Btype>;
483-
if constexpr (floaty_strided) {
484-
if (roi.chbegin == 0 && roi.chend == 3) {
485-
const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
486-
&& Bv.nchannels >= 4)
487-
&& ChannelsContiguous<Rtype>(Rv, 4)
488-
&& ChannelsContiguous<Atype>(Av, 4)
489-
&& ChannelsContiguous<Btype>(Bv, 4);
490-
if (contig4)
491-
return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
492-
nthreads);
493-
}
266+
if (roi.chbegin == 0 && roi.chend == 3) {
267+
const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
268+
&& Bv.nchannels >= 4)
269+
&& ChannelsContiguous<Rtype>(Rv, 4)
270+
&& ChannelsContiguous<Atype>(Av, 4)
271+
&& ChannelsContiguous<Btype>(Bv, 4);
272+
if (contig4)
273+
return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
274+
nthreads);
494275
}
495276
}
496277
#endif

0 commit comments

Comments
 (0)