11// Copyright 2024 Google LLC
2- // Copyright 2025 Fujitsu Limited
2+ // Copyright 2026 Fujitsu Limited
33// SPDX-License-Identifier: Apache-2.0
44// SPDX-License-Identifier: BSD-3-Clause
55//
1515// See the License for the specific language governing permissions and
1616// limitations under the License.
1717
18+
19+ /* *********************************************************************************
20+ ** Integer division
21+ **********************************************************************************
22+ * Almost all architectures (except Power10) don't support integer vector division,
23+ * and the cost of scalar division on architectures like x86 is too high - it can
24+ * take 30 to 40 cycles on modern chips and up to 100 on older ones.
25+ *
26+ * Therefore we use division by multiplying with a precomputed reciprocal. The
27+ * method used in this implementation is based on T. Granlund and P. L. Montgomery,
28+ * "Division by invariant integers using multiplication" (see [Figure 4.1, 5.1])
29+ * https://gmplib.org/~tege/divcnst-pldi94.pdf
30+ * https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
31+ *
32+ * It shows good performance gains for all architectures, especially on x86.
33+ * However, computing divisor parameters is expensive, so this implementation
34+ * should only be used when the divisor is a scalar and used multiple times.
35+ *
36+ * We split the work into two steps:
37+ * 1) Precompute parameters from the scalar divisor (multiplier + shifts).
38+ * DivisorParams{U,S}<T> ComputeDivisorParams(T divisor);
39+ * Computes the divisor parameters (multiplier + shifts + sign of divisor).
40+ *
41+ * 2) Use those parameters to replace per-lane division with MulHigh + shifts.
42+ * Vec<D> IntDiv(D d, Vec<D> dividend, const DivisorParams{U,S}<T>& params);
43+ * Performs the actual division using the precomputed parameters.
44+ *
45+ ** NOTES:
46+ * - For 64-bit division on Aarch64 and IBM/Power, we fall back to scalar division
47+ * since emulating multiply-high is expensive and both architectures have very
48+ * fast hardware dividers.
49+ * - Power-of-two divisors are optimized to simple shifts.
50+ * - Edge cases like INT_MIN / -1 are handled specially.
51+ *
52+ ***************************************************************
53+ ** Figure 4.1: Unsigned division by run-time invariant divisor
54+ ***************************************************************
55+ * Initialization (given uword d with 1 <= d < 2^N):
56+ * int l = ceil(log2(d));
57+ * uword m = 2^N * (2^l - d) / d + 1;
58+ * int sh1 = min(l, 1);
59+ * int sh2 = max(l - 1, 0);
60+ *
61+ * For q = FLOOR(a/d), all uword:
62+ * uword t1 = MULUH(m, a);
63+ * q = SRL(t1 + SRL(a - t1, sh1), sh2);
64+ *
65+ ************************************************************************************
66+ ** Figure 5.1: Signed division by run-time invariant divisor, rounded towards zero
67+ ************************************************************************************
68+ * Initialization (given constant sword d with d != 0):
69+ * int l = max(ceil(log2(abs(d))), 1);
70+ * udword m0 = 1 + (2^(N+l-1)) / abs(d);
71+ * sword m = m0 - 2^N;
72+ * sword dsign = XSIGN(d);
73+ * int sh = l - 1;
74+ *
75+ * For q = TRUNC(a/d), all sword:
76+ * sword q0 = a + MULSH(m, a);
77+ * q0 = SRA(q0, sh) - XSIGN(a);
78+ * q = EOR(q0, dsign) - dsign;
79+ *
80+ **********************************************************************************/
81+
82+
1883// Per-target include guard
1984#if defined(HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_) == defined(HWY_TARGET_TOGGLE)
2085#ifdef HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_
@@ -196,6 +261,21 @@ HWY_INLINE unsigned LeadingZeroCount64(uint64_t x) {
196261#endif
197262}
198263
264+ /* *
265+ * Divides a 128-bit unsigned integer (high:0) by a 64-bit divisor.
266+ * Computes: (high << 64) / divisor
267+ *
268+ * This function is needed to calculate the multiplier for 64-bit integer
269+ * division in the Granlund-Montgomery algorithm.
270+ *
271+ * Minified version based on Donald Knuth's Algorithm D (Division of nonnegative
272+ * integers), and generic implementation in Hacker's Delight.
273+ *
274+ * See https://skanthak.homepage.t-online.de/division.html
275+ * with respect to the license of the Hacker's Delight book
276+ * (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm)
277+ */
278+
199279HWY_INLINE uint64_t DivideHighBy (uint64_t high, uint64_t divisor) {
200280 HWY_DASSERT (divisor != 0 );
201281
@@ -262,15 +342,9 @@ HWY_INLINE V ShiftRightUniform(D d, V v, int sh) {
262342 if constexpr (sizeof (T) * 8 > 8 ) {
263343 if (sh & 8 ) v = ShiftRight<8 >(v);
264344 }
265- if constexpr (sizeof (T) * 8 > 4 ) {
266- if (sh & 4 ) v = ShiftRight<4 >(v);
267- }
268- if constexpr (sizeof (T) * 8 > 2 ) {
269- if (sh & 2 ) v = ShiftRight<2 >(v);
270- }
271- if constexpr (sizeof (T) * 8 > 1 ) {
272- if (sh & 1 ) v = ShiftRight<1 >(v);
273- }
345+ if (sh & 4 ) v = ShiftRight<4 >(v);
346+ if (sh & 2 ) v = ShiftRight<2 >(v);
347+ if (sh & 1 ) v = ShiftRight<1 >(v);
274348 return v;
275349}
276350
@@ -623,7 +697,7 @@ HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsU<T>& params) {
623697 }
624698 #endif
625699
626- if constexpr (sizeof (T) == 1 ) {
700+ if constexpr (sizeof (T) <= 2 ) {
627701 if constexpr (D::kPrivateLanes < 2 ) {
628702 return detail::ScalarDivPerLane (d, dividend, params.divisor );
629703 } else {
@@ -638,42 +712,14 @@ if constexpr (sizeof(T) == 1) {
638712 const auto prod_lo = Mul (lo_wide, mul_wide);
639713 const auto prod_hi = Mul (hi_wide, mul_wide);
640714
641- #if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
642- const V t1 = OrderedDemote2To (d, ShiftRight<8 >(prod_lo), ShiftRight<8 >(prod_hi));
643- #else
644- const Half<D> d_half;
645- const auto t1_lo = DemoteTo (d_half, ShiftRight<8 >(prod_lo));
646- const auto t1_hi = DemoteTo (d_half, ShiftRight<8 >(prod_hi));
647- const V t1 = Combine (d, t1_hi, t1_lo);
648- #endif
649-
650- const V diff = Sub (dividend, t1);
651- const V shifted = detail::ShiftRightUniform (d, diff, params.shift1 );
652- const V sum = Add (t1, shifted);
653- return detail::ShiftRightUniform (d, sum, params.shift2 );
654- }
655-
656- } else if constexpr (sizeof (T) == 2 ) {
657- if constexpr (D::kPrivateLanes < 2 ) {
658- return detail::ScalarDivPerLane (d, dividend, params.divisor );
659- } else {
660- using TWide = MulType_t<T>;
661- const Repartition<TWide, D> d_wide;
662-
663- const auto lo_wide = PromoteLowerTo (d_wide, dividend);
664- const auto hi_wide = PromoteUpperTo (d_wide, dividend);
665-
666- const auto mul_wide = Set (d_wide, static_cast <TWide>(params.multiplier ));
667-
668- const auto prod_lo = Mul (lo_wide, mul_wide);
669- const auto prod_hi = Mul (hi_wide, mul_wide);
715+ constexpr int kShift = static_cast <int >(sizeof (T) * 8 );
670716
671717 #if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
672- const V t1 = OrderedDemote2To (d, ShiftRight<16 >(prod_lo), ShiftRight<16 >(prod_hi));
718+ const V t1 = OrderedDemote2To (d, ShiftRight<kShift >(prod_lo), ShiftRight<kShift >(prod_hi));
673719 #else
674720 const Half<D> d_half;
675- const auto t1_lo = DemoteTo (d_half, ShiftRight<16 >(prod_lo));
676- const auto t1_hi = DemoteTo (d_half, ShiftRight<16 >(prod_hi));
721+ const auto t1_lo = DemoteTo (d_half, ShiftRight<kShift >(prod_lo));
722+ const auto t1_hi = DemoteTo (d_half, ShiftRight<kShift >(prod_hi));
677723 const V t1 = Combine (d, t1_hi, t1_lo);
678724 #endif
679725
@@ -682,15 +728,7 @@ if constexpr (sizeof(T) == 1) {
682728 const V sum = Add (t1, shifted);
683729 return detail::ShiftRightUniform (d, sum, params.shift2 );
684730 }
685- } else if constexpr (sizeof (T) == 4 ) {
686- const V multiplier = Set (d, params.multiplier );
687- const V t1 = MulHigh (dividend, multiplier);
688- const V diff = Sub (dividend, t1);
689- const V shifted = detail::ShiftRightUniform (d, diff, params.shift1 );
690- const V sum = Add (t1, shifted);
691- return detail::ShiftRightUniform (d, sum, params.shift2 );
692-
693- } else {
731+ } else {
694732 const V multiplier = Set (d, params.multiplier );
695733 const V t1 = MulHigh (dividend, multiplier);
696734 const V diff = Sub (dividend, t1);
@@ -742,34 +780,7 @@ HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsS<T>& params) {
742780
743781 V q0;
744782
745- if constexpr (sizeof (T) == 1 ) {
746- if constexpr (D::kPrivateLanes < 2 ) {
747- return detail::ScalarDivPerLane (d, dividend, params.divisor );
748- } else {
749- using TWide = MulType_t<T>;
750- const Repartition<TWide, D> d_wide;
751-
752- const auto lo_wide = PromoteLowerTo (d_wide, dividend);
753- const auto hi_wide = PromoteUpperTo (d_wide, dividend);
754-
755- const auto mul_wide = Set (d_wide, static_cast <TWide>(params.multiplier ));
756-
757- const auto prod_lo = Mul (lo_wide, mul_wide);
758- const auto prod_hi = Mul (hi_wide, mul_wide);
759-
760- #if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
761- const auto high = OrderedDemote2To (d, ShiftRight<8 >(prod_lo), ShiftRight<8 >(prod_hi));
762- #else
763- const Half<D> d_half;
764- const auto high_lo = DemoteTo (d_half, ShiftRight<8 >(prod_lo));
765- const auto high_hi = DemoteTo (d_half, ShiftRight<8 >(prod_hi));
766- const auto high = Combine (d, high_hi, high_lo);
767- #endif
768-
769- q0 = Add (dividend, high);
770- }
771-
772- } else if constexpr (sizeof (T) == 2 ) {
783+ if constexpr (sizeof (T) <= 2 ) {
773784 if constexpr (D::kPrivateLanes < 2 ) {
774785 return detail::ScalarDivPerLane (d, dividend, params.divisor );
775786 } else {
@@ -784,23 +795,20 @@ if constexpr (sizeof(T) == 1) {
784795 const auto prod_lo = Mul (lo_wide, mul_wide);
785796 const auto prod_hi = Mul (hi_wide, mul_wide);
786797
798+ constexpr int kShift = static_cast <int >(sizeof (T) * 8 );
799+
787800 #if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
788- const auto high = OrderedDemote2To (d, ShiftRight<16 >(prod_lo), ShiftRight<16 >(prod_hi));
801+ const auto high = OrderedDemote2To (d, ShiftRight<kShift >(prod_lo), ShiftRight<kShift >(prod_hi));
789802 #else
790803 const Half<D> d_half;
791- const auto high_lo = DemoteTo (d_half, ShiftRight<16 >(prod_lo));
792- const auto high_hi = DemoteTo (d_half, ShiftRight<16 >(prod_hi));
804+ const auto high_lo = DemoteTo (d_half, ShiftRight<kShift >(prod_lo));
805+ const auto high_hi = DemoteTo (d_half, ShiftRight<kShift >(prod_hi));
793806 const auto high = Combine (d, high_hi, high_lo);
794807 #endif
795808
796809 q0 = Add (dividend, high);
797810 }
798- } else if constexpr (sizeof (T) == 4 ) {
799- const V multiplier = Set (d, params.multiplier );
800- const V mulh = MulHigh (dividend, multiplier);
801- q0 = Add (dividend, mulh);
802-
803- } else {
811+ } else {
804812 const V multiplier = Set (d, params.multiplier );
805813 const V mulh = MulHigh (dividend, multiplier);
806814 q0 = Add (dividend, mulh);
0 commit comments