Skip to content

Commit 9b1a2f6

Browse files
review comment
1 parent 7724f63 commit 9b1a2f6

4 files changed

Lines changed: 99 additions & 129 deletions

File tree

hwy/contrib/intdiv/README.md

Lines changed: 0 additions & 38 deletions
This file was deleted.

hwy/contrib/intdiv/intdiv-inl.h

Lines changed: 97 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Copyright 2024 Google LLC
2-
// Copyright 2025 Fujitsu Limited
2+
// Copyright 2026 Fujitsu Limited
33
// SPDX-License-Identifier: Apache-2.0
44
// SPDX-License-Identifier: BSD-3-Clause
55
//
@@ -15,6 +15,71 @@
1515
// See the License for the specific language governing permissions and
1616
// limitations under the License.
1717

18+
19+
/**********************************************************************************
20+
** Integer division
21+
**********************************************************************************
22+
* Almost all architectures (except Power10) don't support integer vector division,
23+
* and the cost of scalar division on architectures like x86 is too high - it can
24+
* take 30 to 40 cycles on modern chips and up to 100 on older ones.
25+
*
26+
* Therefore we use division by multiplying with a precomputed reciprocal. The
27+
* method used in this implementation is based on T. Granlund and P. L. Montgomery,
28+
* "Division by invariant integers using multiplication" (see [Figure 4.1, 5.1])
29+
* https://gmplib.org/~tege/divcnst-pldi94.pdf
30+
* https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
31+
*
32+
* It shows good performance gains for all architectures, especially on x86.
33+
* However, computing divisor parameters is expensive, so this implementation
34+
* should only be used when the divisor is a scalar and used multiple times.
35+
*
36+
* We split the work into two steps:
37+
* 1) Precompute parameters from the scalar divisor (multiplier + shifts).
38+
* DivisorParams{U,S}<T> ComputeDivisorParams(T divisor);
39+
* Computes the divisor parameters (multiplier + shifts + sign of divisor).
40+
*
41+
* 2) Use those parameters to replace per-lane division with MulHigh + shifts.
42+
* Vec<D> IntDiv(D d, Vec<D> dividend, const DivisorParams{U,S}<T>& params);
43+
* Performs the actual division using the precomputed parameters.
44+
*
45+
** NOTES:
46+
* - For 64-bit division on Aarch64 and IBM/Power, we fall back to scalar division
47+
* since emulating multiply-high is expensive and both architectures have very
48+
* fast hardware dividers.
49+
* - Power-of-two divisors are optimized to simple shifts.
50+
* - Edge cases like INT_MIN / -1 are handled specially.
51+
*
52+
***************************************************************
53+
** Figure 4.1: Unsigned division by run-time invariant divisor
54+
***************************************************************
55+
* Initialization (given uword d with 1 <= d < 2^N):
56+
* int l = ceil(log2(d));
57+
* uword m = 2^N * (2^l - d) / d + 1;
58+
* int sh1 = min(l, 1);
59+
* int sh2 = max(l - 1, 0);
60+
*
61+
* For q = FLOOR(a/d), all uword:
62+
* uword t1 = MULUH(m, a);
63+
* q = SRL(t1 + SRL(a - t1, sh1), sh2);
64+
*
65+
************************************************************************************
66+
** Figure 5.1: Signed division by run-time invariant divisor, rounded towards zero
67+
************************************************************************************
68+
* Initialization (given constant sword d with d != 0):
69+
* int l = max(ceil(log2(abs(d))), 1);
70+
* udword m0 = 1 + (2^(N+l-1)) / abs(d);
71+
* sword m = m0 - 2^N;
72+
* sword dsign = XSIGN(d);
73+
* int sh = l - 1;
74+
*
75+
* For q = TRUNC(a/d), all sword:
76+
* sword q0 = a + MULSH(m, a);
77+
* q0 = SRA(q0, sh) - XSIGN(a);
78+
* q = EOR(q0, dsign) - dsign;
79+
*
80+
**********************************************************************************/
81+
82+
1883
// Per-target include guard
1984
#if defined(HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_) == defined(HWY_TARGET_TOGGLE)
2085
#ifdef HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_
@@ -196,6 +261,21 @@ HWY_INLINE unsigned LeadingZeroCount64(uint64_t x) {
196261
#endif
197262
}
198263

264+
/**
265+
* Divides a 128-bit unsigned integer (high:0) by a 64-bit divisor.
266+
* Computes: (high << 64) / divisor
267+
*
268+
* This function is needed to calculate the multiplier for 64-bit integer
269+
* division in the Granlund-Montgomery algorithm.
270+
*
271+
* Minified version based on Donald Knuth's Algorithm D (Division of nonnegative
272+
* integers), and generic implementation in Hacker's Delight.
273+
*
274+
* See https://skanthak.homepage.t-online.de/division.html
275+
* with respect to the license of the Hacker's Delight book
276+
* (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm)
277+
*/
278+
199279
HWY_INLINE uint64_t DivideHighBy(uint64_t high, uint64_t divisor) {
200280
HWY_DASSERT(divisor != 0);
201281

@@ -262,15 +342,9 @@ HWY_INLINE V ShiftRightUniform(D d, V v, int sh) {
262342
if constexpr (sizeof(T) * 8 > 8) {
263343
if (sh & 8) v = ShiftRight<8>(v);
264344
}
265-
if constexpr (sizeof(T) * 8 > 4) {
266-
if (sh & 4) v = ShiftRight<4>(v);
267-
}
268-
if constexpr (sizeof(T) * 8 > 2) {
269-
if (sh & 2) v = ShiftRight<2>(v);
270-
}
271-
if constexpr (sizeof(T) * 8 > 1) {
272-
if (sh & 1) v = ShiftRight<1>(v);
273-
}
345+
if (sh & 4) v = ShiftRight<4>(v);
346+
if (sh & 2) v = ShiftRight<2>(v);
347+
if (sh & 1) v = ShiftRight<1>(v);
274348
return v;
275349
}
276350

@@ -623,7 +697,7 @@ HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsU<T>& params) {
623697
}
624698
#endif
625699

626-
if constexpr (sizeof(T) == 1) {
700+
if constexpr (sizeof(T) <= 2) {
627701
if constexpr (D::kPrivateLanes < 2) {
628702
return detail::ScalarDivPerLane(d, dividend, params.divisor);
629703
} else {
@@ -638,42 +712,14 @@ if constexpr (sizeof(T) == 1) {
638712
const auto prod_lo = Mul(lo_wide, mul_wide);
639713
const auto prod_hi = Mul(hi_wide, mul_wide);
640714

641-
#if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
642-
const V t1 = OrderedDemote2To(d, ShiftRight<8>(prod_lo), ShiftRight<8>(prod_hi));
643-
#else
644-
const Half<D> d_half;
645-
const auto t1_lo = DemoteTo(d_half, ShiftRight<8>(prod_lo));
646-
const auto t1_hi = DemoteTo(d_half, ShiftRight<8>(prod_hi));
647-
const V t1 = Combine(d, t1_hi, t1_lo);
648-
#endif
649-
650-
const V diff = Sub(dividend, t1);
651-
const V shifted = detail::ShiftRightUniform(d, diff, params.shift1);
652-
const V sum = Add(t1, shifted);
653-
return detail::ShiftRightUniform(d, sum, params.shift2);
654-
}
655-
656-
} else if constexpr (sizeof(T) == 2) {
657-
if constexpr (D::kPrivateLanes < 2) {
658-
return detail::ScalarDivPerLane(d, dividend, params.divisor);
659-
} else {
660-
using TWide = MulType_t<T>;
661-
const Repartition<TWide, D> d_wide;
662-
663-
const auto lo_wide = PromoteLowerTo(d_wide, dividend);
664-
const auto hi_wide = PromoteUpperTo(d_wide, dividend);
665-
666-
const auto mul_wide = Set(d_wide, static_cast<TWide>(params.multiplier));
667-
668-
const auto prod_lo = Mul(lo_wide, mul_wide);
669-
const auto prod_hi = Mul(hi_wide, mul_wide);
715+
constexpr int kShift = static_cast<int>(sizeof(T) * 8);
670716

671717
#if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
672-
const V t1 = OrderedDemote2To(d, ShiftRight<16>(prod_lo), ShiftRight<16>(prod_hi));
718+
const V t1 = OrderedDemote2To(d, ShiftRight<kShift>(prod_lo), ShiftRight<kShift>(prod_hi));
673719
#else
674720
const Half<D> d_half;
675-
const auto t1_lo = DemoteTo(d_half, ShiftRight<16>(prod_lo));
676-
const auto t1_hi = DemoteTo(d_half, ShiftRight<16>(prod_hi));
721+
const auto t1_lo = DemoteTo(d_half, ShiftRight<kShift>(prod_lo));
722+
const auto t1_hi = DemoteTo(d_half, ShiftRight<kShift>(prod_hi));
677723
const V t1 = Combine(d, t1_hi, t1_lo);
678724
#endif
679725

@@ -682,15 +728,7 @@ if constexpr (sizeof(T) == 1) {
682728
const V sum = Add(t1, shifted);
683729
return detail::ShiftRightUniform(d, sum, params.shift2);
684730
}
685-
} else if constexpr (sizeof(T) == 4) {
686-
const V multiplier = Set(d, params.multiplier);
687-
const V t1 = MulHigh(dividend, multiplier);
688-
const V diff = Sub(dividend, t1);
689-
const V shifted = detail::ShiftRightUniform(d, diff, params.shift1);
690-
const V sum = Add(t1, shifted);
691-
return detail::ShiftRightUniform(d, sum, params.shift2);
692-
693-
} else {
731+
} else {
694732
const V multiplier = Set(d, params.multiplier);
695733
const V t1 = MulHigh(dividend, multiplier);
696734
const V diff = Sub(dividend, t1);
@@ -742,34 +780,7 @@ HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsS<T>& params) {
742780

743781
V q0;
744782

745-
if constexpr (sizeof(T) == 1) {
746-
if constexpr (D::kPrivateLanes < 2) {
747-
return detail::ScalarDivPerLane(d, dividend, params.divisor);
748-
} else {
749-
using TWide = MulType_t<T>;
750-
const Repartition<TWide, D> d_wide;
751-
752-
const auto lo_wide = PromoteLowerTo(d_wide, dividend);
753-
const auto hi_wide = PromoteUpperTo(d_wide, dividend);
754-
755-
const auto mul_wide = Set(d_wide, static_cast<TWide>(params.multiplier));
756-
757-
const auto prod_lo = Mul(lo_wide, mul_wide);
758-
const auto prod_hi = Mul(hi_wide, mul_wide);
759-
760-
#if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
761-
const auto high = OrderedDemote2To(d, ShiftRight<8>(prod_lo), ShiftRight<8>(prod_hi));
762-
#else
763-
const Half<D> d_half;
764-
const auto high_lo = DemoteTo(d_half, ShiftRight<8>(prod_lo));
765-
const auto high_hi = DemoteTo(d_half, ShiftRight<8>(prod_hi));
766-
const auto high = Combine(d, high_hi, high_lo);
767-
#endif
768-
769-
q0 = Add(dividend, high);
770-
}
771-
772-
} else if constexpr (sizeof(T) == 2) {
783+
if constexpr (sizeof(T) <= 2) {
773784
if constexpr (D::kPrivateLanes < 2) {
774785
return detail::ScalarDivPerLane(d, dividend, params.divisor);
775786
} else {
@@ -784,23 +795,20 @@ if constexpr (sizeof(T) == 1) {
784795
const auto prod_lo = Mul(lo_wide, mul_wide);
785796
const auto prod_hi = Mul(hi_wide, mul_wide);
786797

798+
constexpr int kShift = static_cast<int>(sizeof(T) * 8);
799+
787800
#if defined(HWY_HAVE_ORDEREDDEMOTE2TO)
788-
const auto high = OrderedDemote2To(d, ShiftRight<16>(prod_lo), ShiftRight<16>(prod_hi));
801+
const auto high = OrderedDemote2To(d, ShiftRight<kShift>(prod_lo), ShiftRight<kShift>(prod_hi));
789802
#else
790803
const Half<D> d_half;
791-
const auto high_lo = DemoteTo(d_half, ShiftRight<16>(prod_lo));
792-
const auto high_hi = DemoteTo(d_half, ShiftRight<16>(prod_hi));
804+
const auto high_lo = DemoteTo(d_half, ShiftRight<kShift>(prod_lo));
805+
const auto high_hi = DemoteTo(d_half, ShiftRight<kShift>(prod_hi));
793806
const auto high = Combine(d, high_hi, high_lo);
794807
#endif
795808

796809
q0 = Add(dividend, high);
797810
}
798-
} else if constexpr (sizeof(T) == 4) {
799-
const V multiplier = Set(d, params.multiplier);
800-
const V mulh = MulHigh(dividend, multiplier);
801-
q0 = Add(dividend, mulh);
802-
803-
} else {
811+
} else {
804812
const V multiplier = Set(d, params.multiplier);
805813
const V mulh = MulHigh(dividend, multiplier);
806814
q0 = Add(dividend, mulh);

hwy/contrib/intdiv/intdiv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Copyright 2024 Google LLC
2-
// Copyright 2025 Fujitsu Limited
2+
// Copyright 2026 Fujitsu Limited
33
// SPDX-License-Identifier: Apache-2.0
44
// SPDX-License-Identifier: BSD-3-Clause
55
//

hwy/contrib/intdiv/intdiv_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Copyright 2024 Google LLC
2-
// Copyright 2025 Fujitsu Limited
2+
// Copyright 2026 Fujitsu Limited
33
// SPDX-License-Identifier: Apache-2.0
44
// SPDX-License-Identifier: BSD-3-Clause
55
//

0 commit comments

Comments
 (0)