diff --git a/BUILD b/BUILD index 20adda10da..f14f32ccd6 100644 --- a/BUILD +++ b/BUILD @@ -539,6 +539,22 @@ cc_library( local_defines = ["HWY_HEADER_ONLY"], ) +cc_library( + name = "intdiv", + compatible_with = [], + copts = COPTS, + hdrs = [ + "hwy/contrib/intdiv/intdiv.h", + ], + textual_hdrs = [ + "hwy/contrib/intdiv/intdiv-inl.h", + ], + deps = [ + ":hwy", + ], +) + + cc_test( name = "list_targets", size = "small", @@ -644,6 +660,17 @@ cc_test( ], ) +cc_test( + name = "intdiv_test", + size = "medium", + timeout = "long", + srcs = ["hwy/contrib/intdiv/intdiv_test.cc"], + copts = COPTS + HWY_TEST_COPTS, + local_defines = ["HWY_IS_TEST"], + tags = ["hwy_ops_test"], + deps = HWY_TEST_DEPS + [":intdiv"], +) + # For manually building the tests we define here (:all does not work in --config=msvc) test_suite( name = "hwy_ops_tests", diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e306286ea..142107bb73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,8 @@ list(APPEND HWY_CONTRIB_SOURCES hwy/contrib/image/image.cc hwy/contrib/image/image.h hwy/contrib/math/fast_math-inl.h + hwy/contrib/intdiv/intdiv.h + hwy/contrib/intdiv/intdiv-inl.h hwy/contrib/math/math-inl.h hwy/contrib/matvec/matvec-inl.h hwy/contrib/random/random-inl.h @@ -915,6 +917,7 @@ list(APPEND HWY_TEST_FILES hwy/contrib/bit_pack/bit_pack_test.cc hwy/contrib/dot/dot_test.cc hwy/contrib/matvec/matvec_test.cc + hwy/contrib/intdiv/intdiv_test.cc hwy/contrib/image/image_test.cc # Disabled due to SIGILL in clang7 debug build during gtest discovery phase, # not reproducible locally. Still tested via bazel build. diff --git a/hwy.gni b/hwy.gni index f922010b29..35f932d430 100644 --- a/hwy.gni +++ b/hwy.gni @@ -52,6 +52,8 @@ hwy_contrib_public = [ "$_hwy/contrib/dot/dot-inl.h", "$_hwy/contrib/image/image.h", "$_hwy/contrib/math/fast_math-inl.h", + "$_hwy/contrib/intdiv/intdiv.h", + "$_hwy/contrib/intdiv/intdiv-inl.h", "$_hwy/contrib/math/math-inl.h", ] diff --git a/hwy/base.h b/hwy/base.h index 342bfed214..c088354c55 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -3049,10 +3049,11 @@ class Divisor64 { // Returns n % divisor_. uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); } - private: - uint64_t divisor_; +// private: +// uint64_t divisor_; static uint64_t Div128(uint64_t hi, uint64_t div) { + HWY_DASSERT(div != 0); #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64 unsigned __int64 remainder; // unused return _udiv128(hi, uint64_t{0}, div, &remainder); @@ -3063,7 +3064,9 @@ class Divisor64 { #endif } - static uint64_t MulHigh(uint64_t a, uint64_t b) { + private: + uint64_t divisor_; + static uint64_t MulHigh(uint64_t a, uint64_t b) { #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64 return __umulh(a, b); #else @@ -3074,9 +3077,9 @@ class Divisor64 { #endif } - uint64_t mul_ = 1; - uint64_t shift1_ = 0; - uint64_t shift2_ = 0; + uint64_t mul_ = 1; + uint64_t shift1_ = 0; + uint64_t shift2_ = 0; }; #else // No Div128 available, use built-in 64-bit division on each call. diff --git a/hwy/contrib/intdiv/intdiv-inl.h b/hwy/contrib/intdiv/intdiv-inl.h new file mode 100644 index 0000000000..e224790a89 --- /dev/null +++ b/hwy/contrib/intdiv/intdiv-inl.h @@ -0,0 +1,724 @@ +// Copyright 2024 Google LLC +// Copyright 2026 Fujitsu Limited +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/********************************************************************************** + ** Integer division + ********************************************************************************** + * + * Most SIMD ISAs do not provide a portable integer vector division operation. + * This implementation replaces division by a run-time invariant scalar divisor + * with multiplication by precomputed reciprocal parameters and shifts. + * + * The method is based on T. Granlund and P. L. Montgomery, + * "Division by invariant integers using multiplication" (Figures 4.1 and 5.1): + * https://gmplib.org/~tege/divcnst-pldi94.pdf + * + * hwy/base.h already provides scalar classes `hwy::Divisor` (uint32_t) and + * `hwy::Divisor64` (uint64_t, gated on HWY_HAVE_DIV128) implementing the same + * scheme. This file extends it to SIMD vectors and adds: + * - Signed division (Figure 5.1). + * - 8-bit and 16-bit lanes via widening multiply. + * - Power-of-two fast path. + * - Floor-division variant (Python/NumPy semantics). + * + * Usage is split into two steps: + * 1) Precompute parameters from the scalar divisor: + * DivisorParams{U,S} ComputeDivisorParams(T divisor); + * + * 2) Use those parameters to divide vector lanes: + * Vec IntDiv(D d, Vec dividend, + * const DivisorParams{U,S}& params); + * + * Computing divisor parameters is relatively expensive, so this is intended + * for divisors reused across multiple vector operations. + * + * For 64-bit lanes, some targets use a scalar fallback. This is required when + * 128-bit arithmetic is unavailable, and is also used for NEON/PPC8/VSX where + * the vectorized 64-bit reciprocal-multiply path is not expected to outperform + * scalar division. Array-level APIs skip the vector round-trip in this case. + * + *************************************************************** + ** Figure 4.1: Unsigned division by run-time invariant divisor + *************************************************************** + * Initialization (given uword d with 1 <= d < 2^N): + * int l = ceil(log2(d)); + * uword m = 2^N * (2^l - d) / d + 1; + * int sh1 = min(l, 1); + * int sh2 = max(l - 1, 0); + * + * For q = FLOOR(a/d), all uword: + * uword t1 = MULUH(m, a); + * q = SRL(t1 + SRL(a - t1, sh1), sh2); + * + ************************************************************************************ + ** Figure 5.1: Signed division by run-time invariant divisor, rounded toward zero + ************************************************************************************ + * Initialization (given sword d with d != 0): + * int l = max(ceil(log2(abs(d))), 1); + * udword m0 = 1 + (2^(N+l-1)) / abs(d); + * sword m = m0 - 2^N; + * sword dsign = XSIGN(d); + * int sh = l - 1; + * + * For q = TRUNC(a/d), all sword: + * sword q0 = a + MULSH(m, a); + * q0 = SRA(q0, sh) - XSIGN(a); + * q = EOR(q0, dsign) - dsign; + * + **********************************************************************************/ + + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_INL_H_ +#endif + +#include +#include +#include +#include "hwy/highway.h" + +#ifdef HWY_INTDIV_SCALAR64 +#undef HWY_INTDIV_SCALAR64 +#endif + +#if !HWY_HAVE_DIV128 || HWY_TARGET_IS_NEON || HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_VSX +#define HWY_INTDIV_SCALAR64 1 +#else +#define HWY_INTDIV_SCALAR64 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { + +template +struct MultiplierType { + using type = T; +}; +template +struct MultiplierType { + using type = MakeWide; +}; +template +using MultiplierType_T = typename MultiplierType::type; + +template +HWY_INLINE constexpr bool IsPow2(T x) { + return x > 0 && (x & (x - 1)) == 0; +} + +HWY_INLINE constexpr int CountTrailingZeros32(uint32_t x) { + return x == 0 ? 32 : static_cast(Num0BitsBelowLS1Bit_Nonzero32(x)); +} +HWY_INLINE constexpr int CountTrailingZeros64(uint64_t x) { + return x == 0 ? 64 : static_cast(Num0BitsBelowLS1Bit_Nonzero64(x)); +} +HWY_INLINE constexpr unsigned LeadingZeroCount32(uint32_t x) { + return x == 0 ? 32u : static_cast(Num0BitsAboveMS1Bit_Nonzero32(x)); +} +HWY_INLINE constexpr unsigned LeadingZeroCount64(uint64_t x) { + return x == 0 ? 64u : static_cast(Num0BitsAboveMS1Bit_Nonzero64(x)); +} + +#if HWY_HAVE_DIV128 +HWY_INLINE uint64_t Div128HighBy(uint64_t high, uint64_t divisor) { + HWY_DASSERT(divisor != 0); + return ::hwy::Divisor64::Div128(high, divisor); +} +#endif // HWY_HAVE_DIV128 + +template > +HWY_INLINE V ShiftRightUniform(D /*d*/, V v, int sh) { + HWY_DASSERT(sh >= 0); + HWY_DASSERT(sh < static_cast(sizeof(TFromD) * 8)); + return ShiftRightSame(v, sh); +} + +template , typename T = TFromD> +HWY_INLINE V ScalarDivPerLane(D d, V dividend, T divisor) { + const size_t N = Lanes(d); + HWY_ALIGN T buf[HWY_MAX_BYTES / sizeof(T)]; + StoreU(dividend, d, buf); + for (size_t i = 0; i < N; ++i) { + buf[i] = static_cast(buf[i] / divisor); + } + return LoadU(d, buf); +} + +} // namespace detail + +template +struct DivisorParamsU { + detail::MultiplierType_T multiplier; + int shift2; + bool is_pow2; + int pow2_shift; + T divisor; +}; + +template +struct DivisorParamsS { + detail::MultiplierType_T multiplier; + int shift; + T divisor; + T dsign; + bool is_pow2; + bool is_neg_one; + int pow2_shift; +}; + +template +HWY_INLINE constexpr DivisorParamsU ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + DivisorParamsU params{}; + params.divisor = divisor; + + if (detail::IsPow2(divisor)) { // also catches divisor == 1 + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(divisor); + params.multiplier = 1; + params.shift2 = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned l = 32u - detail::LeadingZeroCount32(divisor - 1u); + const uint16_t two_l = static_cast(1U << l); + const uint32_t m = ((static_cast(two_l - divisor) << 8) / divisor) + 1u; + params.multiplier = static_cast(m); + params.shift2 = static_cast(l) - 1; + return params; +} + +template +HWY_INLINE constexpr DivisorParamsU ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + DivisorParamsU params{}; + params.divisor = divisor; + + if (detail::IsPow2(divisor)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(divisor); + params.multiplier = 1; + params.shift2 = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned l = 32u - detail::LeadingZeroCount32(divisor - 1u); + const uint32_t two_l = 1U << l; + const uint64_t tmp = ((static_cast(two_l - divisor) << 16) / divisor) + 1u; + params.multiplier = static_cast(tmp); + params.shift2 = static_cast(l) - 1; + return params; +} + +template +HWY_INLINE constexpr DivisorParamsU ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + DivisorParamsU params{}; + params.divisor = divisor; + + if (detail::IsPow2(divisor)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(divisor); + params.multiplier = 1; + params.shift2 = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned l = 32u - detail::LeadingZeroCount32(divisor - 1u); + const uint64_t two_l = 1ULL << l; + const uint64_t m = ((two_l - divisor) << 32) / divisor + 1u; + params.multiplier = static_cast(m); + params.shift2 = static_cast(l) - 1; + return params; +} + +template +HWY_INLINE +#if !HWY_INTDIV_SCALAR64 && (!HWY_COMPILER_MSVC || !HWY_ARCH_X86_64) + constexpr +#endif + DivisorParamsU + ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + DivisorParamsU params{}; + params.divisor = divisor; + + if (detail::IsPow2(divisor)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros64(divisor); + params.multiplier = 1; + params.shift2 = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + +#if HWY_INTDIV_SCALAR64 + params.multiplier = 1; + params.shift2 = 0; +#else + const unsigned l = 64u - detail::LeadingZeroCount64(divisor - 1u); + const uint64_t two_l_minus_d = (l < 64) ? ((1ULL << l) - divisor) : (0 - divisor); + const uint64_t m = detail::Div128HighBy(two_l_minus_d, divisor) + 1u; + params.multiplier = m; + params.shift2 = static_cast(l) - 1; +#endif + return params; +} + +template +HWY_INLINE constexpr DivisorParamsS ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + using UT = MakeUnsigned; + DivisorParamsS params{}; + params.divisor = divisor; + params.dsign = (divisor < 0) ? static_cast(-1) : static_cast(0); + params.is_neg_one = (divisor == T(-1)); + + const UT abs_d = divisor < 0 ? static_cast(UT{0} - static_cast(divisor)) + : static_cast(divisor); + + if (detail::IsPow2(abs_d)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(abs_d); + params.multiplier = 1; + params.shift = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned sh = 31u - detail::LeadingZeroCount32(static_cast(abs_d - 1u)); + const uint32_t m = (256U << sh) / abs_d + 1u; + params.multiplier = static_cast(static_cast(m)); + params.shift = static_cast(sh); + return params; +} + +template +HWY_INLINE constexpr DivisorParamsS ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + using UT = MakeUnsigned; + DivisorParamsS params{}; + params.divisor = divisor; + params.dsign = (divisor < 0) ? static_cast(-1) : static_cast(0); + params.is_neg_one = (divisor == T(-1)); + + const UT abs_d = divisor < 0 ? static_cast(UT{0} - static_cast(divisor)) + : static_cast(divisor); + + if (detail::IsPow2(abs_d)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(abs_d); + params.multiplier = 1; + params.shift = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned sh = 31u - detail::LeadingZeroCount32(static_cast(abs_d - 1u)); + const uint64_t tmp = ((uint64_t{1} << 16) << sh) / abs_d + 1u; + params.multiplier = static_cast(static_cast(static_cast(tmp))); + params.shift = static_cast(sh); + return params; +} + +template +HWY_INLINE constexpr DivisorParamsS ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + using UT = MakeUnsigned; + DivisorParamsS params{}; + params.divisor = divisor; + params.dsign = (divisor < 0) ? static_cast(-1) : static_cast(0); + params.is_neg_one = (divisor == T(-1)); + + const UT abs_d = divisor < 0 ? static_cast(UT{0} - static_cast(divisor)) + : static_cast(divisor); + + if (detail::IsPow2(abs_d)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros32(abs_d); + params.multiplier = 1; + params.shift = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + + const unsigned sh = 31u - detail::LeadingZeroCount32(abs_d - 1u); + const uint64_t m = (0x100000000ULL << sh) / abs_d + 1u; + params.multiplier = static_cast(m); + params.shift = static_cast(sh); + return params; +} + +template +HWY_INLINE +#if !HWY_INTDIV_SCALAR64 && (!HWY_COMPILER_MSVC || !HWY_ARCH_X86_64) + constexpr +#endif + DivisorParamsS + ComputeDivisorParams(T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) { + HWY_ABORT("intdiv: division by zero in ComputeDivisorParams"); + } + using UT = MakeUnsigned; + DivisorParamsS params{}; + params.divisor = divisor; + params.dsign = (divisor < 0) ? static_cast(-1) : static_cast(0); + params.is_neg_one = (divisor == T(-1)); + + const UT abs_d = divisor < 0 ? static_cast(UT{0} - static_cast(divisor)) + : static_cast(divisor); + + if (detail::IsPow2(abs_d)) { + params.is_pow2 = true; + params.pow2_shift = detail::CountTrailingZeros64(abs_d); + params.multiplier = 1; + params.shift = 0; + return params; + } + params.is_pow2 = false; + params.pow2_shift = 0; + +#if HWY_INTDIV_SCALAR64 + params.multiplier = 1; + params.shift = 0; +#else + const unsigned sh = 63u - detail::LeadingZeroCount64(abs_d - 1u); + const uint64_t m = detail::Div128HighBy(1ULL << sh, abs_d) + 1u; + params.multiplier = static_cast(m); + params.shift = static_cast(sh); +#endif + return params; +} + +template , typename T = TFromD, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsU& params) { + HWY_DASSERT(params.divisor != 0); + + if (params.is_pow2) { + if (params.pow2_shift == 0) return dividend; // divisor == 1 + return detail::ShiftRightUniform(d, dividend, params.pow2_shift); + } + +#if HWY_INTDIV_SCALAR64 + if constexpr (sizeof(T) == 8) { + return detail::ScalarDivPerLane(d, dividend, params.divisor); + } +#endif + + if constexpr (sizeof(T) <= 2) { + if constexpr (D::kPrivateLanes < 2) { + return detail::ScalarDivPerLane(d, dividend, params.divisor); + } else { + using TWide = detail::MultiplierType_T; + const Repartition d_wide; + + const auto lo_wide = PromoteLowerTo(d_wide, dividend); + const auto hi_wide = PromoteUpperTo(d_wide, dividend); + + const auto mul_wide = Set(d_wide, static_cast(params.multiplier)); + + const auto prod_lo = Mul(lo_wide, mul_wide); + const auto prod_hi = Mul(hi_wide, mul_wide); + + constexpr int kShift = static_cast(sizeof(T) * 8); + + const V t1 = OrderedDemote2To(d, ShiftRight(prod_lo), ShiftRight(prod_hi)); + + const V diff = Sub(dividend, t1); + const V shifted = ShiftRight<1>(diff); + const V sum = Add(t1, shifted); + return detail::ShiftRightUniform(d, sum, params.shift2); + } + } else { + const V multiplier = Set(d, params.multiplier); + const V t1 = MulHigh(dividend, multiplier); + const V diff = Sub(dividend, t1); + const V shifted = ShiftRight<1>(diff); + const V sum = Add(t1, shifted); + return detail::ShiftRightUniform(d, sum, params.shift2); + } +} + +template , typename T = TFromD, HWY_IF_SIGNED_D(D)> +HWY_INLINE V IntDiv(D d, V dividend, const DivisorParamsS& params) { + HWY_DASSERT(params.divisor != 0); + const V dsign_vec = Set(d, params.dsign); + + if (params.is_pow2) { + if (params.pow2_shift == 0) { // |divisor| == 1 + return Sub(Xor(dividend, dsign_vec), dsign_vec); + } + + using UT = MakeUnsigned; + HWY_DASSERT(params.pow2_shift > 0 && + params.pow2_shift < static_cast(8 * sizeof(T))); + const T mask_val = static_cast( + (static_cast(1) << static_cast(params.pow2_shift)) - 1); + const V mask = Set(d, mask_val); + constexpr int kSignBit = int(sizeof(T) * 8) - 1; + const V sign = ShiftRight(dividend); + const V bias = And(sign, mask); + + V q = detail::ShiftRightUniform(d, Add(dividend, bias), params.pow2_shift); + return Sub(Xor(q, dsign_vec), dsign_vec); + } + +#if HWY_INTDIV_SCALAR64 + if constexpr (sizeof(T) == 8) { + return detail::ScalarDivPerLane(d, dividend, params.divisor); + } +#endif + + V q0; + + if constexpr (sizeof(T) <= 2) { + if constexpr (D::kPrivateLanes < 2) { + return detail::ScalarDivPerLane(d, dividend, params.divisor); + } else { + using TWide = detail::MultiplierType_T; + const Repartition d_wide; + + const auto lo_wide = PromoteLowerTo(d_wide, dividend); + const auto hi_wide = PromoteUpperTo(d_wide, dividend); + + const auto mul_wide = Set(d_wide, static_cast(params.multiplier)); + + const auto prod_lo = Mul(lo_wide, mul_wide); + const auto prod_hi = Mul(hi_wide, mul_wide); + + constexpr int kShift = static_cast(sizeof(T) * 8); + + const auto high = OrderedDemote2To(d, ShiftRight(prod_lo), ShiftRight(prod_hi)); + + q0 = Add(dividend, high); + } + } else { + const V multiplier = Set(d, params.multiplier); + const V mulh = MulHigh(dividend, multiplier); + q0 = Add(dividend, mulh); + } + + q0 = detail::ShiftRightUniform(d, q0, params.shift); + + constexpr int kSignBit2 = int(sizeof(T) * 8) - 1; + const V sign_dividend = ShiftRight(dividend); + q0 = Sub(q0, sign_dividend); + + return Sub(Xor(q0, dsign_vec), dsign_vec); +} + +template , typename T = TFromD, HWY_IF_SIGNED_D(D)> +HWY_INLINE V IntDivFloor(D d, V dividend, const DivisorParamsS& params) { + if (params.is_neg_one) { + const V kMin = Set(d, std::numeric_limits::min()); + const auto kMinMask = Eq(dividend, kMin); + return IfThenElse(kMinMask, Zero(d), Neg(dividend)); + } + V q = IntDiv(d, dividend, params); + + const V divisor = Set(d, params.divisor); + const V prod = Mul(q, divisor); + const auto neq = Ne(dividend, prod); + const auto sdiff = Xor(Lt(dividend, Zero(d)), Lt(divisor, Zero(d))); + const V one = Set(d, static_cast(1)); + + return Sub(q, IfThenElse(And(neq, sdiff), one, Zero(d))); +} + +template , typename T = TFromD, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V IntDivFloor(D d, V dividend, const DivisorParamsU& params) { + return IntDiv(d, dividend, params); +} + +template , typename T = TFromD, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V DivideByScalar(D d, V dividend, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + const auto params = ComputeDivisorParams(divisor); + return IntDiv(d, dividend, params); +} + +template , typename T = TFromD, HWY_IF_SIGNED_D(D)> +HWY_INLINE V DivideByScalar(D d, V dividend, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + const auto params = ComputeDivisorParams(divisor); + return IntDiv(d, dividend, params); +} + +template , typename T = TFromD, HWY_IF_SIGNED_D(D)> +HWY_INLINE V FloorDivideByScalar(D d, V dividend, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + const auto params = ComputeDivisorParams(divisor); + return IntDivFloor(d, dividend, params); +} + +template , typename T = TFromD, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V FloorDivideByScalar(D d, V dividend, T divisor) { + return DivideByScalar(d, dividend, divisor); +} + +template +HWY_INLINE void DivideArrayByScalar(T* HWY_RESTRICT array, size_t count, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + +#if HWY_INTDIV_SCALAR64 + if constexpr (sizeof(T) == 8) { + for (size_t i = 0; i < count; ++i) { + array[i] = static_cast(array[i] / divisor); + } + return; + } +#endif + + const ScalableTag d; + const size_t N = Lanes(d); + const auto params = ComputeDivisorParams(divisor); + + size_t i = 0; + for (; i + N <= count; i += N) { + const auto vec = LoadU(d, array + i); + const auto result = IntDiv(d, vec, params); + StoreU(result, d, array + i); + } + if (i < count) { + const size_t remaining = count - i; + const auto vec = LoadN(d, array + i, remaining); + const auto result = IntDiv(d, vec, params); + StoreN(result, d, array + i, remaining); + } +} + +template +HWY_INLINE void DivideArrayByScalar(T* HWY_RESTRICT array, size_t count, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + +#if HWY_INTDIV_SCALAR64 + if constexpr (sizeof(T) == 8) { + for (size_t i = 0; i < count; ++i) { + array[i] = static_cast(array[i] / divisor); + } + return; + } +#endif + + const ScalableTag d; + const size_t N = Lanes(d); + const auto params = ComputeDivisorParams(divisor); + + size_t i = 0; + for (; i + N <= count; i += N) { + const auto vec = LoadU(d, array + i); + const auto result = IntDiv(d, vec, params); + StoreU(result, d, array + i); + } + if (i < count) { + const size_t remaining = count - i; + const auto vec = LoadN(d, array + i, remaining); + const auto result = IntDiv(d, vec, params); + StoreN(result, d, array + i, remaining); + } +} + +template +HWY_INLINE void FloorDivideArrayByScalar(T* HWY_RESTRICT array, size_t count, T divisor) { + HWY_DASSERT(divisor != 0); + if (HWY_UNLIKELY(divisor == 0)) HWY_ABORT("intdiv: division by zero"); + +#if HWY_INTDIV_SCALAR64 + if constexpr (sizeof(T) == 8) { + for (size_t i = 0; i < count; ++i) { + const T a = array[i]; + T q = static_cast(a / divisor); + const T r = static_cast(a % divisor); + if (r != 0 && ((a < 0) != (divisor < 0))) --q; + array[i] = q; + } + return; + } +#endif + + const ScalableTag d; + const size_t N = Lanes(d); + const auto params = ComputeDivisorParams(divisor); + + size_t i = 0; + for (; i + N <= count; i += N) { + const auto vec = LoadU(d, array + i); + const auto result = IntDivFloor(d, vec, params); + StoreU(result, d, array + i); + } + if (i < count) { + const size_t remaining = count - i; + const auto vec = LoadN(d, array + i, remaining); + const auto result = IntDivFloor(d, vec, params); + StoreN(result, d, array + i, remaining); + } +} + +template +HWY_INLINE void FloorDivideArrayByScalar(T* HWY_RESTRICT array, size_t count, T divisor) { + DivideArrayByScalar(array, count, divisor); // Same for unsigned +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // per-target include guard diff --git a/hwy/contrib/intdiv/intdiv.h b/hwy/contrib/intdiv/intdiv.h new file mode 100644 index 0000000000..7e5aa5b840 --- /dev/null +++ b/hwy/contrib/intdiv/intdiv.h @@ -0,0 +1,129 @@ +// Copyright 2024 Google LLC +// Copyright 2026 Fujitsu Limited +// SPDX-License-Identifier: Apache-2.0 +// SPDX-License-Identifier: BSD-3-Clause +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_H_ +#define HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_H_ + +#include "hwy/highway.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/intdiv/intdiv-inl.h" + +#if HWY_ONCE + +namespace hwy { + +template +using VecD = HWY_NAMESPACE::Vec; + +template +using TFromD_ = HWY_NAMESPACE::TFromD; + +template +using DivisorParamsU = HWY_NAMESPACE::DivisorParamsU; + +template +using DivisorParamsS = HWY_NAMESPACE::DivisorParamsS; + +template +HWY_INLINE DivisorParamsS ComputeDivisorParams(T d) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "ComputeDivisorParams only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::ComputeDivisorParams(d); +} + +template +HWY_INLINE DivisorParamsU ComputeDivisorParams(T d) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "ComputeDivisorParams only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::ComputeDivisorParams(d); +} + +template , typename T = TFromD_> +HWY_INLINE V IntDiv(D d, V a, const DivisorParamsU& p) { + static_assert(!hwy::IsSigned(), "DivisorParamsU requires unsigned lane type"); + return HWY_NAMESPACE::IntDiv(d, a, p); +} + +template , typename T = TFromD_> +HWY_INLINE V IntDiv(D d, V a, const DivisorParamsS& p) { + static_assert(hwy::IsSigned(), "DivisorParamsS requires signed lane type"); + return HWY_NAMESPACE::IntDiv(d, a, p); +} + +template , typename T = TFromD_> +HWY_INLINE V IntDivFloor(D d, V a, const DivisorParamsU& p) { + static_assert(!hwy::IsSigned(), "DivisorParamsU requires unsigned lane type"); + return HWY_NAMESPACE::IntDivFloor(d, a, p); +} +template , typename T = TFromD_> +HWY_INLINE V IntDivFloor(D d, V a, const DivisorParamsS& p) { + static_assert(hwy::IsSigned(), "DivisorParamsS requires signed lane type"); + return HWY_NAMESPACE::IntDivFloor(d, a, p); +} + +template , typename T = TFromD_, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V DivideByScalar(D d, V a, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "DivideByScalar only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::DivideByScalar(d, a, div); +} + +template , typename T = TFromD_, HWY_IF_SIGNED_D(D)> +HWY_INLINE V DivideByScalar(D d, V a, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "DivideByScalar only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::DivideByScalar(d, a, div); +} + +template , typename T = TFromD_, HWY_IF_UNSIGNED_D(D)> +HWY_INLINE V FloorDivideByScalar(D d, V a, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "FloorDivideByScalar only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::FloorDivideByScalar(d, a, div); +} + +template , typename T = TFromD_, HWY_IF_SIGNED_D(D)> +HWY_INLINE V FloorDivideByScalar(D d, V a, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "FloorDivideByScalar only supports 8/16/32/64-bit integers"); + return HWY_NAMESPACE::FloorDivideByScalar(d, a, div); +} + +template +HWY_INLINE void DivideArrayByScalar(T* HWY_RESTRICT arr, size_t n, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "DivideArrayByScalar only supports 8/16/32/64-bit integers"); + HWY_NAMESPACE::DivideArrayByScalar(arr, n, div); +} + +template +HWY_INLINE void DivideArrayByScalar(T* HWY_RESTRICT arr, size_t n, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "DivideArrayByScalar only supports 8/16/32/64-bit integers"); + HWY_NAMESPACE::DivideArrayByScalar(arr, n, div); +} + +template +HWY_INLINE void FloorDivideArrayByScalar(T* HWY_RESTRICT arr, size_t n, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "FloorDivideArrayByScalar only supports 8/16/32/64-bit integers"); + HWY_NAMESPACE::FloorDivideArrayByScalar(arr, n, div); +} + +template +HWY_INLINE void FloorDivideArrayByScalar(T* HWY_RESTRICT arr, size_t n, T div) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8, "FloorDivideArrayByScalar only supports 8/16/32/64-bit integers"); + HWY_NAMESPACE::FloorDivideArrayByScalar(arr, n, div); +} + +} // namespace hwy + +#endif // HWY_ONCE + +#endif // HIGHWAY_HWY_CONTRIB_INTDIV_INTDIV_H_ diff --git a/hwy/contrib/intdiv/intdiv_test.cc b/hwy/contrib/intdiv/intdiv_test.cc new file mode 100644 index 0000000000..478dd489c2 --- /dev/null +++ b/hwy/contrib/intdiv/intdiv_test.cc @@ -0,0 +1,726 @@ +// Copyright 2024 Google LLC +// Copyright 2026 Fujitsu Limited +// SPDX-License-Identifier: Apache-2.0 +// SPDX-License-Identifier: BSD-3-Clause +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include + +#include +#include +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/intdiv/intdiv_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/intdiv/intdiv-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +template +T RandWithin(hwy::RandomState& rng) { + if constexpr (sizeof(T) <= 4) { + return static_cast(Random32(&rng)); + } else { + return static_cast(Random64(&rng)); + } +} + +template +T SafeFloorDivScalar(T a, T b) { + if constexpr (!hwy::IsSigned()) { + return static_cast(a / b); + } else { + if (b == T(-1) && a == std::numeric_limits::min()) { + return T(); + } + const T q = static_cast(a / b); + const T r = static_cast(a % b); + const bool adjust = (r != T(0)) && ((a < T(0)) != (b < T(0))); + return static_cast(q - (adjust ? T(1) : T(0))); + } +} + +struct TestBasicDivision { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + if (count == 0) return; + + using T = TFromD; + const size_t N = Lanes(d); + + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + AlignedFreeUniquePtr actual = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected && actual); + + T* a = pa.get() + misalign_a; + + const T divisors[] = {T(1), T(2), T(3), T(5), T(7), T(10), T(11), T(13), + T(16), T(17), T(25), T(31), T(32), T(64), T(100), + T(127), T(128), T(255), T(256), T(1000)}; + + for (T divisor : divisors) { + if (divisor == T(0)) continue; + + const auto params = ComputeDivisorParams(divisor); + + HWY_ASSERT_EQ(divisor, params.divisor); + if constexpr (!hwy::IsSigned()) { + if (detail::IsPow2(divisor)) { + HWY_ASSERT(params.is_pow2); + } + } + for (size_t i = 0; i < count; ++i) { + a[i] = static_cast((static_cast(i) * 123u) & 255u); + expected[i] = static_cast(a[i] / divisor); + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = IntDiv(d, v, params); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = IntDiv(d, v, params); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], actual[i]); + } + } + } +}; + +struct TestPowerOf2Division { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + if (count == 0) return; + + using T = TFromD; + const size_t N = Lanes(d); + + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + AlignedFreeUniquePtr actual = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected && actual); + + T* a = pa.get() + misalign_a; + + const int max_shift = static_cast(sizeof(T) * 8 - hwy::IsSigned()) - 1; + + for (int shift = 0; shift <= max_shift; ++shift) { + const T divisor = static_cast(T(1) << shift); + if (divisor <= T(0)) break; + + const auto params = ComputeDivisorParams(divisor); + HWY_ASSERT(params.is_pow2); + HWY_ASSERT_EQ(shift, params.pow2_shift); + + for (size_t i = 0; i < count; ++i) { + if constexpr (hwy::IsSigned()) { + a[i] = static_cast(static_cast(i) - static_cast(count / 2)); + } else { + a[i] = static_cast(i); + } + expected[i] = static_cast(a[i] / divisor); + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = IntDiv(d, v, params); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = IntDiv(d, v, params); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], actual[i]); + } + } + } +}; + +struct TestSignedDivision { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + using T = TFromD; + if (!hwy::IsSigned()) return; + if (count == 0) return; + + const size_t N = Lanes(d); + + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + AlignedFreeUniquePtr actual = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected && actual); + + T* a = pa.get() + misalign_a; + + const T divisors[] = {T(3), T(5), T(7), T(-3), T(-5), T(-7), + T(17), T(-17), T(100), T(-100)}; + + for (T divisor : divisors) { + const auto params = ComputeDivisorParams(divisor); + + for (size_t i = 0; i < count; ++i) { + a[i] = static_cast(static_cast(i) - static_cast(count / 2)); + + if (divisor == T(-1) && a[i] == std::numeric_limits::min()) { + expected[i] = T(); + } else { + expected[i] = static_cast(a[i] / divisor); + } + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = IntDiv(d, v, params); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = IntDiv(d, v, params); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + if (divisor == T(-1) && a[i] == std::numeric_limits::min()) { + continue; + } + HWY_ASSERT_EQ(expected[i], actual[i]); + } + } + } +}; + +struct TestFloorDivision { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + if (count == 0) return; + + using T = TFromD; + const size_t N = Lanes(d); + + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + AlignedFreeUniquePtr actual = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected && actual); + + T* a = pa.get() + misalign_a; + + std::vector divisors = {T(1), T(2), T(3), T(5), T(7), T(11), T(17), T(100)}; + if constexpr (hwy::IsSigned()) { + std::vector neg = {T(-1), T(-2), T(-3), T(-5), T(-7), T(-11), T(-17)}; + divisors.insert(divisors.end(), neg.begin(), neg.end()); + } + + for (T divisor : divisors) { + const auto params = ComputeDivisorParams(divisor); + + for (size_t i = 0; i < count; ++i) { + if constexpr (hwy::IsSigned()) { + a[i] = static_cast(static_cast(i) - 50); + } else { + a[i] = static_cast(i); + } + expected[i] = SafeFloorDivScalar(a[i], divisor); + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = IntDivFloor(d, v, params); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = IntDivFloor(d, v, params); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], actual[i]); + } + } + } +}; + +struct TestEdgeCases { + template + void operator()(D d, size_t /*count*/, size_t /*misalign_a*/, + size_t /*misalign_b*/, RandomState& /*rng*/) { + using T = TFromD; + + { + const auto params = ComputeDivisorParams(T(1)); + for (T val : {T(0), T(1), T(100), T(std::numeric_limits::max())}) { + HWY_ASSERT_EQ(val, GetLane(IntDiv(d, Set(d, val), params))); + } + } + + { + const T divisor = std::numeric_limits::max(); + const auto params = ComputeDivisorParams(divisor); + HWY_ASSERT_EQ(T(0), GetLane(IntDiv(d, Set(d, T(100)), params))); + HWY_ASSERT_EQ(T(1), GetLane(IntDiv(d, Set(d, divisor), params))); + } + + if constexpr (hwy::IsSigned()) { + const T kMin = std::numeric_limits::min(); + const T kMax = std::numeric_limits::max(); + + { + const auto params = ComputeDivisorParams(T(-1)); + HWY_ASSERT_EQ(kMin, GetLane(IntDiv(d, Set(d, kMin), params))); + HWY_ASSERT_EQ(T(0), GetLane(IntDiv(d, Set(d, T(0)), params))); + HWY_ASSERT_EQ(T(-1), GetLane(IntDiv(d, Set(d, T(1)), params))); + } + + { + const auto params = ComputeDivisorParams(T(7)); + HWY_ASSERT_EQ(static_cast(kMax / T(7)), + GetLane(IntDiv(d, Set(d, kMax), params))); + HWY_ASSERT_EQ(static_cast((kMin + 1) / T(7)), + GetLane(IntDiv(d, Set(d, static_cast(kMin + 1)), params))); + } + } + + if constexpr (sizeof(T) == 4) { + for (T divisor : {T(65535), T(65536)}) { + const auto params = ComputeDivisorParams(divisor); + for (T dividend : {T(0), T(1), T(100)}) { + HWY_ASSERT_EQ(static_cast(dividend / divisor), + GetLane(IntDiv(d, Set(d, dividend), params))); + } + } + } else if constexpr (sizeof(T) == 8 && !hwy::IsSigned()) { + for (T divisor : {T(0xFFFFFFFFull), T(0x100000000ull)}) { + const auto params = ComputeDivisorParams(divisor); + for (T dividend : {T(0), T(1), T(100)}) { + HWY_ASSERT_EQ(static_cast(dividend / divisor), + GetLane(IntDiv(d, Set(d, dividend), params))); + } + } + } + } +}; + + +struct TestRandomDivision { + template + void operator()(D d, size_t /*count*/, size_t /*misalign_a*/, + size_t /*misalign_b*/, RandomState& rng) { + using T = TFromD; + + std::vector divisors = {T(3), T(7), T(17), T(100), T(1000)}; + if constexpr (hwy::IsSigned()) { + divisors.insert(divisors.end(), {T(-3), T(-7), T(-17)}); + } + + for (T divisor : divisors) { + const auto params = ComputeDivisorParams(divisor); + + for (int iter = 0; iter < 100; ++iter) { + const T dividend = RandWithin(rng); + + if constexpr (hwy::IsSigned()) { + if (divisor == T(-1) && dividend == std::numeric_limits::min()) { + continue; + } + } + + const T expected = static_cast(dividend / divisor); + HWY_ASSERT_EQ(expected, GetLane(IntDiv(d, Set(d, dividend), params))); + + const T expected_floor = SafeFloorDivScalar(dividend, divisor); + HWY_ASSERT_EQ(expected_floor, + GetLane(IntDivFloor(d, Set(d, dividend), params))); + } + } + } +}; + + +struct TestLargeDivisors { + template + void operator()(D d, size_t /*count*/, size_t /*misalign_a*/, + size_t /*misalign_b*/, RandomState& rng) { + using T = TFromD; + + std::vector large_divisors; + + if constexpr (sizeof(T) == 1) { + if constexpr (hwy::IsSigned()) { + large_divisors = {T(125), T(126), T(127), T(-125), T(-126), T(-127), + std::numeric_limits::min()}; + } else { + large_divisors = {T(250), T(251), T(252), T(253), T(254), T(255)}; + } + } else if constexpr (sizeof(T) == 2) { + if constexpr (hwy::IsSigned()) { + large_divisors = {T(32765), T(32766), T(32767), + T(-32765), T(-32766), T(-32767), + std::numeric_limits::min()}; + } else { + large_divisors = {T(65530), T(65531), T(65532), T(65533), T(65534), T(65535)}; + } + } else if constexpr (sizeof(T) == 4) { + if constexpr (hwy::IsSigned()) { + const T kMin = std::numeric_limits::min(); + const T kMax = std::numeric_limits::max(); + large_divisors = {static_cast(kMax - 2), static_cast(kMax - 1), kMax, + static_cast(kMin + 3), static_cast(kMin + 2), + static_cast(kMin + 1), kMin}; + } else { + const T kMax = std::numeric_limits::max(); + large_divisors = {static_cast(kMax - 5), static_cast(kMax - 4), + static_cast(kMax - 3), static_cast(kMax - 2), + static_cast(kMax - 1), kMax}; + } + } else if constexpr (sizeof(T) == 8) { + if constexpr (hwy::IsSigned()) { + const T kMin = std::numeric_limits::min(); + const T kMax = std::numeric_limits::max(); + large_divisors = {static_cast(kMax - 2), static_cast(kMax - 1), kMax, + static_cast(kMin + 3), static_cast(kMin + 2), + static_cast(kMin + 1), kMin}; + } else { + const T kMax = std::numeric_limits::max(); + large_divisors = {static_cast(kMax - 5), static_cast(kMax - 4), + static_cast(kMax - 3), static_cast(kMax - 2), + static_cast(kMax - 1), kMax}; + } + } + + for (T divisor : large_divisors) { + const auto params = ComputeDivisorParams(divisor); + + for (int iter = 0; iter < 20; ++iter) { + T dividend = RandWithin(rng); + + T expected; + if constexpr (hwy::IsSigned()) { + if (divisor == T(-1) && dividend == std::numeric_limits::min()) { + continue; + } + expected = static_cast(dividend / divisor); + } else { + expected = static_cast(dividend / divisor); + } + + const T actual = GetLane(IntDiv(d, Set(d, dividend), params)); + HWY_ASSERT_EQ(expected, actual); + + const T expected_floor = SafeFloorDivScalar(dividend, divisor); + const T actual_floor = GetLane(IntDivFloor(d, Set(d, dividend), params)); + HWY_ASSERT_EQ(expected_floor, actual_floor); + } + + std::vector boundary_dividends; + if constexpr (hwy::IsSigned()) { + boundary_dividends = {T(0), T(1), T(-1), + std::numeric_limits::max()}; + + if (divisor != T(-1)) { + boundary_dividends.push_back(std::numeric_limits::min()); + } + } else { + boundary_dividends = {T(0), T(1), std::numeric_limits::max()}; + } + + for (T dividend : boundary_dividends) { + + const T expected = static_cast(dividend / divisor); + const T actual = GetLane(IntDiv(d, Set(d, dividend), params)); + HWY_ASSERT_EQ(expected, actual); + + const T expected_floor = SafeFloorDivScalar(dividend, divisor); + const T actual_floor = GetLane(IntDivFloor(d, Set(d, dividend), params)); + HWY_ASSERT_EQ(expected_floor, actual_floor); + } + } + } +}; + + +struct TestConvenienceAPI { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + if (count == 0) return; + + using T = TFromD; + const size_t N = Lanes(d); + + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + AlignedFreeUniquePtr actual = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected && actual); + + T* a = pa.get() + misalign_a; + + const T divisor = T(7); + + for (size_t i = 0; i < count; ++i) { + a[i] = static_cast(i * T(10)); + expected[i] = static_cast(a[i] / divisor); + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = DivideByScalar(d, v, divisor); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = DivideByScalar(d, v, divisor); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], actual[i]); + } + + if constexpr (hwy::IsSigned()) { + for (size_t i = 0; i < count; ++i) { + expected[i] = SafeFloorDivScalar(a[i], divisor); + } + + for (size_t i = 0; i + N <= count; i += N) { + const auto v = LoadU(d, a + i); + const auto q = FloorDivideByScalar(d, v, divisor); + StoreU(q, d, actual.get() + i); + } + + if (count % N != 0) { + const size_t i = count - (count % N); + const size_t remaining = count - i; + const auto v = LoadN(d, a + i, remaining); + const auto q = FloorDivideByScalar(d, v, divisor); + StoreN(q, d, actual.get() + i, remaining); + } + + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], actual[i]); + } + } + } +}; + +struct TestArrayOperations { + template + void operator()(D /*d*/, size_t /*count*/, size_t /*misalign_a*/, + size_t /*misalign_b*/, RandomState& /*rng*/) { + using T = TFromD; + + constexpr size_t kCount = 127; + auto array = AllocateAligned(kCount); + auto expected = AllocateAligned(kCount); + HWY_ASSERT(array && expected); + + const T divisor = T(11); + + for (size_t i = 0; i < kCount; ++i) { + array[i] = static_cast((static_cast(i) * 13u) & 255u); + expected[i] = static_cast(array[i] / divisor); + } + + DivideArrayByScalar(array.get(), kCount, divisor); + + for (size_t i = 0; i < kCount; ++i) { + HWY_ASSERT_EQ(expected[i], array[i]); + } + + if constexpr (hwy::IsSigned()) { + constexpr size_t kCount2 = 100; + auto array2 = AllocateAligned(kCount2); + auto expected2 = AllocateAligned(kCount2); + HWY_ASSERT(array2 && expected2); + + const T divisor2 = T(-7); + + for (size_t i = 0; i < kCount2; ++i) { + array2[i] = static_cast(static_cast(i) - 50); + expected2[i] = SafeFloorDivScalar(array2[i], divisor2); + } + + FloorDivideArrayByScalar(array2.get(), kCount2, divisor2); + + for (size_t i = 0; i < kCount2; ++i) { + HWY_ASSERT_EQ(expected2[i], array2[i]); + } + } + } +}; + +#if HWY_HAVE_DIV128 +struct TestDiv128HighBy { + HWY_INLINE void operator()() const { + using detail::Div128HighBy; + + { + const uint64_t out = Div128HighBy(1ull, 3ull); + HWY_ASSERT_EQ(0x5555555555555555ull, out); + } + + { + // Must keep high < divisor. Otherwise the 128/64 quotient does not fit + // in uint64_t, and MSVC _udiv128 / x86 div can trap. + const uint64_t high = 1ull << 62; + const uint64_t div = 1ull << 63; + const uint64_t out = Div128HighBy(high, div); + HWY_ASSERT_EQ(1ull << 63, out); + } + + { + const uint64_t out = Div128HighBy(1ull, ~0ull); + HWY_ASSERT_EQ(1ull, out); + } + } +}; +#endif // HWY_HAVE_DIV128 + +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + + for (size_t count : {10 * N, 16 * N, size_t{100}}) { + for (size_t ma : misalignments) { + Test()(d, count, ma, 0, rng); + } + } + } +}; + +void TestAllBasicDivision() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllPowerOf2Division() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllSignedDivision() { + ForSignedTypes(ForPartialVectors>()); +} + +void TestAllFloorDivision() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllEdgeCases() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllRandomDivision() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllLargeDivisors() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllConvenienceAPI() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllArrayOperations() { + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllDiv128HighBy() { +#if HWY_HAVE_DIV128 + TestDiv128HighBy{}(); +#endif // HWY_HAVE_DIV128 +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { + +HWY_BEFORE_TEST(IntDivTest); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllBasicDivision); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllPowerOf2Division); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllSignedDivision); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllFloorDivision); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllEdgeCases); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllRandomDivision); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllLargeDivisors); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllConvenienceAPI); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllArrayOperations); +HWY_EXPORT_AND_TEST_P(IntDivTest, TestAllDiv128HighBy); +HWY_AFTER_TEST(); + +} // namespace +} // namespace hwy + +HWY_TEST_MAIN(); +#endif // HWY_ONCE diff --git a/meson.build b/meson.build index 18ca70cd06..f79fdf33f7 100644 --- a/meson.build +++ b/meson.build @@ -129,6 +129,8 @@ hwy_contrib_headers = files( 'hwy/contrib/dot/dot-inl.h', 'hwy/contrib/image/image.h', 'hwy/contrib/math/fast_math-inl.h', + 'hwy/contrib/intdiv/intdiv.h', + 'hwy/contrib/intdiv/intdiv-inl.h', 'hwy/contrib/math/math-inl.h', 'hwy/contrib/matvec/matvec-inl.h', 'hwy/contrib/random/random-inl.h', @@ -692,6 +694,7 @@ if tests_enabled 'hwy/contrib/thread_pool/thread_pool_test.cc', 'hwy/contrib/thread_pool/topology_test.cc', 'hwy/contrib/unroller/unroller_test.cc', + 'hwy/contrib/intdiv/intdiv_test.cc', ) foreach test_src : hwy_contrib_test_files