Skip to content
This repository was archived by the owner on Oct 12, 2022. It is now read-only.

Commit 4563b96

Browse files
committed
core.int128: Optimize udivmod()
1 parent 55be6b1 commit 4563b96

1 file changed

Lines changed: 132 additions & 20 deletions

File tree

src/core/int128.d

Lines changed: 132 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -453,36 +453,135 @@ pure
453453
Cent udivmod(Cent c1, Cent c2, out Cent modulus)
454454
{
455455
//printf("udiv c1(%llx,%llx) c2(%llx,%llx)\n", c1.lo, c1.hi, c2.lo, c2.hi);
456+
// Based on "Unsigned Doubleword Division" in Hacker's Delight
457+
import core.bitop;
458+
459+
// Divides a 128-bit dividend by a 64-bit divisor.
460+
// The result must fit in 64 bits.
461+
static U udivmod128_64(Cent c1, U c2, out U modulus)
462+
{
463+
// We work in base 2^^32
464+
enum base = 1UL << 32;
465+
enum divmask = (1UL << (Ubits / 2)) - 1;
466+
enum divshift = Ubits / 2;
467+
468+
// Check for overflow and divide by 0
469+
if (c1.hi >= c2)
470+
{
471+
modulus = 0UL;
472+
return ~0UL;
473+
}
474+
475+
// Computes [num1 num0] / den
476+
static uint udiv96_64(U num1, uint num0, U den)
477+
{
478+
// Extract both digits of the denominator
479+
const den1 = cast(uint)(den >> divshift);
480+
const den0 = cast(uint)(den & divmask);
481+
// Estimate ret as num1 / den1, and then correct it
482+
U ret = num1 / den1;
483+
const t2 = (num1 % den1) * base + num0;
484+
const t1 = ret * den0;
485+
if (t1 > t2)
486+
ret -= (t1 - t2 > den) ? 2 : 1;
487+
return cast(uint)ret;
488+
}
489+
490+
// Determine the normalization factor. We multiply c2 by this, so that its leading
491+
// digit is at least half base. In binary this means just shifting left by the number
492+
// of leading zeros, so that there's a 1 in the MSB.
493+
// We also shift number by the same amount. This cannot overflow because c1.hi < c2.
494+
const shift = (Ubits - 1) - bsr(c2);
495+
c2 <<= shift;
496+
U num2 = c1.hi;
497+
num2 <<= shift;
498+
num2 |= (c1.lo >> (-shift & 63)) & (-cast(I)shift >> 63);
499+
c1.lo <<= shift;
500+
501+
// Extract the low digits of the numerator (after normalizing)
502+
const num1 = cast(uint)(c1.lo >> divshift);
503+
const num0 = cast(uint)(c1.lo & divmask);
504+
505+
// Compute q1 = [num2 num1] / c2
506+
const q1 = udiv96_64(num2, num1, c2);
507+
// Compute the true (partial) remainder
508+
const rem = num2 * base + num1 - q1 * c2;
509+
// Compute q0 = [rem num0] / c2
510+
const q0 = udiv96_64(rem, num0, c2);
511+
512+
modulus = (rem * base + num0 - q0 * c2) >> shift;
513+
return (cast(U)q1 << divshift) | q0;
514+
}
515+
516+
// Special cases
456517
if (!tst(c2))
457518
{
458519
// Divide by zero
459520
modulus = Zero;
460521
return com(modulus);
461522
}
462-
463-
// left justify c2
464-
uint shifts = 1;
465-
while (cast(I)c2.hi >= 0)
523+
if (c1.hi == 0 && c2.hi == 0)
524+
{
525+
// Single precision divide
526+
modulus = Cent(c1.lo % c2.lo);
527+
return Cent(c1.lo / c2.lo);
528+
}
529+
if (c1.hi == 0)
530+
{
531+
// Numerator is smaller than the divisor
532+
modulus = c1;
533+
return Zero;
534+
}
535+
if (c2.hi == 0)
466536
{
467-
c2 = shl1(c2);
468-
++shifts;
537+
// Divisor is a 64-bit value, so we just need one 128/64 division.
538+
// If c1 / c2 would overflow, break c1 up into two halves.
539+
const q1 = (c1.hi < c2.lo) ? 0 : (c1.hi / c2.lo);
540+
if (q1)
541+
c1.hi = c1.hi % c2.lo;
542+
U rem;
543+
const q0 = udivmod128_64(c1, c2.lo, rem);
544+
modulus = Cent(rem);
545+
return Cent(q0, q1);
469546
}
470547

471-
// subtract and shift, just like 3rd grade long division
472-
Cent quotient;
473-
while (shifts--)
548+
// Full cent precision division.
549+
// Here c2 >= 2^^64
550+
// We know that c2.hi != 0, so count leading zeros is OK
551+
// We have 0 <= shift <= 63
552+
const shift = (Ubits - 1) - bsr(c2.hi);
553+
554+
// Normalize the divisor so its MSB is 1
555+
// v1 = (c2 << shift) >> 64
556+
U v1 = shl(c2, shift).hi;
557+
558+
// To ensure no overflow.
559+
Cent u1 = shr1(c1);
560+
561+
// Get quotient from divide unsigned operation.
562+
U rem_ignored;
563+
const q1 = udivmod128_64(u1, v1, rem_ignored);
564+
565+
// Undo normalization and division of c1 by 2.
566+
Cent quotient = shr(shl(Cent(q1), shift), 63);
567+
568+
// Make quotient correct or too small by 1
569+
if (tst(quotient))
570+
quotient = dec(quotient);
571+
572+
// Now quotient is correct.
573+
// Compute rem = c1 - (quotient * c2);
574+
Cent rem = sub(c1, mul(quotient, c2));
575+
576+
// Check if remainder is larger than the divisor
577+
if (uge(rem, c2))
474578
{
475-
//printf("shifts %d c1(%llx,%llx) c2(%llx,%llx)\n", shifts, c1.lo, c1.hi, c2.lo, c2.hi);
476-
quotient = shl1(quotient);
477-
if (uge(c1, c2))
478-
{
479-
//printf("sub\n");
480-
c1 = sub(c1, c2);
481-
quotient.lo |= 1;
482-
}
483-
c2 = shr1(c2);
579+
// Increment quotient
580+
quotient = inc(quotient);
581+
// Subtract c2 from remainder
582+
rem = sub(rem, c2);
484583
}
485-
modulus = c1;
584+
modulus = rem;
486585
//printf("quotient "); print(quotient);
487586
//printf("modulus "); print(modulus);
488587
return quotient;
@@ -705,6 +804,9 @@ unittest
705804

706805
enum Cs_3 = Cent(3, I.min);
707806

807+
const Cbig_1 = Cent(0xa3ccac1832952398, 0xc3ac542864f652f8);
808+
const Cbig_2 = Cent(0x5267b85f8a42fc20, 0);
809+
708810
/************************/
709811

710812
assert( ugt(C1, C0) );
@@ -776,19 +878,29 @@ unittest
776878
assert(udivmod(C10,C2, modulus) == C5); assert(modulus == C0);
777879
assert(udivmod(C10,C3, modulus) == C3); assert(modulus == C1);
778880
assert(udivmod(C10,C0, modulus) == Cm1); assert(modulus == C0);
881+
assert(udivmod(C2,C90_30, modulus) == C0); assert(modulus == C2);
882+
assert(udiv(mul(C90_30, C2), C2) == C90_30);
883+
assert(udiv(mul(C90_30, C2), C90_30) == C2);
779884

780885
assert(div(C10,C3) == C3);
781886
assert(divmod( C10, C3, modulus) == C3); assert(modulus == C1);
782887
assert(divmod(Cm10, C3, modulus) == Cm3); assert(modulus == Cm1);
783888
assert(divmod( C10, Cm3, modulus) == Cm3); assert(modulus == C1);
784889
assert(divmod(Cm10, Cm3, modulus) == C3); assert(modulus == Cm1);
890+
assert(divmod(C2, C90_30, modulus) == C0); assert(modulus == C2);
891+
assert(div(mul(C90_30, C2), C2) == C90_30);
892+
assert(div(mul(C90_30, C2), C90_30) == C2);
893+
894+
assert(divmod(Cbig_1, Cbig_2, modulus) == Cent(0x4496aa309d4d4a2f, U.max));
895+
assert(modulus == Cent(0xd83203d0fdc799b8, U.max));
896+
assert(udivmod(Cbig_1, Cbig_2, modulus) == Cent(0x5fe0e9bace2bedad, 2));
897+
assert(modulus == Cent(0x2c923125a68721f8, 0));
785898

786899
assert(mul(Cm10, C1) == Cm10);
787900
assert(mul(C1, Cm10) == Cm10);
788901
assert(mul(C9_3, C10) == C90_30);
789902
assert(mul(Cs_3, C10) == C30);
790903
assert(mul(Cm10, Cm10) == C100);
791-
assert(div(mul(C90_30, C2), C2) == C90_30);
792904

793905
assert( or(C4_8, C3_1) == C7_9);
794906
assert(and(C4_8, C7_9) == C4_8);

0 commit comments

Comments
 (0)