|
| 1 | +diff --git a/compat.c b/compat.c |
| 2 | +index b4b44ce..3f563dd 100644 |
| 3 | +--- a/compat.c |
| 4 | ++++ b/compat.c |
| 5 | +@@ -31,6 +31,12 @@ see https://www.gnu.org/licenses/. */ |
| 6 | + #include <stdio.h> |
| 7 | + #include "gmp-impl.h" |
| 8 | + |
| 9 | ++/* RUNTIMECPUID */ |
| 10 | ++int bCheckedBMI = 0; |
| 11 | ++int bBMI1 = 0; |
| 12 | ++int bBMI2 = 0; |
| 13 | ++int bCheckedLZCNT = 0; |
| 14 | ++int bLZCNT = 0; |
| 15 | + |
| 16 | + /* mpn_divexact_by3 was a function in gmp 3.0.1, but as of gmp 3.1 it's a |
| 17 | + macro calling mpn_divexact_by3c. */ |
| 18 | +diff --git a/longlong.h b/longlong.h |
| 19 | +index edbaf56..c0a7468 100644 |
| 20 | +--- a/longlong.h |
| 21 | ++++ b/longlong.h |
| 22 | +@@ -1040,6 +1040,86 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); |
| 23 | + #endif /* 80x86 */ |
| 24 | + |
| 25 | + #if defined (__amd64__) && W_TYPE_SIZE == 64 |
| 26 | ++ |
| 27 | ++#ifndef RUNTIMECPUID |
| 28 | ++#define RUNTIMECPUID |
| 29 | ++ |
| 30 | ++extern int bCheckedBMI; |
| 31 | ++extern int bBMI1; |
| 32 | ++extern int bBMI2; |
| 33 | ++ |
| 34 | ++inline void hasBMI() |
| 35 | ++{ |
| 36 | ++ if(bCheckedBMI) |
| 37 | ++ return; |
| 38 | ++ |
| 39 | ++ bCheckedBMI = 1; |
| 40 | ++ int info[4] = {0}; |
| 41 | ++#if defined(_MSC_VER) |
| 42 | ++ __cpuid(info, 0x7); |
| 43 | ++#elif defined(__GNUC__) || defined(__clang__) |
| 44 | ++#if defined(ARCH_X86) && defined(__PIC__) |
| 45 | ++ __asm__ __volatile__ ( |
| 46 | ++ "xchg{l} {%%}ebx, %k1;" |
| 47 | ++ "cpuid;" |
| 48 | ++ "xchg{l} {%%}ebx, %k1;" |
| 49 | ++ : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x7), "c"(0) |
| 50 | ++ ); |
| 51 | ++#else |
| 52 | ++ __asm__ __volatile__ ( |
| 53 | ++ "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x7), "c"(0) |
| 54 | ++ ); |
| 55 | ++#endif |
| 56 | ++#endif |
| 57 | ++ bBMI1 = ((info[1] & (1 << 3)) != 0); |
| 58 | ++ bBMI2 = ((info[1] & (1 << 8)) != 0); |
| 59 | ++} |
| 60 | ++ |
| 61 | ++inline int hasBMI1() |
| 62 | ++{ |
| 63 | ++ hasBMI(); |
| 64 | ++ return bBMI1; |
| 65 | ++} |
| 66 | ++ |
| 67 | ++inline int hasBMI2() |
| 68 | ++{ |
| 69 | ++ hasBMI(); |
| 70 | ++ return bBMI2; |
| 71 | ++} |
| 72 | ++ |
| 73 | ++extern int bCheckedLZCNT; |
| 74 | ++extern int bLZCNT; |
| 75 | ++ |
| 76 | ++inline int hasLZCNT() |
| 77 | ++{ |
| 78 | ++ if(bCheckedLZCNT) |
| 79 | ++ return bLZCNT; |
| 80 | ++ |
| 81 | ++ bCheckedLZCNT = 1; |
| 82 | ++ int info[4] = {0}; |
| 83 | ++ #if defined(_MSC_VER) |
| 84 | ++ __cpuid(info, 0x80000001); |
| 85 | ++ #elif defined(__GNUC__) || defined(__clang__) |
| 86 | ++ #if defined(ARCH_X86) && defined(__PIC__) |
| 87 | ++ __asm__ __volatile__ ( |
| 88 | ++ "xchg{l} {%%}ebx, %k1;" |
| 89 | ++ "cpuid;" |
| 90 | ++ "xchg{l} {%%}ebx, %k1;" |
| 91 | ++ : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0) |
| 92 | ++ ); |
| 93 | ++ #else |
| 94 | ++ __asm__ __volatile__ ( |
| 95 | ++ "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0) |
| 96 | ++ ); |
| 97 | ++ #endif |
| 98 | ++ #endif |
| 99 | ++ |
| 100 | ++ bLZCNT = ((info[2] & (1 << 5)) != 0); |
| 101 | ++ return bLZCNT; |
| 102 | ++} |
| 103 | ++ |
| 104 | ++#endif // RUNTIMECPUID |
| 105 | ++ |
| 106 | + #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ |
| 107 | + __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ |
| 108 | + : "=r" (sh), "=&r" (sl) \ |
| 109 | +@@ -1050,61 +1130,52 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); |
| 110 | + : "=r" (sh), "=&r" (sl) \ |
| 111 | + : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ |
| 112 | + "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) |
| 113 | +-#if X86_ASM_MULX \ |
| 114 | +- && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \ |
| 115 | +- || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen) |
| 116 | + #define umul_ppmm(w1, w0, u, v) \ |
| 117 | +- __asm__ ("mulx\t%3, %q0, %q1" \ |
| 118 | ++ if(hasBMI2()) { \ |
| 119 | ++ __asm__ ("mulx\t%3, %q0, %q1" \ |
| 120 | + : "=r" (w0), "=r" (w1) \ |
| 121 | +- : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))) |
| 122 | +-#else |
| 123 | +-#define umul_ppmm(w1, w0, u, v) \ |
| 124 | +- __asm__ ("mulq\t%3" \ |
| 125 | ++ : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))); \ |
| 126 | ++ } else { \ |
| 127 | ++ __asm__ ("mulq\t%3" \ |
| 128 | + : "=a" (w0), "=d" (w1) \ |
| 129 | +- : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) |
| 130 | +-#endif |
| 131 | ++ : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))); \ |
| 132 | ++ } |
| 133 | + #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ |
| 134 | + __asm__ ("divq %4" /* stringification in K&R C */ \ |
| 135 | + : "=a" (q), "=d" (r) \ |
| 136 | + : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) |
| 137 | + |
| 138 | +-#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ |
| 139 | +- || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \ |
| 140 | +- || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \ |
| 141 | +- || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar |
| 142 | + #define count_leading_zeros(count, x) \ |
| 143 | +- do { \ |
| 144 | +- /* This is lzcnt, spelled for older assemblers. Destination and */ \ |
| 145 | +- /* source must be a 64-bit registers, hence cast and %q. */ \ |
| 146 | +- __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 147 | +- } while (0) |
| 148 | ++ if(hasLZCNT()) { \ |
| 149 | ++ do { \ |
| 150 | ++ /* This is lzcnt, spelled for older assemblers. Destination and */ \ |
| 151 | ++ /* source must be a 64-bit registers, hence cast and %q. */ \ |
| 152 | ++ __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 153 | ++ } while (0); \ |
| 154 | ++ } else { \ |
| 155 | ++ do { \ |
| 156 | ++ UDItype __cbtmp; \ |
| 157 | ++ ASSERT ((x) != 0); \ |
| 158 | ++ __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ |
| 159 | ++ (count) = __cbtmp ^ 63; \ |
| 160 | ++ } while (0); \ |
| 161 | ++ } |
| 162 | + #define COUNT_LEADING_ZEROS_0 64 |
| 163 | +-#else |
| 164 | +-#define count_leading_zeros(count, x) \ |
| 165 | +- do { \ |
| 166 | +- UDItype __cbtmp; \ |
| 167 | +- ASSERT ((x) != 0); \ |
| 168 | +- __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ |
| 169 | +- (count) = __cbtmp ^ 63; \ |
| 170 | +- } while (0) |
| 171 | +-#endif |
| 172 | + |
| 173 | +-#if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \ |
| 174 | +- || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar |
| 175 | + #define count_trailing_zeros(count, x) \ |
| 176 | +- do { \ |
| 177 | +- /* This is tzcnt, spelled for older assemblers. Destination and */ \ |
| 178 | +- /* source must be a 64-bit registers, hence cast and %q. */ \ |
| 179 | +- __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 180 | +- } while (0) |
| 181 | ++ if(hasBMI1()) { \ |
| 182 | ++ do { \ |
| 183 | ++ /* This is tzcnt, spelled for older assemblers. Destination and */ \ |
| 184 | ++ /* source must be a 64-bit registers, hence cast and %q. */ \ |
| 185 | ++ __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 186 | ++ } while (0); \ |
| 187 | ++ } else { \ |
| 188 | ++ do { \ |
| 189 | ++ ASSERT ((x) != 0); \ |
| 190 | ++ __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 191 | ++ } while (0); \ |
| 192 | ++ } |
| 193 | + #define COUNT_TRAILING_ZEROS_0 64 |
| 194 | +-#else |
| 195 | +-#define count_trailing_zeros(count, x) \ |
| 196 | +- do { \ |
| 197 | +- ASSERT ((x) != 0); \ |
| 198 | +- __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ |
| 199 | +- } while (0) |
| 200 | +-#endif |
| 201 | + #endif /* __amd64__ */ |
| 202 | + |
| 203 | + #if defined (__i860__) && W_TYPE_SIZE == 32 |
| 204 | +diff --git a/mpz/inp_raw.c b/mpz/inp_raw.c |
| 205 | +index 378c42b..f88fea9 100644 |
| 206 | +--- a/mpz/inp_raw.c |
| 207 | ++++ b/mpz/inp_raw.c |
| 208 | +@@ -88,8 +88,11 @@ mpz_inp_raw (mpz_ptr x, FILE *fp) |
| 209 | + |
| 210 | + abs_csize = ABS (csize); |
| 211 | + |
| 212 | ++ if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8)) |
| 213 | ++ return 0; /* Bit size overflows */ |
| 214 | ++ |
| 215 | + /* round up to a multiple of limbs */ |
| 216 | +- abs_xsize = BITS_TO_LIMBS (abs_csize*8); |
| 217 | ++ abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8); |
| 218 | + |
| 219 | + if (abs_xsize != 0) |
| 220 | + { |
0 commit comments