|
| 1 | +//===-- extendsfdf2.S - single- to double-precision FP conversion ---------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This file implements the __extendsfdf2 function (single to double precision |
| 10 | +// floating point conversion) for the Arm and Thumb2 ISAs. |
| 11 | +// |
| 12 | +//===----------------------------------------------------------------------===// |
| 13 | + |
| 14 | +#include "../assembly.h" |
| 15 | +#include "crt_endian.h" |
| 16 | + |
| 17 | + .syntax unified |
| 18 | + .text |
| 19 | + .p2align 2 |
| 20 | + |
| 21 | +#if __ARM_PCS_VFP |
| 22 | +DEFINE_COMPILERRT_FUNCTION(__extendsfdf2) |
| 23 | + push {r4, lr} |
| 24 | + vmov r0, s0 |
| 25 | + bl __aeabi_f2d |
| 26 | + VMOV_TO_DOUBLE(d0, r0, r1) |
| 27 | + pop {r4, pc} |
| 28 | +#else |
| 29 | +DEFINE_COMPILERRT_FUNCTION_ALIAS(__extendsfdf2, __aeabi_f2d) |
| 30 | +#endif |
| 31 | + |
| 32 | +DEFINE_COMPILERRT_FUNCTION(__aeabi_f2d) |
| 33 | + |
| 34 | + // Start with the fast path, dealing with normalized single-precision inputs. |
| 35 | + // We handle these as quickly as possible in straight-line code, and branch |
| 36 | + // out of line to a single 'handle everything else' label which will have to |
| 37 | + // figure out what kind of unusual thing has happened. |
| 38 | + |
| 39 | + // Extend the exponent field by 3 bits, by shifting the sign bit off the top |
| 40 | + // of r0 into the carry flag, shifting the rest of the input word right by 3, |
| 41 | + // then using RRX to put the sign back. So we end up with a word shaped like |
| 42 | + // the top half of a double, but the exponent field is still biased by the |
| 43 | + // single-precision offset of 0x7f instead of the double-precision 0x3ff. |
| 44 | + lsls r3, r0, #1 |
| 45 | + lsr r12, r3, #3 |
| 46 | + rrx r12, r12 |
| 47 | + |
| 48 | + // For a normalized number, the remaining steps are to rebias the exponent, |
| 49 | + // recover the remaining 3 mantissa bits from r0 which aren't included in the |
| 50 | + // word we've just made, and move both into the right output registers. |
| 51 | + // |
| 52 | + // But we must also check for the difficult cases. These occur when the input |
| 53 | + // exponent is either 0 or 0xFF. Those two values can be identified by the |
| 54 | + // property that exp XOR (exp << 1) has the top 7 bits all zero. |
| 55 | + |
| 56 | + // Do the test for uncommon values. Instead of using a shifter operand in the |
| 57 | + // obvious way (EOR output, r0, r0, lsl #1), we use the fact that the setup |
| 58 | + // code above already has a shifted-left copy of the input word in r3. In |
| 59 | + // Thumb, this makes the EORS a 16-bit instruction instead of 32-bit. |
| 60 | + eors r3, r3, r0 |
| 61 | + |
| 62 | + // Now prepare the output, for normal inputs. |
| 63 | + // |
| 64 | + // We make this pair of instructions conditional on NE, i.e. we skip it if r3 |
| 65 | + // and r0 were actually equal (which could only happen if r0 was 0, i.e. the |
| 66 | + // input was +0). This is fine, because in that situation the input wasn't |
| 67 | + // normalized, so we aren't going to return this output anyway. |
| 68 | + // |
| 69 | + // The _point_ of conditionalizing these two instructions is that this way we |
| 70 | + // have only one IT instruction on the fast path, and it's _here_, where this |
| 71 | + // comment is, so that it comes immediately after the above 16-bit EORS and |
| 72 | + // can be executed in the same cycle by Cortex-M3. |
| 73 | + lslne xl, r0, #29 // xl now has the bottom 3 |
| 74 | + // input mantissa bits |
| 75 | + addne xh, r12, #((0x3ff - 0x7f) << 20) // rebias exponent in xh |
| 76 | + |
| 77 | + // Finally, check whether the test word in r3 has its top 7 exponent bits |
| 78 | + // zero. If not, we can return the fast-path answer. |
| 79 | + tstne r3, #0x7f000000 |
| 80 | + bxne lr |
| 81 | + |
| 82 | + // Now we've handled the fast-path cases as fast as we know how, what do we |
| 83 | + // do next? We almost certainly don't have the input value in r0 any more, |
| 84 | + // because we overwrote it by writing an unused output to xh:xl in the above |
| 85 | + // code. Worse, we didn't _reliably_ overwrite it, because those writes to |
| 86 | + // xh:xl might not have happened if the whole test word in r3 was zero. So |
| 87 | + // where can we find the input bits? |
| 88 | + // |
| 89 | + // We have r3 = input XOR (input << 1). That's actually an invertible |
| 90 | + // transformation, so in principle we could recover the full original input |
| 91 | + // float from just r3. The quickest way to do that involves these five |
| 92 | + // instructions (in any order, since they commute): |
| 93 | + // |
| 94 | + // EOR r3, r3, r3, lsl #16 |
| 95 | + // EOR r3, r3, r3, lsl #8 |
| 96 | + // EOR r3, r3, r3, lsl #4 |
| 97 | + // EOR r3, r3, r3, lsl #2 |
| 98 | + // EOR r3, r3, r3, lsl #1 |
| 99 | + // |
| 100 | + // But that's rather slow, and we can do better. r12 contains most of the |
| 101 | + // input bits in a more usable form: we inserted three zero bits between the |
| 102 | + // sign and the top of the exponent, but everything from the input is there |
| 103 | + // _somewhere_, except for the low 3 bits. |
| 104 | + // |
| 105 | + // However, on one code path below we'll use a subset of those EOR |
| 106 | + // instructions to recover the low 3 bits of the input. |
| 107 | + |
| 108 | + // First, find out whether the input exponent was 0 (zero or denormal), or |
| 109 | + // 0xFF (infinity or NaN). We know it was one of the two, or we would have |
| 110 | + // taken the early return from the fast path. So it's enough to test any |
| 111 | + // single bit of the exponent in r12. |
| 112 | + tst r12, #(1 << 27) // bit 27 is topmost bit of the 8-bit exponent |
| 113 | + bne LOCAL_LABEL(inf_or_nan) |
| 114 | + |
| 115 | + // If we didn't take that branch, we have a denormal or zero. Zeroes are |
| 116 | + // likely to be common, so we'd prefer to handle those with highest priority. |
| 117 | + // |
| 118 | + // r3 = (input XOR (input << 1)) will take the values 0 or 0x80000000 for a |
| 119 | + // zero input. So it contains precisely the right value to return in xh. |
| 120 | + // |
| 121 | + // The BICS here combines the zeroing of xl with the test of r3, because it |
| 122 | + // sets Z if and only if the input was one of those two values, and if so, |
| 123 | + // sets xl=0. |
| 124 | + // |
| 125 | + // Unfortunately this has the side effect of clobbering xl in the case where |
| 126 | + // we _don't_ take the early return, so now we've lost our verbatim copy of |
| 127 | + // the low 3 input bits! On the denormal-handling path we'll have to recover |
| 128 | + // those from r3 more awkwardly. But denormal handling is rare, and slow |
| 129 | + // anyway, so it's worth the awkwardness to save a cycle in the much more |
| 130 | + // common case of a zero input. |
| 131 | + bics xl, r3, #0x80000000 // EQ if output is zero |
| 132 | + moveq xh, r3 // if so, copy input sign into xh |
| 133 | + bxeq lr // and return |
| 134 | + |
| 135 | + // Now we know we're dealing with a denormal, so we need to recover the whole |
| 136 | + // input mantissa. Most of it is in r12, but those last three bits now need |
| 137 | + // to be reconstructed from r3 by using part of the shift+EOR trick shown |
| 138 | + // above. We only need the left shifts by 1 and by 2, because the other three |
| 139 | + // don't affect the bottom 3 bits at all. |
| 140 | + eor r3, r3, r3, lsl #2 |
| 141 | + eor r3, r3, r3, lsl #1 |
| 142 | + and r3, r3, #7 |
| 143 | + |
| 144 | + // Now r3 contains just the low bits of the mantissa. The rest of the |
| 145 | + // mantissa is in r12, shifted right by 3 bits, so this instruction rebuilds |
| 146 | + // the entire input mantissa in xh. (The exponent field is known to be zero, |
| 147 | + // and the sign bit at the top of r12 is discarded by the left shift.) |
| 148 | + orr xh, r3, r12, lsl #3 |
| 149 | + |
| 150 | + // Renormalize that input mantissa so that its high bit is at the top of the |
| 151 | + // word. |
| 152 | + clz r2, xh |
| 153 | + lsl xh, xh, r2 |
| 154 | + |
| 155 | + // Compute the right sign + exponent to go with that mantissa. |
| 156 | + // |
| 157 | + // If the input mantissa had had only its low bit set, then the input float |
| 158 | + // would be 2^-149, which has a double-precision exponent of 0x36a. In that |
| 159 | + // situation we'd have r2 = 31 (output from the CLZ). So we need the output |
| 160 | + // exponent to be (0x389 - r2). But the leading bit of the mantissa will |
| 161 | + // increment the exponent field when we add them together, so in fact we want |
| 162 | + // to calculate (0x388 - r2). That's particularly convenient, because 0x388 |
| 163 | + // fits in an AArch32 immediate field! |
| 164 | + and r3, r12, #0x80000000 // get the sign bit from the top of r12 |
| 165 | + add r3, r3, #(0x388 << 20) // add the exponent bias as calculated above |
| 166 | + sub r3, r3, r2, lsl #20 // subtract the CLZ output |
| 167 | + |
| 168 | + // Finally, distribute the normalized mantissa across the two output words, |
| 169 | + // and combine the top half with the exponent we just computed. |
| 170 | + lsls xl, xh, #21 // low word = low 3 bits of normalized mantissa |
| 171 | + add xh, r3, xh, lsr #11 // high word = sign + exp + rest of mantissa |
| 172 | + bx lr |
| 173 | + |
| 174 | +LOCAL_LABEL(inf_or_nan): |
| 175 | + // We come here if the input was either infinity or a NaN. In this situation |
| 176 | + // we can be sure that the instructions that set up the fast-path return |
| 177 | + // value _did_ happen, because the input was nonzero. Also we branched away |
| 178 | + // before the test for a zero input clobbered xl. |
| 179 | + // |
| 180 | + // So xh:xl will contain what _would_ be the right output value if 0xFF were |
| 181 | + // not a special input: the exponent field will be 0x47f, and the sign and |
| 182 | + // mantissa will be in place. |
| 183 | + // |
| 184 | + // This is almost exactly what we really want to return, except for two |
| 185 | + // things: the exponent should be corrected to 0x7ff for an output infinity |
| 186 | + // or NaN, and if the mantissa is nonzero at all (so that we're returning a |
| 187 | + // NaN and not an infinity) then we should set its top bit to make it a quiet |
| 188 | + // NaN. |
| 189 | + orrs xh, xh, #0x7f000000 // set the missing bits in the exponent field |
| 190 | + orrs r2, xl, xh, lsl #12 // is any bit of the mantissa set? |
| 191 | + orrne xh, xh, #0x00080000 // if so, set the top mantissa bit |
| 192 | + bx lr |
| 193 | + |
| 194 | +END_COMPILERRT_FUNCTION(__aeabi_f2d) |
| 195 | + |
| 196 | +NO_EXEC_STACK_DIRECTIVE |
0 commit comments