Skip to content

Commit ed11d7a

Browse files
authored
[compiler-rt][ARM] Optimized FP double <-> single conversion (#179926)
This commit provides assembly versions of the conversions both ways between double and float.
1 parent bdaf3cf commit ed11d7a

5 files changed

Lines changed: 886 additions & 0 deletions

File tree

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
477477
arm/gesf2.S
478478
arm/unorddf2.S
479479
arm/unordsf2.S
480+
arm/extendsfdf2.S
481+
arm/truncdfsf2.S
480482
)
481483
set_source_files_properties(${assembly_files}
482484
PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
//===-- extendsfdf2.S - single- to double-precision FP conversion ---------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements the __extendsfdf2 function (single to double precision
10+
// floating point conversion) for the Arm and Thumb2 ISAs.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "../assembly.h"
15+
#include "crt_endian.h"
16+
17+
.syntax unified
18+
.text
19+
.p2align 2
20+
21+
#if __ARM_PCS_VFP
22+
DEFINE_COMPILERRT_FUNCTION(__extendsfdf2)
23+
push {r4, lr}
24+
vmov r0, s0
25+
bl __aeabi_f2d
26+
VMOV_TO_DOUBLE(d0, r0, r1)
27+
pop {r4, pc}
28+
#else
29+
DEFINE_COMPILERRT_FUNCTION_ALIAS(__extendsfdf2, __aeabi_f2d)
30+
#endif
31+
32+
DEFINE_COMPILERRT_FUNCTION(__aeabi_f2d)
33+
34+
// Start with the fast path, dealing with normalized single-precision inputs.
35+
// We handle these as quickly as possible in straight-line code, and branch
36+
// out of line to a single 'handle everything else' label which will have to
37+
// figure out what kind of unusual thing has happened.
38+
39+
// Extend the exponent field by 3 bits, by shifting the sign bit off the top
40+
// of r0 into the carry flag, shifting the rest of the input word right by 3,
41+
// then using RRX to put the sign back. So we end up with a word shaped like
42+
// the top half of a double, but the exponent field is still biased by the
43+
// single-precision offset of 0x7f instead of the double-precision 0x3ff.
44+
lsls r3, r0, #1
45+
lsr r12, r3, #3
46+
rrx r12, r12
47+
48+
// For a normalized number, the remaining steps are to rebias the exponent,
49+
// recover the remaining 3 mantissa bits from r0 which aren't included in the
50+
// word we've just made, and move both into the right output registers.
51+
//
52+
// But we must also check for the difficult cases. These occur when the input
53+
// exponent is either 0 or 0xFF. Those two values can be identified by the
54+
// property that exp XOR (exp << 1) has the top 7 bits all zero.
55+
56+
// Do the test for uncommon values. Instead of using a shifter operand in the
57+
// obvious way (EOR output, r0, r0, lsl #1), we use the fact that the setup
58+
// code above already has a shifted-left copy of the input word in r3. In
59+
// Thumb, this makes the EORS a 16-bit instruction instead of 32-bit.
60+
eors r3, r3, r0
61+
62+
// Now prepare the output, for normal inputs.
63+
//
64+
// We make this pair of instructions conditional on NE, i.e. we skip it if r3
65+
// and r0 were actually equal (which could only happen if r0 was 0, i.e. the
66+
// input was +0). This is fine, because in that situation the input wasn't
67+
// normalized, so we aren't going to return this output anyway.
68+
//
69+
// The _point_ of conditionalizing these two instructions is that this way we
70+
// have only one IT instruction on the fast path, and it's _here_, where this
71+
// comment is, so that it comes immediately after the above 16-bit EORS and
72+
// can be executed in the same cycle by Cortex-M3.
73+
lslne xl, r0, #29 // xl now has the bottom 3
74+
// input mantissa bits
75+
addne xh, r12, #((0x3ff - 0x7f) << 20) // rebias exponent in xh
76+
77+
// Finally, check whether the test word in r3 has its top 7 exponent bits
78+
// zero. If not, we can return the fast-path answer.
79+
tstne r3, #0x7f000000
80+
bxne lr
81+
82+
// Now we've handled the fast-path cases as fast as we know how, what do we
83+
// do next? We almost certainly don't have the input value in r0 any more,
84+
// because we overwrote it by writing an unused output to xh:xl in the above
85+
// code. Worse, we didn't _reliably_ overwrite it, because those writes to
86+
// xh:xl might not have happened if the whole test word in r3 was zero. So
87+
// where can we find the input bits?
88+
//
89+
// We have r3 = input XOR (input << 1). That's actually an invertible
90+
// transformation, so in principle we could recover the full original input
91+
// float from just r3. The quickest way to do that involves these five
92+
// instructions (in any order, since they commute):
93+
//
94+
// EOR r3, r3, r3, lsl #16
95+
// EOR r3, r3, r3, lsl #8
96+
// EOR r3, r3, r3, lsl #4
97+
// EOR r3, r3, r3, lsl #2
98+
// EOR r3, r3, r3, lsl #1
99+
//
100+
// But that's rather slow, and we can do better. r12 contains most of the
101+
// input bits in a more usable form: we inserted three zero bits between the
102+
// sign and the top of the exponent, but everything from the input is there
103+
// _somewhere_, except for the low 3 bits.
104+
//
105+
// However, on one code path below we'll use a subset of those EOR
106+
// instructions to recover the low 3 bits of the input.
107+
108+
// First, find out whether the input exponent was 0 (zero or denormal), or
109+
// 0xFF (infinity or NaN). We know it was one of the two, or we would have
110+
// taken the early return from the fast path. So it's enough to test any
111+
// single bit of the exponent in r12.
112+
tst r12, #(1 << 27) // bit 27 is topmost bit of the 8-bit exponent
113+
bne LOCAL_LABEL(inf_or_nan)
114+
115+
// If we didn't take that branch, we have a denormal or zero. Zeroes are
116+
// likely to be common, so we'd prefer to handle those with highest priority.
117+
//
118+
// r3 = (input XOR (input << 1)) will take the values 0 or 0x80000000 for a
119+
// zero input. So it contains precisely the right value to return in xh.
120+
//
121+
// The BICS here combines the zeroing of xl with the test of r3, because it
122+
// sets Z if and only if the input was one of those two values, and if so,
123+
// sets xl=0.
124+
//
125+
// Unfortunately this has the side effect of clobbering xl in the case where
126+
// we _don't_ take the early return, so now we've lost our verbatim copy of
127+
// the low 3 input bits! On the denormal-handling path we'll have to recover
128+
// those from r3 more awkwardly. But denormal handling is rare, and slow
129+
// anyway, so it's worth the awkwardness to save a cycle in the much more
130+
// common case of a zero input.
131+
bics xl, r3, #0x80000000 // EQ if output is zero
132+
moveq xh, r3 // if so, copy input sign into xh
133+
bxeq lr // and return
134+
135+
// Now we know we're dealing with a denormal, so we need to recover the whole
136+
// input mantissa. Most of it is in r12, but those last three bits now need
137+
// to be reconstructed from r3 by using part of the shift+EOR trick shown
138+
// above. We only need the left shifts by 1 and by 2, because the other three
139+
// don't affect the bottom 3 bits at all.
140+
eor r3, r3, r3, lsl #2
141+
eor r3, r3, r3, lsl #1
142+
and r3, r3, #7
143+
144+
// Now r3 contains just the low bits of the mantissa. The rest of the
145+
// mantissa is in r12, shifted right by 3 bits, so this instruction rebuilds
146+
// the entire input mantissa in xh. (The exponent field is known to be zero,
147+
// and the sign bit at the top of r12 is discarded by the left shift.)
148+
orr xh, r3, r12, lsl #3
149+
150+
// Renormalize that input mantissa so that its high bit is at the top of the
151+
// word.
152+
clz r2, xh
153+
lsl xh, xh, r2
154+
155+
// Compute the right sign + exponent to go with that mantissa.
156+
//
157+
// If the input mantissa had had only its low bit set, then the input float
158+
// would be 2^-149, which has a double-precision exponent of 0x36a. In that
159+
// situation we'd have r2 = 31 (output from the CLZ). So we need the output
160+
// exponent to be (0x389 - r2). But the leading bit of the mantissa will
161+
// increment the exponent field when we add them together, so in fact we want
162+
// to calculate (0x388 - r2). That's particularly convenient, because 0x388
163+
// fits in an AArch32 immediate field!
164+
and r3, r12, #0x80000000 // get the sign bit from the top of r12
165+
add r3, r3, #(0x388 << 20) // add the exponent bias as calculated above
166+
sub r3, r3, r2, lsl #20 // subtract the CLZ output
167+
168+
// Finally, distribute the normalized mantissa across the two output words,
169+
// and combine the top half with the exponent we just computed.
170+
lsls xl, xh, #21 // low word = low 3 bits of normalized mantissa
171+
add xh, r3, xh, lsr #11 // high word = sign + exp + rest of mantissa
172+
bx lr
173+
174+
LOCAL_LABEL(inf_or_nan):
175+
// We come here if the input was either infinity or a NaN. In this situation
176+
// we can be sure that the instructions that set up the fast-path return
177+
// value _did_ happen, because the input was nonzero. Also we branched away
178+
// before the test for a zero input clobbered xl.
179+
//
180+
// So xh:xl will contain what _would_ be the right output value if 0xFF were
181+
// not a special input: the exponent field will be 0x47f, and the sign and
182+
// mantissa will be in place.
183+
//
184+
// This is almost exactly what we really want to return, except for two
185+
// things: the exponent should be corrected to 0x7ff for an output infinity
186+
// or NaN, and if the mantissa is nonzero at all (so that we're returning a
187+
// NaN and not an infinity) then we should set its top bit to make it a quiet
188+
// NaN.
189+
orrs xh, xh, #0x7f000000 // set the missing bits in the exponent field
190+
orrs r2, xl, xh, lsl #12 // is any bit of the mantissa set?
191+
orrne xh, xh, #0x00080000 // if so, set the top mantissa bit
192+
bx lr
193+
194+
END_COMPILERRT_FUNCTION(__aeabi_f2d)
195+
196+
NO_EXEC_STACK_DIRECTIVE

0 commit comments

Comments
 (0)