Skip to content

Commit b3df295

Browse files
authored
aarch64: Fix ABI layout calculation for apple_aarch64 (#12982)
This commit fixes an issue in the ABI layout calculated for the `apple_aarch64` calling convention when stack-returns are present with sign extensions. In this situation the machinst code would load/store a full pointer-width but the aarch64 ABI calculation would only allocate a type's width for the slot. The fix in this PR is to skip the logic for "slot sized to the type" and instead use the minimum 8 bytes for extended arguments.
1 parent 9f93e49 commit b3df295

3 files changed

Lines changed: 103 additions & 7 deletions

File tree

cranelift/codegen/src/isa/aarch64/abi.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -347,15 +347,21 @@ impl ABIMachineSpec for AArch64MachineDeps {
347347
// Compute the stack slot's size.
348348
let size = (ty_bits(param.value_type) / 8) as u32;
349349

350-
let size = if is_apple_cc || is_winch_return {
351-
// MacOS and Winch aarch64 allows stack slots with
352-
// sizes less than 8 bytes. They still need to be
353-
// properly aligned on their natural data alignment,
354-
// though.
350+
// MacOS and Winch aarch64 allows stack slots with sizes less than 8
351+
// bytes. They still need to be properly aligned on their natural
352+
// data alignment, though, and this additionally is only applicable
353+
// for arguments or when there's no argument extension in play.
354+
// Stack slots for return values with argument extension get their
355+
// full machine-word-width loaded or stored.
356+
//
357+
// Otherwise every arg takes a minimum slot of 8 bytes. (16-byte
358+
// stack alignment happens separately after all args.)
359+
let size = if (is_apple_cc || is_winch_return)
360+
&& (args_or_rets == ArgsOrRets::Args
361+
|| param.extension == ir::ArgumentExtension::None)
362+
{
355363
size
356364
} else {
357-
// Every arg takes a minimum slot of 8 bytes. (16-byte stack
358-
// alignment happens separately after all args.)
359365
core::cmp::max(size, 8)
360366
};
361367

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
test compile precise-output
2+
set opt_level=speed_and_size
3+
set enable_nan_canonicalization=true
4+
set enable_multi_ret_implicit_sret=true
5+
target aarch64
6+
7+
function %a(i8, i8 uext, i8 uext, i8, i128 uext, i128 uext, i128 uext, f32, i16x8, i16x8, i16x8, f64, i8 uext, f32, f32) -> i128, i128, i128, i128, i128, i128, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 uext apple_aarch64 {
8+
block0(v0: i8, v1: i8, v2: i8, v3: i8, v4: i128, v5: i128, v6: i128, v7: f32, v8: i16x8, v9: i16x8, v10: i16x8, v11: f64, v12: i8, v13: f32, v14: f32):
9+
v16 = iconst.i8 0
10+
v17 = iconst.i16 0
11+
v18 = iconst.i32 0
12+
v19 = iconst.i64 0
13+
v20 = uextend.i128 v19
14+
return v4, v4, v4, v4, v4, v4, v0, v0, v0, v0, v0, v0, v0, v0, v0, v0
15+
}
16+
17+
; VCode:
18+
; stp fp, lr, [sp, #-16]!
19+
; mov fp, sp
20+
; block0:
21+
; str x4, [x8]
22+
; str x5, [x8, #8]
23+
; str x4, [x8, #16]
24+
; str x5, [x8, #24]
25+
; strb w0, [x8, #32]
26+
; strb w0, [x8, #33]
27+
; strb w0, [x8, #34]
28+
; strb w0, [x8, #35]
29+
; strb w0, [x8, #36]
30+
; strb w0, [x8, #37]
31+
; strb w0, [x8, #38]
32+
; strb w0, [x8, #39]
33+
; strb w0, [x8, #40]
34+
; uxtb w1, w0
35+
; str x0, [x8, #48]
36+
; mov x6, x4
37+
; mov x7, x5
38+
; mov x0, x6
39+
; mov x1, x7
40+
; mov x2, x6
41+
; mov x3, x7
42+
; ldp fp, lr, [sp], #16
43+
; ret
44+
;
45+
; Disassembled:
46+
; block0: ; offset 0x0
47+
; stp x29, x30, [sp, #-0x10]!
48+
; mov x29, sp
49+
; block1: ; offset 0x8
50+
; stur x4, [x8]
51+
; stur x5, [x8, #8]
52+
; stur x4, [x8, #0x10]
53+
; stur x5, [x8, #0x18]
54+
; sturb w0, [x8, #0x20]
55+
; sturb w0, [x8, #0x21]
56+
; sturb w0, [x8, #0x22]
57+
; sturb w0, [x8, #0x23]
58+
; sturb w0, [x8, #0x24]
59+
; sturb w0, [x8, #0x25]
60+
; sturb w0, [x8, #0x26]
61+
; sturb w0, [x8, #0x27]
62+
; sturb w0, [x8, #0x28]
63+
; uxtb w1, w0
64+
; stur x0, [x8, #0x30]
65+
; mov x6, x4
66+
; mov x7, x5
67+
; mov x0, x6
68+
; mov x1, x7
69+
; mov x2, x6
70+
; mov x3, x7
71+
; ldp x29, x30, [sp], #0x10
72+
; ret
73+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
test run
2+
set opt_level=speed_and_size
3+
set enable_nan_canonicalization=true
4+
set enable_multi_ret_implicit_sret=true
5+
target aarch64
6+
7+
function %a(i8, i8 uext, i8 uext, i8, i128 uext, i128 uext, i128 uext, f32, i16x8, i16x8, i16x8, f64, i8 uext, f32, f32) -> i128, i128, i128, i128, i128, i128, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 uext apple_aarch64 {
8+
block0(v0: i8, v1: i8, v2: i8, v3: i8, v4: i128, v5: i128, v6: i128, v7: f32, v8: i16x8, v9: i16x8, v10: i16x8, v11: f64, v12: i8, v13: f32, v14: f32):
9+
v16 = iconst.i8 0
10+
v17 = iconst.i16 0
11+
v18 = iconst.i32 0
12+
v19 = iconst.i64 0
13+
v20 = uextend.i128 v19 ; v19 = 0
14+
return v4, v4, v4, v4, v4, v4, v0, v0, v0, v0, v0, v0, v0, v0, v0, v0
15+
}
16+
17+
; run: %a(0, 0, 0, 0, 0, 0, 0, 0.0, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0.0, 0, 0.0, 0.0) == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

0 commit comments

Comments
 (0)