diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 4e74f343c..cd0f464ac 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -2781,12 +2781,18 @@ class ror(AArch64Shift): outputs = ["Xd"] -class asr(AArch64Shift): +class asr_imm(AArch64Shift): pattern = "asr , , " inputs = ["Xa"] outputs = ["Xd"] +class asr(AArch64Shift): + pattern = "asr , , " + inputs = ["Xa", "Xb"] + outputs = ["Xd"] + + class AArch64Logical(AArch64Instruction): pass diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py index 8d46b0e5a..f3983cc2b 100644 --- a/slothy/targets/aarch64/cortex_a55.py +++ b/slothy/targets/aarch64/cortex_a55.py @@ -66,6 +66,7 @@ vmul, Instruction, csel, + csel_xzr_ne, fcsel, Q_Ld2_Lane_Post_Inc, q_ld2_lane_s, @@ -162,6 +163,8 @@ ngc_zero, subs_wform, asr_wform, + asr_imm, + asr, and_imm_wform, eor_wform, eon_wform, @@ -181,6 +184,7 @@ fmov_s_form, # from double/single to gen reg fmov_d_form, # from double/single to gen reg (64-bit) cmp, + cmp_xzr, vdup_w, crc32b, crc32h, @@ -439,12 +443,15 @@ def get_min_max_objective(slothy): sub, sub_imm, cmp, + cmp_xzr, sbcs_zero_to_zero, cmp_xzr2, mov, ngc_zero, subs_wform, asr_wform, + asr_imm, + asr, and_imm_wform, lsr_wform, lsr, @@ -457,7 +464,7 @@ def get_min_max_objective(slothy): # NOTE: AESE/AESMC and AESD/AESIMC pairs can be dual-issued on A55 but this # is not modeled AESInstruction: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], - csel: ExecutionUnit.SCALAR(), + (csel, csel_xzr_ne): ExecutionUnit.SCALAR(), ( crc32b, crc32h, @@ -486,7 +493,7 @@ def get_min_max_objective(slothy): umov_d, vuaddlv_sform, ): 1, - (sub_imm, cmp): 1, + (sub_imm, cmp, cmp_xzr): 1, ( vmla, vmla_lane, @@ -514,7 +521,7 @@ def get_min_max_objective(slothy): vshrn: 2, vtbl: 1, # N cycles (N = number of registers in the table) (fcsel): 1, - csel: 1, + (csel, csel_xzr_ne): 1, (VecToGprMov, Mov_xtov_d, mov_wtov_s): 1, ( movk_imm, @@ -545,7 +552,17 @@ def get_min_max_objective(slothy): adcs_zero_r_to_zero, cmn, ): 1, - (cmp_xzr2, cmp_imm, sub, subs_wform, asr_wform, sbcs_zero_to_zero, ngc_zero): 1, + ( + cmp_xzr2, + cmp_imm, + sub, + subs_wform, + asr_wform, + asr_imm, + sbcs_zero_to_zero, + ngc_zero, + ): 1, + asr: 2, (bfi, ubfx): 1, VShiftImmediateRounding: 1, AArch64NeonShiftInsert: 1, @@ -592,7 +609,7 @@ def get_min_max_objective(slothy): ): 4, (Ldr_D): 3, (Ldr_Q, Str_Q): 4, - (sub_imm, cmp): 2, + (sub_imm, cmp, cmp_xzr): 2, AArch64NeonCount: 2, St4: 5, St3: 3, @@ -612,7 +629,7 @@ def get_min_max_objective(slothy): (Vins, umov_d): 2, (tst_wform): 1, (fcsel): 2, - csel: 1, + (csel, csel_xzr_ne): 1, (VecToGprMov, Mov_xtov_d, mov_wtov_s): 2, ( movk_imm, @@ -643,12 +660,13 @@ def get_min_max_objective(slothy): cmn, sub, subs_wform, - asr_wform, + asr, sbcs_zero_to_zero, cmp_xzr2, ngc_zero, cmp_imm, ): 1, + (asr_wform, asr_imm): 2, (bfi, ubfx): 2, VShiftImmediateRounding: 3, VShiftImmediateBasic: 2, diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py index f77bf2334..66495a79d 100644 --- a/slothy/targets/aarch64/cortex_a72_frontend.py +++ b/slothy/targets/aarch64/cortex_a72_frontend.py @@ -123,10 +123,14 @@ q_ld2_lane_s, Ldp_W, cmp, + cmp_xzr, cmp_imm, csel, + csel_xzr_ne, q_ldp_with_inc, AArch64CRC32, + asr, + asr_imm, ) # From the A72 SWOG, Section "4.1 Dispatch Constraints" @@ -237,7 +241,7 @@ def get_min_max_objective(slothy): ], (AArch64NeonShiftInsert, vusra): [ExecutionUnit.ASIMD1], fcsel: ExecutionUnit.ASIMD(), - csel: ExecutionUnit.INT(), + (csel, csel_xzr_ne): ExecutionUnit.INT(), AArch64ConditionalCompare: ExecutionUnit.INT(), AArch64Logical: [ExecutionUnit.INT()], # 8B/8H occupies both F0, F1 @@ -262,11 +266,13 @@ def get_min_max_objective(slothy): lsr_imm: ExecutionUnit.INT(), lsr: ExecutionUnit.INT(), movk_imm_lsl: ExecutionUnit.INT(), - (sub_imm, cmp, cmp_imm): ExecutionUnit.INT(), + (sub_imm, cmp_imm): ExecutionUnit.INT(), + (cmp, cmp_xzr): ExecutionUnit.MINT(), Ldp_W: ExecutionUnit.LOAD(), q_ldp_with_inc: ExecutionUnit.LOAD(), Stp_W: ExecutionUnit.STORE(), AArch64CRC32: ExecutionUnit.MINT(), + (asr, asr_imm): ExecutionUnit.INT(), } inverse_throughput = { @@ -291,7 +297,7 @@ def get_min_max_objective(slothy): AArch64NeonLogical: 1, (AArch64NeonShiftInsert, vusra): 1, fcsel: 1, - csel: 1, + (csel, csel_xzr_ne): 1, AArch64ConditionalCompare: 1, AArch64Logical: 1, Vins: 1, @@ -312,7 +318,7 @@ def get_min_max_objective(slothy): q_ld2_lane_s: 1, vtbl: 1, # SWOG contains a blank throughput (approximating from AArch32) AESInstruction: 1, - (sub_imm, cmp, cmp_imm): 1, + (sub_imm, cmp, cmp_xzr, cmp_imm): 1, vuaddlv_sform: 1, fmov_s_form: 1, # from vec to gen reg fmov_d_form: 1, # from vec to gen reg (64-bit) @@ -327,6 +333,8 @@ def get_min_max_objective(slothy): Ldp_W: 1, Stp_W: 1, AArch64CRC32: 1, + asr: 1, + asr_imm: 1, } # REVISIT @@ -358,7 +366,7 @@ def get_min_max_objective(slothy): AArch64NeonShiftInsert: 3, vusra: 4, fcsel: 3, - csel: 1, + (csel, csel_xzr_ne): 1, AArch64ConditionalCompare: 1, AArch64Logical: 1, (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4, # approx @@ -381,7 +389,8 @@ def get_min_max_objective(slothy): q_ld2_lane_s: 8, vtbl: 6, # q-form: 3*N+3 cycles (N = number of registers in the table) AESInstruction: 3, - (sub_imm, cmp, cmp_imm): 1, + (sub_imm, cmp_imm): 1, + (cmp, cmp_xzr): 2, vuaddlv_sform: 6, # 8B/8H fmov_s_form: 5, # from vec to gen reg fmov_d_form: 5, # from vec to gen reg (64-bit) @@ -396,6 +405,8 @@ def get_min_max_objective(slothy): Ldp_W: 4, Stp_W: 1, AArch64CRC32: 2, + asr: 1, + asr_imm: 1, } diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s index 693de5b07..bdbd1db70 100644 --- a/tests/naive/aarch64/instructions.s +++ b/tests/naive/aarch64/instructions.s @@ -216,4 +216,13 @@ crc32ch w6, w6, w7 crc32cw w6, w6, w7 crc32cx w6, w6, x8 fmov x5, d7 + +asr x11, x12, x7 +asr x11, x12, #7 +csel x11, x10, xzr, eq +csel x11, x10, xzr, ne +csel x11, x10, xzr, lt +csel x11, x10, xzr, gt +cmp x3, xzr + end: