diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index a30980dc..be778ed8 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -542,6 +542,8 @@ def start( # Identify the register that is used as a loop counter body_code = [line for line in body_code if line.text != ""] loop_cnt_reg = None + loop_end_reg = None + subs_imm = None for idx, line in enumerate(body_code): inst = Instruction.parser(line) # Flags are set through cmp @@ -557,17 +559,42 @@ def start( "as end register." ) break + if loop_cnt_reg is None: + # Try to find a subs instruction in the body (countdown loop) + for idx, line in enumerate(body_code): + inst = Instruction.parser(line) + if ( + isinstance(inst[0], AArch64BasicArithmetic) + and inst[0].mnemonic.startswith("subs") + and inst[0].immediate is not None + and len(inst[0].args_out) > 0 + ): + loop_cnt_reg = inst[0].args_out[0] + subs_imm = inst[0].immediate + body_code[idx].add_tag("id", "cmp") + logging.debug( + f"Assuming {loop_cnt_reg} as countdown counter register " + f"(subs immediate={subs_imm})." + ) + break if loop_cnt_reg is None: raise FatalParsingException("No flag-setting instruction found!") if unroll > 1: assert unroll in [1, 2, 4, 8, 16, 32] - yield f"{indent}lsr {loop_end_reg}, {loop_end_reg}, #{int(math.log2(unroll))}" + if loop_end_reg is not None: + yield ( + f"{indent}lsr {loop_end_reg}, {loop_end_reg}," + f"#{int(math.log2(unroll))}" + ) inc_per_iter = 0 for idx, line in enumerate(body_code): inst = Instruction.parser(line) modifies_counter = False + # Skip the subs instruction itself - it is already tagged as the flag-setter + if subs_imm is not None and body_code[idx].tags.get("id") == "cmp": + continue # Increment happens through pointer modification if loop_cnt_reg.lower() == inst[0].addr and inst[0].increment is not None: inc_per_iter = inc_per_iter + simplify(inst[0].increment) @@ -587,7 +614,12 @@ def start( ) if fixup != 0: - yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}" + if loop_end_reg is not None: + yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}" + else: + # subs countdown loop: subtract fixup * subs_imm from the counter + fixup_val = simplify(f"{fixup} * ({subs_imm})") + yield f"{indent}sub {loop_cnt_reg}, {loop_cnt_reg}, #{fixup_val}" if jump_if_empty is not None: yield f"cbz {loop_cnt}, {jump_if_empty}" @@ -748,7 +780,15 @@ def is_q_form_vector_instruction(self): return self._is_instance_of([Str_Q, Ldr_Q]) # Operations on specific lanes are not counted as Q-form instructions - if self._is_instance_of([Q_Ld2_Lane_Post_Inc, st2_lane, st2_lane_post_inc]): + if self._is_instance_of( + [ + Q_Ld2_Lane_Post_Inc, + st2_lane, + st2_lane_post_inc, + q_ld1_lane_with_reg_postinc, + q_st1_lane_with_reg_postinc, + ] + ): return False dt = self.datatype @@ -763,12 +803,39 @@ def is_q_form_vector_instruction(self): def is_vector_load(self): """Indicates if an instruction is a Neon load instruction""" - return self._is_instance_of([Ldr_Q, Ldp_Q, Ld2, Ld3, Ld4, Q_Ld2_Lane_Post_Inc]) + return self._is_instance_of( + [ + Ldr_Q, + Ldp_Q, + Ld2, + Ld3, + Ld4, + Q_Ld2_Lane_Post_Inc, + q_ld1_lane_with_reg_postinc, + q_ld1_1_with_reg_postinc, + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_4, + q_ld1_4_with_postinc, + ] + ) def is_vector_store(self): """Indicates if an instruction is a Neon store instruction""" return self._is_instance_of( - [Str_Q, Stp_Q, St2, St3, St4, d_stp_stack_with_inc, d_str_stack_with_inc] + [ + Str_Q, + Stp_Q, + St2, + St3, + St4, + d_stp_stack_with_inc, + d_str_stack_with_inc, + q_st1_lane_with_reg_postinc, + q_st1_1_with_reg_postinc, + q_st1_4_with_postinc, + ] ) # scalar @@ -932,8 +999,6 @@ def _enforce_datatype_matching(pattern, res): raise FatalParsingException( f"Inconsistent data type: {datatypes[dt]} vs {val}" ) - elif dt not in datatypes and val in datatypes.values(): - raise FatalParsingException(f"Inconsistent dt: {dt}") datatypes[dt] = val @staticmethod @@ -1474,6 +1539,43 @@ def make(cls, src): return obj +class q_ld1_2(AArch64Instruction): + pattern = "ld1 {.
, .
}, []" + inputs = ["Xc"] + outputs = ["Va", "Vb"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in[0] + obj.args_out_combinations = [ + ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)]) + ] + return obj + + +class q_ld1_4(AArch64Instruction): + pattern = "ld1 {.
, .
, .
, .
}, []" + inputs = ["Xe"] + outputs = ["Va", "Vb", "Vc", "Vd"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in[0] + obj.args_out_combinations = [ + ( + [0, 1, 2, 3], + [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)], + ) + ] + return obj + + class prefetch(Ldr_Q): pattern = "prfm pld1lkeep, [, ]" inputs = ["Xc"] @@ -1767,6 +1869,91 @@ def make(cls, src): return obj +class q_ld1_2_with_postinc(AArch64Instruction): + pattern = "ld1 {.
, .
}, [], " + in_outs = ["Xc"] + outputs = ["Va", "Vb"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = obj.immediate + obj.pre_index = None + obj.addr = obj.args_in_out[0] + + obj.args_out_combinations = [ + ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)]) + ] + return obj + + +class q_ld1_4_with_postinc(AArch64Instruction): + pattern = "ld1 {.
, .
, .
, .
}, [], " + in_outs = ["Xe"] + outputs = ["Va", "Vb", "Vc", "Vd"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = obj.immediate + obj.pre_index = None + obj.addr = obj.args_in_out[0] + obj.args_out_combinations = [ + ( + [0, 1, 2, 3], + [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)], + ) + ] + return obj + + +class q_ld1_2_with_reg_postinc(AArch64Instruction): + pattern = "ld1 {.
, .
}, [], " + inputs = ["Xd"] + in_outs = ["Xc"] + outputs = ["Va", "Vb"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None # register-determined at runtime + obj.pre_index = None + obj.addr = obj.args_in_out[0] + obj.args_out_combinations = [ + ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)]) + ] + return obj + + +class q_ld1_1_with_reg_postinc(AArch64Instruction): + pattern = "ld1 {.
}, [], " + inputs = ["Xc"] + in_outs = ["Xb"] + outputs = ["Va"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in_out[0] + return obj + + +class q_ld1_lane_with_reg_postinc(AArch64Instruction): + pattern = "ld1 {.
}[], [], " + inputs = ["Xc"] + in_outs = ["Xb", "Va"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in_out[0] + return obj + + class q_ldp_with_postinc(Ldp_Q): pattern = "ldp , , [], " in_outs = ["Xc"] @@ -1964,6 +2151,34 @@ def make(cls, src): return obj +class q_st1_1_with_reg_postinc(AArch64Instruction): + pattern = "st1 {.
}, [], " + inputs = ["Va", "Xc"] + in_outs = ["Xb"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in_out[0] + return obj + + +class q_st1_lane_with_reg_postinc(AArch64Instruction): + pattern = "st1 {.
}[], [], " + inputs = ["Va", "Xc"] + in_outs = ["Xb"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = None + obj.pre_index = None + obj.addr = obj.args_in_out[0] + return obj + + class q_stp_with_postinc(Stp_Q): pattern = "stp , , [], " inputs = ["Qa", "Qb"] @@ -1979,7 +2194,7 @@ def make(cls, src): class q_st1_2_with_postinc(Stp_Q): - pattern = "st1 {., .}, [], " + pattern = "st1 {.
, .
}, [], " inputs = ["Va", "Vb"] in_outs = ["Xc"] @@ -1996,6 +2211,26 @@ def make(cls, src): return obj +class q_st1_4_with_postinc(AArch64Instruction): + pattern = "st1 {.
, .
, .
, .
}, [], " + inputs = ["Va", "Vb", "Vc", "Vd"] + in_outs = ["Xe"] + + @classmethod + def make(cls, src): + obj = AArch64Instruction.build(cls, src) + obj.increment = obj.immediate + obj.pre_index = None + obj.addr = obj.args_in_out[0] + obj.args_in_combinations = [ + ( + [0, 1, 2, 3], + [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)], + ) + ] + return obj + + class Ldr_X(AArch64Instruction): pass @@ -2728,6 +2963,12 @@ class sub(AArch64BasicArithmetic): outputs = ["Xd"] +class sub_wform(AArch64BasicArithmetic): + pattern = "sub , , " + inputs = ["Wa", "Wb"] + outputs = ["Wd"] + + class AArch64ShiftedArithmetic(AArch64Instruction): pass @@ -2750,6 +2991,12 @@ class add_shifted(AArch64ShiftedArithmetic): outputs = ["Xd"] +class sub_shifted(AArch64ShiftedArithmetic): + pattern = "sub , , , " + inputs = ["Xa", "Xb"] + outputs = ["Xd"] + + class adds_shifted(AArch64ShiftedArithmetic): pattern = "adds , , , " inputs = ["Xa", "Xb"] @@ -2779,6 +3026,12 @@ class lsl(AArch64Shift): outputs = ["Xd"] +class lsl_wform(AArch64Shift): + pattern = "lsl , , " + inputs = ["Wa"] + outputs = ["Wd"] + + class ror(AArch64Shift): pattern = "ror , , " inputs = ["Xa"] @@ -2904,6 +3157,12 @@ class sxtb(AArch64Logical): outputs = ["Xd"] +class sxtw(AArch64Logical): + pattern = "sxtw , " + inputs = ["Wa"] + outputs = ["Xd"] + + class uxtb(AArch64Logical): pattern = "uxtb , " inputs = ["Wa"] @@ -3279,6 +3538,73 @@ class uaddlp(AArch64Instruction): outputs = ["Vd"] +class uaddl(AArch64Instruction): + pattern = "uaddl ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class uaddl2(AArch64Instruction): + pattern = "uaddl2 ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class uaddw(AArch64Instruction): + pattern = "uaddw ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class uaddw2(AArch64Instruction): + pattern = "uaddw2 ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class saddl(AArch64Instruction): + pattern = "saddl ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class saddl2(AArch64Instruction): + pattern = "saddl2 ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class rshrn(AArch64Instruction): + pattern = "rshrn ., ., " + inputs = ["Va"] + outputs = ["Vd"] + + +class rshrn2(AArch64Instruction): + # rshrn2 writes the upper half of Vd, lower half is retained + pattern = "rshrn2 ., ., " + inputs = ["Va"] + in_outs = ["Vd"] + + +class sqxtun(AArch64Instruction): + pattern = "sqxtun ., ." + inputs = ["Va"] + outputs = ["Vd"] + + +class sqrshrun(AArch64Instruction): + pattern = "sqrshrun ., ., " + inputs = ["Va"] + outputs = ["Vd"] + + +class urhadd(AArch64Instruction): + pattern = "urhadd .
, .
, .
" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + class Vzip(AArch64Instruction): pass @@ -3366,6 +3692,12 @@ class vins_d(Vins): in_outs = ["Vd"] +class vins_d_from_v(AArch64Instruction): + pattern = "ins .d[], .d[]" + inputs = ["Va"] + in_outs = ["Vd"] + + class vins_d_force_output(Vins): pattern = "ins .d[], " inputs = ["Xa"] @@ -3433,6 +3765,12 @@ class mov_vtov_d(AArch64Instruction): in_outs = ["Vd"] +class mov_vtov_s(AArch64Instruction): + pattern = "mov .s[], .s[]" + inputs = ["Va"] + in_outs = ["Vd"] + + class SHA3Instruction( AArch64Instruction ): # pylint: disable=missing-docstring,invalid-name @@ -3607,12 +3945,72 @@ def make(cls, src): return obj +class fmla(AArch64Instruction): + pattern = "fmla .
, .
, .
" + inputs = ["Va", "Vb"] + in_outs = ["Vd"] + + +class faddp_vec(AArch64Instruction): + pattern = "faddp .
, .
, .
" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class faddp_scalar(AArch64Instruction): + pattern = "faddp , .
" + inputs = ["Va"] + outputs = ["Sd"] + + +class fadd_vec(AArch64Instruction): + pattern = "fadd .
, .
, .
" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class fsub_vec(AArch64Instruction): + pattern = "fsub .
, .
, .
" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class fmul_vec(AArch64Instruction): + pattern = "fmul .
, .
, .
" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + +class fmla_lane(AArch64Instruction): + pattern = "fmla ., ., .[]" + inputs = ["Va", "Vb"] + in_outs = ["Vd"] + + +class fmls_vec(AArch64Instruction): + pattern = "fmls .
, .
, .
" + inputs = ["Va", "Vb"] + in_outs = ["Vd"] + + +class fmul_lane(AArch64Instruction): + pattern = "fmul ., ., .[]" + inputs = ["Va", "Vb"] + outputs = ["Vd"] + + class vdup(AArch64Instruction): pattern = "dup .
, " inputs = ["Xa"] outputs = ["Vd"] +class vdup_lane(AArch64Instruction): + pattern = "dup ., .[]" + inputs = ["Va"] + outputs = ["Vd"] + + class vdup_w(AArch64Instruction): pattern = "dup .
, " inputs = ["Wa"] diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py index 26fe2061..d01145d9 100644 --- a/slothy/targets/aarch64/cortex_a55.py +++ b/slothy/targets/aarch64/cortex_a55.py @@ -58,7 +58,9 @@ Ldr_Q, Ldr_D, Str_Q, + Stp_Q, vmov, + vmovi, vadd, vxtn, vshrn, @@ -73,6 +75,12 @@ vmla_lane, vmls, vmls_lane, + fmla, + faddp_vec, + faddp_scalar, + fadd_vec, + fsub_vec, + fmul_vec, vmul_lane, vqrdmulh, vqrdmulh_lane, @@ -161,12 +169,15 @@ mov, ngc_zero, subs_wform, + subs_imm, asr_wform, and_imm_wform, eor_wform, eon_wform, lsr_wform, lsr, + lsl_wform, + sub_wform, ASimdCompare, and_twoarg, VShiftImmediateBasic, @@ -181,6 +192,36 @@ fmov_s_form, # from double/single to gen reg cmp, vdup_w, + vdup_lane, + fmla_lane, + fmls_vec, + fmul_lane, + mov_vtov_s, + Vrev, + uaddl, + uaddl2, + uaddw, + uaddw2, + saddl, + saddl2, + rshrn, + rshrn2, + sqxtun, + sqrshrun, + urhadd, + sub_shifted, + sxtw, + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_4, + q_ld1_4_with_postinc, + q_ld1_1_with_reg_postinc, + q_ld1_lane_with_reg_postinc, + q_st1_1_with_reg_postinc, + q_st1_lane_with_reg_postinc, + q_st1_4_with_postinc, + vins_d_from_v, ) issue_rate = 2 @@ -266,6 +307,7 @@ def get_min_max_objective(slothy): vqdmulh_vector, Ldr_Q, Str_Q, + Stp_Q, q_ldr1_stack, q_ldr1_post_inc, Q_Ld2_Lane_Post_Inc, @@ -279,6 +321,25 @@ def get_min_max_objective(slothy): VShiftImmediateRounding, AArch64NeonLogical, vuaddlv_sform, + uaddl, + uaddl2, + uaddw, + uaddw2, + saddl, + saddl2, + rshrn2, + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_4, + q_ld1_4_with_postinc, + # TODO: revisit -- lane load might also needs SCALAR_LOAD; placed here + # following the Ldr_Q convention which also omits a load unit. + q_ld1_1_with_reg_postinc, + q_ld1_lane_with_reg_postinc, + q_st1_lane_with_reg_postinc, + q_st1_4_with_postinc, + vmovi, ): [ [ExecutionUnit.VEC0, ExecutionUnit.VEC1] ], # these instructions use both VEC0 and VEC1 @@ -328,6 +389,8 @@ def get_min_max_objective(slothy): mov_d01, mov_b00, mov_vtov_d, + mov_vtov_s, + mov_vtov_s, fcsel, VecToGprMov, Mov_xtov_d, @@ -338,10 +401,17 @@ def get_min_max_objective(slothy): d_ldr_stack_with_inc, fmov_s_form, # from double/single to gen reg vdup_w, + is_dform_form_of(sqrshrun), + is_dform_form_of(rshrn), + is_dform_form_of(sqxtun), + is_dform_form_of(urhadd), + vins_d_from_v, + q_st1_1_with_reg_postinc, ): [ ExecutionUnit.VEC0, ExecutionUnit.VEC1, ], # these instructions use VEC0 or VEC1 + is_qform_form_of(urhadd): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_qform_form_of(vmov): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(vmov): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], is_qform_form_of(trn1): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], @@ -358,6 +428,10 @@ def get_min_max_objective(slothy): is_dform_form_of(vzip2): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], is_qform_form_of(vext): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(vext): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(vdup_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(vdup_lane): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(Vrev): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(Vrev): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], is_qform_form_of(vuzp1): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(vuzp1): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], is_qform_form_of(vuzp2): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], @@ -378,6 +452,23 @@ def get_min_max_objective(slothy): is_dform_form_of(vmla): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], is_qform_form_of(vmls): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(vmls): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fmla): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fmla): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fmla_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fmla_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_qform_form_of(faddp_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(faddp_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + faddp_scalar: [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fadd_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fadd_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fsub_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fsub_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fmul_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fmul_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fmls_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fmls_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(fmul_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(fmul_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], vshl_d: [ExecutionUnit.VEC0, ExecutionUnit.VEC1], vuxtl: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_qform_form_of(AArch64NeonShiftInsert): [ @@ -427,17 +518,22 @@ def get_min_max_objective(slothy): movz_imm, movz_imm_lsl, sub, + sub_shifted, sub_imm, + sxtw, cmp, sbcs_zero_to_zero, cmp_xzr2, mov, ngc_zero, subs_wform, + subs_imm, asr_wform, and_imm_wform, lsr_wform, lsr, + lsl_wform, + sub_wform, eor_wform, eon_wform, sxtb, @@ -455,6 +551,7 @@ def get_min_max_objective(slothy): vadd, vsub, vmov, + vmovi, vmul, vmul_lane, vqrdmulh, @@ -472,13 +569,25 @@ def get_min_max_objective(slothy): vmla_lane, vmls, vmls_lane, + fmla, + fmla_lane, + fmls_vec, + fmul_lane, + fmls_vec, + fmul_lane, + faddp_vec, + faddp_scalar, + fadd_vec, + fsub_vec, + fmul_vec, ): 1, (vshl, vshl_d, vsshr, vushr, vuxtl): 1, (trn2, trn1, ASimdCompare): 1, (Ldr_D): 1, - (Ldr_Q): 2, + (Ldr_Q, q_ld1_1_with_reg_postinc): 2, (AArch64NeonCount): 1, - (Str_Q): 1, + (Str_Q, Stp_Q, q_st1_1_with_reg_postinc): 1, + q_st1_4_with_postinc: 4, (tst_wform): 1, (nop, Vins, Ldr_X, Str_X): 1, Ldp_X: 2, @@ -511,7 +620,7 @@ def get_min_max_objective(slothy): (ldr_const): 1, (ldr_sxtw_wform): 3, (lsr_imm, ror): 1, - (lsr, lsr_wform): 2, + (lsr, lsr_wform, lsl_wform): 2, (umull_wform, mul_wform, umaddl_wform): 1, (and_twoarg, and_imm, and_imm_wform): 1, ( @@ -525,17 +634,43 @@ def get_min_max_objective(slothy): adcs_zero_r_to_zero, cmn, ): 1, - (cmp_xzr2, cmp_imm, sub, subs_wform, asr_wform, sbcs_zero_to_zero, ngc_zero): 1, + ( + cmp_xzr2, + cmp_imm, + sub, + sub_wform, + sub_shifted, + subs_wform, + subs_imm, + asr_wform, + sbcs_zero_to_zero, + ngc_zero, + ): 1, (bfi, ubfx): 1, VShiftImmediateRounding: 1, AArch64NeonShiftInsert: 1, (vusra): 1, + (uaddl, uaddl2): 1, + (uaddw, uaddw2): 1, + (saddl, saddl2): 1, + (rshrn, rshrn2, sqxtun): 1, + (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 4, + (q_ld1_4, q_ld1_4_with_postinc): 8, + q_ld1_lane_with_reg_postinc: 1, + q_st1_lane_with_reg_postinc: 1, + vins_d_from_v: 1, + sxtw: 1, + sqrshrun: 1, + urhadd: 1, AArch64NeonLogical: 1, vext: 1, + Vrev: 1, + vdup_lane: 1, (vuzp1, vuzp2): 1, (q_ldr1_stack, Q_Ld2_Lane_Post_Inc, q_ldr1_post_inc, q_ld2_lane_s): 1, (b_ldr_stack_with_inc, d_ldr_stack_with_inc): 1, - (mov_d01, mov_b00, mov_vtov_d): 1, + (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 1, + (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 1, (vzip1, vzip2): 1, (eor_wform, eon_wform): 1, (eon, eor, bic, bic_reg, eor_shifted, bic_shifted): 1, @@ -549,6 +684,7 @@ def get_min_max_objective(slothy): default_latencies = { vdup_w: 3, vmov: 2, + vmovi: 1, is_qform_form_of([vadd, vsub]): 3, is_dform_form_of([vadd, vsub]): 2, (trn1, trn2, ASimdCompare): 2, @@ -567,9 +703,29 @@ def get_min_max_objective(slothy): vmla_lane, vmls, vmls_lane, + fmla, + fmla_lane, + fmls_vec, + fmul_lane, + fmls_vec, + fmul_lane, + fadd_vec, + fsub_vec, + fmul_vec, ): 4, (Ldr_D): 3, - (Ldr_Q, Str_Q): 4, + ( + faddp_vec, + faddp_scalar, + ): 4, + ( + Ldr_Q, + Str_Q, + Stp_Q, + q_ld1_1_with_reg_postinc, + q_st1_1_with_reg_postinc, + q_st1_4_with_postinc, + ): 4, (sub_imm, cmp): 2, AArch64NeonCount: 2, St4: 5, @@ -582,12 +738,11 @@ def get_min_max_objective(slothy): vxtn: 2, vshrn: 2, vtbl: 2, # 2+N-1 cycles (N = number of registers in the table) - (vuxtl): 2, (Str_X, Ldr_X): 4, Ldp_X: 4, Ldp_W: 3, q_ldp_with_inc: 6, - (Vins, umov_d): 2, + (Vins, umov_d, vins_d_from_v): 2, (tst_wform): 1, (fcsel): 2, csel: 1, @@ -606,12 +761,12 @@ def get_min_max_objective(slothy): (Stp_X, Stp_W, w_stp_with_imm_sp): 1, (ldr_const): 3, (ldr_sxtw_wform): 5, - (lsr, lsr_wform): 1, + (lsr, lsr_wform, lsl_wform, sub_wform): 1, lsr_imm: 2, (umull_wform, mul_wform, umaddl_wform): 3, (vuaddlv_sform): 3, (and_imm, and_imm_wform): 1, - (add2, add_shifted, add_sp_imm): 2, + (add2, add_shifted, sub_shifted, add_sp_imm): 2, ( add, add_imm, @@ -620,7 +775,9 @@ def get_min_max_objective(slothy): adcs_zero2, cmn, sub, + sub_wform, subs_wform, + subs_imm, asr_wform, sbcs_zero_to_zero, cmp_xzr2, @@ -632,13 +789,29 @@ def get_min_max_objective(slothy): VShiftImmediateBasic: 2, AArch64NeonShiftInsert: 2, (vusra): 3, + (uaddl, uaddl2): 3, + (uaddw, uaddw2): 3, + # LD1 multi-reg Q-form latencies (SWOG section 4.18) + (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 6, + (q_ld1_4, q_ld1_4_with_postinc): 10, + q_ld1_lane_with_reg_postinc: 3, + q_st1_lane_with_reg_postinc: 1, + sxtw: 2, + (saddl, saddl2): 3, + (rshrn, rshrn2): 3, + sqxtun: 4, + sqrshrun: 4, + urhadd: 2, AArch64NeonLogical: 1, vext: 2, + Vrev: 2, + vdup_lane: 2, (vuzp1, vuzp2): 2, (q_ldr1_stack, Q_Ld2_Lane_Post_Inc, q_ldr1_post_inc): 3, q_ld2_lane_s: 3, (b_ldr_stack_with_inc, d_ldr_stack_with_inc): 3, - (mov_d01, mov_b00, mov_vtov_d): 2, + (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 2, + (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 2, (vzip1, vzip2): 2, (eor_wform, eon_wform): 1, # According to SWOG, this is 2 cycles, byt if the output is used as a @@ -694,6 +867,28 @@ def get_latency(src, out_idx, dst): lambda t_src, t_dst: t_dst.program_start_var == t_src.program_start_var + 1, ) + # Fast mul->mla forwarding (accumulate_latency=1) + if ( + instclass_src in [vmul, vmul_lane] + and instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] + and src.args_out[0] == dst.args_in_out[0] + ): + return 1 + # Fast mla->mla forwarding (accumulate_latency=1) + if ( + instclass_src in [vmla, vmla_lane, vmls, vmls_lane] + and instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] + and src.args_in_out[0] == dst.args_in_out[0] + ): + return 1 + # Fast mull->mlal forwarding (accumulate_latency=1) + if ( + isinstance(src, Vmull) + and isinstance(dst, Vmlal) + and src.args_out[0] == dst.args_in_out[0] + ): + return 1 + return latency diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py index 54fd2be4..407f8a6d 100644 --- a/slothy/targets/aarch64/cortex_a72_frontend.py +++ b/slothy/targets/aarch64/cortex_a72_frontend.py @@ -125,6 +125,45 @@ cmp_imm, csel, q_ldp_with_inc, + uaddl, + uaddl2, + uaddw, + uaddw2, + saddl, + saddl2, + urhadd, + rshrn, + rshrn2, + sqxtun, + sqrshrun, + q_ld1_2, + q_ld1_4, + q_ld1_2_with_postinc, + q_ld1_4_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_1_with_reg_postinc, + q_ld1_lane_with_reg_postinc, + q_st1_1_with_reg_postinc, + q_st1_lane_with_reg_postinc, + q_st1_4_with_postinc, + q_stp_with_inc, + sub_shifted, + subs_imm, + subs_wform, + fadd_vec, + fsub_vec, + fmul_vec, + faddp_vec, + faddp_scalar, + fmla, + fmls_vec, + fmla_lane, + fmul_lane, + vmovi, + vdup_lane, + rev64, + mov_vtov_s, + vins_d_from_v, ) # From the A72 SWOG, Section "4.1 Dispatch Constraints" @@ -257,6 +296,8 @@ def get_min_max_objective(slothy): fmov_s_form: ExecutionUnit.LOAD(), # from vec to gen reg eor_shifted: ExecutionUnit.SCALAR(), bic_shifted: ExecutionUnit.SCALAR(), + sub_shifted: ExecutionUnit.SCALAR(), + (subs_wform, subs_imm): ExecutionUnit.INT(), lsr_imm: ExecutionUnit.INT(), lsr: ExecutionUnit.INT(), movk_imm_lsl: ExecutionUnit.INT(), @@ -264,6 +305,43 @@ def get_min_max_objective(slothy): Ldp_W: ExecutionUnit.LOAD(), q_ldp_with_inc: ExecutionUnit.LOAD(), Stp_W: ExecutionUnit.STORE(), + q_stp_with_inc: [ + ExecutionUnit.STORE() + [ExecutionUnit.INT0], + ExecutionUnit.STORE() + [ExecutionUnit.INT1], + ], + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): [ + ExecutionUnit.ASIMD0, + ExecutionUnit.ASIMD1, + ], + (rshrn, rshrn2, sqxtun, sqrshrun): [ExecutionUnit.ASIMD1], + (fadd_vec, fsub_vec, fmul_vec, faddp_vec, fmla, fmls_vec): [ + ExecutionUnit.ASIMD0, + ExecutionUnit.ASIMD1, + ], + faddp_scalar: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + (fmla_lane, fmul_lane): [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + vmovi: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + vdup_lane: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + rev64: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + mov_vtov_s: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + vins_d_from_v: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + ( + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_4, + q_ld1_4_with_postinc, + q_ld1_1_with_reg_postinc, + ): ExecutionUnit.LOAD(), + q_ld1_lane_with_reg_postinc: [ + [ExecutionUnit.ASIMD0, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1], + [ExecutionUnit.ASIMD1, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1], + ], + q_st1_lane_with_reg_postinc: [ + [ExecutionUnit.ASIMD0, ExecutionUnit.STORE0, ExecutionUnit.STORE1], + [ExecutionUnit.ASIMD1, ExecutionUnit.STORE0, ExecutionUnit.STORE1], + ], + (q_st1_1_with_reg_postinc, q_st1_4_with_postinc): [ExecutionUnit.STORE()], } inverse_throughput = { @@ -295,6 +373,7 @@ def get_min_max_objective(slothy): umov_d: 1, (add, add_imm, add_shifted): 1, (Ldr_D, Ldr_Q, Str_Q, Ldr_X, Str_X): 1, + q_stp_with_inc: 4, (VShiftImmediateRounding, VShiftImmediateBasic): 1, # TODO: this seems in accurate; revisiting may improve performance St2: 4, @@ -314,6 +393,15 @@ def get_min_max_objective(slothy): fmov_s_form: 1, # from vec to gen reg eor_shifted: 1, bic_shifted: 1, + sub_shifted: 1, + (subs_wform, subs_imm): 1, + (fadd_vec, fsub_vec, fmul_vec, faddp_vec, faddp_scalar, fmla, fmls_vec): 1, + (fmla_lane, fmul_lane): 1, + vmovi: 1, + vdup_lane: 1, + rev64: 1, + mov_vtov_s: 1, + vins_d_from_v: 1, vdup_w: 1, mov_wtov_s: 1, mov_vtov_d: 1, @@ -322,6 +410,14 @@ def get_min_max_objective(slothy): movk_imm_lsl: 1, Ldp_W: 1, Stp_W: 1, + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 1, + (rshrn, rshrn2, sqxtun, sqrshrun): 1, + (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 2, + (q_ld1_4, q_ld1_4_with_postinc): 4, + (q_ld1_1_with_reg_postinc, q_ld1_lane_with_reg_postinc): 1, + q_st1_1_with_reg_postinc: 2, + q_st1_lane_with_reg_postinc: 1, + q_st1_4_with_postinc: 8, } # REVISIT @@ -356,7 +452,7 @@ def get_min_max_objective(slothy): csel: 1, AArch64ConditionalCompare: 1, AArch64Logical: 1, - (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4, # approx + (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X, q_stp_with_inc): 4, # approx Vins: 6, # approx umov_d: 4, # approx (add, add_imm, add_shifted): 2, @@ -381,6 +477,17 @@ def get_min_max_objective(slothy): fmov_s_form: 5, # from vec to gen reg eor_shifted: 2, bic_shifted: 2, + sub_shifted: 2, + (subs_wform, subs_imm): 1, + (fadd_vec, fsub_vec, faddp_vec, faddp_scalar): 4, + fmul_vec: 4, + (fmla, fmls_vec, fmla_lane): 7, + fmul_lane: 5, + vmovi: 3, + vdup_lane: 3, + rev64: 3, + mov_vtov_s: 3, + vins_d_from_v: 3, vdup_w: 8, mov_wtov_s: 8, mov_vtov_d: 3, @@ -389,6 +496,18 @@ def get_min_max_objective(slothy): movk_imm_lsl: 1, Ldp_W: 4, Stp_W: 1, + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 3, + (rshrn, rshrn2, sqrshrun, sqxtun): 4, + # Multi-register ld1 (SWOG: Q-form latencies) + q_ld1_1_with_reg_postinc: 5, + (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 6, + (q_ld1_4, q_ld1_4_with_postinc): 8, + # Single-lane ld1 B/H/S (SWOG: 8 cycles) + q_ld1_lane_with_reg_postinc: 8, + # Single-lane st1 B/H/S (SWOG: 3 cycles) + q_st1_lane_with_reg_postinc: 3, + q_st1_1_with_reg_postinc: 2, + q_st1_4_with_postinc: 8, } @@ -428,6 +547,20 @@ def get_latency(src, out_idx, dst): and src.args_in_out[0] == dst.args_in_out[0] ): return 1 + # Fast fmul->fmla forwarding (accumulate_latency=3) + if ( + instclass_src in [fmul_vec, fmul_lane] + and instclass_dst in [fmla, fmls_vec, fmla_lane] + and src.args_out[0] == dst.args_in_out[0] + ): + return 3 + # Fast fmla->fmla forwarding (accumulate_latency=3) + if ( + instclass_src in [fmla, fmls_vec, fmla_lane] + and instclass_dst in [fmla, fmls_vec, fmla_lane] + and src.args_in_out[0] == dst.args_in_out[0] + ): + return 3 return latency diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py index 16d538d0..59f628c2 100644 --- a/slothy/targets/aarch64/neoverse_n1_experimental.py +++ b/slothy/targets/aarch64/neoverse_n1_experimental.py @@ -111,6 +111,39 @@ mov_vtov_d, lsr, movk_imm_lsl, + uaddl, + uaddl2, + uaddw, + uaddw2, + saddl, + saddl2, + urhadd, + rshrn, + rshrn2, + sqxtun, + sqrshrun, + q_ld1_2, + q_ld1_4, + q_ld1_2_with_postinc, + q_ld1_4_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_1_with_reg_postinc, + q_ld1_lane_with_reg_postinc, + q_st1_1_with_reg_postinc, + q_st1_lane_with_reg_postinc, + q_st1_4_with_postinc, + fadd_vec, + fsub_vec, + fmul_vec, + faddp_vec, + faddp_scalar, + fmla, + fmls_vec, + fmla_lane, + fmul_lane, + mov_vtov_s, + vdup_lane, + vins_d_from_v, ) issue_rate = 4 @@ -229,6 +262,18 @@ def get_min_max_objective(slothy): vusra: ExecutionUnit.V1(), AESInstruction: ExecutionUnit.V0(), (Vmul, Vmla, Vqdmulh, Vmull, Vmlal): ExecutionUnit.V0(), + ( + fadd_vec, + fsub_vec, + fmul_vec, + faddp_vec, + faddp_scalar, + fmla, + fmls_vec, + fmla_lane, + fmul_lane, + ): ExecutionUnit.V(), + (mov_vtov_s, vdup_lane, vins_d_from_v): ExecutionUnit.V(), AArch64NeonLogical: ExecutionUnit.V(), vext: ExecutionUnit.V(), ( @@ -261,6 +306,34 @@ def get_min_max_objective(slothy): lsr: ExecutionUnit.I(), movk_imm_lsl: ExecutionUnit.I(), q_ld2_lane_s: ExecutionUnit.V(), + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): ExecutionUnit.V(), + (rshrn, rshrn2, sqxtun, sqrshrun): ExecutionUnit.V1(), + ( + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_4, + q_ld1_4_with_postinc, + q_ld1_1_with_reg_postinc, + ): ExecutionUnit.LSU(), + q_ld1_lane_with_reg_postinc: [ + [ExecutionUnit.LSU0, ExecutionUnit.VEC0], + [ExecutionUnit.LSU0, ExecutionUnit.VEC1], + [ExecutionUnit.LSU1, ExecutionUnit.VEC0], + [ExecutionUnit.LSU1, ExecutionUnit.VEC1], + ], + q_st1_lane_with_reg_postinc: [ + [ExecutionUnit.LSU0, ExecutionUnit.VEC0], + [ExecutionUnit.LSU0, ExecutionUnit.VEC1], + [ExecutionUnit.LSU1, ExecutionUnit.VEC0], + [ExecutionUnit.LSU1, ExecutionUnit.VEC1], + ], + (q_st1_1_with_reg_postinc, q_st1_4_with_postinc): [ + [ExecutionUnit.LSU0, ExecutionUnit.VEC0], + [ExecutionUnit.LSU0, ExecutionUnit.VEC1], + [ExecutionUnit.LSU1, ExecutionUnit.VEC0], + [ExecutionUnit.LSU1, ExecutionUnit.VEC1], + ], } inverse_throughput = { @@ -323,6 +396,25 @@ def get_min_max_objective(slothy): mov_vtov_d: 1, lsr: 1, movk_imm_lsl: 1, + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 1, + (rshrn, rshrn2, sqxtun, sqrshrun): 1, + (q_ld1_1_with_reg_postinc, q_ld1_lane_with_reg_postinc): 1, + (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 2, + (q_ld1_4, q_ld1_4_with_postinc): 4, + (q_st1_1_with_reg_postinc, q_st1_lane_with_reg_postinc): 2, + q_st1_4_with_postinc: 8, + ( + fadd_vec, + fsub_vec, + faddp_vec, + faddp_scalar, + fmla, + fmls_vec, + fmla_lane, + fmul_vec, + fmul_lane, + ): 1, + (mov_vtov_s, vdup_lane, vins_d_from_v): 1, } default_latencies = { @@ -386,6 +478,24 @@ def get_min_max_objective(slothy): mov_vtov_d: 2, lsr: 1, movk_imm_lsl: 1, + (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 2, + (rshrn, rshrn2, sqrshrun, sqxtun): 4, + ( + q_ld1_2, + q_ld1_2_with_postinc, + q_ld1_2_with_reg_postinc, + q_ld1_1_with_reg_postinc, + ): 5, + (q_ld1_4, q_ld1_4_with_postinc): 6, + q_ld1_lane_with_reg_postinc: 7, + q_st1_1_with_reg_postinc: 2, + q_st1_lane_with_reg_postinc: 4, + q_st1_4_with_postinc: 5, + (fadd_vec, fsub_vec, faddp_vec, faddp_scalar): 2, + fmul_vec: 3, + (fmla, fmls_vec, fmla_lane): 4, + fmul_lane: 3, + (mov_vtov_s, vdup_lane, vins_d_from_v): 2, } diff --git a/tests/naive/aarch64/_test.py b/tests/naive/aarch64/_test.py index 24869661..4ee9356a 100644 --- a/tests/naive/aarch64/_test.py +++ b/tests/naive/aarch64/_test.py @@ -42,6 +42,8 @@ def core(self, slothy): slothy.config.constraints.allow_reordering = False slothy.config.variable_size = True slothy.config.constraints.stalls_first_attempt = 256 + slothy.config.reserved_regs.add("x12") + slothy.config.selftest_initial_register_values = {"x12": 16} slothy.optimize(start="start", end="end") diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s index a764dd31..4d2b65d7 100644 --- a/tests/naive/aarch64/instructions.s +++ b/tests/naive/aarch64/instructions.s @@ -28,6 +28,32 @@ st2 { v0.s, v1.s}[0], [x11], #8 st2 {v0.s, v1.s}[0], [x11], #8 st2 { v0.S, v1.S }[1], [x1] +// ld1 multi-register (no post-increment) +ld1 {v0.4s, v1.4s}, [x1] +ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x1] + +// ld1 multi-register with immediate post-increment +ld1 {v0.4s, v1.4s}, [x1], #32 +ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x1], #64 + +// ld1 multi-register and single-register with register post-increment +mov x12, #16 +ld1 {v0.4s, v1.4s}, [x1], x12 +ld1 {v2.4s}, [x1], x12 +ld1 {v3.s}[0], [x1], x12 +ld1 {v3.s}[1], [x1], x12 + +// st1 with register post-increment +st1 {v0.4s}, [x1], x12 +st1 {v1.s}[0], [x1], x12 +st1 {v1.s}[1], [x1], x12 + +// st1 4-register with immediate post-increment +st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + +// Store vector pair Q-form +stp q0, q1, [x0, #32] + zip1 v5.16b, v6.16b, v7.16b zip2 v8.16b, v9.16b, v10.16b uzp1 v11.16b, v12.16b, v13.16b @@ -207,4 +233,59 @@ ld3 {v2.4s, v3.4s, v4.4s}, [x17], #48 st3 {v5.4s, v6.4s, v7.4s}, [x18], #48 ld2 {v8.4s, v9.4s}, [x19], #32 st2 {v10.4s, v11.4s}, [x20], #32 + +// ASIMD widening / narrowing arithmetic +uaddl v0.8h, v1.8b, v2.8b +uaddl2 v3.8h, v4.16b, v5.16b +uaddw v6.8h, v7.8h, v8.8b +uaddw2 v9.8h, v10.8h, v11.16b +saddl v12.8h, v13.8b, v14.8b +saddl2 v15.8h, v16.16b, v17.16b +rshrn v18.8b, v19.8h, #2 +rshrn2 v20.16b, v21.8h, #3 +sqxtun v22.8b, v23.8h +sqrshrun v24.8b, v25.8h, #4 +urhadd v26.8b, v27.8b, v28.8b + +// sub with shifted register operand +sub x0, x1, x2, lsl #3 +sub x3, x4, x5, lsr #2 + +// sign-extend word +sxtw x6, w7 + +// ASIMD FP arithmetic (vector) +fadd v0.4s, v1.4s, v2.4s +fsub v0.4s, v1.4s, v2.4s +fmul v0.4s, v1.4s, v2.4s +faddp v0.4s, v1.4s, v2.4s +faddp s0, v1.2s + +// ASIMD FP multiply accumulate / subtract +fmla v0.4s, v1.4s, v2.4s +fmls v0.4s, v1.4s, v2.4s + +// ASIMD FP by-element +fmla v0.4s, v1.4s, v2.s[0] +fmul v0.4s, v1.4s, v2.s[1] + +// ASIMD move immediate +movi v0.16b, #0 +movi v1.4s, #0 + +// SUBS immediate +subs x0, x1, #4 + +// ASIMD duplicate element +dup v0.4s, v1.s[0] +dup v2.2s, v3.s[1] + +// ASIMD reverse +rev64 v0.16b, v1.16b +rev64 v2.4s, v3.4s + +// ASIMD element move / insert +mov v0.s[0], v1.s[1] +ins v0.d[0], v1.d[1] + end: