Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion slothy/targets/aarch64/aarch64_neon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2781,12 +2781,18 @@ class ror(AArch64Shift):
outputs = ["Xd"]


class asr(AArch64Shift):
class asr_imm(AArch64Shift):
pattern = "asr <Xd>, <Xa>, <imm>"
inputs = ["Xa"]
outputs = ["Xd"]


class asr(AArch64Shift):
pattern = "asr <Xd>, <Xa>, <Xb>"
inputs = ["Xa", "Xb"]
outputs = ["Xd"]


class AArch64Logical(AArch64Instruction):
pass

Expand Down
32 changes: 25 additions & 7 deletions slothy/targets/aarch64/cortex_a55.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
vmul,
Instruction,
csel,
csel_xzr_ne,
fcsel,
Q_Ld2_Lane_Post_Inc,
q_ld2_lane_s,
Expand Down Expand Up @@ -162,6 +163,8 @@
ngc_zero,
subs_wform,
asr_wform,
asr_imm,
asr,
and_imm_wform,
eor_wform,
eon_wform,
Expand All @@ -181,6 +184,7 @@
fmov_s_form, # from double/single to gen reg
fmov_d_form, # from double/single to gen reg (64-bit)
cmp,
cmp_xzr,
vdup_w,
crc32b,
crc32h,
Expand Down Expand Up @@ -439,12 +443,15 @@ def get_min_max_objective(slothy):
sub,
sub_imm,
cmp,
cmp_xzr,
sbcs_zero_to_zero,
cmp_xzr2,
mov,
ngc_zero,
subs_wform,
asr_wform,
asr_imm,
asr,
and_imm_wform,
lsr_wform,
lsr,
Expand All @@ -457,7 +464,7 @@ def get_min_max_objective(slothy):
# NOTE: AESE/AESMC and AESD/AESIMC pairs can be dual-issued on A55 but this
# is not modeled
AESInstruction: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
csel: ExecutionUnit.SCALAR(),
(csel, csel_xzr_ne): ExecutionUnit.SCALAR(),
(
crc32b,
crc32h,
Expand Down Expand Up @@ -486,7 +493,7 @@ def get_min_max_objective(slothy):
umov_d,
vuaddlv_sform,
): 1,
(sub_imm, cmp): 1,
(sub_imm, cmp, cmp_xzr): 1,
(
vmla,
vmla_lane,
Expand Down Expand Up @@ -514,7 +521,7 @@ def get_min_max_objective(slothy):
vshrn: 2,
vtbl: 1, # N cycles (N = number of registers in the table)
(fcsel): 1,
csel: 1,
(csel, csel_xzr_ne): 1,
(VecToGprMov, Mov_xtov_d, mov_wtov_s): 1,
(
movk_imm,
Expand Down Expand Up @@ -545,7 +552,17 @@ def get_min_max_objective(slothy):
adcs_zero_r_to_zero,
cmn,
): 1,
(cmp_xzr2, cmp_imm, sub, subs_wform, asr_wform, sbcs_zero_to_zero, ngc_zero): 1,
(
cmp_xzr2,
cmp_imm,
sub,
subs_wform,
asr_wform,
asr_imm,
sbcs_zero_to_zero,
ngc_zero,
): 1,
asr: 2,
(bfi, ubfx): 1,
VShiftImmediateRounding: 1,
AArch64NeonShiftInsert: 1,
Expand Down Expand Up @@ -592,7 +609,7 @@ def get_min_max_objective(slothy):
): 4,
(Ldr_D): 3,
(Ldr_Q, Str_Q): 4,
(sub_imm, cmp): 2,
(sub_imm, cmp, cmp_xzr): 2,
AArch64NeonCount: 2,
St4: 5,
St3: 3,
Expand All @@ -612,7 +629,7 @@ def get_min_max_objective(slothy):
(Vins, umov_d): 2,
(tst_wform): 1,
(fcsel): 2,
csel: 1,
(csel, csel_xzr_ne): 1,
(VecToGprMov, Mov_xtov_d, mov_wtov_s): 2,
(
movk_imm,
Expand Down Expand Up @@ -643,12 +660,13 @@ def get_min_max_objective(slothy):
cmn,
sub,
subs_wform,
asr_wform,
asr,
sbcs_zero_to_zero,
cmp_xzr2,
ngc_zero,
cmp_imm,
): 1,
(asr_wform, asr_imm): 2,
(bfi, ubfx): 2,
VShiftImmediateRounding: 3,
VShiftImmediateBasic: 2,
Expand Down
23 changes: 17 additions & 6 deletions slothy/targets/aarch64/cortex_a72_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,14 @@
q_ld2_lane_s,
Ldp_W,
cmp,
cmp_xzr,
cmp_imm,
csel,
csel_xzr_ne,
q_ldp_with_inc,
AArch64CRC32,
asr,
asr_imm,
)

# From the A72 SWOG, Section "4.1 Dispatch Constraints"
Expand Down Expand Up @@ -237,7 +241,7 @@ def get_min_max_objective(slothy):
],
(AArch64NeonShiftInsert, vusra): [ExecutionUnit.ASIMD1],
fcsel: ExecutionUnit.ASIMD(),
csel: ExecutionUnit.INT(),
(csel, csel_xzr_ne): ExecutionUnit.INT(),
AArch64ConditionalCompare: ExecutionUnit.INT(),
AArch64Logical: [ExecutionUnit.INT()],
# 8B/8H occupies both F0, F1
Expand All @@ -262,11 +266,13 @@ def get_min_max_objective(slothy):
lsr_imm: ExecutionUnit.INT(),
lsr: ExecutionUnit.INT(),
movk_imm_lsl: ExecutionUnit.INT(),
(sub_imm, cmp, cmp_imm): ExecutionUnit.INT(),
(sub_imm, cmp_imm): ExecutionUnit.INT(),
(cmp, cmp_xzr): ExecutionUnit.MINT(),
Ldp_W: ExecutionUnit.LOAD(),
q_ldp_with_inc: ExecutionUnit.LOAD(),
Stp_W: ExecutionUnit.STORE(),
AArch64CRC32: ExecutionUnit.MINT(),
(asr, asr_imm): ExecutionUnit.INT(),
}

inverse_throughput = {
Expand All @@ -291,7 +297,7 @@ def get_min_max_objective(slothy):
AArch64NeonLogical: 1,
(AArch64NeonShiftInsert, vusra): 1,
fcsel: 1,
csel: 1,
(csel, csel_xzr_ne): 1,
AArch64ConditionalCompare: 1,
AArch64Logical: 1,
Vins: 1,
Expand All @@ -312,7 +318,7 @@ def get_min_max_objective(slothy):
q_ld2_lane_s: 1,
vtbl: 1, # SWOG contains a blank throughput (approximating from AArch32)
AESInstruction: 1,
(sub_imm, cmp, cmp_imm): 1,
(sub_imm, cmp, cmp_xzr, cmp_imm): 1,
vuaddlv_sform: 1,
fmov_s_form: 1, # from vec to gen reg
fmov_d_form: 1, # from vec to gen reg (64-bit)
Expand All @@ -327,6 +333,8 @@ def get_min_max_objective(slothy):
Ldp_W: 1,
Stp_W: 1,
AArch64CRC32: 1,
asr: 1,
asr_imm: 1,
}

# REVISIT
Expand Down Expand Up @@ -358,7 +366,7 @@ def get_min_max_objective(slothy):
AArch64NeonShiftInsert: 3,
vusra: 4,
fcsel: 3,
csel: 1,
(csel, csel_xzr_ne): 1,
AArch64ConditionalCompare: 1,
AArch64Logical: 1,
(Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4, # approx
Expand All @@ -381,7 +389,8 @@ def get_min_max_objective(slothy):
q_ld2_lane_s: 8,
vtbl: 6, # q-form: 3*N+3 cycles (N = number of registers in the table)
AESInstruction: 3,
(sub_imm, cmp, cmp_imm): 1,
(sub_imm, cmp_imm): 1,
(cmp, cmp_xzr): 2,
vuaddlv_sform: 6, # 8B/8H
fmov_s_form: 5, # from vec to gen reg
fmov_d_form: 5, # from vec to gen reg (64-bit)
Expand All @@ -396,6 +405,8 @@ def get_min_max_objective(slothy):
Ldp_W: 4,
Stp_W: 1,
AArch64CRC32: 2,
asr: 1,
asr_imm: 1,
}


Expand Down
9 changes: 9 additions & 0 deletions tests/naive/aarch64/instructions.s
Original file line number Diff line number Diff line change
Expand Up @@ -216,4 +216,13 @@ crc32ch w6, w6, w7
crc32cw w6, w6, w7
crc32cx w6, w6, x8
fmov x5, d7

asr x11, x12, x7
asr x11, x12, #7
csel x11, x10, xzr, eq
csel x11, x10, xzr, ne
csel x11, x10, xzr, lt
csel x11, x10, xzr, gt
cmp x3, xzr

end:
Loading