From 27663d869f7f8f54cda63e1682f0a879da9f8ca5 Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Sun, 10 May 2026 23:58:55 +0800
Subject: [PATCH 1/3] AArch64: Add support for asr (register) and asr
 (immediate) to A55, A72

This commit add the asr (register) and asr (immediate)instruction
support the a55, a72 model.
According to A64 Base Instruction Descriptions, page C6-1820 and
page C6-1822, these two instructions are the aliases of:
asr (register)  ---> ASRV,
asr (immediate) ---> SBFM,

- a55 SWOG ASRV(page: 19/48)
  - latency: 1
  - Inverse throughput: 2/1 = 2
  - ExecutionUnit: SCALAR (ALU0, ALU1)

- a55 SWOG SBFM (page: 21/48)
  - latency: 2
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: SCALAR (ALU0, ALU1)

- a72 SWOG ASRV (page: 9/42)
  - latency: 1
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: INT (INT0, INT1)

- a72 SWOG SBFM (page: 12/42)
  - latency: 1
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: INT (INT0, INT1)

- This commit also refactor the existed asr_wform's latency, it should
  be 2 instead of 1.(reference from SBFM, since asr (immediate) is alias
  of SBFM)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 slothy/targets/aarch64/aarch64_neon.py        |  8 +++++++-
 slothy/targets/aarch64/cortex_a55.py          | 19 +++++++++++++++++--
 slothy/targets/aarch64/cortex_a72_frontend.py |  7 +++++++
 tests/naive/aarch64/instructions.s            |  4 ++++
 4 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 4e74f343..cd0f464a 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -2781,12 +2781,18 @@ class ror(AArch64Shift):
     outputs = ["Xd"]
 
 
-class asr(AArch64Shift):
+class asr_imm(AArch64Shift):
     pattern = "asr <Xd>, <Xa>, <imm>"
     inputs = ["Xa"]
     outputs = ["Xd"]
 
 
+class asr(AArch64Shift):
+    pattern = "asr <Xd>, <Xa>, <Xb>"
+    inputs = ["Xa", "Xb"]
+    outputs = ["Xd"]
+
+
 class AArch64Logical(AArch64Instruction):
     pass
 
diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
index 8d46b0e5..acc3eed7 100644
--- a/slothy/targets/aarch64/cortex_a55.py
+++ b/slothy/targets/aarch64/cortex_a55.py
@@ -162,6 +162,8 @@
     ngc_zero,
     subs_wform,
     asr_wform,
+    asr_imm,
+    asr,
     and_imm_wform,
     eor_wform,
     eon_wform,
@@ -445,6 +447,8 @@ def get_min_max_objective(slothy):
         ngc_zero,
         subs_wform,
         asr_wform,
+        asr_imm,
+        asr,
         and_imm_wform,
         lsr_wform,
         lsr,
@@ -545,7 +549,17 @@ def get_min_max_objective(slothy):
         adcs_zero_r_to_zero,
         cmn,
     ): 1,
-    (cmp_xzr2, cmp_imm, sub, subs_wform, asr_wform, sbcs_zero_to_zero, ngc_zero): 1,
+    (
+        cmp_xzr2,
+        cmp_imm,
+        sub,
+        subs_wform,
+        asr_wform,
+        asr_imm,
+        sbcs_zero_to_zero,
+        ngc_zero,
+    ): 1,
+    asr: 2,
     (bfi, ubfx): 1,
     VShiftImmediateRounding: 1,
     AArch64NeonShiftInsert: 1,
@@ -643,12 +657,13 @@ def get_min_max_objective(slothy):
         cmn,
         sub,
         subs_wform,
-        asr_wform,
+        asr,
         sbcs_zero_to_zero,
         cmp_xzr2,
         ngc_zero,
         cmp_imm,
     ): 1,
+    (asr_wform, asr_imm): 2,
     (bfi, ubfx): 2,
     VShiftImmediateRounding: 3,
     VShiftImmediateBasic: 2,
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
index f77bf233..219c2f41 100644
--- a/slothy/targets/aarch64/cortex_a72_frontend.py
+++ b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -127,6 +127,8 @@
     csel,
     q_ldp_with_inc,
     AArch64CRC32,
+    asr,
+    asr_imm,
 )
 
 # From the A72 SWOG, Section "4.1 Dispatch Constraints"
@@ -267,6 +269,7 @@ def get_min_max_objective(slothy):
     q_ldp_with_inc: ExecutionUnit.LOAD(),
     Stp_W: ExecutionUnit.STORE(),
     AArch64CRC32: ExecutionUnit.MINT(),
+    (asr, asr_imm): ExecutionUnit.INT(),
 }
 
 inverse_throughput = {
@@ -327,6 +330,8 @@ def get_min_max_objective(slothy):
     Ldp_W: 1,
     Stp_W: 1,
     AArch64CRC32: 1,
+    asr: 1,
+    asr_imm: 1,
 }
 
 # REVISIT
@@ -396,6 +401,8 @@ def get_min_max_objective(slothy):
     Ldp_W: 4,
     Stp_W: 1,
     AArch64CRC32: 2,
+    asr: 1,
+    asr_imm: 1,
 }
 
 
diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
index 693de5b0..9e2ad27f 100644
--- a/tests/naive/aarch64/instructions.s
+++ b/tests/naive/aarch64/instructions.s
@@ -216,4 +216,8 @@ crc32ch w6, w6, w7
 crc32cw w6, w6, w7
 crc32cx w6, w6, x8
 fmov x5, d7
+
+asr x11, x12, x7
+asr x11, x12, #7
+
 end:

From 86f8d4c8e6f3817094b3df72d6a980ff5844bcfd Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Mon, 11 May 2026 02:38:00 +0800
Subject: [PATCH 2/3] AArch64: Add support for `csel_xzr_ne` to the A55, A72
 uArch model

This commit adds support for the csel_xzr_ne instruction to the
A55, A72 uArch model.

This pattern is a variant of csel using the zero register xzr.
This commit reuses the existing csel uArch model definition for the
uArch model

- a55 SWOG CSEL(page: 18/48)
  - latency: 1
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: SCALAR (ALU0, ALU1)

- a72 SWOG CSEL(page: 8/42)
  - latency: 1
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: INT (INT0, INT1)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 slothy/targets/aarch64/cortex_a55.py          | 7 ++++---
 slothy/targets/aarch64/cortex_a72_frontend.py | 7 ++++---
 tests/naive/aarch64/instructions.s            | 4 ++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
index acc3eed7..96eba0bd 100644
--- a/slothy/targets/aarch64/cortex_a55.py
+++ b/slothy/targets/aarch64/cortex_a55.py
@@ -66,6 +66,7 @@
     vmul,
     Instruction,
     csel,
+    csel_xzr_ne,
     fcsel,
     Q_Ld2_Lane_Post_Inc,
     q_ld2_lane_s,
@@ -461,7 +462,7 @@ def get_min_max_objective(slothy):
     # NOTE: AESE/AESMC and AESD/AESIMC pairs can be dual-issued on A55 but this
     # is not modeled
     AESInstruction: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
-    csel: ExecutionUnit.SCALAR(),
+    (csel, csel_xzr_ne): ExecutionUnit.SCALAR(),
     (
         crc32b,
         crc32h,
@@ -518,7 +519,7 @@ def get_min_max_objective(slothy):
     vshrn: 2,
     vtbl: 1,  # N cycles (N = number of registers in the table)
     (fcsel): 1,
-    csel: 1,
+    (csel, csel_xzr_ne): 1,
     (VecToGprMov, Mov_xtov_d, mov_wtov_s): 1,
     (
         movk_imm,
@@ -626,7 +627,7 @@ def get_min_max_objective(slothy):
     (Vins, umov_d): 2,
     (tst_wform): 1,
     (fcsel): 2,
-    csel: 1,
+    (csel, csel_xzr_ne): 1,
     (VecToGprMov, Mov_xtov_d, mov_wtov_s): 2,
     (
         movk_imm,
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
index 219c2f41..438abdb1 100644
--- a/slothy/targets/aarch64/cortex_a72_frontend.py
+++ b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -125,6 +125,7 @@
     cmp,
     cmp_imm,
     csel,
+    csel_xzr_ne,
     q_ldp_with_inc,
     AArch64CRC32,
     asr,
@@ -239,7 +240,7 @@ def get_min_max_objective(slothy):
     ],
     (AArch64NeonShiftInsert, vusra): [ExecutionUnit.ASIMD1],
     fcsel: ExecutionUnit.ASIMD(),
-    csel: ExecutionUnit.INT(),
+    (csel, csel_xzr_ne): ExecutionUnit.INT(),
     AArch64ConditionalCompare: ExecutionUnit.INT(),
     AArch64Logical: [ExecutionUnit.INT()],
     # 8B/8H occupies both F0, F1
@@ -294,7 +295,7 @@ def get_min_max_objective(slothy):
     AArch64NeonLogical: 1,
     (AArch64NeonShiftInsert, vusra): 1,
     fcsel: 1,
-    csel: 1,
+    (csel, csel_xzr_ne): 1,
     AArch64ConditionalCompare: 1,
     AArch64Logical: 1,
     Vins: 1,
@@ -363,7 +364,7 @@ def get_min_max_objective(slothy):
     AArch64NeonShiftInsert: 3,
     vusra: 4,
     fcsel: 3,
-    csel: 1,
+    (csel, csel_xzr_ne): 1,
     AArch64ConditionalCompare: 1,
     AArch64Logical: 1,
     (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4,  # approx
diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
index 9e2ad27f..707fa8ab 100644
--- a/tests/naive/aarch64/instructions.s
+++ b/tests/naive/aarch64/instructions.s
@@ -219,5 +219,9 @@ fmov x5, d7
 
 asr x11, x12, x7
 asr x11, x12, #7
+csel x11, x10, xzr, eq
+csel x11, x10, xzr, ne
+csel x11, x10, xzr, lt
+csel x11, x10, xzr, gt
 
 end:

From ee1067676cdf3df603e72175512061f41bef15fa Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Mon, 11 May 2026 02:56:13 +0800
Subject: [PATCH 3/3] AArch64: Add support for `cmp_xzr` to the A55, A72 uArch
 model

This commit adds support for the `cmp_xzr` instruction to the A55, A72
 uArch model.
This pattern is a variant of cmp using the zero register xzr.

This commit reuses the existing cmp uArch model definition for the
uArch model, and cmp (shift register) is alias of
SUBS(according to page C6-1953 of Aarch64 Base Instruction
Descriptions), so we reference the SUBS to model this instruction.

- a55 SWOG SUBS(page: 18/48)
  - latency: 2
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: SCALAR (ALU0, ALU1)

- a72 SWOG SUBS(page: 8/48)
  - latency: 2
  - Inverse throughput: 1/1 = 1
  - ExecutionUnit: MINT(M)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 slothy/targets/aarch64/cortex_a55.py          | 6 ++++--
 slothy/targets/aarch64/cortex_a72_frontend.py | 9 ++++++---
 tests/naive/aarch64/instructions.s            | 1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
index 96eba0bd..f3983cc2 100644
--- a/slothy/targets/aarch64/cortex_a55.py
+++ b/slothy/targets/aarch64/cortex_a55.py
@@ -184,6 +184,7 @@
     fmov_s_form,  # from double/single to gen reg
     fmov_d_form,  # from double/single to gen reg (64-bit)
     cmp,
+    cmp_xzr,
     vdup_w,
     crc32b,
     crc32h,
@@ -442,6 +443,7 @@ def get_min_max_objective(slothy):
         sub,
         sub_imm,
         cmp,
+        cmp_xzr,
         sbcs_zero_to_zero,
         cmp_xzr2,
         mov,
@@ -491,7 +493,7 @@ def get_min_max_objective(slothy):
         umov_d,
         vuaddlv_sform,
     ): 1,
-    (sub_imm, cmp): 1,
+    (sub_imm, cmp, cmp_xzr): 1,
     (
         vmla,
         vmla_lane,
@@ -607,7 +609,7 @@ def get_min_max_objective(slothy):
     ): 4,
     (Ldr_D): 3,
     (Ldr_Q, Str_Q): 4,
-    (sub_imm, cmp): 2,
+    (sub_imm, cmp, cmp_xzr): 2,
     AArch64NeonCount: 2,
     St4: 5,
     St3: 3,
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
index 438abdb1..66495a79 100644
--- a/slothy/targets/aarch64/cortex_a72_frontend.py
+++ b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -123,6 +123,7 @@
     q_ld2_lane_s,
     Ldp_W,
     cmp,
+    cmp_xzr,
     cmp_imm,
     csel,
     csel_xzr_ne,
@@ -265,7 +266,8 @@ def get_min_max_objective(slothy):
     lsr_imm: ExecutionUnit.INT(),
     lsr: ExecutionUnit.INT(),
     movk_imm_lsl: ExecutionUnit.INT(),
-    (sub_imm, cmp, cmp_imm): ExecutionUnit.INT(),
+    (sub_imm, cmp_imm): ExecutionUnit.INT(),
+    (cmp, cmp_xzr): ExecutionUnit.MINT(),
     Ldp_W: ExecutionUnit.LOAD(),
     q_ldp_with_inc: ExecutionUnit.LOAD(),
     Stp_W: ExecutionUnit.STORE(),
@@ -316,7 +318,7 @@ def get_min_max_objective(slothy):
     q_ld2_lane_s: 1,
     vtbl: 1,  # SWOG contains a blank throughput (approximating from AArch32)
     AESInstruction: 1,
-    (sub_imm, cmp, cmp_imm): 1,
+    (sub_imm, cmp, cmp_xzr, cmp_imm): 1,
     vuaddlv_sform: 1,
     fmov_s_form: 1,  # from vec to gen reg
     fmov_d_form: 1,  # from vec to gen reg (64-bit)
@@ -387,7 +389,8 @@ def get_min_max_objective(slothy):
     q_ld2_lane_s: 8,
     vtbl: 6,  # q-form: 3*N+3 cycles (N = number of registers in the table)
     AESInstruction: 3,
-    (sub_imm, cmp, cmp_imm): 1,
+    (sub_imm, cmp_imm): 1,
+    (cmp, cmp_xzr): 2,
     vuaddlv_sform: 6,  # 8B/8H
     fmov_s_form: 5,  # from vec to gen reg
     fmov_d_form: 5,  # from vec to gen reg (64-bit)
diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
index 707fa8ab..bdbd1db7 100644
--- a/tests/naive/aarch64/instructions.s
+++ b/tests/naive/aarch64/instructions.s
@@ -223,5 +223,6 @@ csel x11, x10, xzr, eq
 csel x11, x10, xzr, ne
 csel x11, x10, xzr, lt
 csel x11, x10, xzr, gt
+cmp x3, xzr
 
 end: