AArch64: Add support for cmp_xzr to the A55, A72 uArch model

willieyz · willieyz · commit 6a6aac35cd00 · 2026-05-12T00:41:07.000+08:00
This commit adds support for the `cmp_xzr` instruction to the A55, A72
 uArch model.
This pattern is a variant of cmp using the zero register xzr.

This commit reuses the existing cmp uArch model definition for the
uArch model, and cmp is alias of SUBS(according to page C6-1953 of
Aarch64 Base Instruction Descriptions), so we reference the SUBS to
model this instruction.

- a55 SWOG SUBS(page: 18/48)
  - latency: 2
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: SCALAR (ALU0, ALU1)

- a72 SWOG SUBS(page: 8/48)
  - latency: 1
  - Inverse throughput: 2/2 = 1
  - ExecutionUnit: INT (INT0, INT1)

Signed-off-by: willieyz &lt;willie.zhao@chelpis.com&gt;
diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
@@ -184,6 +184,7 @@
     fmov_s_form,  # from double/single to gen reg
     fmov_d_form,  # from double/single to gen reg (64-bit)
     cmp,
+    cmp_xzr,
     vdup_w,
     crc32b,
     crc32h,
@@ -442,6 +443,7 @@ def get_min_max_objective(slothy):
         sub,
         sub_imm,
         cmp,
+        cmp_xzr,
         sbcs_zero_to_zero,
         cmp_xzr2,
         mov,
@@ -491,7 +493,7 @@ def get_min_max_objective(slothy):
         umov_d,
         vuaddlv_sform,
     ): 1,
-    (sub_imm, cmp): 1,
+    (sub_imm, cmp, cmp_xzr): 1,
     (
         vmla,
         vmla_lane,
@@ -607,7 +609,7 @@ def get_min_max_objective(slothy):
     ): 4,
     (Ldr_D): 3,
     (Ldr_Q, Str_Q): 4,
-    (sub_imm, cmp): 2,
+    (sub_imm, cmp, cmp_xzr): 2,
     AArch64NeonCount: 2,
     St4: 5,
     St3: 3,
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -123,6 +123,7 @@
     q_ld2_lane_s,
     Ldp_W,
     cmp,
+    cmp_xzr,
     cmp_imm,
     csel,
     csel_xzr_ne,
@@ -265,7 +266,7 @@ def get_min_max_objective(slothy):
     lsr_imm: ExecutionUnit.INT(),
     lsr: ExecutionUnit.INT(),
     movk_imm_lsl: ExecutionUnit.INT(),
-    (sub_imm, cmp, cmp_imm): ExecutionUnit.INT(),
+    (sub_imm, cmp, cmp_xzr, cmp_imm): ExecutionUnit.INT(),
     Ldp_W: ExecutionUnit.LOAD(),
     q_ldp_with_inc: ExecutionUnit.LOAD(),
     Stp_W: ExecutionUnit.STORE(),
@@ -316,7 +317,7 @@ def get_min_max_objective(slothy):
     q_ld2_lane_s: 1,
     vtbl: 1,  # SWOG contains a blank throughput (approximating from AArch32)
     AESInstruction: 1,
-    (sub_imm, cmp, cmp_imm): 1,
+    (sub_imm, cmp, cmp_xzr, cmp_imm): 1,
     vuaddlv_sform: 1,
     fmov_s_form: 1,  # from vec to gen reg
     fmov_d_form: 1,  # from vec to gen reg (64-bit)
@@ -387,7 +388,7 @@ def get_min_max_objective(slothy):
     q_ld2_lane_s: 8,
     vtbl: 6,  # q-form: 3*N+3 cycles (N = number of registers in the table)
     AESInstruction: 3,
-    (sub_imm, cmp, cmp_imm): 1,
+    (sub_imm, cmp, cmp_xzr, cmp_imm): 1,
     vuaddlv_sform: 6,  # 8B/8H
     fmov_s_form: 5,  # from vec to gen reg
     fmov_d_form: 5,  # from vec to gen reg (64-bit)
diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
@@ -223,5 +223,6 @@ csel x11, x10, xzr, eq
 csel x11, x10, xzr, ne
 csel x11, x10, xzr, lt
 csel x11, x10, xzr, gt
+cmp x3, xzr
 
 end: