diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index a30980dc..be778ed8 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -542,6 +542,8 @@ def start(
         # Identify the register that is used as a loop counter
         body_code = [line for line in body_code if line.text != ""]
         loop_cnt_reg = None
+        loop_end_reg = None
+        subs_imm = None
         for idx, line in enumerate(body_code):
             inst = Instruction.parser(line)
             # Flags are set through cmp
@@ -557,17 +559,42 @@ def start(
                     "as end register."
                 )
                 break
+        if loop_cnt_reg is None:
+            # Try to find a subs instruction in the body (countdown loop)
+            for idx, line in enumerate(body_code):
+                inst = Instruction.parser(line)
+                if (
+                    isinstance(inst[0], AArch64BasicArithmetic)
+                    and inst[0].mnemonic.startswith("subs")
+                    and inst[0].immediate is not None
+                    and len(inst[0].args_out) > 0
+                ):
+                    loop_cnt_reg = inst[0].args_out[0]
+                    subs_imm = inst[0].immediate
+                    body_code[idx].add_tag("id", "cmp")
+                    logging.debug(
+                        f"Assuming {loop_cnt_reg} as countdown counter register "
+                        f"(subs immediate={subs_imm})."
+                    )
+                    break
         if loop_cnt_reg is None:
             raise FatalParsingException("No flag-setting instruction found!")
 
         if unroll > 1:
             assert unroll in [1, 2, 4, 8, 16, 32]
-            yield f"{indent}lsr {loop_end_reg}, {loop_end_reg}, #{int(math.log2(unroll))}"
+            if loop_end_reg is not None:
+                yield (
+                    f"{indent}lsr {loop_end_reg}, {loop_end_reg},"
+                    f"#{int(math.log2(unroll))}"
+                )
 
         inc_per_iter = 0
         for idx, line in enumerate(body_code):
             inst = Instruction.parser(line)
             modifies_counter = False
+            # Skip the subs instruction itself - it is already tagged as the flag-setter
+            if subs_imm is not None and body_code[idx].tags.get("id") == "cmp":
+                continue
             # Increment happens through pointer modification
             if loop_cnt_reg.lower() == inst[0].addr and inst[0].increment is not None:
                 inc_per_iter = inc_per_iter + simplify(inst[0].increment)
@@ -587,7 +614,12 @@ def start(
         )
 
         if fixup != 0:
-            yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
+            if loop_end_reg is not None:
+                yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
+            else:
+                # subs countdown loop: subtract fixup * subs_imm from the counter
+                fixup_val = simplify(f"{fixup} * ({subs_imm})")
+                yield f"{indent}sub {loop_cnt_reg}, {loop_cnt_reg}, #{fixup_val}"
 
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
@@ -748,7 +780,15 @@ def is_q_form_vector_instruction(self):
             return self._is_instance_of([Str_Q, Ldr_Q])
 
         # Operations on specific lanes are not counted as Q-form instructions
-        if self._is_instance_of([Q_Ld2_Lane_Post_Inc, st2_lane, st2_lane_post_inc]):
+        if self._is_instance_of(
+            [
+                Q_Ld2_Lane_Post_Inc,
+                st2_lane,
+                st2_lane_post_inc,
+                q_ld1_lane_with_reg_postinc,
+                q_st1_lane_with_reg_postinc,
+            ]
+        ):
             return False
 
         dt = self.datatype
@@ -763,12 +803,39 @@ def is_q_form_vector_instruction(self):
 
     def is_vector_load(self):
         """Indicates if an instruction is a Neon load instruction"""
-        return self._is_instance_of([Ldr_Q, Ldp_Q, Ld2, Ld3, Ld4, Q_Ld2_Lane_Post_Inc])
+        return self._is_instance_of(
+            [
+                Ldr_Q,
+                Ldp_Q,
+                Ld2,
+                Ld3,
+                Ld4,
+                Q_Ld2_Lane_Post_Inc,
+                q_ld1_lane_with_reg_postinc,
+                q_ld1_1_with_reg_postinc,
+                q_ld1_2,
+                q_ld1_2_with_postinc,
+                q_ld1_2_with_reg_postinc,
+                q_ld1_4,
+                q_ld1_4_with_postinc,
+            ]
+        )
 
     def is_vector_store(self):
         """Indicates if an instruction is a Neon store instruction"""
         return self._is_instance_of(
-            [Str_Q, Stp_Q, St2, St3, St4, d_stp_stack_with_inc, d_str_stack_with_inc]
+            [
+                Str_Q,
+                Stp_Q,
+                St2,
+                St3,
+                St4,
+                d_stp_stack_with_inc,
+                d_str_stack_with_inc,
+                q_st1_lane_with_reg_postinc,
+                q_st1_1_with_reg_postinc,
+                q_st1_4_with_postinc,
+            ]
         )
 
     # scalar
@@ -932,8 +999,6 @@ def _enforce_datatype_matching(pattern, res):
                 raise FatalParsingException(
                     f"Inconsistent data type: {datatypes[dt]} vs {val}"
                 )
-            elif dt not in datatypes and val in datatypes.values():
-                raise FatalParsingException(f"Inconsistent dt: {dt}")
             datatypes[dt] = val
 
     @staticmethod
@@ -1474,6 +1539,43 @@ def make(cls, src):
         return obj
 
 
+class q_ld1_2(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>, <Vb>.<dt>}, [<Xc>]"
+    inputs = ["Xc"]
+    outputs = ["Va", "Vb"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in[0]
+        obj.args_out_combinations = [
+            ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)])
+        ]
+        return obj
+
+
+class q_ld1_4(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>, <Vb>.<dt>, <Vc>.<dt>, <Vd>.<dt>}, [<Xe>]"
+    inputs = ["Xe"]
+    outputs = ["Va", "Vb", "Vc", "Vd"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in[0]
+        obj.args_out_combinations = [
+            (
+                [0, 1, 2, 3],
+                [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)],
+            )
+        ]
+        return obj
+
+
 class prefetch(Ldr_Q):
     pattern = "prfm pld1lkeep, [<Xc>, <imm>]"
     inputs = ["Xc"]
@@ -1767,6 +1869,91 @@ def make(cls, src):
         return obj
 
 
+class q_ld1_2_with_postinc(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>, <Vb>.<dt>}, [<Xc>], <imm>"
+    in_outs = ["Xc"]
+    outputs = ["Va", "Vb"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+
+        obj.args_out_combinations = [
+            ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)])
+        ]
+        return obj
+
+
+class q_ld1_4_with_postinc(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>, <Vb>.<dt>, <Vc>.<dt>, <Vd>.<dt>}, [<Xe>], <imm>"
+    in_outs = ["Xe"]
+    outputs = ["Va", "Vb", "Vc", "Vd"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        obj.args_out_combinations = [
+            (
+                [0, 1, 2, 3],
+                [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)],
+            )
+        ]
+        return obj
+
+
+class q_ld1_2_with_reg_postinc(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>, <Vb>.<dt>}, [<Xc>], <Xd>"
+    inputs = ["Xd"]
+    in_outs = ["Xc"]
+    outputs = ["Va", "Vb"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None  # register-determined at runtime
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        obj.args_out_combinations = [
+            ([0, 1], [[f"v{i}", f"v{i+1}"] for i in range(0, 31)])
+        ]
+        return obj
+
+
+class q_ld1_1_with_reg_postinc(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>}, [<Xb>], <Xc>"
+    inputs = ["Xc"]
+    in_outs = ["Xb"]
+    outputs = ["Va"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
+class q_ld1_lane_with_reg_postinc(AArch64Instruction):
+    pattern = "ld1 {<Va>.<dt>}[<index>], [<Xb>], <Xc>"
+    inputs = ["Xc"]
+    in_outs = ["Xb", "Va"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
 class q_ldp_with_postinc(Ldp_Q):
     pattern = "ldp <Qa>, <Qb>, [<Xc>], <imm>"
     in_outs = ["Xc"]
@@ -1964,6 +2151,34 @@ def make(cls, src):
         return obj
 
 
+class q_st1_1_with_reg_postinc(AArch64Instruction):
+    pattern = "st1 {<Va>.<dt>}, [<Xb>], <Xc>"
+    inputs = ["Va", "Xc"]
+    in_outs = ["Xb"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
+class q_st1_lane_with_reg_postinc(AArch64Instruction):
+    pattern = "st1 {<Va>.<dt>}[<index>], [<Xb>], <Xc>"
+    inputs = ["Va", "Xc"]
+    in_outs = ["Xb"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
 class q_stp_with_postinc(Stp_Q):
     pattern = "stp <Qa>, <Qb>, [<Xc>], <imm>"
     inputs = ["Qa", "Qb"]
@@ -1979,7 +2194,7 @@ def make(cls, src):
 
 
 class q_st1_2_with_postinc(Stp_Q):
-    pattern = "st1 {<Va>.<dt0>, <Vb>.<dt1>}, [<Xc>], <imm>"
+    pattern = "st1 {<Va>.<dt>, <Vb>.<dt>}, [<Xc>], <imm>"
     inputs = ["Va", "Vb"]
     in_outs = ["Xc"]
 
@@ -1996,6 +2211,26 @@ def make(cls, src):
         return obj
 
 
+class q_st1_4_with_postinc(AArch64Instruction):
+    pattern = "st1 {<Va>.<dt>, <Vb>.<dt>, <Vc>.<dt>, <Vd>.<dt>}, [<Xe>], <imm>"
+    inputs = ["Va", "Vb", "Vc", "Vd"]
+    in_outs = ["Xe"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        obj.args_in_combinations = [
+            (
+                [0, 1, 2, 3],
+                [[f"v{i}", f"v{i+1}", f"v{i+2}", f"v{i+3}"] for i in range(0, 29)],
+            )
+        ]
+        return obj
+
+
 class Ldr_X(AArch64Instruction):
     pass
 
@@ -2728,6 +2963,12 @@ class sub(AArch64BasicArithmetic):
     outputs = ["Xd"]
 
 
+class sub_wform(AArch64BasicArithmetic):
+    pattern = "sub <Wd>, <Wa>, <Wb>"
+    inputs = ["Wa", "Wb"]
+    outputs = ["Wd"]
+
+
 class AArch64ShiftedArithmetic(AArch64Instruction):
     pass
 
@@ -2750,6 +2991,12 @@ class add_shifted(AArch64ShiftedArithmetic):
     outputs = ["Xd"]
 
 
+class sub_shifted(AArch64ShiftedArithmetic):
+    pattern = "sub <Xd>, <Xa>, <Xb>, <barrel> <imm>"
+    inputs = ["Xa", "Xb"]
+    outputs = ["Xd"]
+
+
 class adds_shifted(AArch64ShiftedArithmetic):
     pattern = "adds <Xd>, <Xa>, <Xb>, <barrel> <imm>"
     inputs = ["Xa", "Xb"]
@@ -2779,6 +3026,12 @@ class lsl(AArch64Shift):
     outputs = ["Xd"]
 
 
+class lsl_wform(AArch64Shift):
+    pattern = "lsl <Wd>, <Wa>, <imm>"
+    inputs = ["Wa"]
+    outputs = ["Wd"]
+
+
 class ror(AArch64Shift):
     pattern = "ror <Xd>, <Xa>, <imm>"
     inputs = ["Xa"]
@@ -2904,6 +3157,12 @@ class sxtb(AArch64Logical):
     outputs = ["Xd"]
 
 
+class sxtw(AArch64Logical):
+    pattern = "sxtw <Xd>, <Wa>"
+    inputs = ["Wa"]
+    outputs = ["Xd"]
+
+
 class uxtb(AArch64Logical):
     pattern = "uxtb <Wd>, <Wa>"
     inputs = ["Wa"]
@@ -3279,6 +3538,73 @@ class uaddlp(AArch64Instruction):
     outputs = ["Vd"]
 
 
+class uaddl(AArch64Instruction):
+    pattern = "uaddl <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class uaddl2(AArch64Instruction):
+    pattern = "uaddl2 <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class uaddw(AArch64Instruction):
+    pattern = "uaddw <Vd>.<dt0>, <Va>.<dt0>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class uaddw2(AArch64Instruction):
+    pattern = "uaddw2 <Vd>.<dt0>, <Va>.<dt0>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class saddl(AArch64Instruction):
+    pattern = "saddl <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class saddl2(AArch64Instruction):
+    pattern = "saddl2 <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt1>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class rshrn(AArch64Instruction):
+    pattern = "rshrn <Vd>.<dt0>, <Va>.<dt1>, <imm>"
+    inputs = ["Va"]
+    outputs = ["Vd"]
+
+
+class rshrn2(AArch64Instruction):
+    # rshrn2 writes the upper half of Vd, lower half is retained
+    pattern = "rshrn2 <Vd>.<dt0>, <Va>.<dt1>, <imm>"
+    inputs = ["Va"]
+    in_outs = ["Vd"]
+
+
+class sqxtun(AArch64Instruction):
+    pattern = "sqxtun <Vd>.<dt0>, <Va>.<dt1>"
+    inputs = ["Va"]
+    outputs = ["Vd"]
+
+
+class sqrshrun(AArch64Instruction):
+    pattern = "sqrshrun <Vd>.<dt0>, <Va>.<dt1>, <imm>"
+    inputs = ["Va"]
+    outputs = ["Vd"]
+
+
+class urhadd(AArch64Instruction):
+    pattern = "urhadd <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
 class Vzip(AArch64Instruction):
     pass
 
@@ -3366,6 +3692,12 @@ class vins_d(Vins):
     in_outs = ["Vd"]
 
 
+class vins_d_from_v(AArch64Instruction):
+    pattern = "ins <Vd>.d[<index0>], <Va>.d[<index1>]"
+    inputs = ["Va"]
+    in_outs = ["Vd"]
+
+
 class vins_d_force_output(Vins):
     pattern = "ins <Vd>.d[<index>], <Xa>"
     inputs = ["Xa"]
@@ -3433,6 +3765,12 @@ class mov_vtov_d(AArch64Instruction):
     in_outs = ["Vd"]
 
 
+class mov_vtov_s(AArch64Instruction):
+    pattern = "mov <Vd>.s[<index0>], <Va>.s[<index1>]"
+    inputs = ["Va"]
+    in_outs = ["Vd"]
+
+
 class SHA3Instruction(
     AArch64Instruction
 ):  # pylint: disable=missing-docstring,invalid-name
@@ -3607,12 +3945,72 @@ def make(cls, src):
         return obj
 
 
+class fmla(AArch64Instruction):
+    pattern = "fmla <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    in_outs = ["Vd"]
+
+
+class faddp_vec(AArch64Instruction):
+    pattern = "faddp <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class faddp_scalar(AArch64Instruction):
+    pattern = "faddp <Sd>, <Va>.<dt>"
+    inputs = ["Va"]
+    outputs = ["Sd"]
+
+
+class fadd_vec(AArch64Instruction):
+    pattern = "fadd <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class fsub_vec(AArch64Instruction):
+    pattern = "fsub <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class fmul_vec(AArch64Instruction):
+    pattern = "fmul <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
+class fmla_lane(AArch64Instruction):
+    pattern = "fmla <Vd>.<dt0>, <Va>.<dt0>, <Vb>.<dt1>[<index>]"
+    inputs = ["Va", "Vb"]
+    in_outs = ["Vd"]
+
+
+class fmls_vec(AArch64Instruction):
+    pattern = "fmls <Vd>.<dt>, <Va>.<dt>, <Vb>.<dt>"
+    inputs = ["Va", "Vb"]
+    in_outs = ["Vd"]
+
+
+class fmul_lane(AArch64Instruction):
+    pattern = "fmul <Vd>.<dt0>, <Va>.<dt0>, <Vb>.<dt1>[<index>]"
+    inputs = ["Va", "Vb"]
+    outputs = ["Vd"]
+
+
 class vdup(AArch64Instruction):
     pattern = "dup <Vd>.<dt>, <Xa>"
     inputs = ["Xa"]
     outputs = ["Vd"]
 
 
+class vdup_lane(AArch64Instruction):
+    pattern = "dup <Vd>.<dt0>, <Va>.<dt1>[<index>]"
+    inputs = ["Va"]
+    outputs = ["Vd"]
+
+
 class vdup_w(AArch64Instruction):
     pattern = "dup <Vd>.<dt>, <Wa>"
     inputs = ["Wa"]
diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
index 26fe2061..d01145d9 100644
--- a/slothy/targets/aarch64/cortex_a55.py
+++ b/slothy/targets/aarch64/cortex_a55.py
@@ -58,7 +58,9 @@
     Ldr_Q,
     Ldr_D,
     Str_Q,
+    Stp_Q,
     vmov,
+    vmovi,
     vadd,
     vxtn,
     vshrn,
@@ -73,6 +75,12 @@
     vmla_lane,
     vmls,
     vmls_lane,
+    fmla,
+    faddp_vec,
+    faddp_scalar,
+    fadd_vec,
+    fsub_vec,
+    fmul_vec,
     vmul_lane,
     vqrdmulh,
     vqrdmulh_lane,
@@ -161,12 +169,15 @@
     mov,
     ngc_zero,
     subs_wform,
+    subs_imm,
     asr_wform,
     and_imm_wform,
     eor_wform,
     eon_wform,
     lsr_wform,
     lsr,
+    lsl_wform,
+    sub_wform,
     ASimdCompare,
     and_twoarg,
     VShiftImmediateBasic,
@@ -181,6 +192,36 @@
     fmov_s_form,  # from double/single to gen reg
     cmp,
     vdup_w,
+    vdup_lane,
+    fmla_lane,
+    fmls_vec,
+    fmul_lane,
+    mov_vtov_s,
+    Vrev,
+    uaddl,
+    uaddl2,
+    uaddw,
+    uaddw2,
+    saddl,
+    saddl2,
+    rshrn,
+    rshrn2,
+    sqxtun,
+    sqrshrun,
+    urhadd,
+    sub_shifted,
+    sxtw,
+    q_ld1_2,
+    q_ld1_2_with_postinc,
+    q_ld1_2_with_reg_postinc,
+    q_ld1_4,
+    q_ld1_4_with_postinc,
+    q_ld1_1_with_reg_postinc,
+    q_ld1_lane_with_reg_postinc,
+    q_st1_1_with_reg_postinc,
+    q_st1_lane_with_reg_postinc,
+    q_st1_4_with_postinc,
+    vins_d_from_v,
 )
 
 issue_rate = 2
@@ -266,6 +307,7 @@ def get_min_max_objective(slothy):
         vqdmulh_vector,
         Ldr_Q,
         Str_Q,
+        Stp_Q,
         q_ldr1_stack,
         q_ldr1_post_inc,
         Q_Ld2_Lane_Post_Inc,
@@ -279,6 +321,25 @@ def get_min_max_objective(slothy):
         VShiftImmediateRounding,
         AArch64NeonLogical,
         vuaddlv_sform,
+        uaddl,
+        uaddl2,
+        uaddw,
+        uaddw2,
+        saddl,
+        saddl2,
+        rshrn2,
+        q_ld1_2,
+        q_ld1_2_with_postinc,
+        q_ld1_2_with_reg_postinc,
+        q_ld1_4,
+        q_ld1_4_with_postinc,
+        # TODO: revisit -- lane load might also needs SCALAR_LOAD; placed here
+        # following the Ldr_Q convention which also omits a load unit.
+        q_ld1_1_with_reg_postinc,
+        q_ld1_lane_with_reg_postinc,
+        q_st1_lane_with_reg_postinc,
+        q_st1_4_with_postinc,
+        vmovi,
     ): [
         [ExecutionUnit.VEC0, ExecutionUnit.VEC1]
     ],  # these instructions use both VEC0 and VEC1
@@ -328,6 +389,8 @@ def get_min_max_objective(slothy):
         mov_d01,
         mov_b00,
         mov_vtov_d,
+        mov_vtov_s,
+        mov_vtov_s,
         fcsel,
         VecToGprMov,
         Mov_xtov_d,
@@ -338,10 +401,17 @@ def get_min_max_objective(slothy):
         d_ldr_stack_with_inc,
         fmov_s_form,  # from double/single to gen reg
         vdup_w,
+        is_dform_form_of(sqrshrun),
+        is_dform_form_of(rshrn),
+        is_dform_form_of(sqxtun),
+        is_dform_form_of(urhadd),
+        vins_d_from_v,
+        q_st1_1_with_reg_postinc,
     ): [
         ExecutionUnit.VEC0,
         ExecutionUnit.VEC1,
     ],  # these instructions use VEC0 or VEC1
+    is_qform_form_of(urhadd): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_qform_form_of(vmov): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_dform_form_of(vmov): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     is_qform_form_of(trn1): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
@@ -358,6 +428,10 @@ def get_min_max_objective(slothy):
     is_dform_form_of(vzip2): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     is_qform_form_of(vext): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_dform_form_of(vext): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(vdup_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(vdup_lane): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(Vrev): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(Vrev): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     is_qform_form_of(vuzp1): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_dform_form_of(vuzp1): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     is_qform_form_of(vuzp2): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
@@ -378,6 +452,23 @@ def get_min_max_objective(slothy):
     is_dform_form_of(vmla): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     is_qform_form_of(vmls): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_dform_form_of(vmls): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fmla): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fmla): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fmla_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fmla_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_qform_form_of(faddp_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(faddp_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    faddp_scalar: [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fadd_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fadd_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fsub_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fsub_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fmul_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fmul_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fmls_vec): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fmls_vec): [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
+    is_qform_form_of(fmul_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+    is_dform_form_of(fmul_lane): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     vshl_d: [ExecutionUnit.VEC0, ExecutionUnit.VEC1],
     vuxtl: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
     is_qform_form_of(AArch64NeonShiftInsert): [
@@ -427,17 +518,22 @@ def get_min_max_objective(slothy):
         movz_imm,
         movz_imm_lsl,
         sub,
+        sub_shifted,
         sub_imm,
+        sxtw,
         cmp,
         sbcs_zero_to_zero,
         cmp_xzr2,
         mov,
         ngc_zero,
         subs_wform,
+        subs_imm,
         asr_wform,
         and_imm_wform,
         lsr_wform,
         lsr,
+        lsl_wform,
+        sub_wform,
         eor_wform,
         eon_wform,
         sxtb,
@@ -455,6 +551,7 @@ def get_min_max_objective(slothy):
         vadd,
         vsub,
         vmov,
+        vmovi,
         vmul,
         vmul_lane,
         vqrdmulh,
@@ -472,13 +569,25 @@ def get_min_max_objective(slothy):
         vmla_lane,
         vmls,
         vmls_lane,
+        fmla,
+        fmla_lane,
+        fmls_vec,
+        fmul_lane,
+        fmls_vec,
+        fmul_lane,
+        faddp_vec,
+        faddp_scalar,
+        fadd_vec,
+        fsub_vec,
+        fmul_vec,
     ): 1,
     (vshl, vshl_d, vsshr, vushr, vuxtl): 1,
     (trn2, trn1, ASimdCompare): 1,
     (Ldr_D): 1,
-    (Ldr_Q): 2,
+    (Ldr_Q, q_ld1_1_with_reg_postinc): 2,
     (AArch64NeonCount): 1,
-    (Str_Q): 1,
+    (Str_Q, Stp_Q, q_st1_1_with_reg_postinc): 1,
+    q_st1_4_with_postinc: 4,
     (tst_wform): 1,
     (nop, Vins, Ldr_X, Str_X): 1,
     Ldp_X: 2,
@@ -511,7 +620,7 @@ def get_min_max_objective(slothy):
     (ldr_const): 1,
     (ldr_sxtw_wform): 3,
     (lsr_imm, ror): 1,
-    (lsr, lsr_wform): 2,
+    (lsr, lsr_wform, lsl_wform): 2,
     (umull_wform, mul_wform, umaddl_wform): 1,
     (and_twoarg, and_imm, and_imm_wform): 1,
     (
@@ -525,17 +634,43 @@ def get_min_max_objective(slothy):
         adcs_zero_r_to_zero,
         cmn,
     ): 1,
-    (cmp_xzr2, cmp_imm, sub, subs_wform, asr_wform, sbcs_zero_to_zero, ngc_zero): 1,
+    (
+        cmp_xzr2,
+        cmp_imm,
+        sub,
+        sub_wform,
+        sub_shifted,
+        subs_wform,
+        subs_imm,
+        asr_wform,
+        sbcs_zero_to_zero,
+        ngc_zero,
+    ): 1,
     (bfi, ubfx): 1,
     VShiftImmediateRounding: 1,
     AArch64NeonShiftInsert: 1,
     (vusra): 1,
+    (uaddl, uaddl2): 1,
+    (uaddw, uaddw2): 1,
+    (saddl, saddl2): 1,
+    (rshrn, rshrn2, sqxtun): 1,
+    (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 4,
+    (q_ld1_4, q_ld1_4_with_postinc): 8,
+    q_ld1_lane_with_reg_postinc: 1,
+    q_st1_lane_with_reg_postinc: 1,
+    vins_d_from_v: 1,
+    sxtw: 1,
+    sqrshrun: 1,
+    urhadd: 1,
     AArch64NeonLogical: 1,
     vext: 1,
+    Vrev: 1,
+    vdup_lane: 1,
     (vuzp1, vuzp2): 1,
     (q_ldr1_stack, Q_Ld2_Lane_Post_Inc, q_ldr1_post_inc, q_ld2_lane_s): 1,
     (b_ldr_stack_with_inc, d_ldr_stack_with_inc): 1,
-    (mov_d01, mov_b00, mov_vtov_d): 1,
+    (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 1,
+    (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 1,
     (vzip1, vzip2): 1,
     (eor_wform, eon_wform): 1,
     (eon, eor, bic, bic_reg, eor_shifted, bic_shifted): 1,
@@ -549,6 +684,7 @@ def get_min_max_objective(slothy):
 default_latencies = {
     vdup_w: 3,
     vmov: 2,
+    vmovi: 1,
     is_qform_form_of([vadd, vsub]): 3,
     is_dform_form_of([vadd, vsub]): 2,
     (trn1, trn2, ASimdCompare): 2,
@@ -567,9 +703,29 @@ def get_min_max_objective(slothy):
         vmla_lane,
         vmls,
         vmls_lane,
+        fmla,
+        fmla_lane,
+        fmls_vec,
+        fmul_lane,
+        fmls_vec,
+        fmul_lane,
+        fadd_vec,
+        fsub_vec,
+        fmul_vec,
     ): 4,
     (Ldr_D): 3,
-    (Ldr_Q, Str_Q): 4,
+    (
+        faddp_vec,
+        faddp_scalar,
+    ): 4,
+    (
+        Ldr_Q,
+        Str_Q,
+        Stp_Q,
+        q_ld1_1_with_reg_postinc,
+        q_st1_1_with_reg_postinc,
+        q_st1_4_with_postinc,
+    ): 4,
     (sub_imm, cmp): 2,
     AArch64NeonCount: 2,
     St4: 5,
@@ -582,12 +738,11 @@ def get_min_max_objective(slothy):
     vxtn: 2,
     vshrn: 2,
     vtbl: 2,  # 2+N-1 cycles (N = number of registers in the table)
-    (vuxtl): 2,
     (Str_X, Ldr_X): 4,
     Ldp_X: 4,
     Ldp_W: 3,
     q_ldp_with_inc: 6,
-    (Vins, umov_d): 2,
+    (Vins, umov_d, vins_d_from_v): 2,
     (tst_wform): 1,
     (fcsel): 2,
     csel: 1,
@@ -606,12 +761,12 @@ def get_min_max_objective(slothy):
     (Stp_X, Stp_W, w_stp_with_imm_sp): 1,
     (ldr_const): 3,
     (ldr_sxtw_wform): 5,
-    (lsr, lsr_wform): 1,
+    (lsr, lsr_wform, lsl_wform, sub_wform): 1,
     lsr_imm: 2,
     (umull_wform, mul_wform, umaddl_wform): 3,
     (vuaddlv_sform): 3,
     (and_imm, and_imm_wform): 1,
-    (add2, add_shifted, add_sp_imm): 2,
+    (add2, add_shifted, sub_shifted, add_sp_imm): 2,
     (
         add,
         add_imm,
@@ -620,7 +775,9 @@ def get_min_max_objective(slothy):
         adcs_zero2,
         cmn,
         sub,
+        sub_wform,
         subs_wform,
+        subs_imm,
         asr_wform,
         sbcs_zero_to_zero,
         cmp_xzr2,
@@ -632,13 +789,29 @@ def get_min_max_objective(slothy):
     VShiftImmediateBasic: 2,
     AArch64NeonShiftInsert: 2,
     (vusra): 3,
+    (uaddl, uaddl2): 3,
+    (uaddw, uaddw2): 3,
+    # LD1 multi-reg Q-form latencies (SWOG section 4.18)
+    (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 6,
+    (q_ld1_4, q_ld1_4_with_postinc): 10,
+    q_ld1_lane_with_reg_postinc: 3,
+    q_st1_lane_with_reg_postinc: 1,
+    sxtw: 2,
+    (saddl, saddl2): 3,
+    (rshrn, rshrn2): 3,
+    sqxtun: 4,
+    sqrshrun: 4,
+    urhadd: 2,
     AArch64NeonLogical: 1,
     vext: 2,
+    Vrev: 2,
+    vdup_lane: 2,
     (vuzp1, vuzp2): 2,
     (q_ldr1_stack, Q_Ld2_Lane_Post_Inc, q_ldr1_post_inc): 3,
     q_ld2_lane_s: 3,
     (b_ldr_stack_with_inc, d_ldr_stack_with_inc): 3,
-    (mov_d01, mov_b00, mov_vtov_d): 2,
+    (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 2,
+    (mov_d01, mov_b00, mov_vtov_d, mov_vtov_s): 2,
     (vzip1, vzip2): 2,
     (eor_wform, eon_wform): 1,
     # According to SWOG, this is 2 cycles, byt if the output is used as a
@@ -694,6 +867,28 @@ def get_latency(src, out_idx, dst):
             lambda t_src, t_dst: t_dst.program_start_var == t_src.program_start_var + 1,
         )
 
+    # Fast mul->mla forwarding (accumulate_latency=1)
+    if (
+        instclass_src in [vmul, vmul_lane]
+        and instclass_dst in [vmla, vmla_lane, vmls, vmls_lane]
+        and src.args_out[0] == dst.args_in_out[0]
+    ):
+        return 1
+    # Fast mla->mla forwarding (accumulate_latency=1)
+    if (
+        instclass_src in [vmla, vmla_lane, vmls, vmls_lane]
+        and instclass_dst in [vmla, vmla_lane, vmls, vmls_lane]
+        and src.args_in_out[0] == dst.args_in_out[0]
+    ):
+        return 1
+    # Fast mull->mlal forwarding (accumulate_latency=1)
+    if (
+        isinstance(src, Vmull)
+        and isinstance(dst, Vmlal)
+        and src.args_out[0] == dst.args_in_out[0]
+    ):
+        return 1
+
     return latency
 
 
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
index 54fd2be4..407f8a6d 100644
--- a/slothy/targets/aarch64/cortex_a72_frontend.py
+++ b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -125,6 +125,45 @@
     cmp_imm,
     csel,
     q_ldp_with_inc,
+    uaddl,
+    uaddl2,
+    uaddw,
+    uaddw2,
+    saddl,
+    saddl2,
+    urhadd,
+    rshrn,
+    rshrn2,
+    sqxtun,
+    sqrshrun,
+    q_ld1_2,
+    q_ld1_4,
+    q_ld1_2_with_postinc,
+    q_ld1_4_with_postinc,
+    q_ld1_2_with_reg_postinc,
+    q_ld1_1_with_reg_postinc,
+    q_ld1_lane_with_reg_postinc,
+    q_st1_1_with_reg_postinc,
+    q_st1_lane_with_reg_postinc,
+    q_st1_4_with_postinc,
+    q_stp_with_inc,
+    sub_shifted,
+    subs_imm,
+    subs_wform,
+    fadd_vec,
+    fsub_vec,
+    fmul_vec,
+    faddp_vec,
+    faddp_scalar,
+    fmla,
+    fmls_vec,
+    fmla_lane,
+    fmul_lane,
+    vmovi,
+    vdup_lane,
+    rev64,
+    mov_vtov_s,
+    vins_d_from_v,
 )
 
 # From the A72 SWOG, Section "4.1 Dispatch Constraints"
@@ -257,6 +296,8 @@ def get_min_max_objective(slothy):
     fmov_s_form: ExecutionUnit.LOAD(),  # from vec to gen reg
     eor_shifted: ExecutionUnit.SCALAR(),
     bic_shifted: ExecutionUnit.SCALAR(),
+    sub_shifted: ExecutionUnit.SCALAR(),
+    (subs_wform, subs_imm): ExecutionUnit.INT(),
     lsr_imm: ExecutionUnit.INT(),
     lsr: ExecutionUnit.INT(),
     movk_imm_lsl: ExecutionUnit.INT(),
@@ -264,6 +305,43 @@ def get_min_max_objective(slothy):
     Ldp_W: ExecutionUnit.LOAD(),
     q_ldp_with_inc: ExecutionUnit.LOAD(),
     Stp_W: ExecutionUnit.STORE(),
+    q_stp_with_inc: [
+        ExecutionUnit.STORE() + [ExecutionUnit.INT0],
+        ExecutionUnit.STORE() + [ExecutionUnit.INT1],
+    ],
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): [
+        ExecutionUnit.ASIMD0,
+        ExecutionUnit.ASIMD1,
+    ],
+    (rshrn, rshrn2, sqxtun, sqrshrun): [ExecutionUnit.ASIMD1],
+    (fadd_vec, fsub_vec, fmul_vec, faddp_vec, fmla, fmls_vec): [
+        ExecutionUnit.ASIMD0,
+        ExecutionUnit.ASIMD1,
+    ],
+    faddp_scalar: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    (fmla_lane, fmul_lane): [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    vmovi: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    vdup_lane: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    rev64: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    mov_vtov_s: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    vins_d_from_v: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    (
+        q_ld1_2,
+        q_ld1_2_with_postinc,
+        q_ld1_2_with_reg_postinc,
+        q_ld1_4,
+        q_ld1_4_with_postinc,
+        q_ld1_1_with_reg_postinc,
+    ): ExecutionUnit.LOAD(),
+    q_ld1_lane_with_reg_postinc: [
+        [ExecutionUnit.ASIMD0, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1],
+        [ExecutionUnit.ASIMD1, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1],
+    ],
+    q_st1_lane_with_reg_postinc: [
+        [ExecutionUnit.ASIMD0, ExecutionUnit.STORE0, ExecutionUnit.STORE1],
+        [ExecutionUnit.ASIMD1, ExecutionUnit.STORE0, ExecutionUnit.STORE1],
+    ],
+    (q_st1_1_with_reg_postinc, q_st1_4_with_postinc): [ExecutionUnit.STORE()],
 }
 
 inverse_throughput = {
@@ -295,6 +373,7 @@ def get_min_max_objective(slothy):
     umov_d: 1,
     (add, add_imm, add_shifted): 1,
     (Ldr_D, Ldr_Q, Str_Q, Ldr_X, Str_X): 1,
+    q_stp_with_inc: 4,
     (VShiftImmediateRounding, VShiftImmediateBasic): 1,
     # TODO: this seems in accurate; revisiting may improve performance
     St2: 4,
@@ -314,6 +393,15 @@ def get_min_max_objective(slothy):
     fmov_s_form: 1,  # from vec to gen reg
     eor_shifted: 1,
     bic_shifted: 1,
+    sub_shifted: 1,
+    (subs_wform, subs_imm): 1,
+    (fadd_vec, fsub_vec, fmul_vec, faddp_vec, faddp_scalar, fmla, fmls_vec): 1,
+    (fmla_lane, fmul_lane): 1,
+    vmovi: 1,
+    vdup_lane: 1,
+    rev64: 1,
+    mov_vtov_s: 1,
+    vins_d_from_v: 1,
     vdup_w: 1,
     mov_wtov_s: 1,
     mov_vtov_d: 1,
@@ -322,6 +410,14 @@ def get_min_max_objective(slothy):
     movk_imm_lsl: 1,
     Ldp_W: 1,
     Stp_W: 1,
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 1,
+    (rshrn, rshrn2, sqxtun, sqrshrun): 1,
+    (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 2,
+    (q_ld1_4, q_ld1_4_with_postinc): 4,
+    (q_ld1_1_with_reg_postinc, q_ld1_lane_with_reg_postinc): 1,
+    q_st1_1_with_reg_postinc: 2,
+    q_st1_lane_with_reg_postinc: 1,
+    q_st1_4_with_postinc: 8,
 }
 
 # REVISIT
@@ -356,7 +452,7 @@ def get_min_max_objective(slothy):
     csel: 1,
     AArch64ConditionalCompare: 1,
     AArch64Logical: 1,
-    (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4,  # approx
+    (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X, q_stp_with_inc): 4,  # approx
     Vins: 6,  # approx
     umov_d: 4,  # approx
     (add, add_imm, add_shifted): 2,
@@ -381,6 +477,17 @@ def get_min_max_objective(slothy):
     fmov_s_form: 5,  # from vec to gen reg
     eor_shifted: 2,
     bic_shifted: 2,
+    sub_shifted: 2,
+    (subs_wform, subs_imm): 1,
+    (fadd_vec, fsub_vec, faddp_vec, faddp_scalar): 4,
+    fmul_vec: 4,
+    (fmla, fmls_vec, fmla_lane): 7,
+    fmul_lane: 5,
+    vmovi: 3,
+    vdup_lane: 3,
+    rev64: 3,
+    mov_vtov_s: 3,
+    vins_d_from_v: 3,
     vdup_w: 8,
     mov_wtov_s: 8,
     mov_vtov_d: 3,
@@ -389,6 +496,18 @@ def get_min_max_objective(slothy):
     movk_imm_lsl: 1,
     Ldp_W: 4,
     Stp_W: 1,
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 3,
+    (rshrn, rshrn2, sqrshrun, sqxtun): 4,
+    # Multi-register ld1 (SWOG: Q-form latencies)
+    q_ld1_1_with_reg_postinc: 5,
+    (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 6,
+    (q_ld1_4, q_ld1_4_with_postinc): 8,
+    # Single-lane ld1 B/H/S (SWOG: 8 cycles)
+    q_ld1_lane_with_reg_postinc: 8,
+    # Single-lane st1 B/H/S (SWOG: 3 cycles)
+    q_st1_lane_with_reg_postinc: 3,
+    q_st1_1_with_reg_postinc: 2,
+    q_st1_4_with_postinc: 8,
 }
 
 
@@ -428,6 +547,20 @@ def get_latency(src, out_idx, dst):
         and src.args_in_out[0] == dst.args_in_out[0]
     ):
         return 1
+    # Fast fmul->fmla forwarding (accumulate_latency=3)
+    if (
+        instclass_src in [fmul_vec, fmul_lane]
+        and instclass_dst in [fmla, fmls_vec, fmla_lane]
+        and src.args_out[0] == dst.args_in_out[0]
+    ):
+        return 3
+    # Fast fmla->fmla forwarding (accumulate_latency=3)
+    if (
+        instclass_src in [fmla, fmls_vec, fmla_lane]
+        and instclass_dst in [fmla, fmls_vec, fmla_lane]
+        and src.args_in_out[0] == dst.args_in_out[0]
+    ):
+        return 3
 
     return latency
 
diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
index 16d538d0..59f628c2 100644
--- a/slothy/targets/aarch64/neoverse_n1_experimental.py
+++ b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -111,6 +111,39 @@
     mov_vtov_d,
     lsr,
     movk_imm_lsl,
+    uaddl,
+    uaddl2,
+    uaddw,
+    uaddw2,
+    saddl,
+    saddl2,
+    urhadd,
+    rshrn,
+    rshrn2,
+    sqxtun,
+    sqrshrun,
+    q_ld1_2,
+    q_ld1_4,
+    q_ld1_2_with_postinc,
+    q_ld1_4_with_postinc,
+    q_ld1_2_with_reg_postinc,
+    q_ld1_1_with_reg_postinc,
+    q_ld1_lane_with_reg_postinc,
+    q_st1_1_with_reg_postinc,
+    q_st1_lane_with_reg_postinc,
+    q_st1_4_with_postinc,
+    fadd_vec,
+    fsub_vec,
+    fmul_vec,
+    faddp_vec,
+    faddp_scalar,
+    fmla,
+    fmls_vec,
+    fmla_lane,
+    fmul_lane,
+    mov_vtov_s,
+    vdup_lane,
+    vins_d_from_v,
 )
 
 issue_rate = 4
@@ -229,6 +262,18 @@ def get_min_max_objective(slothy):
     vusra: ExecutionUnit.V1(),
     AESInstruction: ExecutionUnit.V0(),
     (Vmul, Vmla, Vqdmulh, Vmull, Vmlal): ExecutionUnit.V0(),
+    (
+        fadd_vec,
+        fsub_vec,
+        fmul_vec,
+        faddp_vec,
+        faddp_scalar,
+        fmla,
+        fmls_vec,
+        fmla_lane,
+        fmul_lane,
+    ): ExecutionUnit.V(),
+    (mov_vtov_s, vdup_lane, vins_d_from_v): ExecutionUnit.V(),
     AArch64NeonLogical: ExecutionUnit.V(),
     vext: ExecutionUnit.V(),
     (
@@ -261,6 +306,34 @@ def get_min_max_objective(slothy):
     lsr: ExecutionUnit.I(),
     movk_imm_lsl: ExecutionUnit.I(),
     q_ld2_lane_s: ExecutionUnit.V(),
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): ExecutionUnit.V(),
+    (rshrn, rshrn2, sqxtun, sqrshrun): ExecutionUnit.V1(),
+    (
+        q_ld1_2,
+        q_ld1_2_with_postinc,
+        q_ld1_2_with_reg_postinc,
+        q_ld1_4,
+        q_ld1_4_with_postinc,
+        q_ld1_1_with_reg_postinc,
+    ): ExecutionUnit.LSU(),
+    q_ld1_lane_with_reg_postinc: [
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC1],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC1],
+    ],
+    q_st1_lane_with_reg_postinc: [
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC1],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC1],
+    ],
+    (q_st1_1_with_reg_postinc, q_st1_4_with_postinc): [
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU0, ExecutionUnit.VEC1],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC0],
+        [ExecutionUnit.LSU1, ExecutionUnit.VEC1],
+    ],
 }
 
 inverse_throughput = {
@@ -323,6 +396,25 @@ def get_min_max_objective(slothy):
     mov_vtov_d: 1,
     lsr: 1,
     movk_imm_lsl: 1,
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 1,
+    (rshrn, rshrn2, sqxtun, sqrshrun): 1,
+    (q_ld1_1_with_reg_postinc, q_ld1_lane_with_reg_postinc): 1,
+    (q_ld1_2, q_ld1_2_with_postinc, q_ld1_2_with_reg_postinc): 2,
+    (q_ld1_4, q_ld1_4_with_postinc): 4,
+    (q_st1_1_with_reg_postinc, q_st1_lane_with_reg_postinc): 2,
+    q_st1_4_with_postinc: 8,
+    (
+        fadd_vec,
+        fsub_vec,
+        faddp_vec,
+        faddp_scalar,
+        fmla,
+        fmls_vec,
+        fmla_lane,
+        fmul_vec,
+        fmul_lane,
+    ): 1,
+    (mov_vtov_s, vdup_lane, vins_d_from_v): 1,
 }
 
 default_latencies = {
@@ -386,6 +478,24 @@ def get_min_max_objective(slothy):
     mov_vtov_d: 2,
     lsr: 1,
     movk_imm_lsl: 1,
+    (uaddl, uaddl2, uaddw, uaddw2, saddl, saddl2, urhadd): 2,
+    (rshrn, rshrn2, sqrshrun, sqxtun): 4,
+    (
+        q_ld1_2,
+        q_ld1_2_with_postinc,
+        q_ld1_2_with_reg_postinc,
+        q_ld1_1_with_reg_postinc,
+    ): 5,
+    (q_ld1_4, q_ld1_4_with_postinc): 6,
+    q_ld1_lane_with_reg_postinc: 7,
+    q_st1_1_with_reg_postinc: 2,
+    q_st1_lane_with_reg_postinc: 4,
+    q_st1_4_with_postinc: 5,
+    (fadd_vec, fsub_vec, faddp_vec, faddp_scalar): 2,
+    fmul_vec: 3,
+    (fmla, fmls_vec, fmla_lane): 4,
+    fmul_lane: 3,
+    (mov_vtov_s, vdup_lane, vins_d_from_v): 2,
 }
 
 
diff --git a/tests/naive/aarch64/_test.py b/tests/naive/aarch64/_test.py
index 24869661..4ee9356a 100644
--- a/tests/naive/aarch64/_test.py
+++ b/tests/naive/aarch64/_test.py
@@ -42,6 +42,8 @@ def core(self, slothy):
         slothy.config.constraints.allow_reordering = False
         slothy.config.variable_size = True
         slothy.config.constraints.stalls_first_attempt = 256
+        slothy.config.reserved_regs.add("x12")
+        slothy.config.selftest_initial_register_values = {"x12": 16}
         slothy.optimize(start="start", end="end")
 
 
diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
index a764dd31..4d2b65d7 100644
--- a/tests/naive/aarch64/instructions.s
+++ b/tests/naive/aarch64/instructions.s
@@ -28,6 +28,32 @@ st2 { v0.s, v1.s}[0], [x11], #8
 st2 {v0.s, v1.s}[0], [x11], #8
 st2 { v0.S, v1.S }[1], [x1]
 
+// ld1 multi-register (no post-increment)
+ld1 {v0.4s, v1.4s}, [x1]
+ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x1]
+
+// ld1 multi-register with immediate post-increment
+ld1 {v0.4s, v1.4s}, [x1], #32
+ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x1], #64
+
+// ld1 multi-register and single-register with register post-increment
+mov x12, #16
+ld1 {v0.4s, v1.4s}, [x1], x12
+ld1 {v2.4s}, [x1], x12
+ld1 {v3.s}[0], [x1], x12
+ld1 {v3.s}[1], [x1], x12
+
+// st1 with register post-increment
+st1 {v0.4s}, [x1], x12
+st1 {v1.s}[0], [x1], x12
+st1 {v1.s}[1], [x1], x12
+
+// st1 4-register with immediate post-increment
+st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+
+// Store vector pair Q-form
+stp q0, q1, [x0, #32]
+
 zip1 v5.16b, v6.16b, v7.16b
 zip2 v8.16b, v9.16b, v10.16b
 uzp1 v11.16b, v12.16b, v13.16b
@@ -207,4 +233,59 @@ ld3 {v2.4s, v3.4s, v4.4s}, [x17], #48
 st3 {v5.4s, v6.4s, v7.4s}, [x18], #48
 ld2 {v8.4s, v9.4s}, [x19], #32
 st2 {v10.4s, v11.4s}, [x20], #32
+
+// ASIMD widening / narrowing arithmetic
+uaddl  v0.8h,  v1.8b,  v2.8b
+uaddl2 v3.8h,  v4.16b, v5.16b
+uaddw  v6.8h,  v7.8h,  v8.8b
+uaddw2 v9.8h,  v10.8h, v11.16b
+saddl  v12.8h, v13.8b, v14.8b
+saddl2 v15.8h, v16.16b, v17.16b
+rshrn  v18.8b, v19.8h, #2
+rshrn2 v20.16b, v21.8h, #3
+sqxtun  v22.8b, v23.8h
+sqrshrun v24.8b, v25.8h, #4
+urhadd v26.8b, v27.8b, v28.8b
+
+// sub with shifted register operand
+sub x0, x1, x2, lsl #3
+sub x3, x4, x5, lsr #2
+
+// sign-extend word
+sxtw x6, w7
+
+// ASIMD FP arithmetic (vector)
+fadd v0.4s, v1.4s, v2.4s
+fsub v0.4s, v1.4s, v2.4s
+fmul v0.4s, v1.4s, v2.4s
+faddp v0.4s, v1.4s, v2.4s
+faddp s0, v1.2s
+
+// ASIMD FP multiply accumulate / subtract
+fmla v0.4s, v1.4s, v2.4s
+fmls v0.4s, v1.4s, v2.4s
+
+// ASIMD FP by-element
+fmla v0.4s, v1.4s, v2.s[0]
+fmul v0.4s, v1.4s, v2.s[1]
+
+// ASIMD move immediate
+movi v0.16b, #0
+movi v1.4s, #0
+
+// SUBS immediate
+subs x0, x1, #4
+
+// ASIMD duplicate element
+dup v0.4s, v1.s[0]
+dup v2.2s, v3.s[1]
+
+// ASIMD reverse
+rev64 v0.16b, v1.16b
+rev64 v2.4s, v3.4s
+
+// ASIMD element move / insert
+mov v0.s[0], v1.s[1]
+ins v0.d[0], v1.d[1]
+
 end: