Do vqdmulh optimisation for extractbytes as well

bremoran · mkannwischer · commit 278a9ac84508 · 2026-02-25T18:24:30.000+08:00
Signed-off-by: Brendan Moran &lt;brendan.moran@arm.com&gt;
diff --git a/mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x1_mve_asm.S b/mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x1_mve_asm.S
@@ -25,21 +25,21 @@
 
 // deinterleave_even: inverse of the even-bit compaction. Inflates the even
 // bit-plane stored in \e back into byte positions (odd bits garbage).
-.macro deinterleave_even e, tmp
+.macro deinterleave_even e, tmp, const4, const16
                                 // | e[31:24] | e[23:16] | e[15:8] | e[7:0] |
     vsli.u32    \e, \e, #8      // | e[23:16] | e[15:8]  | e[7:0]  | e[7:0] |
     vsli.u16    \e, \e, #4      // | e[19:12] | e[11:8,11:8]  | e[3:0,7:4]  | e[3:0,3:0] |
     vsli.u8     \e, \e, #1      // | e[18:12,12] | e[10:8,11:8,8]  | e[2:0,7:4,4]  | e[2:0,3:0,0] |
-    vshr.u8     \tmp, \e, #3
+    vqdmulh.s8  \tmp, \e, \const16
     vsli.u8     \e, \tmp, #4    // | e[17:14,14:12,12] | e[9:8,11:10,10:8,8]  | e[1:0,7:6,6:4,4]  | e[1:0,3:2,2:0,0] |
-    vshr.u8     \tmp, \e, #5
+    vqdmulh.s8  \tmp, \e, \const4
     vsli.u8     \e, \tmp, #6    // | e[16:15,15:14,14:12,12] | e[8,11,11:10,10:8,8]  | e[0,7,7:6,6:4,4]  | e[0,3,3:2,2:0,0] |
                                 // after 0x55 mask
                                 // | e[15,14,13,12] | e[11,10,9,8]  | e[7,6,5,4]  | e[3,2,1,0] |
 .endm
 
 .balign 8
-.macro from_bit_interleaving_x1 tmp
+.macro from_bit_interleaving_x1 tmp, const4, const16
     // Input:  q0 = [e0, o0, e1, o1]
     // Output: q0 = [d0l, d0h, d1l, d1h]
     // Clobbers: r0, q1, q2, q3, q4
@@ -55,8 +55,8 @@
     // construct an o vector
     vpsel       q1, q3, q1    // q0.u16: [o0l, o0h, o0h, o0l, o1l, o1h, o1l, o1h] 
     // expand vectors
-    deinterleave_even q0, q2
-    deinterleave_even q1, q2
+    deinterleave_even q0, q2, \const4, \const16
+    deinterleave_even q1, q2, \const4, \const16
     // Zero garbage bits
     mov         \tmp, #0x55
     vdup.u8     q2, \tmp
@@ -90,12 +90,17 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
     tmp                 .req r4
     nB                  .req lr
     off                 .req r5
+    const4              .req r6
+    const16             .req r7
     lane_offset_bytes   .req r9
     mask                .req r11
     // ---- Vector naming ----
     qd                  .req q0
     qs                  .req q1
 
+    mov     const4, #4
+    mov     const16, #16
+
     cmp     length,  #0             // if len==0 done
     beq     keccak_f1600_x1_state_extract_bytes_asm_exit
 
@@ -123,7 +128,7 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
     // Load state for the partial lane
     vldrw.u32 qd, [state], #16
     // Deinterleave to bytes
-    from_bit_interleaving_x1 tmp
+    from_bit_interleaving_x1 tmp, const4, const16
     // Predicated byte store of up to 16 bytes
     // calculate the predicates
     // mask = (1 << nB) - 1 over 8-bit lanes, then shift by 'off'.
@@ -154,7 +159,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_start:
     // Load 16B (two lanes) from state and bump pointer
     vldrw.u32 qd, [state], #16
     // Deinterleave to bytes
-    from_bit_interleaving_x1 tmp
+    from_bit_interleaving_x1 tmp, const4, const16
     // Store 16B of output bytes (post-increment by 16)
     vstrw.u32 qd, [dp], #16
 
@@ -172,7 +177,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_end:
 
     // Load next state lane, deinterleave, store tail
     vldrw.u32 qd, [state], #16
-    from_bit_interleaving_x1 tmp
+    from_bit_interleaving_x1 tmp, const4, const16
     // Tail via predicated byte stores like prologue, but off=0 (no base adjust)
     vctp.8  length
     vpst
diff --git a/mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x1_mve_asm.S b/mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x1_mve_asm.S
@@ -27,20 +27,15 @@
 // bits of each byte/halfword/word in \t toward the lower half, preparing the
 // even bit-plane. Comments show the equivalent masks after each stage.
 .macro interleave_evens t, u, const8, const16, const32, const128, const32768
-    //vshr.u8     \u, \t, #2       // stage 1 within bytes
-    vqdmulh.s8  \u, \t, \const32   // shift right 2 = shift left 5+1, shift right 8; 1<<5 = 32
+    vqdmulh.s8  \u, \t, \const32
     vsli.u8     \t, \u, #1       // t = ((t >> 1) & 0x7E7E7E7E) | (t & 0x01010101)
-    //vshr.u8     \u, \t, #3       // stage 2 within nibbles
-    vqdmulh.s8    \u, \t, \const16     // shift right 3 = shift left 5, shift right 8; 1<<5 = 32
+    vqdmulh.s8    \u, \t, \const16
     vsli.u8     \t, \u, #2       // t = ((t >> 2) & 0x1C1C1C1C) | (t & 0x03030303)
-    //vshr.u8     \u, \t, #4       // stage 3 across bytes
-    vqdmulh.s8    \u, \t, \const8     // shift right 4 = shift left 4, shift right 8; 1<<4 = 16
+    vqdmulh.s8    \u, \t, \const8
     vsli.u8     \t, \u, #3       // t = ((t >> 3) & 0x08080808) | (t & 0x07070707)
-    //vshr.u16    \u, \t, #8       // widen within halfwords
-    vqdmulh.s16   \u, \t, \const128 // shift right by 8 = shift left 7+1, shift right 16; 1<<7 = 128
+    vqdmulh.s16   \u, \t, \const128
     vsli.u8     \t, \u, #4       // t = ((t >> 4) & 0x00F000F0) | (t & 0x000F000F)
-    //vshr.u32    \u, \t, #16      // widen within words
-    vqdmulh.s32   \u, \t, \const32768   // shift right by 16 = shift left 15+1, shift right 32; 1<<15 = 32768
+    vqdmulh.s32   \u, \t, \const32768
     vsli.u16    \t, \u, #8       // t = ((t >> 8) & 0x0000FF00) | (t & 0x000000FF)
 .endm