diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 1b284974c..8e27d76ab 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -179,6 +179,7 @@ source code and documentation. - [dev/aarch64_opt/src/ntt.S](dev/aarch64_opt/src/ntt.S) - [mldsa/src/native/aarch64/src/intt.S](mldsa/src/native/aarch64/src/intt.S) - [mldsa/src/native/aarch64/src/ntt.S](mldsa/src/native/aarch64/src/ntt.S) + - [proofs/hol_light/aarch64/mldsa/mldsa_ntt.S](proofs/hol_light/aarch64/mldsa/mldsa_ntt.S) ### `REF` @@ -284,6 +285,7 @@ source code and documentation. - [dev/aarch64_opt/src/ntt.S](dev/aarch64_opt/src/ntt.S) - [mldsa/src/native/aarch64/src/intt.S](mldsa/src/native/aarch64/src/intt.S) - [mldsa/src/native/aarch64/src/ntt.S](mldsa/src/native/aarch64/src/ntt.S) + - [proofs/hol_light/aarch64/mldsa/mldsa_ntt.S](proofs/hol_light/aarch64/mldsa/mldsa_ntt.S) ### `libmceliece` diff --git a/dev/aarch64_clean/src/polyz_unpack_17_asm.S b/dev/aarch64_clean/src/polyz_unpack_17_asm.S index c7b9f00a2..f67609e2c 100644 --- a/dev/aarch64_clean/src/polyz_unpack_17_asm.S +++ b/dev/aarch64_clean/src/polyz_unpack_17_asm.S @@ -69,9 +69,9 @@ polyz_unpack_17_loop: // 3-register ld1 would load 48 bytes, but only 36 are // consumed per iteration. The TBL indices for v2 are // adjusted to account for v2's load offset. - ld1 {v0.16b, v1.16b}, [buf] - add buf, buf, #0x14 - ld1 {v2.16b}, [buf], #0x10 + ld1 {v0.16b, v1.16b}, [buf] + add buf, buf, #0x14 + ld1 {v2.16b}, [buf], #0x10 tbl v4.16b, {v0.16b}, idx0.16b tbl v5.16b, {v0.16b - v1.16b}, idx1.16b diff --git a/dev/aarch64_clean/src/polyz_unpack_19_asm.S b/dev/aarch64_clean/src/polyz_unpack_19_asm.S index ed65193d2..72907e5a8 100644 --- a/dev/aarch64_clean/src/polyz_unpack_19_asm.S +++ b/dev/aarch64_clean/src/polyz_unpack_19_asm.S @@ -66,9 +66,9 @@ polyz_unpack_19_loop: // 3-register ld1 would load 48 bytes, but only 40 are // consumed per iteration. The TBL indices for v2 are // adjusted to account for v2's load offset. - ld1 {v0.16b, v1.16b}, [buf] - add buf, buf, #0x18 - ld1 {v2.16b}, [buf], #0x10 + ld1 {v0.16b, v1.16b}, [buf] + add buf, buf, #0x18 + ld1 {v2.16b}, [buf], #0x10 tbl v4.16b, {v0.16b}, idx0.16b tbl v5.16b, {v0.16b - v1.16b}, idx1.16b diff --git a/dev/aarch64_opt/src/polyz_unpack_17_asm.S b/dev/aarch64_opt/src/polyz_unpack_17_asm.S index c7b9f00a2..f67609e2c 100644 --- a/dev/aarch64_opt/src/polyz_unpack_17_asm.S +++ b/dev/aarch64_opt/src/polyz_unpack_17_asm.S @@ -69,9 +69,9 @@ polyz_unpack_17_loop: // 3-register ld1 would load 48 bytes, but only 36 are // consumed per iteration. The TBL indices for v2 are // adjusted to account for v2's load offset. - ld1 {v0.16b, v1.16b}, [buf] - add buf, buf, #0x14 - ld1 {v2.16b}, [buf], #0x10 + ld1 {v0.16b, v1.16b}, [buf] + add buf, buf, #0x14 + ld1 {v2.16b}, [buf], #0x10 tbl v4.16b, {v0.16b}, idx0.16b tbl v5.16b, {v0.16b - v1.16b}, idx1.16b diff --git a/dev/aarch64_opt/src/polyz_unpack_19_asm.S b/dev/aarch64_opt/src/polyz_unpack_19_asm.S index ed65193d2..72907e5a8 100644 --- a/dev/aarch64_opt/src/polyz_unpack_19_asm.S +++ b/dev/aarch64_opt/src/polyz_unpack_19_asm.S @@ -66,9 +66,9 @@ polyz_unpack_19_loop: // 3-register ld1 would load 48 bytes, but only 40 are // consumed per iteration. The TBL indices for v2 are // adjusted to account for v2's load offset. - ld1 {v0.16b, v1.16b}, [buf] - add buf, buf, #0x18 - ld1 {v2.16b}, [buf], #0x10 + ld1 {v0.16b, v1.16b}, [buf] + add buf, buf, #0x18 + ld1 {v2.16b}, [buf], #0x10 tbl v4.16b, {v0.16b}, idx0.16b tbl v5.16b, {v0.16b - v1.16b}, idx1.16b diff --git a/dev/x86_64/src/intt.S b/dev/x86_64/src/intt.S index b610c2752..1aeb88e78 100644 --- a/dev/x86_64/src/intt.S +++ b/dev/x86_64/src/intt.S @@ -26,22 +26,22 @@ #include "consts.h" .macro shuffle8 r0, r1, r2, r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle4 r0, r1, r2, r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0, r1, r2, r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm /* @@ -52,7 +52,7 @@ vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 * (See the end of this file for the exact value of MONTMUL_BOUND) */ .macro butterfly l, h, zl0=1, zl1=1, zh0=2, zh1=2 -vpsubd %ymm\l,%ymm\h,%ymm12 +vpsubd %ymm\l,%ymm\h,%ymm12 /* * VEX Encoding Optimization for Platform-Independent Code * @@ -75,64 +75,64 @@ vpsubd %ymm\l,%ymm\h,%ymm12 * vpaddd %ymm4, %ymm8, %ymm4 -> 2-byte VEX (0xC5 0xBD 0xFE 0xE4) preferred */ .if (\l < 8) && (\h >= 8) -vpaddd %ymm\l,%ymm\h,%ymm\l +vpaddd %ymm\l,%ymm\h,%ymm\l .else -vpaddd %ymm\h,%ymm\l,%ymm\l +vpaddd %ymm\h,%ymm\l,%ymm\l .endif -vpmuldq %ymm\zl0,%ymm12,%ymm13 -vmovshdup %ymm12,%ymm\h -vpmuldq %ymm\zl1,%ymm\h,%ymm14 +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 -vpmuldq %ymm\zh0,%ymm12,%ymm12 -vpmuldq %ymm\zh1,%ymm\h,%ymm\h +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 -vpsubd %ymm13,%ymm12,%ymm12 -vpsubd %ymm14,%ymm\h,%ymm\h +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h -vmovshdup %ymm12,%ymm12 -vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h .endm .macro levels0t5 off -vmovdqa 256*\off+ 0(%rdi),%ymm4 -vmovdqa 256*\off+ 32(%rdi),%ymm5 -vmovdqa 256*\off+ 64(%rdi),%ymm6 -vmovdqa 256*\off+ 96(%rdi),%ymm7 -vmovdqa 256*\off+128(%rdi),%ymm8 -vmovdqa 256*\off+160(%rdi),%ymm9 -vmovdqa 256*\off+192(%rdi),%ymm10 -vmovdqa 256*\off+224(%rdi),%ymm11 +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 /* Bounds: |ymm{i}| < q for i in 4...11 */ /* level 0 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4, 5, 1, 3, 2, 15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 6, 7, 1, 3, 2, 15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 8, 9, 1, 3, 2, 15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 10, 11, 1, 3, 2, 15 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4, 5, 1, 3, 2, 15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6, 7, 1, 3, 2, 15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8, 9, 1, 3, 2, 15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10, 11, 1, 3, 2, 15 /* * Bounds: |ymm{i}| < 2q for i in 4, 6, 8, 10 @@ -151,120 +151,120 @@ butterfly 10, 11, 1, 3, 2, 15 */ /* level 1 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4, 6, 1, 3, 2, 15 -butterfly 5, 7, 1, 3, 2, 15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 8, 10, 1, 3, 2, 15 -butterfly 9, 11, 1, 3, 2, 15 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4, 6, 1, 3, 2, 15 +butterfly 5, 7, 1, 3, 2, 15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8, 10, 1, 3, 2, 15 +butterfly 9, 11, 1, 3, 2, 15 /* Bounds: |ymm{i}| < 4q */ /* level 2 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4, 8, 1, 3, 2, 15 -butterfly 5, 9, 1, 3, 2, 15 -butterfly 6, 10, 1, 3, 2, 15 -butterfly 7, 11, 1, 3, 2, 15 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4, 8, 1, 3, 2, 15 +butterfly 5, 9, 1, 3, 2, 15 +butterfly 6, 10, 1, 3, 2, 15 +butterfly 7, 11, 1, 3, 2, 15 /* Bounds: |ymm{i}| < 8q */ /* level 3 */ -shuffle2 4, 5, 3, 5 -shuffle2 6, 7, 4, 7 -shuffle2 8, 9, 6, 9 -shuffle2 10, 11, 8, 11 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 -butterfly 3, 5 -butterfly 4, 7 -butterfly 6, 9 -butterfly 8, 11 +shuffle2 4, 5, 3, 5 +shuffle2 6, 7, 4, 7 +shuffle2 8, 9, 6, 9 +shuffle2 10, 11, 8, 11 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3, 5 +butterfly 4, 7 +butterfly 6, 9 +butterfly 8, 11 /* Bounds: |ymm{i}| < 16q */ /* level 4 */ -shuffle4 3, 4, 10, 4 -shuffle4 6, 8, 3, 8 -shuffle4 5, 7, 6, 7 -shuffle4 9, 11, 5, 11 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 -butterfly 10, 4 -butterfly 3, 8 -butterfly 6, 7 -butterfly 5, 11 +shuffle4 3, 4, 10, 4 +shuffle4 6, 8, 3, 8 +shuffle4 5, 7, 6, 7 +shuffle4 9, 11, 5, 11 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10, 4 +butterfly 3, 8 +butterfly 6, 7 +butterfly 5, 11 /* Bounds: |ymm{i}| < 32q */ /* level 5 */ -shuffle8 10, 3, 9, 3 -shuffle8 6, 5, 10, 5 -shuffle8 4, 8, 6, 8 -shuffle8 7, 11, 4, 11 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+7-\off)*4(%rsi),%ymm2 -butterfly 9, 3 -butterfly 10, 5 -butterfly 6, 8 -butterfly 4, 11 +shuffle8 10, 3, 9, 3 +shuffle8 6, 5, 10, 5 +shuffle8 4, 8, 6, 8 +shuffle8 7, 11, 4, 11 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9, 3 +butterfly 10, 5 +butterfly 6, 8 +butterfly 4, 11 /* Bounds: |ymm{i}| < 64q */ -vmovdqa %ymm9,256*\off+ 0(%rdi) -vmovdqa %ymm10,256*\off+ 32(%rdi) -vmovdqa %ymm6,256*\off+ 64(%rdi) -vmovdqa %ymm4,256*\off+ 96(%rdi) -vmovdqa %ymm3,256*\off+128(%rdi) -vmovdqa %ymm5,256*\off+160(%rdi) -vmovdqa %ymm8,256*\off+192(%rdi) -vmovdqa %ymm11,256*\off+224(%rdi) +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) .endm .macro levels6t7 off -vmovdqa 0+32*\off(%rdi),%ymm4 -vmovdqa 128+32*\off(%rdi),%ymm5 -vmovdqa 256+32*\off(%rdi),%ymm6 -vmovdqa 384+32*\off(%rdi),%ymm7 -vmovdqa 512+32*\off(%rdi),%ymm8 -vmovdqa 640+32*\off(%rdi),%ymm9 -vmovdqa 768+32*\off(%rdi),%ymm10 -vmovdqa 896+32*\off(%rdi),%ymm11 +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 /* level 6 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 -butterfly 4, 6 -butterfly 5, 7 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4, 6 +butterfly 5, 7 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 -butterfly 8, 10 -butterfly 9, 11 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8, 10 +butterfly 9, 11 /* Bounds: |ymm{i}| < 128q */ /* level 7 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2 -butterfly 4, 8 -butterfly 5, 9 -butterfly 6, 10 -butterfly 7, 11 +butterfly 4, 8 +butterfly 5, 9 +butterfly 6, 10 +butterfly 7, 11 /* * Bounds: |ymm{i}| < 256q for i in 4...7 @@ -289,53 +289,53 @@ vmovdqa %ymm11,896+32*\off(%rdi) * Bounds: |ymm{i}| < 256q for i in 4...7 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2 -vpmuldq %ymm1,%ymm4,%ymm12 -vpmuldq %ymm1,%ymm5,%ymm13 -vmovshdup %ymm4,%ymm8 -vmovshdup %ymm5,%ymm9 -vpmuldq %ymm1,%ymm8,%ymm14 -vpmuldq %ymm1,%ymm9,%ymm15 -vpmuldq %ymm2,%ymm4,%ymm4 -vpmuldq %ymm2,%ymm5,%ymm5 -vpmuldq %ymm2,%ymm8,%ymm8 -vpmuldq %ymm2,%ymm9,%ymm9 -vpmuldq %ymm0,%ymm12,%ymm12 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 -vpmuldq %ymm0,%ymm15,%ymm15 -vpsubd %ymm12,%ymm4,%ymm4 -vpsubd %ymm13,%ymm5,%ymm5 -vpsubd %ymm14,%ymm8,%ymm8 -vpsubd %ymm15,%ymm9,%ymm9 -vmovshdup %ymm4,%ymm4 -vmovshdup %ymm5,%ymm5 -vpblendd $0xAA,%ymm8,%ymm4,%ymm4 -vpblendd $0xAA,%ymm9,%ymm5,%ymm5 - -vpmuldq %ymm1,%ymm6,%ymm12 -vpmuldq %ymm1,%ymm7,%ymm13 -vmovshdup %ymm6,%ymm8 -vmovshdup %ymm7,%ymm9 -vpmuldq %ymm1,%ymm8,%ymm14 -vpmuldq %ymm1,%ymm9,%ymm15 -vpmuldq %ymm2,%ymm6,%ymm6 -vpmuldq %ymm2,%ymm7,%ymm7 -vpmuldq %ymm2,%ymm8,%ymm8 -vpmuldq %ymm2,%ymm9,%ymm9 -vpmuldq %ymm0,%ymm12,%ymm12 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 -vpmuldq %ymm0,%ymm15,%ymm15 -vpsubd %ymm12,%ymm6,%ymm6 -vpsubd %ymm13,%ymm7,%ymm7 -vpsubd %ymm14,%ymm8,%ymm8 -vpsubd %ymm15,%ymm9,%ymm9 -vmovshdup %ymm6,%ymm6 -vmovshdup %ymm7,%ymm7 -vpblendd $0xAA,%ymm8,%ymm6,%ymm6 -vpblendd $0xAA,%ymm9,%ymm7,%ymm7 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 /* Bounds: |ymm{i}| < MONTMUL_BOUND for i in 4...7 */ @@ -372,15 +372,15 @@ MLD_ASM_FN_SYMBOL(invntt_avx2) vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 -levels0t5 0 -levels0t5 1 -levels0t5 2 -levels0t5 3 +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 -levels6t7 0 -levels6t7 1 -levels6t7 2 -levels6t7 3 +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 ret /* simpasm: footer-start */ diff --git a/dev/x86_64/src/ntt.S b/dev/x86_64/src/ntt.S index 2da198c0e..45c118d35 100644 --- a/dev/x86_64/src/ntt.S +++ b/dev/x86_64/src/ntt.S @@ -26,22 +26,22 @@ #include "consts.h" .macro shuffle8 r0, r1, r2, r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle4 r0, r1, r2, r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0, r1, r2, r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm /* @@ -55,18 +55,18 @@ vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 * each layer. */ .macro butterfly l, h, zl0=1, zl1=1, zh0=2, zh1=2 -vpmuldq %ymm\zl0,%ymm\h,%ymm13 -vmovshdup %ymm\h,%ymm12 -vpmuldq %ymm\zl1,%ymm12,%ymm14 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 -vpmuldq %ymm\zh0,%ymm\h,%ymm\h -vpmuldq %ymm\zh1,%ymm12,%ymm12 +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 -vmovshdup %ymm\h,%ymm\h -vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */ +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */ /* * Originally, mulhi(h, zh) should be subtracted by mulhi(q, mullo(h, zl)) in @@ -109,200 +109,200 @@ vpaddd %ymm\l,%ymm\h,%ymm\l /* l + mulhi(h, zh) vpaddd %ymm\h,%ymm\l,%ymm\l .endif -vmovshdup %ymm13,%ymm13 -vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q, mullo(h, zl)) */ +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q, mullo(h, zl)) */ /* Finish the delayed task mentioned above */ -vpaddd %ymm13,%ymm12,%ymm\h /* h' */ -vpsubd %ymm13,%ymm\l,%ymm\l /* l' */ +vpaddd %ymm13,%ymm12,%ymm\h /* h' */ +vpsubd %ymm13,%ymm\l,%ymm\l /* l' */ .endm .macro levels0t1 off /* level 0 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2 - -vmovdqa 0+32*\off(%rdi),%ymm4 -vmovdqa 128+32*\off(%rdi),%ymm5 -vmovdqa 256+32*\off(%rdi),%ymm6 -vmovdqa 384+32*\off(%rdi),%ymm7 -vmovdqa 512+32*\off(%rdi),%ymm8 -vmovdqa 640+32*\off(%rdi),%ymm9 -vmovdqa 768+32*\off(%rdi),%ymm10 -vmovdqa 896+32*\off(%rdi),%ymm11 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 /* Bounds: |ymm{i}| < q */ -butterfly 4, 8 -butterfly 5, 9 -butterfly 6, 10 -butterfly 7, 11 +butterfly 4, 8 +butterfly 5, 9 +butterfly 6, 10 +butterfly 7, 11 /* Bounds: |ymm{i}| < 2q */ /* level 1 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 -butterfly 4, 6 -butterfly 5, 7 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4, 6 +butterfly 5, 7 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 -butterfly 8, 10 -butterfly 9, 11 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8, 10 +butterfly 9, 11 /* Bounds: |ymm{i}| < 3q */ -vmovdqa %ymm4, 0+32*\off(%rdi) -vmovdqa %ymm5,128+32*\off(%rdi) -vmovdqa %ymm6,256+32*\off(%rdi) -vmovdqa %ymm7,384+32*\off(%rdi) -vmovdqa %ymm8,512+32*\off(%rdi) -vmovdqa %ymm9,640+32*\off(%rdi) -vmovdqa %ymm10,768+32*\off(%rdi) -vmovdqa %ymm11,896+32*\off(%rdi) +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) .endm .macro levels2t7 off /* level 2 */ -vmovdqa 256*\off+ 0(%rdi),%ymm4 -vmovdqa 256*\off+ 32(%rdi),%ymm5 -vmovdqa 256*\off+ 64(%rdi),%ymm6 -vmovdqa 256*\off+ 96(%rdi),%ymm7 -vmovdqa 256*\off+128(%rdi),%ymm8 -vmovdqa 256*\off+160(%rdi),%ymm9 -vmovdqa 256*\off+192(%rdi),%ymm10 -vmovdqa 256*\off+224(%rdi),%ymm11 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2 - -butterfly 4, 8 -butterfly 5, 9 -butterfly 6, 10 -butterfly 7, 11 - -shuffle8 4, 8, 3, 8 -shuffle8 5, 9, 4, 9 -shuffle8 6, 10, 5, 10 -shuffle8 7, 11, 6, 11 +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4, 8 +butterfly 5, 9 +butterfly 6, 10 +butterfly 7, 11 + +shuffle8 4, 8, 3, 8 +shuffle8 5, 9, 4, 9 +shuffle8 6, 10, 5, 10 +shuffle8 7, 11, 6, 11 /* Bounds: |ymm{i}| < 4q */ /* level 3 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 -butterfly 3, 5 -butterfly 8, 10 -butterfly 4, 6 -butterfly 9, 11 +butterfly 3, 5 +butterfly 8, 10 +butterfly 4, 6 +butterfly 9, 11 -shuffle4 3, 5, 7, 5 -shuffle4 8, 10, 3, 10 -shuffle4 4, 6, 8, 6 -shuffle4 9, 11, 4, 11 +shuffle4 3, 5, 7, 5 +shuffle4 8, 10, 3, 10 +shuffle4 4, 6, 8, 6 +shuffle4 9, 11, 4, 11 /* Bounds: |ymm{i}| < 5q */ /* level 4 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 -butterfly 7, 8 -butterfly 5, 6 -butterfly 3, 4 -butterfly 10, 11 +butterfly 7, 8 +butterfly 5, 6 +butterfly 3, 4 +butterfly 10, 11 -shuffle2 7, 8, 9, 8 -shuffle2 5, 6, 7, 6 -shuffle2 3, 4, 5, 4 -shuffle2 10, 11, 3, 11 +shuffle2 7, 8, 9, 8 +shuffle2 5, 6, 7, 6 +shuffle2 3, 4, 5, 4 +shuffle2 10, 11, 3, 11 /* Bounds: |ymm{i}| < 6q */ /* level 5 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 -butterfly 9, 5, 1, 10, 2, 15 -butterfly 8, 4, 1, 10, 2, 15 -butterfly 7, 3, 1, 10, 2, 15 -butterfly 6, 11, 1, 10, 2, 15 +butterfly 9, 5, 1, 10, 2, 15 +butterfly 8, 4, 1, 10, 2, 15 +butterfly 7, 3, 1, 10, 2, 15 +butterfly 6, 11, 1, 10, 2, 15 /* Bounds: |ymm{i}| < 7q */ /* level 6 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 9, 7, 1, 10, 2, 15 -butterfly 8, 6, 1, 10, 2, 15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 5, 3, 1, 10, 2, 15 -butterfly 4, 11, 1, 10, 2, 15 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9, 7, 1, 10, 2, 15 +butterfly 8, 6, 1, 10, 2, 15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5, 3, 1, 10, 2, 15 +butterfly 4, 11, 1, 10, 2, 15 /* Bounds: |ymm{i}| < 8q */ /* level 7 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 9, 8, 1, 10, 2, 15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 7, 6, 1, 10, 2, 15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 5, 4, 1, 10, 2, 15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 3, 11, 1, 10, 2, 15 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9, 8, 1, 10, 2, 15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7, 6, 1, 10, 2, 15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5, 4, 1, 10, 2, 15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3, 11, 1, 10, 2, 15 /* Bounds: |ymm{i}| < 9q */ -vmovdqa %ymm9,256*\off+ 0(%rdi) -vmovdqa %ymm8,256*\off+ 32(%rdi) -vmovdqa %ymm7,256*\off+ 64(%rdi) -vmovdqa %ymm6,256*\off+ 96(%rdi) -vmovdqa %ymm5,256*\off+128(%rdi) -vmovdqa %ymm4,256*\off+160(%rdi) -vmovdqa %ymm3,256*\off+192(%rdi) -vmovdqa %ymm11,256*\off+224(%rdi) +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) .endm .text .balign 4 .global MLD_ASM_NAMESPACE(ntt_avx2) MLD_ASM_FN_SYMBOL(ntt_avx2) -vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 +vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 -levels0t1 0 -levels0t1 1 -levels0t1 2 -levels0t1 3 +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 -levels2t7 0 -levels2t7 1 -levels2t7 2 -levels2t7 3 +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 ret /* simpasm: footer-start */ diff --git a/dev/x86_64/src/nttunpack.S b/dev/x86_64/src/nttunpack.S index 032fd2343..e6e3eca39 100644 --- a/dev/x86_64/src/nttunpack.S +++ b/dev/x86_64/src/nttunpack.S @@ -25,59 +25,59 @@ /* simpasm: header-end */ .macro shuffle8 r0, r1, r2, r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle4 r0, r1, r2, r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0, r1, r2, r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro nttunpack_64_coefficients offset #load -vmovdqa (\offset + 0)(%rdi), %ymm4 -vmovdqa (\offset + 32)(%rdi), %ymm5 -vmovdqa (\offset + 64)(%rdi), %ymm6 -vmovdqa (\offset + 96)(%rdi), %ymm7 -vmovdqa (\offset + 128)(%rdi), %ymm8 -vmovdqa (\offset + 160)(%rdi), %ymm9 -vmovdqa (\offset + 192)(%rdi), %ymm10 -vmovdqa (\offset + 224)(%rdi), %ymm11 - -shuffle8 4, 8, 3, 8 -shuffle8 5, 9, 4, 9 -shuffle8 6, 10, 5, 10 -shuffle8 7, 11, 6, 11 - -shuffle4 3, 5, 7, 5 -shuffle4 8, 10, 3, 10 -shuffle4 4, 6, 8, 6 -shuffle4 9, 11, 4, 11 - -shuffle2 7, 8, 9, 8 -shuffle2 5, 6, 7, 6 -shuffle2 3, 4, 5, 4 -shuffle2 10, 11, 3, 11 +vmovdqa (\offset + 0)(%rdi), %ymm4 +vmovdqa (\offset + 32)(%rdi), %ymm5 +vmovdqa (\offset + 64)(%rdi), %ymm6 +vmovdqa (\offset + 96)(%rdi), %ymm7 +vmovdqa (\offset + 128)(%rdi), %ymm8 +vmovdqa (\offset + 160)(%rdi), %ymm9 +vmovdqa (\offset + 192)(%rdi), %ymm10 +vmovdqa (\offset + 224)(%rdi), %ymm11 + +shuffle8 4, 8, 3, 8 +shuffle8 5, 9, 4, 9 +shuffle8 6, 10, 5, 10 +shuffle8 7, 11, 6, 11 + +shuffle4 3, 5, 7, 5 +shuffle4 8, 10, 3, 10 +shuffle4 4, 6, 8, 6 +shuffle4 9, 11, 4, 11 + +shuffle2 7, 8, 9, 8 +shuffle2 5, 6, 7, 6 +shuffle2 3, 4, 5, 4 +shuffle2 10, 11, 3, 11 #store -vmovdqa %ymm9, (\offset + 0)(%rdi) -vmovdqa %ymm8, (\offset + 32)(%rdi) -vmovdqa %ymm7, (\offset + 64)(%rdi) -vmovdqa %ymm6, (\offset + 96)(%rdi) -vmovdqa %ymm5, (\offset + 128)(%rdi) -vmovdqa %ymm4, (\offset + 160)(%rdi) -vmovdqa %ymm3, (\offset + 192)(%rdi) -vmovdqa %ymm11, (\offset + 224)(%rdi) +vmovdqa %ymm9, (\offset + 0)(%rdi) +vmovdqa %ymm8, (\offset + 32)(%rdi) +vmovdqa %ymm7, (\offset + 64)(%rdi) +vmovdqa %ymm6, (\offset + 96)(%rdi) +vmovdqa %ymm5, (\offset + 128)(%rdi) +vmovdqa %ymm4, (\offset + 160)(%rdi) +vmovdqa %ymm3, (\offset + 192)(%rdi) +vmovdqa %ymm11, (\offset + 224)(%rdi) .endm diff --git a/dev/x86_64/src/poly_caddq_avx2.S b/dev/x86_64/src/poly_caddq_avx2.S index 882424451..9625c27a0 100644 --- a/dev/x86_64/src/poly_caddq_avx2.S +++ b/dev/x86_64/src/poly_caddq_avx2.S @@ -42,15 +42,15 @@ vmovdqa \reg, \offset(%rdi) .endm .text -.global MLD_ASM_NAMESPACE(poly_caddq_avx2) +.global MLD_ASM_NAMESPACE(poly_caddq_avx2) .balign 16 MLD_ASM_FN_SYMBOL(poly_caddq_avx2) -mov $8380417, %edx -leaq 1024(%rdi), %rax -vpxor %xmm2, %xmm2, %xmm2 -vmovd %edx, %xmm1 -vpbroadcastd %xmm1, %ymm1 +mov $8380417, %edx +leaq 1024(%rdi), %rax +vpxor %xmm2, %xmm2, %xmm2 +vmovd %edx, %xmm1 +vpbroadcastd %xmm1, %ymm1 poly_caddq_avx2_loop: diff --git a/mldsa/src/cbmc.h b/mldsa/src/cbmc.h index 981755009..b9158863e 100644 --- a/mldsa/src/cbmc.h +++ b/mldsa/src/cbmc.h @@ -96,7 +96,7 @@ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \ } -#define exists(qvar, qvar_lb, qvar_ub, predicate) \ +#define exists(qvar, qvar_lb, qvar_ub, predicate) \ __CPROVER_exists \ { \ unsigned qvar; \ @@ -120,30 +120,30 @@ { \ unsigned qvar; \ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ - (((int)(value_lb) <= ((array_var)[(qvar)])) && \ - (((array_var)[(qvar)]) < (int)(value_ub))) \ + (((int)(value_lb) <= ((array_var)[(qvar)])) && \ + (((array_var)[(qvar)]) < (int)(value_ub))) \ } #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \ - array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \ + array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \ (qvar_ub), (array_var), (value_lb), (value_ub)) -#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \ - __CPROVER_forall \ - { \ - unsigned qvar; \ - ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ +#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \ + __CPROVER_forall \ + { \ + unsigned qvar; \ + ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ ((array_var)[(qvar)]) == (old(* (int32_t (*)[(qvar_ub)])(array_var)))[(qvar)] \ } #define array_unchanged(array_var, N) \ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var)) -#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \ - __CPROVER_forall \ - { \ - unsigned qvar; \ - ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ +#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \ + __CPROVER_forall \ + { \ + unsigned qvar; \ + ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \ } diff --git a/mldsa/src/fips202/keccakf1600.h b/mldsa/src/fips202/keccakf1600.h index c226804a3..d1d75e911 100644 --- a/mldsa/src/fips202/keccakf1600.h +++ b/mldsa/src/fips202/keccakf1600.h @@ -25,7 +25,7 @@ void mld_keccakf1600_extract_bytes(uint64_t *state, unsigned char *data, unsigned offset, unsigned length) __contract__( requires(0 <= offset && offset <= MLD_KECCAK_LANES * sizeof(uint64_t) && - 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) + 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) requires(memory_no_alias(state, sizeof(uint64_t) * MLD_KECCAK_LANES)) requires(memory_no_alias(data, length)) assigns(memory_slice(data, length)) @@ -37,7 +37,7 @@ void mld_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data, unsigned offset, unsigned length) __contract__( requires(0 <= offset && offset <= MLD_KECCAK_LANES * sizeof(uint64_t) && - 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) + 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) requires(memory_no_alias(state, sizeof(uint64_t) * MLD_KECCAK_LANES)) requires(memory_no_alias(data, length)) assigns(memory_slice(state, sizeof(uint64_t) * MLD_KECCAK_LANES)) @@ -52,7 +52,7 @@ void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0, unsigned length) __contract__( requires(0 <= offset && offset <= MLD_KECCAK_LANES * sizeof(uint64_t) && - 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) + 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) requires(memory_no_alias(state, sizeof(uint64_t) * MLD_KECCAK_LANES * MLD_KECCAK_WAY)) requires(memory_no_alias(data0, length)) requires(memory_no_alias(data1, length)) @@ -73,14 +73,14 @@ void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0, unsigned length) __contract__( requires(0 <= offset && offset <= MLD_KECCAK_LANES * sizeof(uint64_t) && - 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) + 0 <= length && length <= MLD_KECCAK_LANES * sizeof(uint64_t) - offset) requires(memory_no_alias(state, sizeof(uint64_t) * MLD_KECCAK_LANES * MLD_KECCAK_WAY)) requires(memory_no_alias(data0, length)) /* Case 1: all input buffers are distinct; Case 2: All input buffers are the same */ requires((data0 == data1 && data0 == data2 && data0 == data3) || - (memory_no_alias(data1, length) && + (memory_no_alias(data1, length) && memory_no_alias(data2, length) && memory_no_alias(data3, length))) assigns(memory_slice(state, sizeof(uint64_t) * MLD_KECCAK_LANES * MLD_KECCAK_WAY)) diff --git a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S index 52f780f05..f3d851495 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +++ b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S @@ -46,322 +46,322 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x1_scalar_asm) .cfi_startproc - sub sp, sp, #0x80 + sub sp, sp, #0x80 .cfi_adjust_cfa_offset 0x80 - stp x19, x20, [sp, #0x20] + stp x19, x20, [sp, #0x20] .cfi_rel_offset x19, 0x20 .cfi_rel_offset x20, 0x28 - stp x21, x22, [sp, #0x30] + stp x21, x22, [sp, #0x30] .cfi_rel_offset x21, 0x30 .cfi_rel_offset x22, 0x38 - stp x23, x24, [sp, #0x40] + stp x23, x24, [sp, #0x40] .cfi_rel_offset x23, 0x40 .cfi_rel_offset x24, 0x48 - stp x25, x26, [sp, #0x50] + stp x25, x26, [sp, #0x50] .cfi_rel_offset x25, 0x50 .cfi_rel_offset x26, 0x58 - stp x27, x28, [sp, #0x60] + stp x27, x28, [sp, #0x60] .cfi_rel_offset x27, 0x60 .cfi_rel_offset x28, 0x68 - stp x29, x30, [sp, #0x70] + stp x29, x30, [sp, #0x70] .cfi_rel_offset x29, 0x70 .cfi_rel_offset x30, 0x78 Lkeccak_f1600_x1_scalar_initial: - mov x26, x1 - str x1, [sp, #0x8] - ldp x1, x6, [x0] - ldp x11, x16, [x0, #0x10] - ldp x21, x2, [x0, #0x20] - ldp x7, x12, [x0, #0x30] - ldp x17, x22, [x0, #0x40] - ldp x3, x8, [x0, #0x50] - ldp x13, x28, [x0, #0x60] - ldp x23, x4, [x0, #0x70] - ldp x9, x14, [x0, #0x80] - ldp x19, x24, [x0, #0x90] - ldp x5, x10, [x0, #0xa0] - ldp x15, x20, [x0, #0xb0] - ldr x25, [x0, #0xc0] - str x0, [sp] - eor x30, x24, x25 - eor x27, x9, x10 - eor x0, x30, x21 - eor x26, x27, x6 - eor x27, x26, x7 - eor x29, x0, x22 - eor x26, x29, x23 - eor x29, x4, x5 - eor x30, x29, x1 - eor x0, x27, x8 - eor x29, x30, x2 - eor x30, x19, x20 - eor x30, x30, x16 - eor x27, x26, x0, ror #63 - eor x4, x4, x27 - eor x30, x30, x17 - eor x30, x30, x28 - eor x29, x29, x3 - eor x0, x0, x30, ror #63 - eor x30, x30, x29, ror #63 - eor x22, x22, x30 - eor x23, x23, x30 - str x23, [sp, #0x18] - eor x23, x14, x15 - eor x14, x14, x0 - eor x23, x23, x11 - eor x15, x15, x0 - eor x1, x1, x27 - eor x23, x23, x12 - eor x23, x23, x13 - eor x11, x11, x0 - eor x29, x29, x23, ror #63 - eor x23, x23, x26, ror #63 - eor x26, x13, x0 - eor x13, x28, x23 - eor x28, x24, x30 - eor x24, x16, x23 - eor x16, x21, x30 - eor x21, x25, x30 - eor x30, x19, x23 - eor x19, x20, x23 - eor x20, x17, x23 - eor x17, x12, x0 - eor x0, x2, x27 - eor x2, x6, x29 - eor x6, x8, x29 - bic x8, x28, x13, ror #47 - eor x12, x3, x27 - bic x3, x13, x17, ror #19 - eor x5, x5, x27 - ldr x27, [sp, #0x18] - bic x25, x17, x2, ror #5 - eor x9, x9, x29 - eor x23, x25, x5, ror #52 - eor x3, x3, x2, ror #24 - eor x8, x8, x17, ror #2 - eor x17, x10, x29 - bic x25, x12, x22, ror #47 - eor x29, x7, x29 - bic x10, x4, x27, ror #2 - bic x7, x5, x28, ror #10 - eor x10, x10, x20, ror #50 - eor x13, x7, x13, ror #57 - bic x7, x2, x5, ror #47 - eor x2, x25, x24, ror #39 - bic x25, x20, x11, ror #57 - bic x5, x17, x4, ror #25 - eor x25, x25, x17, ror #53 - bic x17, x11, x17, ror #60 - eor x28, x7, x28, ror #57 - bic x7, x9, x12, ror #42 - eor x7, x7, x22, ror #25 - bic x22, x22, x24, ror #56 - bic x24, x24, x15, ror #31 - eor x22, x22, x15, ror #23 - bic x20, x27, x20, ror #48 - bic x15, x15, x9, ror #16 - eor x12, x15, x12, ror #58 - eor x15, x5, x27, ror #27 - eor x5, x20, x11, ror #41 - ldr x11, [sp, #0x8] - eor x20, x17, x4, ror #21 - eor x17, x24, x9, ror #47 - mov x24, #0x1 // =1 - bic x9, x0, x16, ror #9 - str x24, [sp, #0x10] - bic x24, x29, x1, ror #44 - bic x27, x1, x21, ror #50 - bic x4, x26, x29, ror #63 - eor x1, x1, x4, ror #21 - ldr x11, [x11] - bic x4, x21, x30, ror #57 - eor x21, x24, x21, ror #30 - eor x24, x9, x19, ror #44 - bic x9, x14, x6, ror #5 - eor x9, x9, x0, ror #43 - bic x0, x6, x0, ror #38 - eor x1, x1, x11 - eor x11, x4, x26, ror #35 - eor x4, x0, x16, ror #47 - bic x0, x16, x19, ror #35 - eor x16, x27, x30, ror #43 - bic x27, x30, x26, ror #42 - bic x26, x19, x14, ror #41 - eor x19, x0, x14, ror #12 - eor x14, x26, x6, ror #46 - eor x6, x27, x29, ror #41 + mov x26, x1 + str x1, [sp, #0x8] + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + str x0, [sp] + eor x30, x24, x25 + eor x27, x9, x10 + eor x0, x30, x21 + eor x26, x27, x6 + eor x27, x26, x7 + eor x29, x0, x22 + eor x26, x29, x23 + eor x29, x4, x5 + eor x30, x29, x1 + eor x0, x27, x8 + eor x29, x30, x2 + eor x30, x19, x20 + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor x4, x4, x27 + eor x30, x30, x17 + eor x30, x30, x28 + eor x29, x29, x3 + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor x23, x23, x30 + str x23, [sp, #0x18] + eor x23, x14, x15 + eor x14, x14, x0 + eor x23, x23, x11 + eor x15, x15, x0 + eor x1, x1, x27 + eor x23, x23, x12 + eor x23, x23, x13 + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + eor x26, x13, x0 + eor x13, x28, x23 + eor x28, x24, x30 + eor x24, x16, x23 + eor x16, x21, x30 + eor x21, x25, x30 + eor x30, x19, x23 + eor x19, x20, x23 + eor x20, x17, x23 + eor x17, x12, x0 + eor x0, x2, x27 + eor x2, x6, x29 + eor x6, x8, x29 + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + ldr x27, [sp, #0x18] + bic x25, x17, x2, ror #5 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + bic x7, x5, x28, ror #10 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor x5, x20, x11, ror #41 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x10] + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + ldr x11, [x11] + bic x4, x21, x30, ror #57 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 Lkeccak_f1600_x1_scalar_loop: - eor x0, x15, x11, ror #52 - eor x0, x0, x13, ror #48 - eor x26, x8, x9, ror #57 - eor x27, x0, x14, ror #10 - eor x29, x16, x28, ror #63 - eor x26, x26, x6, ror #51 - eor x30, x23, x22, ror #50 - eor x0, x26, x10, ror #31 - eor x29, x29, x19, ror #37 - eor x27, x27, x12, ror #5 - eor x30, x30, x24, ror #34 - eor x0, x0, x7, ror #27 - eor x26, x30, x21, ror #26 - eor x26, x26, x25, ror #15 - ror x30, x27, #0x3e - eor x30, x30, x26, ror #57 - ror x26, x26, #0x3a - eor x16, x30, x16 - eor x28, x30, x28, ror #63 - str x28, [sp, #0x18] - eor x29, x29, x17, ror #36 - eor x28, x1, x2, ror #61 - eor x19, x30, x19, ror #37 - eor x29, x29, x20, ror #2 - eor x28, x28, x4, ror #54 - eor x26, x26, x0, ror #55 - eor x28, x28, x3, ror #39 - eor x28, x28, x5, ror #25 - ror x0, x0, #0x38 - eor x0, x0, x29, ror #63 - eor x27, x28, x27, ror #61 - eor x13, x0, x13, ror #46 - eor x28, x29, x28, ror #63 - eor x29, x30, x20, ror #2 - eor x20, x26, x3, ror #39 - eor x11, x0, x11, ror #50 - eor x25, x28, x25, ror #9 - eor x3, x28, x21, ror #20 - eor x21, x26, x1 - eor x9, x27, x9, ror #49 - eor x24, x28, x24, ror #28 - eor x1, x30, x17, ror #36 - eor x14, x0, x14, ror #8 - eor x22, x28, x22, ror #44 - eor x8, x27, x8, ror #56 - eor x17, x27, x7, ror #19 - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - eor x4, x26, x4, ror #54 - eor x0, x0, x12, ror #3 - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - eor x26, x26, x5, ror #25 - eor x2, x7, x16, ror #39 - bic x7, x9, x20, ror #42 - bic x30, x15, x9, ror #16 - eor x7, x7, x22, ror #25 - eor x12, x30, x20, ror #58 - bic x20, x22, x16, ror #56 - eor x30, x27, x6, ror #43 - eor x22, x20, x15, ror #23 - bic x6, x19, x13, ror #42 - eor x6, x6, x17, ror #41 - bic x5, x13, x17, ror #63 - eor x5, x21, x5, ror #21 - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - bic x21, x21, x25, ror #50 - bic x20, x27, x4, ror #25 - bic x10, x16, x15, ror #31 - eor x16, x21, x19, ror #43 - eor x21, x17, x25, ror #30 - bic x19, x25, x19, ror #57 - ldr x25, [sp, #0x10] - eor x17, x10, x9, ror #47 - ldr x9, [sp, #0x8] - eor x15, x20, x28, ror #27 - bic x20, x4, x28, ror #2 - eor x10, x20, x1, ror #50 - bic x20, x11, x27, ror #60 - eor x20, x20, x4, ror #21 - bic x4, x28, x1, ror #48 - bic x1, x1, x11, ror #57 - ldr x28, [x9, x25, lsl #3] - ldr x9, [sp, #0x18] - add x25, x25, #0x1 - str x25, [sp, #0x10] - cmp x25, #0x17 - eor x25, x1, x27, ror #53 - bic x27, x30, x26, ror #47 - eor x1, x5, x28 - eor x5, x4, x11, ror #41 - eor x11, x19, x13, ror #35 - bic x13, x26, x24, ror #10 - eor x28, x27, x24, ror #57 - bic x27, x24, x9, ror #47 - bic x19, x23, x3, ror #9 - bic x4, x29, x14, ror #41 - eor x24, x19, x29, ror #44 - bic x29, x3, x29, ror #35 - eor x13, x13, x9, ror #57 - eor x19, x29, x14, ror #12 - bic x29, x9, x0, ror #19 - bic x14, x14, x8, ror #5 - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - b.le Lkeccak_f1600_x1_scalar_loop - ror x6, x6, #0x2b - ror x11, x11, #0x32 - ror x21, x21, #0x14 - ror x2, x2, #0x3d - ror x7, x7, #0x13 - ror x12, x12, #0x3 - ror x17, x17, #0x24 - ror x22, x22, #0x2c - ror x3, x3, #0x27 - ror x8, x8, #0x38 - ror x13, x13, #0x2e - ror x28, x28, #0x3f - ror x23, x23, #0x3a - ror x4, x4, #0x36 - ror x9, x9, #0x31 - ror x14, x14, #0x8 - ror x19, x19, #0x25 - ror x24, x24, #0x1c - ror x5, x5, #0x19 - ror x10, x10, #0x17 - ror x15, x15, #0x3e - ror x20, x20, #0x2 - ror x25, x25, #0x9 - ldr x0, [sp] - stp x1, x6, [x0] - stp x11, x16, [x0, #0x10] - stp x21, x2, [x0, #0x20] - stp x7, x12, [x0, #0x30] - stp x17, x22, [x0, #0x40] - stp x3, x8, [x0, #0x50] - stp x13, x28, [x0, #0x60] - stp x23, x4, [x0, #0x70] - stp x9, x14, [x0, #0x80] - stp x19, x24, [x0, #0x90] - stp x5, x10, [x0, #0xa0] - stp x15, x20, [x0, #0xb0] - str x25, [x0, #0xc0] - ldp x19, x20, [sp, #0x20] + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + str x28, [sp, #0x18] + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x10] + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0x18] + add x25, x25, #0x1 + str x25, [sp, #0x10] + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + b.le Lkeccak_f1600_x1_scalar_loop + ror x6, x6, #0x2b + ror x11, x11, #0x32 + ror x21, x21, #0x14 + ror x2, x2, #0x3d + ror x7, x7, #0x13 + ror x12, x12, #0x3 + ror x17, x17, #0x24 + ror x22, x22, #0x2c + ror x3, x3, #0x27 + ror x8, x8, #0x38 + ror x13, x13, #0x2e + ror x28, x28, #0x3f + ror x23, x23, #0x3a + ror x4, x4, #0x36 + ror x9, x9, #0x31 + ror x14, x14, #0x8 + ror x19, x19, #0x25 + ror x24, x24, #0x1c + ror x5, x5, #0x19 + ror x10, x10, #0x17 + ror x15, x15, #0x3e + ror x20, x20, #0x2 + ror x25, x25, #0x9 + ldr x0, [sp] + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + ldp x19, x20, [sp, #0x20] .cfi_restore x19 .cfi_restore x20 - ldp x21, x22, [sp, #0x30] + ldp x21, x22, [sp, #0x30] .cfi_restore x21 .cfi_restore x22 - ldp x23, x24, [sp, #0x40] + ldp x23, x24, [sp, #0x40] .cfi_restore x23 .cfi_restore x24 - ldp x25, x26, [sp, #0x50] + ldp x25, x26, [sp, #0x50] .cfi_restore x25 .cfi_restore x26 - ldp x27, x28, [sp, #0x60] + ldp x27, x28, [sp, #0x60] .cfi_restore x27 .cfi_restore x28 - ldp x29, x30, [sp, #0x70] + ldp x29, x30, [sp, #0x70] .cfi_restore x29 .cfi_restore x30 - add sp, sp, #0x80 + add sp, sp, #0x80 .cfi_adjust_cfa_offset -0x80 ret .cfi_endproc diff --git a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S index b85b7aafe..53ba57e89 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +++ b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S @@ -63,131 +63,131 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x1_v84a_asm) .cfi_startproc - sub sp, sp, #0x40 + sub sp, sp, #0x40 .cfi_adjust_cfa_offset 0x40 - stp d8, d9, [sp] + stp d8, d9, [sp] .cfi_rel_offset d8, 0x0 .cfi_rel_offset d9, 0x8 - stp d10, d11, [sp, #0x10] + stp d10, d11, [sp, #0x10] .cfi_rel_offset d10, 0x10 .cfi_rel_offset d11, 0x18 - stp d12, d13, [sp, #0x20] + stp d12, d13, [sp, #0x20] .cfi_rel_offset d12, 0x20 .cfi_rel_offset d13, 0x28 - stp d14, d15, [sp, #0x30] + stp d14, d15, [sp, #0x30] .cfi_rel_offset d14, 0x30 .cfi_rel_offset d15, 0x38 - ldp d0, d1, [x0] - ldp d2, d3, [x0, #0x10] - ldp d4, d5, [x0, #0x20] - ldp d6, d7, [x0, #0x30] - ldp d8, d9, [x0, #0x40] - ldp d10, d11, [x0, #0x50] - ldp d12, d13, [x0, #0x60] - ldp d14, d15, [x0, #0x70] - ldp d16, d17, [x0, #0x80] - ldp d18, d19, [x0, #0x90] - ldp d20, d21, [x0, #0xa0] - ldp d22, d23, [x0, #0xb0] - ldr d24, [x0, #0xc0] - mov x2, #0x18 // =24 + ldp d0, d1, [x0] + ldp d2, d3, [x0, #0x10] + ldp d4, d5, [x0, #0x20] + ldp d6, d7, [x0, #0x30] + ldp d8, d9, [x0, #0x40] + ldp d10, d11, [x0, #0x50] + ldp d12, d13, [x0, #0x60] + ldp d14, d15, [x0, #0x70] + ldp d16, d17, [x0, #0x80] + ldp d18, d19, [x0, #0x90] + ldp d20, d21, [x0, #0xa0] + ldp d22, d23, [x0, #0xb0] + ldr d24, [x0, #0xc0] + mov x2, #0x18 // =24 Lkeccak_f1600_x1_v84a_loop: - eor3 v30.16b, v0.16b, v5.16b, v10.16b - eor3 v29.16b, v1.16b, v6.16b, v11.16b - eor3 v28.16b, v2.16b, v7.16b, v12.16b - eor3 v27.16b, v3.16b, v8.16b, v13.16b - eor3 v26.16b, v4.16b, v9.16b, v14.16b - eor3 v30.16b, v30.16b, v15.16b, v20.16b - eor3 v29.16b, v29.16b, v16.16b, v21.16b - eor3 v28.16b, v28.16b, v17.16b, v22.16b - eor3 v27.16b, v27.16b, v18.16b, v23.16b - eor3 v26.16b, v26.16b, v19.16b, v24.16b - rax1 v25.2d, v30.2d, v28.2d - rax1 v28.2d, v28.2d, v26.2d - rax1 v26.2d, v26.2d, v29.2d - rax1 v29.2d, v29.2d, v27.2d - rax1 v27.2d, v27.2d, v30.2d - eor v30.16b, v0.16b, v26.16b - xar v0.2d, v2.2d, v29.2d, #0x2 - xar v2.2d, v12.2d, v29.2d, #0x15 - xar v12.2d, v13.2d, v28.2d, #0x27 - xar v13.2d, v19.2d, v27.2d, #0x38 - xar v19.2d, v23.2d, v28.2d, #0x8 - xar v23.2d, v15.2d, v26.2d, #0x17 - xar v15.2d, v1.2d, v25.2d, #0x3f - xar v1.2d, v8.2d, v28.2d, #0x9 - xar v8.2d, v16.2d, v25.2d, #0x13 - xar v16.2d, v7.2d, v29.2d, #0x3a - xar v7.2d, v10.2d, v26.2d, #0x3d - xar v10.2d, v3.2d, v28.2d, #0x24 - xar v3.2d, v18.2d, v28.2d, #0x2b - xar v18.2d, v17.2d, v29.2d, #0x31 - xar v17.2d, v11.2d, v25.2d, #0x36 - xar v11.2d, v9.2d, v27.2d, #0x2c - xar v9.2d, v22.2d, v29.2d, #0x3 - xar v22.2d, v14.2d, v27.2d, #0x19 - xar v14.2d, v20.2d, v26.2d, #0x2e - xar v20.2d, v4.2d, v27.2d, #0x25 - xar v4.2d, v24.2d, v27.2d, #0x32 - xar v24.2d, v21.2d, v25.2d, #0x3e - xar v21.2d, v5.2d, v26.2d, #0x1c - xar v27.2d, v6.2d, v25.2d, #0x14 - ld1r { v31.2d }, [x1], #8 - bcax v5.16b, v10.16b, v7.16b, v11.16b - bcax v6.16b, v11.16b, v8.16b, v7.16b - bcax v7.16b, v7.16b, v9.16b, v8.16b - bcax v8.16b, v8.16b, v10.16b, v9.16b - bcax v9.16b, v9.16b, v11.16b, v10.16b - bcax v10.16b, v15.16b, v12.16b, v16.16b - bcax v11.16b, v16.16b, v13.16b, v12.16b - bcax v12.16b, v12.16b, v14.16b, v13.16b - bcax v13.16b, v13.16b, v15.16b, v14.16b - bcax v14.16b, v14.16b, v16.16b, v15.16b - bcax v15.16b, v20.16b, v17.16b, v21.16b - bcax v16.16b, v21.16b, v18.16b, v17.16b - bcax v17.16b, v17.16b, v19.16b, v18.16b - bcax v18.16b, v18.16b, v20.16b, v19.16b - bcax v19.16b, v19.16b, v21.16b, v20.16b - bcax v20.16b, v0.16b, v22.16b, v1.16b - bcax v21.16b, v1.16b, v23.16b, v22.16b - bcax v22.16b, v22.16b, v24.16b, v23.16b - bcax v23.16b, v23.16b, v0.16b, v24.16b - bcax v24.16b, v24.16b, v1.16b, v0.16b - bcax v0.16b, v30.16b, v2.16b, v27.16b - bcax v1.16b, v27.16b, v3.16b, v2.16b - bcax v2.16b, v2.16b, v4.16b, v3.16b - bcax v3.16b, v3.16b, v30.16b, v4.16b - bcax v4.16b, v4.16b, v27.16b, v30.16b - eor v0.16b, v0.16b, v31.16b - sub x2, x2, #0x1 - cbnz x2, Lkeccak_f1600_x1_v84a_loop - stp d0, d1, [x0] - stp d2, d3, [x0, #0x10] - stp d4, d5, [x0, #0x20] - stp d6, d7, [x0, #0x30] - stp d8, d9, [x0, #0x40] - stp d10, d11, [x0, #0x50] - stp d12, d13, [x0, #0x60] - stp d14, d15, [x0, #0x70] - stp d16, d17, [x0, #0x80] - stp d18, d19, [x0, #0x90] - stp d20, d21, [x0, #0xa0] - stp d22, d23, [x0, #0xb0] - str d24, [x0, #0xc0] - ldp d8, d9, [sp] + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, Lkeccak_f1600_x1_v84a_loop + stp d0, d1, [x0] + stp d2, d3, [x0, #0x10] + stp d4, d5, [x0, #0x20] + stp d6, d7, [x0, #0x30] + stp d8, d9, [x0, #0x40] + stp d10, d11, [x0, #0x50] + stp d12, d13, [x0, #0x60] + stp d14, d15, [x0, #0x70] + stp d16, d17, [x0, #0x80] + stp d18, d19, [x0, #0x90] + stp d20, d21, [x0, #0xa0] + stp d22, d23, [x0, #0xb0] + str d24, [x0, #0xc0] + ldp d8, d9, [sp] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0x10] + ldp d10, d11, [sp, #0x10] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0x20] + ldp d12, d13, [sp, #0x20] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0x30] + ldp d14, d15, [sp, #0x30] .cfi_restore d14 .cfi_restore d15 - add sp, sp, #0x40 + add sp, sp, #0x40 .cfi_adjust_cfa_offset -0x40 ret .cfi_endproc diff --git a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S index 653bbb692..0acb92a9c 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +++ b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S @@ -63,186 +63,186 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x2_v84a_asm) .cfi_startproc - sub sp, sp, #0x40 + sub sp, sp, #0x40 .cfi_adjust_cfa_offset 0x40 - stp d8, d9, [sp] + stp d8, d9, [sp] .cfi_rel_offset d8, 0x0 .cfi_rel_offset d9, 0x8 - stp d10, d11, [sp, #0x10] + stp d10, d11, [sp, #0x10] .cfi_rel_offset d10, 0x10 .cfi_rel_offset d11, 0x18 - stp d12, d13, [sp, #0x20] + stp d12, d13, [sp, #0x20] .cfi_rel_offset d12, 0x20 .cfi_rel_offset d13, 0x28 - stp d14, d15, [sp, #0x30] + stp d14, d15, [sp, #0x30] .cfi_rel_offset d14, 0x30 .cfi_rel_offset d15, 0x38 - add x2, x0, #0xc8 - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v0.2d, v25.2d, v27.2d - trn2 v1.2d, v25.2d, v27.2d - trn1 v2.2d, v26.2d, v28.2d - trn2 v3.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v4.2d, v25.2d, v27.2d - trn2 v5.2d, v25.2d, v27.2d - trn1 v6.2d, v26.2d, v28.2d - trn2 v7.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v8.2d, v25.2d, v27.2d - trn2 v9.2d, v25.2d, v27.2d - trn1 v10.2d, v26.2d, v28.2d - trn2 v11.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v12.2d, v25.2d, v27.2d - trn2 v13.2d, v25.2d, v27.2d - trn1 v14.2d, v26.2d, v28.2d - trn2 v15.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v16.2d, v25.2d, v27.2d - trn2 v17.2d, v25.2d, v27.2d - trn1 v18.2d, v26.2d, v28.2d - trn2 v19.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x2], #32 - trn1 v20.2d, v25.2d, v27.2d - trn2 v21.2d, v25.2d, v27.2d - trn1 v22.2d, v26.2d, v28.2d - trn2 v23.2d, v26.2d, v28.2d - ldr d25, [x0] - ldr d27, [x2] - trn1 v24.2d, v25.2d, v27.2d - mov x2, #0x18 // =24 + add x2, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x2] + trn1 v24.2d, v25.2d, v27.2d + mov x2, #0x18 // =24 Lkeccak_f1600_x2_v84a_loop: - eor3 v30.16b, v0.16b, v5.16b, v10.16b - eor3 v29.16b, v1.16b, v6.16b, v11.16b - eor3 v28.16b, v2.16b, v7.16b, v12.16b - eor3 v27.16b, v3.16b, v8.16b, v13.16b - eor3 v26.16b, v4.16b, v9.16b, v14.16b - eor3 v30.16b, v30.16b, v15.16b, v20.16b - eor3 v29.16b, v29.16b, v16.16b, v21.16b - eor3 v28.16b, v28.16b, v17.16b, v22.16b - eor3 v27.16b, v27.16b, v18.16b, v23.16b - eor3 v26.16b, v26.16b, v19.16b, v24.16b - rax1 v25.2d, v30.2d, v28.2d - rax1 v28.2d, v28.2d, v26.2d - rax1 v26.2d, v26.2d, v29.2d - rax1 v29.2d, v29.2d, v27.2d - rax1 v27.2d, v27.2d, v30.2d - eor v30.16b, v0.16b, v26.16b - xar v0.2d, v2.2d, v29.2d, #0x2 - xar v2.2d, v12.2d, v29.2d, #0x15 - xar v12.2d, v13.2d, v28.2d, #0x27 - xar v13.2d, v19.2d, v27.2d, #0x38 - xar v19.2d, v23.2d, v28.2d, #0x8 - xar v23.2d, v15.2d, v26.2d, #0x17 - xar v15.2d, v1.2d, v25.2d, #0x3f - xar v1.2d, v8.2d, v28.2d, #0x9 - xar v8.2d, v16.2d, v25.2d, #0x13 - xar v16.2d, v7.2d, v29.2d, #0x3a - xar v7.2d, v10.2d, v26.2d, #0x3d - xar v10.2d, v3.2d, v28.2d, #0x24 - xar v3.2d, v18.2d, v28.2d, #0x2b - xar v18.2d, v17.2d, v29.2d, #0x31 - xar v17.2d, v11.2d, v25.2d, #0x36 - xar v11.2d, v9.2d, v27.2d, #0x2c - xar v9.2d, v22.2d, v29.2d, #0x3 - xar v22.2d, v14.2d, v27.2d, #0x19 - xar v14.2d, v20.2d, v26.2d, #0x2e - xar v20.2d, v4.2d, v27.2d, #0x25 - xar v4.2d, v24.2d, v27.2d, #0x32 - xar v24.2d, v21.2d, v25.2d, #0x3e - xar v21.2d, v5.2d, v26.2d, #0x1c - xar v27.2d, v6.2d, v25.2d, #0x14 - ld1r { v31.2d }, [x1], #8 - bcax v5.16b, v10.16b, v7.16b, v11.16b - bcax v6.16b, v11.16b, v8.16b, v7.16b - bcax v7.16b, v7.16b, v9.16b, v8.16b - bcax v8.16b, v8.16b, v10.16b, v9.16b - bcax v9.16b, v9.16b, v11.16b, v10.16b - bcax v10.16b, v15.16b, v12.16b, v16.16b - bcax v11.16b, v16.16b, v13.16b, v12.16b - bcax v12.16b, v12.16b, v14.16b, v13.16b - bcax v13.16b, v13.16b, v15.16b, v14.16b - bcax v14.16b, v14.16b, v16.16b, v15.16b - bcax v15.16b, v20.16b, v17.16b, v21.16b - bcax v16.16b, v21.16b, v18.16b, v17.16b - bcax v17.16b, v17.16b, v19.16b, v18.16b - bcax v18.16b, v18.16b, v20.16b, v19.16b - bcax v19.16b, v19.16b, v21.16b, v20.16b - bcax v20.16b, v0.16b, v22.16b, v1.16b - bcax v21.16b, v1.16b, v23.16b, v22.16b - bcax v22.16b, v22.16b, v24.16b, v23.16b - bcax v23.16b, v23.16b, v0.16b, v24.16b - bcax v24.16b, v24.16b, v1.16b, v0.16b - bcax v0.16b, v30.16b, v2.16b, v27.16b - bcax v1.16b, v27.16b, v3.16b, v2.16b - bcax v2.16b, v2.16b, v4.16b, v3.16b - bcax v3.16b, v3.16b, v30.16b, v4.16b - bcax v4.16b, v4.16b, v27.16b, v30.16b - eor v0.16b, v0.16b, v31.16b - sub x2, x2, #0x1 - cbnz x2, Lkeccak_f1600_x2_v84a_loop - sub x0, x0, #0xc0 - add x2, x0, #0xc8 - trn1 v25.2d, v0.2d, v1.2d - trn1 v26.2d, v2.2d, v3.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v0.2d, v1.2d - trn2 v28.2d, v2.2d, v3.2d - st1 { v27.2d, v28.2d }, [x2], #32 - trn1 v25.2d, v4.2d, v5.2d - trn1 v26.2d, v6.2d, v7.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v4.2d, v5.2d - trn2 v28.2d, v6.2d, v7.2d - st1 { v27.2d, v28.2d }, [x2], #32 - trn1 v25.2d, v8.2d, v9.2d - trn1 v26.2d, v10.2d, v11.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v8.2d, v9.2d - trn2 v28.2d, v10.2d, v11.2d - st1 { v27.2d, v28.2d }, [x2], #32 - trn1 v25.2d, v12.2d, v13.2d - trn1 v26.2d, v14.2d, v15.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v12.2d, v13.2d - trn2 v28.2d, v14.2d, v15.2d - st1 { v27.2d, v28.2d }, [x2], #32 - trn1 v25.2d, v16.2d, v17.2d - trn1 v26.2d, v18.2d, v19.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v16.2d, v17.2d - trn2 v28.2d, v18.2d, v19.2d - st1 { v27.2d, v28.2d }, [x2], #32 - trn1 v25.2d, v20.2d, v21.2d - trn1 v26.2d, v22.2d, v23.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v20.2d, v21.2d - trn2 v28.2d, v22.2d, v23.2d - st1 { v27.2d, v28.2d }, [x2], #32 - str d24, [x0] - trn2 v25.2d, v24.2d, v24.2d - str d25, [x2] - ldp d8, d9, [sp] + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, Lkeccak_f1600_x2_v84a_loop + sub x0, x0, #0xc0 + add x2, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x2], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x2] + ldp d8, d9, [sp] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0x10] + ldp d10, d11, [sp, #0x10] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0x20] + ldp d12, d13, [sp, #0x20] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0x30] + ldp d14, d15, [sp, #0x30] .cfi_restore d14 .cfi_restore d15 - add sp, sp, #0x40 + add sp, sp, #0x40 .cfi_adjust_cfa_offset -0x40 ret .cfi_endproc diff --git a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S index 2c0897e3e..2512bfb8d 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +++ b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S @@ -46,1023 +46,1023 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x4_v8a_scalar_hybrid_asm) .cfi_startproc - sub sp, sp, #0xe0 + sub sp, sp, #0xe0 .cfi_adjust_cfa_offset 0xe0 - stp x19, x20, [sp, #0x30] + stp x19, x20, [sp, #0x30] .cfi_rel_offset x19, 0x30 .cfi_rel_offset x20, 0x38 - stp x21, x22, [sp, #0x40] + stp x21, x22, [sp, #0x40] .cfi_rel_offset x21, 0x40 .cfi_rel_offset x22, 0x48 - stp x23, x24, [sp, #0x50] + stp x23, x24, [sp, #0x50] .cfi_rel_offset x23, 0x50 .cfi_rel_offset x24, 0x58 - stp x25, x26, [sp, #0x60] + stp x25, x26, [sp, #0x60] .cfi_rel_offset x25, 0x60 .cfi_rel_offset x26, 0x68 - stp x27, x28, [sp, #0x70] + stp x27, x28, [sp, #0x70] .cfi_rel_offset x27, 0x70 .cfi_rel_offset x28, 0x78 - stp x29, x30, [sp, #0x80] + stp x29, x30, [sp, #0x80] .cfi_rel_offset x29, 0x80 .cfi_rel_offset x30, 0x88 - stp d8, d9, [sp, #0x90] + stp d8, d9, [sp, #0x90] .cfi_rel_offset d8, 0x90 .cfi_rel_offset d9, 0x98 - stp d10, d11, [sp, #0xa0] + stp d10, d11, [sp, #0xa0] .cfi_rel_offset d10, 0xa0 .cfi_rel_offset d11, 0xa8 - stp d12, d13, [sp, #0xb0] + stp d12, d13, [sp, #0xb0] .cfi_rel_offset d12, 0xb0 .cfi_rel_offset d13, 0xb8 - stp d14, d15, [sp, #0xc0] + stp d14, d15, [sp, #0xc0] .cfi_rel_offset d14, 0xc0 .cfi_rel_offset d15, 0xc8 - mov x29, x1 - mov x30, #0x0 // =0 - str x30, [sp, #0x20] - str x29, [sp, #0x8] - str x29, [sp, #0x10] - str x0, [sp] - add x4, x0, #0xc8 - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v0.2d, v25.2d, v27.2d - trn2 v1.2d, v25.2d, v27.2d - trn1 v2.2d, v26.2d, v28.2d - trn2 v3.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v4.2d, v25.2d, v27.2d - trn2 v5.2d, v25.2d, v27.2d - trn1 v6.2d, v26.2d, v28.2d - trn2 v7.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v8.2d, v25.2d, v27.2d - trn2 v9.2d, v25.2d, v27.2d - trn1 v10.2d, v26.2d, v28.2d - trn2 v11.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v12.2d, v25.2d, v27.2d - trn2 v13.2d, v25.2d, v27.2d - trn1 v14.2d, v26.2d, v28.2d - trn2 v15.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v16.2d, v25.2d, v27.2d - trn2 v17.2d, v25.2d, v27.2d - trn1 v18.2d, v26.2d, v28.2d - trn2 v19.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v20.2d, v25.2d, v27.2d - trn2 v21.2d, v25.2d, v27.2d - trn1 v22.2d, v26.2d, v28.2d - trn2 v23.2d, v26.2d, v28.2d - ldr d25, [x0] - ldr d27, [x4] - trn1 v24.2d, v25.2d, v27.2d - sub x0, x0, #0xc0 - add x0, x0, #0x190 - ldp x1, x6, [x0] - ldp x11, x16, [x0, #0x10] - ldp x21, x2, [x0, #0x20] - ldp x7, x12, [x0, #0x30] - ldp x17, x22, [x0, #0x40] - ldp x3, x8, [x0, #0x50] - ldp x13, x28, [x0, #0x60] - ldp x23, x4, [x0, #0x70] - ldp x9, x14, [x0, #0x80] - ldp x19, x24, [x0, #0x90] - ldp x5, x10, [x0, #0xa0] - ldp x15, x20, [x0, #0xb0] - ldr x25, [x0, #0xc0] - sub x0, x0, #0x190 + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x4] + trn1 v24.2d, v25.2d, v27.2d + sub x0, x0, #0xc0 + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 Lkeccak_f1600_x4_v8a_scalar_hybrid_initial: - eor x30, x24, x25 - eor x27, x9, x10 - eor v30.16b, v0.16b, v5.16b - eor v30.16b, v30.16b, v10.16b - eor x0, x30, x21 - eor v30.16b, v30.16b, v15.16b - eor x26, x27, x6 - eor x27, x26, x7 - eor v30.16b, v30.16b, v20.16b - eor x29, x0, x22 - eor v29.16b, v1.16b, v6.16b - eor x26, x29, x23 - eor v29.16b, v29.16b, v11.16b - eor x29, x4, x5 - eor x30, x29, x1 - eor v29.16b, v29.16b, v16.16b - eor x0, x27, x8 - eor v29.16b, v29.16b, v21.16b - eor x29, x30, x2 - eor v28.16b, v2.16b, v7.16b - eor x30, x19, x20 - eor x30, x30, x16 - eor v28.16b, v28.16b, v12.16b - eor x27, x26, x0, ror #63 - eor v28.16b, v28.16b, v17.16b - eor x4, x4, x27 - eor v28.16b, v28.16b, v22.16b - eor x30, x30, x17 - eor x30, x30, x28 - eor v27.16b, v3.16b, v8.16b - eor x29, x29, x3 - eor v27.16b, v27.16b, v13.16b - eor x0, x0, x30, ror #63 - eor v27.16b, v27.16b, v18.16b - eor x30, x30, x29, ror #63 - eor x22, x22, x30 - eor v27.16b, v27.16b, v23.16b - eor x23, x23, x30 - eor v26.16b, v4.16b, v9.16b - str x23, [sp, #0xd0] - eor v26.16b, v26.16b, v14.16b - eor x23, x14, x15 - eor x14, x14, x0 - eor v26.16b, v26.16b, v19.16b - eor x23, x23, x11 - eor v26.16b, v26.16b, v24.16b - eor x15, x15, x0 - eor x1, x1, x27 - add v31.2d, v28.2d, v28.2d - eor x23, x23, x12 - sri v31.2d, v28.2d, #0x3f - eor x23, x23, x13 - eor v25.16b, v31.16b, v30.16b - eor x11, x11, x0 - eor x29, x29, x23, ror #63 - add v31.2d, v26.2d, v26.2d - eor x23, x23, x26, ror #63 - sri v31.2d, v26.2d, #0x3f - eor x26, x13, x0 - eor v28.16b, v31.16b, v28.16b - eor x13, x28, x23 - eor x28, x24, x30 - add v31.2d, v29.2d, v29.2d - eor x24, x16, x23 - sri v31.2d, v29.2d, #0x3f - eor x16, x21, x30 - eor v26.16b, v31.16b, v26.16b - eor x21, x25, x30 - eor x30, x19, x23 - add v31.2d, v27.2d, v27.2d - eor x19, x20, x23 - sri v31.2d, v27.2d, #0x3f - eor x20, x17, x23 - eor v29.16b, v31.16b, v29.16b - eor x17, x12, x0 - eor x0, x2, x27 - add v31.2d, v30.2d, v30.2d - eor x2, x6, x29 - sri v31.2d, v30.2d, #0x3f - eor x6, x8, x29 - eor v27.16b, v31.16b, v27.16b - bic x8, x28, x13, ror #47 - eor x12, x3, x27 - eor v30.16b, v0.16b, v26.16b - bic x3, x13, x17, ror #19 - eor v31.16b, v2.16b, v29.16b - eor x5, x5, x27 - ldr x27, [sp, #0xd0] - shl v0.2d, v31.2d, #0x3e - bic x25, x17, x2, ror #5 - sri v0.2d, v31.2d, #0x2 - eor x9, x9, x29 - eor v31.16b, v12.16b, v29.16b - eor x23, x25, x5, ror #52 - eor x3, x3, x2, ror #24 - shl v2.2d, v31.2d, #0x2b - eor x8, x8, x17, ror #2 - sri v2.2d, v31.2d, #0x15 - eor x17, x10, x29 - eor v31.16b, v13.16b, v28.16b - bic x25, x12, x22, ror #47 - eor x29, x7, x29 - shl v12.2d, v31.2d, #0x19 - bic x10, x4, x27, ror #2 - sri v12.2d, v31.2d, #0x27 - bic x7, x5, x28, ror #10 - eor v31.16b, v19.16b, v27.16b - eor x10, x10, x20, ror #50 - eor x13, x7, x13, ror #57 - shl v13.2d, v31.2d, #0x8 - bic x7, x2, x5, ror #47 - sri v13.2d, v31.2d, #0x38 - eor x2, x25, x24, ror #39 - eor v31.16b, v23.16b, v28.16b - bic x25, x20, x11, ror #57 - bic x5, x17, x4, ror #25 - shl v19.2d, v31.2d, #0x38 - eor x25, x25, x17, ror #53 - sri v19.2d, v31.2d, #0x8 - bic x17, x11, x17, ror #60 - eor v31.16b, v15.16b, v26.16b - eor x28, x7, x28, ror #57 - bic x7, x9, x12, ror #42 - shl v23.2d, v31.2d, #0x29 - eor x7, x7, x22, ror #25 - sri v23.2d, v31.2d, #0x17 - bic x22, x22, x24, ror #56 - bic x24, x24, x15, ror #31 - eor v31.16b, v1.16b, v25.16b - eor x22, x22, x15, ror #23 - shl v15.2d, v31.2d, #0x1 - bic x20, x27, x20, ror #48 - sri v15.2d, v31.2d, #0x3f - bic x15, x15, x9, ror #16 - eor x12, x15, x12, ror #58 - eor v31.16b, v8.16b, v28.16b - eor x15, x5, x27, ror #27 - shl v1.2d, v31.2d, #0x37 - eor x5, x20, x11, ror #41 - sri v1.2d, v31.2d, #0x9 - ldr x11, [sp, #0x8] - eor x20, x17, x4, ror #21 - eor v31.16b, v16.16b, v25.16b - eor x17, x24, x9, ror #47 - shl v8.2d, v31.2d, #0x2d - mov x24, #0x1 // =1 - sri v8.2d, v31.2d, #0x13 - bic x9, x0, x16, ror #9 - str x24, [sp, #0x18] - eor v31.16b, v7.16b, v29.16b - bic x24, x29, x1, ror #44 - shl v16.2d, v31.2d, #0x6 - bic x27, x1, x21, ror #50 - sri v16.2d, v31.2d, #0x3a - bic x4, x26, x29, ror #63 - eor x1, x1, x4, ror #21 - eor v31.16b, v10.16b, v26.16b - ldr x11, [x11] - shl v7.2d, v31.2d, #0x3 - bic x4, x21, x30, ror #57 - sri v7.2d, v31.2d, #0x3d - eor x21, x24, x21, ror #30 - eor x24, x9, x19, ror #44 - eor v31.16b, v3.16b, v28.16b - bic x9, x14, x6, ror #5 - shl v10.2d, v31.2d, #0x1c - eor x9, x9, x0, ror #43 - sri v10.2d, v31.2d, #0x24 - bic x0, x6, x0, ror #38 - eor x1, x1, x11 - eor v31.16b, v18.16b, v28.16b - eor x11, x4, x26, ror #35 - shl v3.2d, v31.2d, #0x15 - eor x4, x0, x16, ror #47 - bic x0, x16, x19, ror #35 - sri v3.2d, v31.2d, #0x2b - eor x16, x27, x30, ror #43 - eor v31.16b, v17.16b, v29.16b - bic x27, x30, x26, ror #42 - shl v18.2d, v31.2d, #0xf - bic x26, x19, x14, ror #41 - eor x19, x0, x14, ror #12 - sri v18.2d, v31.2d, #0x31 - eor x14, x26, x6, ror #46 - eor v31.16b, v11.16b, v25.16b - eor x6, x27, x29, ror #41 - shl v17.2d, v31.2d, #0xa - eor x0, x15, x11, ror #52 - eor x0, x0, x13, ror #48 - sri v17.2d, v31.2d, #0x36 - eor x26, x8, x9, ror #57 - eor v31.16b, v9.16b, v27.16b - eor x27, x0, x14, ror #10 - shl v11.2d, v31.2d, #0x14 - eor x29, x16, x28, ror #63 - eor x26, x26, x6, ror #51 - sri v11.2d, v31.2d, #0x2c - eor x30, x23, x22, ror #50 - eor v31.16b, v22.16b, v29.16b - eor x0, x26, x10, ror #31 - shl v9.2d, v31.2d, #0x3d - eor x29, x29, x19, ror #37 - eor x27, x27, x12, ror #5 - sri v9.2d, v31.2d, #0x3 - eor x30, x30, x24, ror #34 - eor v31.16b, v14.16b, v27.16b - eor x0, x0, x7, ror #27 - shl v22.2d, v31.2d, #0x27 - eor x26, x30, x21, ror #26 - eor x26, x26, x25, ror #15 - sri v22.2d, v31.2d, #0x19 - ror x30, x27, #0x3e - eor v31.16b, v20.16b, v26.16b - eor x30, x30, x26, ror #57 - ror x26, x26, #0x3a - shl v14.2d, v31.2d, #0x12 - eor x16, x30, x16 - sri v14.2d, v31.2d, #0x2e - eor x28, x30, x28, ror #63 - eor v31.16b, v4.16b, v27.16b - str x28, [sp, #0xd0] - eor x29, x29, x17, ror #36 - shl v20.2d, v31.2d, #0x1b - eor x28, x1, x2, ror #61 - sri v20.2d, v31.2d, #0x25 - eor x19, x30, x19, ror #37 - eor v31.16b, v24.16b, v27.16b - eor x29, x29, x20, ror #2 - eor x28, x28, x4, ror #54 - shl v4.2d, v31.2d, #0xe - eor x26, x26, x0, ror #55 - sri v4.2d, v31.2d, #0x32 - eor x28, x28, x3, ror #39 - eor v31.16b, v21.16b, v25.16b - eor x28, x28, x5, ror #25 - ror x0, x0, #0x38 - shl v24.2d, v31.2d, #0x2 - eor x0, x0, x29, ror #63 - sri v24.2d, v31.2d, #0x3e - eor x27, x28, x27, ror #61 - eor v31.16b, v5.16b, v26.16b - eor x13, x0, x13, ror #46 - eor x28, x29, x28, ror #63 - shl v21.2d, v31.2d, #0x24 - eor x29, x30, x20, ror #2 - sri v21.2d, v31.2d, #0x1c - eor x20, x26, x3, ror #39 - eor v31.16b, v6.16b, v25.16b - eor x11, x0, x11, ror #50 - eor x25, x28, x25, ror #9 - shl v27.2d, v31.2d, #0x2c - eor x3, x28, x21, ror #20 - sri v27.2d, v31.2d, #0x14 - eor x21, x26, x1 - eor x9, x27, x9, ror #49 - bic v31.16b, v7.16b, v11.16b - eor x24, x28, x24, ror #28 - eor v5.16b, v31.16b, v10.16b - eor x1, x30, x17, ror #36 - bic v31.16b, v8.16b, v7.16b - eor x14, x0, x14, ror #8 - eor x22, x28, x22, ror #44 - eor v6.16b, v31.16b, v11.16b - eor x8, x27, x8, ror #56 - bic v31.16b, v9.16b, v8.16b - eor x17, x27, x7, ror #19 - eor v7.16b, v31.16b, v7.16b - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - bic v31.16b, v10.16b, v9.16b - eor x4, x26, x4, ror #54 - eor v8.16b, v31.16b, v8.16b - eor x0, x0, x12, ror #3 - bic v31.16b, v11.16b, v10.16b - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - eor v9.16b, v31.16b, v9.16b - eor x26, x26, x5, ror #25 - bic v31.16b, v12.16b, v16.16b - eor x2, x7, x16, ror #39 - eor v10.16b, v31.16b, v15.16b - bic x7, x9, x20, ror #42 - bic x30, x15, x9, ror #16 - bic v31.16b, v13.16b, v12.16b - eor x7, x7, x22, ror #25 - eor v11.16b, v31.16b, v16.16b - eor x12, x30, x20, ror #58 - bic v31.16b, v14.16b, v13.16b - bic x20, x22, x16, ror #56 - eor x30, x27, x6, ror #43 - eor v12.16b, v31.16b, v12.16b - eor x22, x20, x15, ror #23 - bic v31.16b, v15.16b, v14.16b - bic x6, x19, x13, ror #42 - eor v13.16b, v31.16b, v13.16b - eor x6, x6, x17, ror #41 - bic x5, x13, x17, ror #63 - bic v31.16b, v16.16b, v15.16b - eor x5, x21, x5, ror #21 - eor v14.16b, v31.16b, v14.16b - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - bic v31.16b, v17.16b, v21.16b - bic x21, x21, x25, ror #50 - eor v15.16b, v31.16b, v20.16b - bic x20, x27, x4, ror #25 - bic v31.16b, v18.16b, v17.16b - bic x10, x16, x15, ror #31 - eor x16, x21, x19, ror #43 - eor v16.16b, v31.16b, v21.16b - eor x21, x17, x25, ror #30 - bic v31.16b, v19.16b, v18.16b - bic x19, x25, x19, ror #57 - eor v17.16b, v31.16b, v17.16b - ldr x25, [sp, #0x18] - eor x17, x10, x9, ror #47 - bic v31.16b, v20.16b, v19.16b - ldr x9, [sp, #0x8] - eor v18.16b, v31.16b, v18.16b - eor x15, x20, x28, ror #27 - bic v31.16b, v21.16b, v20.16b - bic x20, x4, x28, ror #2 - eor x10, x20, x1, ror #50 - eor v19.16b, v31.16b, v19.16b - bic x20, x11, x27, ror #60 - bic v31.16b, v22.16b, v1.16b - eor x20, x20, x4, ror #21 - eor v20.16b, v31.16b, v0.16b - bic x4, x28, x1, ror #48 - bic x1, x1, x11, ror #57 - bic v31.16b, v23.16b, v22.16b - ldr x28, [x9, x25, lsl #3] - eor v21.16b, v31.16b, v1.16b - ldr x9, [sp, #0xd0] - bic v31.16b, v24.16b, v23.16b - add x25, x25, #0x1 - str x25, [sp, #0x18] - eor v22.16b, v31.16b, v22.16b - cmp x25, #0x17 - bic v31.16b, v0.16b, v24.16b - eor x25, x1, x27, ror #53 - bic x27, x30, x26, ror #47 - eor v23.16b, v31.16b, v23.16b - eor x1, x5, x28 - bic v31.16b, v1.16b, v0.16b - eor x5, x4, x11, ror #41 - eor v24.16b, v31.16b, v24.16b - eor x11, x19, x13, ror #35 - bic x13, x26, x24, ror #10 - bic v31.16b, v2.16b, v27.16b - eor x28, x27, x24, ror #57 - eor v0.16b, v31.16b, v30.16b - bic x27, x24, x9, ror #47 - bic v31.16b, v3.16b, v2.16b - bic x19, x23, x3, ror #9 - bic x4, x29, x14, ror #41 - eor v1.16b, v31.16b, v27.16b - eor x24, x19, x29, ror #44 - bic v31.16b, v4.16b, v3.16b - bic x29, x3, x29, ror #35 - eor v2.16b, v31.16b, v2.16b - eor x13, x13, x9, ror #57 - eor x19, x29, x14, ror #12 - bic v31.16b, v30.16b, v4.16b - bic x29, x9, x0, ror #19 - eor v3.16b, v31.16b, v3.16b - bic x14, x14, x8, ror #5 - bic v31.16b, v27.16b, v30.16b - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - eor v4.16b, v31.16b, v4.16b - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - ldr x30, [sp, #0x10] - ld1r { v28.2d }, [x30], #8 - str x30, [sp, #0x10] - eor v0.16b, v0.16b, v28.16b + eor x30, x24, x25 + eor x27, x9, x10 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x0, x30, x21 + eor v30.16b, v30.16b, v15.16b + eor x26, x27, x6 + eor x27, x26, x7 + eor v30.16b, v30.16b, v20.16b + eor x29, x0, x22 + eor v29.16b, v1.16b, v6.16b + eor x26, x29, x23 + eor v29.16b, v29.16b, v11.16b + eor x29, x4, x5 + eor x30, x29, x1 + eor v29.16b, v29.16b, v16.16b + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor v28.16b, v2.16b, v7.16b + eor x30, x19, x20 + eor x30, x30, x16 + eor v28.16b, v28.16b, v12.16b + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x17 + eor x30, x30, x28 + eor v27.16b, v3.16b, v8.16b + eor x29, x29, x3 + eor v27.16b, v27.16b, v13.16b + eor x0, x0, x30, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + eor v26.16b, v4.16b, v9.16b + str x23, [sp, #0xd0] + eor v26.16b, v26.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor v26.16b, v26.16b, v24.16b + eor x15, x15, x0 + eor x1, x1, x27 + add v31.2d, v28.2d, v28.2d + eor x23, x23, x12 + sri v31.2d, v28.2d, #0x3f + eor x23, x23, x13 + eor v25.16b, v31.16b, v30.16b + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + add v31.2d, v26.2d, v26.2d + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor v28.16b, v31.16b, v28.16b + eor x13, x28, x23 + eor x28, x24, x30 + add v31.2d, v29.2d, v29.2d + eor x24, x16, x23 + sri v31.2d, v29.2d, #0x3f + eor x16, x21, x30 + eor v26.16b, v31.16b, v26.16b + eor x21, x25, x30 + eor x30, x19, x23 + add v31.2d, v27.2d, v27.2d + eor x19, x20, x23 + sri v31.2d, v27.2d, #0x3f + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + add v31.2d, v30.2d, v30.2d + eor x2, x6, x29 + sri v31.2d, v30.2d, #0x3f + eor x6, x8, x29 + eor v27.16b, v31.16b, v27.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v30.16b, v0.16b, v26.16b + bic x3, x13, x17, ror #19 + eor v31.16b, v2.16b, v29.16b + eor x5, x5, x27 + ldr x27, [sp, #0xd0] + shl v0.2d, v31.2d, #0x3e + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor v31.16b, v12.16b, v29.16b + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + shl v2.2d, v31.2d, #0x2b + eor x8, x8, x17, ror #2 + sri v2.2d, v31.2d, #0x15 + eor x17, x10, x29 + eor v31.16b, v13.16b, v28.16b + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + shl v12.2d, v31.2d, #0x19 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + eor v31.16b, v19.16b, v27.16b + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + shl v13.2d, v31.2d, #0x8 + bic x7, x2, x5, ror #47 + sri v13.2d, v31.2d, #0x38 + eor x2, x25, x24, ror #39 + eor v31.16b, v23.16b, v28.16b + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + shl v19.2d, v31.2d, #0x38 + eor x25, x25, x17, ror #53 + sri v19.2d, v31.2d, #0x8 + bic x17, x11, x17, ror #60 + eor v31.16b, v15.16b, v26.16b + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + shl v23.2d, v31.2d, #0x29 + eor x7, x7, x22, ror #25 + sri v23.2d, v31.2d, #0x17 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor v31.16b, v1.16b, v25.16b + eor x22, x22, x15, ror #23 + shl v15.2d, v31.2d, #0x1 + bic x20, x27, x20, ror #48 + sri v15.2d, v31.2d, #0x3f + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor v31.16b, v8.16b, v28.16b + eor x15, x5, x27, ror #27 + shl v1.2d, v31.2d, #0x37 + eor x5, x20, x11, ror #41 + sri v1.2d, v31.2d, #0x9 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor v31.16b, v16.16b, v25.16b + eor x17, x24, x9, ror #47 + shl v8.2d, v31.2d, #0x2d + mov x24, #0x1 // =1 + sri v8.2d, v31.2d, #0x13 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v7.16b, v29.16b + bic x24, x29, x1, ror #44 + shl v16.2d, v31.2d, #0x6 + bic x27, x1, x21, ror #50 + sri v16.2d, v31.2d, #0x3a + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + eor v31.16b, v10.16b, v26.16b + ldr x11, [x11] + shl v7.2d, v31.2d, #0x3 + bic x4, x21, x30, ror #57 + sri v7.2d, v31.2d, #0x3d + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v3.16b, v28.16b + bic x9, x14, x6, ror #5 + shl v10.2d, v31.2d, #0x1c + eor x9, x9, x0, ror #43 + sri v10.2d, v31.2d, #0x24 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor v31.16b, v18.16b, v28.16b + eor x11, x4, x26, ror #35 + shl v3.2d, v31.2d, #0x15 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x16, x27, x30, ror #43 + eor v31.16b, v17.16b, v29.16b + bic x27, x30, x26, ror #42 + shl v18.2d, v31.2d, #0xf + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v18.2d, v31.2d, #0x31 + eor x14, x26, x6, ror #46 + eor v31.16b, v11.16b, v25.16b + eor x6, x27, x29, ror #41 + shl v17.2d, v31.2d, #0xa + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + sri v17.2d, v31.2d, #0x36 + eor x26, x8, x9, ror #57 + eor v31.16b, v9.16b, v27.16b + eor x27, x0, x14, ror #10 + shl v11.2d, v31.2d, #0x14 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v11.2d, v31.2d, #0x2c + eor x30, x23, x22, ror #50 + eor v31.16b, v22.16b, v29.16b + eor x0, x26, x10, ror #31 + shl v9.2d, v31.2d, #0x3d + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + sri v9.2d, v31.2d, #0x3 + eor x30, x30, x24, ror #34 + eor v31.16b, v14.16b, v27.16b + eor x0, x0, x7, ror #27 + shl v22.2d, v31.2d, #0x27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + sri v22.2d, v31.2d, #0x19 + ror x30, x27, #0x3e + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v14.2d, v31.2d, #0x12 + eor x16, x30, x16 + sri v14.2d, v31.2d, #0x2e + eor x28, x30, x28, ror #63 + eor v31.16b, v4.16b, v27.16b + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + shl v20.2d, v31.2d, #0x1b + eor x28, x1, x2, ror #61 + sri v20.2d, v31.2d, #0x25 + eor x19, x30, x19, ror #37 + eor v31.16b, v24.16b, v27.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v4.2d, v31.2d, #0xe + eor x26, x26, x0, ror #55 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x3, ror #39 + eor v31.16b, v21.16b, v25.16b + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + shl v24.2d, v31.2d, #0x2 + eor x0, x0, x29, ror #63 + sri v24.2d, v31.2d, #0x3e + eor x27, x28, x27, ror #61 + eor v31.16b, v5.16b, v26.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + shl v21.2d, v31.2d, #0x24 + eor x29, x30, x20, ror #2 + sri v21.2d, v31.2d, #0x1c + eor x20, x26, x3, ror #39 + eor v31.16b, v6.16b, v25.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + shl v27.2d, v31.2d, #0x2c + eor x3, x28, x21, ror #20 + sri v27.2d, v31.2d, #0x14 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bic v31.16b, v7.16b, v11.16b + eor x24, x28, x24, ror #28 + eor v5.16b, v31.16b, v10.16b + eor x1, x30, x17, ror #36 + bic v31.16b, v8.16b, v7.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v6.16b, v31.16b, v11.16b + eor x8, x27, x8, ror #56 + bic v31.16b, v9.16b, v8.16b + eor x17, x27, x7, ror #19 + eor v7.16b, v31.16b, v7.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v10.16b, v9.16b + eor x4, x26, x4, ror #54 + eor v8.16b, v31.16b, v8.16b + eor x0, x0, x12, ror #3 + bic v31.16b, v11.16b, v10.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v9.16b, v31.16b, v9.16b + eor x26, x26, x5, ror #25 + bic v31.16b, v12.16b, v16.16b + eor x2, x7, x16, ror #39 + eor v10.16b, v31.16b, v15.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + bic v31.16b, v13.16b, v12.16b + eor x7, x7, x22, ror #25 + eor v11.16b, v31.16b, v16.16b + eor x12, x30, x20, ror #58 + bic v31.16b, v14.16b, v13.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v12.16b, v31.16b, v12.16b + eor x22, x20, x15, ror #23 + bic v31.16b, v15.16b, v14.16b + bic x6, x19, x13, ror #42 + eor v13.16b, v31.16b, v13.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v16.16b, v15.16b + eor x5, x21, x5, ror #21 + eor v14.16b, v31.16b, v14.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic v31.16b, v17.16b, v21.16b + bic x21, x21, x25, ror #50 + eor v15.16b, v31.16b, v20.16b + bic x20, x27, x4, ror #25 + bic v31.16b, v18.16b, v17.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor v16.16b, v31.16b, v21.16b + eor x21, x17, x25, ror #30 + bic v31.16b, v19.16b, v18.16b + bic x19, x25, x19, ror #57 + eor v17.16b, v31.16b, v17.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + bic v31.16b, v20.16b, v19.16b + ldr x9, [sp, #0x8] + eor v18.16b, v31.16b, v18.16b + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + eor v19.16b, v31.16b, v19.16b + bic x20, x11, x27, ror #60 + bic v31.16b, v22.16b, v1.16b + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + bic v31.16b, v23.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + eor v21.16b, v31.16b, v1.16b + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + eor v22.16b, v31.16b, v22.16b + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v23.16b, v31.16b, v23.16b + eor x1, x5, x28 + bic v31.16b, v1.16b, v0.16b + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + bic v31.16b, v2.16b, v27.16b + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor v1.16b, v31.16b, v27.16b + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic v31.16b, v30.16b, v4.16b + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b Lkeccak_f1600_x4_v8a_scalar_hybrid_loop: - eor x0, x15, x11, ror #52 - eor x0, x0, x13, ror #48 - eor v30.16b, v0.16b, v5.16b - eor v30.16b, v30.16b, v10.16b - eor x26, x8, x9, ror #57 - eor v30.16b, v30.16b, v15.16b - eor x27, x0, x14, ror #10 - eor x29, x16, x28, ror #63 - eor v30.16b, v30.16b, v20.16b - eor x26, x26, x6, ror #51 - eor v29.16b, v1.16b, v6.16b - eor x30, x23, x22, ror #50 - eor v29.16b, v29.16b, v11.16b - eor x0, x26, x10, ror #31 - eor x29, x29, x19, ror #37 - eor v29.16b, v29.16b, v16.16b - eor x27, x27, x12, ror #5 - eor v29.16b, v29.16b, v21.16b - eor x30, x30, x24, ror #34 - eor x0, x0, x7, ror #27 - eor v28.16b, v2.16b, v7.16b - eor x26, x30, x21, ror #26 - eor v28.16b, v28.16b, v12.16b - eor x26, x26, x25, ror #15 - eor v28.16b, v28.16b, v17.16b - ror x30, x27, #0x3e - eor x30, x30, x26, ror #57 - eor v28.16b, v28.16b, v22.16b - ror x26, x26, #0x3a - eor v27.16b, v3.16b, v8.16b - eor x16, x30, x16 - eor v27.16b, v27.16b, v13.16b - eor x28, x30, x28, ror #63 - str x28, [sp, #0xd0] - eor v27.16b, v27.16b, v18.16b - eor x29, x29, x17, ror #36 - eor v27.16b, v27.16b, v23.16b - eor x28, x1, x2, ror #61 - eor x19, x30, x19, ror #37 - eor v26.16b, v4.16b, v9.16b - eor x29, x29, x20, ror #2 - eor v26.16b, v26.16b, v14.16b - eor x28, x28, x4, ror #54 - eor v26.16b, v26.16b, v19.16b - eor x26, x26, x0, ror #55 - eor x28, x28, x3, ror #39 - eor v26.16b, v26.16b, v24.16b - eor x28, x28, x5, ror #25 - add v31.2d, v28.2d, v28.2d - ror x0, x0, #0x38 - eor x0, x0, x29, ror #63 - sri v31.2d, v28.2d, #0x3f - eor x27, x28, x27, ror #61 - eor v25.16b, v31.16b, v30.16b - eor x13, x0, x13, ror #46 - add v31.2d, v26.2d, v26.2d - eor x28, x29, x28, ror #63 - eor x29, x30, x20, ror #2 - sri v31.2d, v26.2d, #0x3f - eor x20, x26, x3, ror #39 - eor v28.16b, v31.16b, v28.16b - eor x11, x0, x11, ror #50 - add v31.2d, v29.2d, v29.2d - eor x25, x28, x25, ror #9 - eor x3, x28, x21, ror #20 - sri v31.2d, v29.2d, #0x3f - eor x21, x26, x1 - eor v26.16b, v31.16b, v26.16b - eor x9, x27, x9, ror #49 - eor x24, x28, x24, ror #28 - add v31.2d, v27.2d, v27.2d - eor x1, x30, x17, ror #36 - sri v31.2d, v27.2d, #0x3f - eor x14, x0, x14, ror #8 - eor v29.16b, v31.16b, v29.16b - eor x22, x28, x22, ror #44 - eor x8, x27, x8, ror #56 - add v31.2d, v30.2d, v30.2d - eor x17, x27, x7, ror #19 - sri v31.2d, v30.2d, #0x3f - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - eor v27.16b, v31.16b, v27.16b - eor x4, x26, x4, ror #54 - eor v30.16b, v0.16b, v26.16b - eor x0, x0, x12, ror #3 - eor v31.16b, v2.16b, v29.16b - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - shl v0.2d, v31.2d, #0x3e - eor x26, x26, x5, ror #25 - sri v0.2d, v31.2d, #0x2 - eor x2, x7, x16, ror #39 - eor v31.16b, v12.16b, v29.16b - bic x7, x9, x20, ror #42 - bic x30, x15, x9, ror #16 - shl v2.2d, v31.2d, #0x2b - eor x7, x7, x22, ror #25 - sri v2.2d, v31.2d, #0x15 - eor x12, x30, x20, ror #58 - bic x20, x22, x16, ror #56 - eor v31.16b, v13.16b, v28.16b - eor x30, x27, x6, ror #43 - shl v12.2d, v31.2d, #0x19 - eor x22, x20, x15, ror #23 - sri v12.2d, v31.2d, #0x27 - bic x6, x19, x13, ror #42 - eor x6, x6, x17, ror #41 - eor v31.16b, v19.16b, v27.16b - bic x5, x13, x17, ror #63 - shl v13.2d, v31.2d, #0x8 - eor x5, x21, x5, ror #21 - sri v13.2d, v31.2d, #0x38 - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - eor v31.16b, v23.16b, v28.16b - bic x21, x21, x25, ror #50 - shl v19.2d, v31.2d, #0x38 - bic x20, x27, x4, ror #25 - bic x10, x16, x15, ror #31 - sri v19.2d, v31.2d, #0x8 - eor x16, x21, x19, ror #43 - eor v31.16b, v15.16b, v26.16b - eor x21, x17, x25, ror #30 - shl v23.2d, v31.2d, #0x29 - bic x19, x25, x19, ror #57 - ldr x25, [sp, #0x18] - sri v23.2d, v31.2d, #0x17 - eor x17, x10, x9, ror #47 - eor v31.16b, v1.16b, v25.16b - ldr x9, [sp, #0x8] - eor x15, x20, x28, ror #27 - shl v15.2d, v31.2d, #0x1 - bic x20, x4, x28, ror #2 - sri v15.2d, v31.2d, #0x3f - eor x10, x20, x1, ror #50 - eor v31.16b, v8.16b, v28.16b - bic x20, x11, x27, ror #60 - eor x20, x20, x4, ror #21 - shl v1.2d, v31.2d, #0x37 - bic x4, x28, x1, ror #48 - sri v1.2d, v31.2d, #0x9 - bic x1, x1, x11, ror #57 - eor v31.16b, v16.16b, v25.16b - ldr x28, [x9, x25, lsl #3] - ldr x9, [sp, #0xd0] - shl v8.2d, v31.2d, #0x2d - add x25, x25, #0x1 - sri v8.2d, v31.2d, #0x13 - str x25, [sp, #0x18] - cmp x25, #0x17 - eor v31.16b, v7.16b, v29.16b - eor x25, x1, x27, ror #53 - shl v16.2d, v31.2d, #0x6 - bic x27, x30, x26, ror #47 - sri v16.2d, v31.2d, #0x3a - eor x1, x5, x28 - eor x5, x4, x11, ror #41 - eor v31.16b, v10.16b, v26.16b - eor x11, x19, x13, ror #35 - shl v7.2d, v31.2d, #0x3 - bic x13, x26, x24, ror #10 - eor x28, x27, x24, ror #57 - sri v7.2d, v31.2d, #0x3d - bic x27, x24, x9, ror #47 - eor v31.16b, v3.16b, v28.16b - bic x19, x23, x3, ror #9 - shl v10.2d, v31.2d, #0x1c - bic x4, x29, x14, ror #41 - eor x24, x19, x29, ror #44 - sri v10.2d, v31.2d, #0x24 - bic x29, x3, x29, ror #35 - eor v31.16b, v18.16b, v28.16b - eor x13, x13, x9, ror #57 - shl v3.2d, v31.2d, #0x15 - eor x19, x29, x14, ror #12 - bic x29, x9, x0, ror #19 - sri v3.2d, v31.2d, #0x2b - bic x14, x14, x8, ror #5 - eor v31.16b, v17.16b, v29.16b - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - shl v18.2d, v31.2d, #0xf - bic x23, x8, x23, ror #38 - sri v18.2d, v31.2d, #0x31 - eor x8, x27, x0, ror #2 - eor v31.16b, v11.16b, v25.16b - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - shl v17.2d, v31.2d, #0xa - eor x23, x3, x26, ror #52 - sri v17.2d, v31.2d, #0x36 - eor x3, x29, x30, ror #24 - eor x0, x15, x11, ror #52 - eor v31.16b, v9.16b, v27.16b - eor x0, x0, x13, ror #48 - shl v11.2d, v31.2d, #0x14 - eor x26, x8, x9, ror #57 - sri v11.2d, v31.2d, #0x2c - eor x27, x0, x14, ror #10 - eor x29, x16, x28, ror #63 - eor v31.16b, v22.16b, v29.16b - eor x26, x26, x6, ror #51 - shl v9.2d, v31.2d, #0x3d - eor x30, x23, x22, ror #50 - sri v9.2d, v31.2d, #0x3 - eor x0, x26, x10, ror #31 - eor x29, x29, x19, ror #37 - eor v31.16b, v14.16b, v27.16b - eor x27, x27, x12, ror #5 - shl v22.2d, v31.2d, #0x27 - eor x30, x30, x24, ror #34 - eor x0, x0, x7, ror #27 - sri v22.2d, v31.2d, #0x19 - eor x26, x30, x21, ror #26 - eor v31.16b, v20.16b, v26.16b - eor x26, x26, x25, ror #15 - shl v14.2d, v31.2d, #0x12 - ror x30, x27, #0x3e - eor x30, x30, x26, ror #57 - sri v14.2d, v31.2d, #0x2e - ror x26, x26, #0x3a - eor v31.16b, v4.16b, v27.16b - eor x16, x30, x16 - shl v20.2d, v31.2d, #0x1b - eor x28, x30, x28, ror #63 - str x28, [sp, #0xd0] - sri v20.2d, v31.2d, #0x25 - eor x29, x29, x17, ror #36 - eor v31.16b, v24.16b, v27.16b - eor x28, x1, x2, ror #61 - eor x19, x30, x19, ror #37 - shl v4.2d, v31.2d, #0xe - eor x29, x29, x20, ror #2 - sri v4.2d, v31.2d, #0x32 - eor x28, x28, x4, ror #54 - eor v31.16b, v21.16b, v25.16b - eor x26, x26, x0, ror #55 - eor x28, x28, x3, ror #39 - shl v24.2d, v31.2d, #0x2 - eor x28, x28, x5, ror #25 - sri v24.2d, v31.2d, #0x3e - ror x0, x0, #0x38 - eor x0, x0, x29, ror #63 - eor v31.16b, v5.16b, v26.16b - eor x27, x28, x27, ror #61 - shl v21.2d, v31.2d, #0x24 - eor x13, x0, x13, ror #46 - sri v21.2d, v31.2d, #0x1c - eor x28, x29, x28, ror #63 - eor x29, x30, x20, ror #2 - eor v31.16b, v6.16b, v25.16b - eor x20, x26, x3, ror #39 - shl v27.2d, v31.2d, #0x2c - eor x11, x0, x11, ror #50 - sri v27.2d, v31.2d, #0x14 - eor x25, x28, x25, ror #9 - eor x3, x28, x21, ror #20 - bic v31.16b, v7.16b, v11.16b - eor x21, x26, x1 - eor v5.16b, v31.16b, v10.16b - eor x9, x27, x9, ror #49 - eor x24, x28, x24, ror #28 - bic v31.16b, v8.16b, v7.16b - eor x1, x30, x17, ror #36 - eor v6.16b, v31.16b, v11.16b - eor x14, x0, x14, ror #8 - bic v31.16b, v9.16b, v8.16b - eor x22, x28, x22, ror #44 - eor x8, x27, x8, ror #56 - eor v7.16b, v31.16b, v7.16b - eor x17, x27, x7, ror #19 - bic v31.16b, v10.16b, v9.16b - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - eor v8.16b, v31.16b, v8.16b - eor x4, x26, x4, ror #54 - bic v31.16b, v11.16b, v10.16b - eor x0, x0, x12, ror #3 - eor v9.16b, v31.16b, v9.16b - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - bic v31.16b, v12.16b, v16.16b - eor x26, x26, x5, ror #25 - eor v10.16b, v31.16b, v15.16b - eor x2, x7, x16, ror #39 - bic v31.16b, v13.16b, v12.16b - bic x7, x9, x20, ror #42 - bic x30, x15, x9, ror #16 - eor v11.16b, v31.16b, v16.16b - eor x7, x7, x22, ror #25 - bic v31.16b, v14.16b, v13.16b - eor x12, x30, x20, ror #58 - bic x20, x22, x16, ror #56 - eor v12.16b, v31.16b, v12.16b - eor x30, x27, x6, ror #43 - bic v31.16b, v15.16b, v14.16b - eor x22, x20, x15, ror #23 - eor v13.16b, v31.16b, v13.16b - bic x6, x19, x13, ror #42 - eor x6, x6, x17, ror #41 - bic v31.16b, v16.16b, v15.16b - bic x5, x13, x17, ror #63 - eor v14.16b, v31.16b, v14.16b - eor x5, x21, x5, ror #21 - bic v31.16b, v17.16b, v21.16b - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - eor v15.16b, v31.16b, v20.16b - bic x21, x21, x25, ror #50 - bic v31.16b, v18.16b, v17.16b - bic x20, x27, x4, ror #25 - bic x10, x16, x15, ror #31 - eor v16.16b, v31.16b, v21.16b - eor x16, x21, x19, ror #43 - bic v31.16b, v19.16b, v18.16b - eor x21, x17, x25, ror #30 - eor v17.16b, v31.16b, v17.16b - bic x19, x25, x19, ror #57 - ldr x25, [sp, #0x18] - bic v31.16b, v20.16b, v19.16b - eor x17, x10, x9, ror #47 - eor v18.16b, v31.16b, v18.16b - ldr x9, [sp, #0x8] - eor x15, x20, x28, ror #27 - bic v31.16b, v21.16b, v20.16b - bic x20, x4, x28, ror #2 - eor v19.16b, v31.16b, v19.16b - eor x10, x20, x1, ror #50 - bic v31.16b, v22.16b, v1.16b - bic x20, x11, x27, ror #60 - eor x20, x20, x4, ror #21 - eor v20.16b, v31.16b, v0.16b - bic x4, x28, x1, ror #48 - bic v31.16b, v23.16b, v22.16b - bic x1, x1, x11, ror #57 - eor v21.16b, v31.16b, v1.16b - ldr x28, [x9, x25, lsl #3] - ldr x9, [sp, #0xd0] - bic v31.16b, v24.16b, v23.16b - add x25, x25, #0x1 - eor v22.16b, v31.16b, v22.16b - str x25, [sp, #0x18] - cmp x25, #0x17 - bic v31.16b, v0.16b, v24.16b - eor x25, x1, x27, ror #53 - eor v23.16b, v31.16b, v23.16b - bic x27, x30, x26, ror #47 - bic v31.16b, v1.16b, v0.16b - eor x1, x5, x28 - eor x5, x4, x11, ror #41 - eor v24.16b, v31.16b, v24.16b - eor x11, x19, x13, ror #35 - bic v31.16b, v2.16b, v27.16b - bic x13, x26, x24, ror #10 - eor x28, x27, x24, ror #57 - eor v0.16b, v31.16b, v30.16b - bic x27, x24, x9, ror #47 - bic v31.16b, v3.16b, v2.16b - bic x19, x23, x3, ror #9 - eor v1.16b, v31.16b, v27.16b - bic x4, x29, x14, ror #41 - eor x24, x19, x29, ror #44 - bic v31.16b, v4.16b, v3.16b - bic x29, x3, x29, ror #35 - eor v2.16b, v31.16b, v2.16b - eor x13, x13, x9, ror #57 - bic v31.16b, v30.16b, v4.16b - eor x19, x29, x14, ror #12 - bic x29, x9, x0, ror #19 - eor v3.16b, v31.16b, v3.16b - bic x14, x14, x8, ror #5 - bic v31.16b, v27.16b, v30.16b - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - eor v4.16b, v31.16b, v4.16b - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - ldr x30, [sp, #0x10] - ld1r { v28.2d }, [x30], #8 - str x30, [sp, #0x10] - eor v0.16b, v0.16b, v28.16b + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x26, x8, x9, ror #57 + eor v30.16b, v30.16b, v15.16b + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v30.16b, v30.16b, v20.16b + eor x26, x26, x6, ror #51 + eor v29.16b, v1.16b, v6.16b + eor x30, x23, x22, ror #50 + eor v29.16b, v29.16b, v11.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v29.16b, v29.16b, v16.16b + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor v28.16b, v2.16b, v7.16b + eor x26, x30, x21, ror #26 + eor v28.16b, v28.16b, v12.16b + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor v27.16b, v3.16b, v8.16b + eor x16, x30, x16 + eor v27.16b, v27.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor v27.16b, v27.16b, v23.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v26.16b, v4.16b, v9.16b + eor x29, x29, x20, ror #2 + eor v26.16b, v26.16b, v14.16b + eor x28, x28, x4, ror #54 + eor v26.16b, v26.16b, v19.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor v26.16b, v26.16b, v24.16b + eor x28, x28, x5, ror #25 + add v31.2d, v28.2d, v28.2d + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + sri v31.2d, v28.2d, #0x3f + eor x27, x28, x27, ror #61 + eor v25.16b, v31.16b, v30.16b + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor v28.16b, v31.16b, v28.16b + eor x11, x0, x11, ror #50 + add v31.2d, v29.2d, v29.2d + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + sri v31.2d, v29.2d, #0x3f + eor x21, x26, x1 + eor v26.16b, v31.16b, v26.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + add v31.2d, v27.2d, v27.2d + eor x1, x30, x17, ror #36 + sri v31.2d, v27.2d, #0x3f + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + add v31.2d, v30.2d, v30.2d + eor x17, x27, x7, ror #19 + sri v31.2d, v30.2d, #0x3f + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v27.16b, v31.16b, v27.16b + eor x4, x26, x4, ror #54 + eor v30.16b, v0.16b, v26.16b + eor x0, x0, x12, ror #3 + eor v31.16b, v2.16b, v29.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + shl v0.2d, v31.2d, #0x3e + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + eor v31.16b, v12.16b, v29.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + shl v2.2d, v31.2d, #0x2b + eor x7, x7, x22, ror #25 + sri v2.2d, v31.2d, #0x15 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v31.16b, v13.16b, v28.16b + eor x30, x27, x6, ror #43 + shl v12.2d, v31.2d, #0x19 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + eor v31.16b, v19.16b, v27.16b + bic x5, x13, x17, ror #63 + shl v13.2d, v31.2d, #0x8 + eor x5, x21, x5, ror #21 + sri v13.2d, v31.2d, #0x38 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v31.16b, v23.16b, v28.16b + bic x21, x21, x25, ror #50 + shl v19.2d, v31.2d, #0x38 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + sri v19.2d, v31.2d, #0x8 + eor x16, x21, x19, ror #43 + eor v31.16b, v15.16b, v26.16b + eor x21, x17, x25, ror #30 + shl v23.2d, v31.2d, #0x29 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + sri v23.2d, v31.2d, #0x17 + eor x17, x10, x9, ror #47 + eor v31.16b, v1.16b, v25.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + shl v15.2d, v31.2d, #0x1 + bic x20, x4, x28, ror #2 + sri v15.2d, v31.2d, #0x3f + eor x10, x20, x1, ror #50 + eor v31.16b, v8.16b, v28.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + shl v1.2d, v31.2d, #0x37 + bic x4, x28, x1, ror #48 + sri v1.2d, v31.2d, #0x9 + bic x1, x1, x11, ror #57 + eor v31.16b, v16.16b, v25.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + shl v8.2d, v31.2d, #0x2d + add x25, x25, #0x1 + sri v8.2d, v31.2d, #0x13 + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v7.16b, v29.16b + eor x25, x1, x27, ror #53 + shl v16.2d, v31.2d, #0x6 + bic x27, x30, x26, ror #47 + sri v16.2d, v31.2d, #0x3a + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v31.16b, v10.16b, v26.16b + eor x11, x19, x13, ror #35 + shl v7.2d, v31.2d, #0x3 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + sri v7.2d, v31.2d, #0x3d + bic x27, x24, x9, ror #47 + eor v31.16b, v3.16b, v28.16b + bic x19, x23, x3, ror #9 + shl v10.2d, v31.2d, #0x1c + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + sri v10.2d, v31.2d, #0x24 + bic x29, x3, x29, ror #35 + eor v31.16b, v18.16b, v28.16b + eor x13, x13, x9, ror #57 + shl v3.2d, v31.2d, #0x15 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + sri v3.2d, v31.2d, #0x2b + bic x14, x14, x8, ror #5 + eor v31.16b, v17.16b, v29.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v18.2d, v31.2d, #0xf + bic x23, x8, x23, ror #38 + sri v18.2d, v31.2d, #0x31 + eor x8, x27, x0, ror #2 + eor v31.16b, v11.16b, v25.16b + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + shl v17.2d, v31.2d, #0xa + eor x23, x3, x26, ror #52 + sri v17.2d, v31.2d, #0x36 + eor x3, x29, x30, ror #24 + eor x0, x15, x11, ror #52 + eor v31.16b, v9.16b, v27.16b + eor x0, x0, x13, ror #48 + shl v11.2d, v31.2d, #0x14 + eor x26, x8, x9, ror #57 + sri v11.2d, v31.2d, #0x2c + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v31.16b, v22.16b, v29.16b + eor x26, x26, x6, ror #51 + shl v9.2d, v31.2d, #0x3d + eor x30, x23, x22, ror #50 + sri v9.2d, v31.2d, #0x3 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v31.16b, v14.16b, v27.16b + eor x27, x27, x12, ror #5 + shl v22.2d, v31.2d, #0x27 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + sri v22.2d, v31.2d, #0x19 + eor x26, x30, x21, ror #26 + eor v31.16b, v20.16b, v26.16b + eor x26, x26, x25, ror #15 + shl v14.2d, v31.2d, #0x12 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + sri v14.2d, v31.2d, #0x2e + ror x26, x26, #0x3a + eor v31.16b, v4.16b, v27.16b + eor x16, x30, x16 + shl v20.2d, v31.2d, #0x1b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + sri v20.2d, v31.2d, #0x25 + eor x29, x29, x17, ror #36 + eor v31.16b, v24.16b, v27.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + shl v4.2d, v31.2d, #0xe + eor x29, x29, x20, ror #2 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x4, ror #54 + eor v31.16b, v21.16b, v25.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v24.2d, v31.2d, #0x2 + eor x28, x28, x5, ror #25 + sri v24.2d, v31.2d, #0x3e + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor v31.16b, v5.16b, v26.16b + eor x27, x28, x27, ror #61 + shl v21.2d, v31.2d, #0x24 + eor x13, x0, x13, ror #46 + sri v21.2d, v31.2d, #0x1c + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor v31.16b, v6.16b, v25.16b + eor x20, x26, x3, ror #39 + shl v27.2d, v31.2d, #0x2c + eor x11, x0, x11, ror #50 + sri v27.2d, v31.2d, #0x14 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v7.16b, v11.16b + eor x21, x26, x1 + eor v5.16b, v31.16b, v10.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + bic v31.16b, v8.16b, v7.16b + eor x1, x30, x17, ror #36 + eor v6.16b, v31.16b, v11.16b + eor x14, x0, x14, ror #8 + bic v31.16b, v9.16b, v8.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor v7.16b, v31.16b, v7.16b + eor x17, x27, x7, ror #19 + bic v31.16b, v10.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v8.16b, v31.16b, v8.16b + eor x4, x26, x4, ror #54 + bic v31.16b, v11.16b, v10.16b + eor x0, x0, x12, ror #3 + eor v9.16b, v31.16b, v9.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bic v31.16b, v12.16b, v16.16b + eor x26, x26, x5, ror #25 + eor v10.16b, v31.16b, v15.16b + eor x2, x7, x16, ror #39 + bic v31.16b, v13.16b, v12.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v11.16b, v31.16b, v16.16b + eor x7, x7, x22, ror #25 + bic v31.16b, v14.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v12.16b, v31.16b, v12.16b + eor x30, x27, x6, ror #43 + bic v31.16b, v15.16b, v14.16b + eor x22, x20, x15, ror #23 + eor v13.16b, v31.16b, v13.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic v31.16b, v16.16b, v15.16b + bic x5, x13, x17, ror #63 + eor v14.16b, v31.16b, v14.16b + eor x5, x21, x5, ror #21 + bic v31.16b, v17.16b, v21.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v15.16b, v31.16b, v20.16b + bic x21, x21, x25, ror #50 + bic v31.16b, v18.16b, v17.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor v16.16b, v31.16b, v21.16b + eor x16, x21, x19, ror #43 + bic v31.16b, v19.16b, v18.16b + eor x21, x17, x25, ror #30 + eor v17.16b, v31.16b, v17.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bic v31.16b, v20.16b, v19.16b + eor x17, x10, x9, ror #47 + eor v18.16b, v31.16b, v18.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor v19.16b, v31.16b, v19.16b + eor x10, x20, x1, ror #50 + bic v31.16b, v22.16b, v1.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic v31.16b, v23.16b, v22.16b + bic x1, x1, x11, ror #57 + eor v21.16b, v31.16b, v1.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + eor v22.16b, v31.16b, v22.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + eor v23.16b, v31.16b, v23.16b + bic x27, x30, x26, ror #47 + bic v31.16b, v1.16b, v0.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v2.16b, v27.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + eor v1.16b, v31.16b, v27.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + bic v31.16b, v30.16b, v4.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b Lkeccak_f1600_x4_v8a_scalar_hybrid_loop_end: - b.le Lkeccak_f1600_x4_v8a_scalar_hybrid_loop - ror x2, x2, #0x3d - ror x3, x3, #0x27 - ror x4, x4, #0x36 - ror x5, x5, #0x19 - ror x6, x6, #0x2b - ror x7, x7, #0x13 - ror x8, x8, #0x38 - ror x9, x9, #0x31 - ror x10, x10, #0x17 - ror x11, x11, #0x32 - ror x12, x12, #0x3 - ror x13, x13, #0x2e - ror x14, x14, #0x8 - ror x15, x15, #0x3e - ror x17, x17, #0x24 - ror x28, x28, #0x3f - ror x19, x19, #0x25 - ror x20, x20, #0x2 - ror x21, x21, #0x14 - ror x22, x22, #0x2c - ror x23, x23, #0x3a - ror x24, x24, #0x1c - ror x25, x25, #0x9 - ldr x30, [sp, #0x20] - cmp x30, #0x1 - b.eq Lkeccak_f1600_x4_v8a_scalar_hybrid_done - mov x30, #0x1 // =1 - str x30, [sp, #0x20] - ldr x0, [sp] - add x0, x0, #0x190 - stp x1, x6, [x0] - stp x11, x16, [x0, #0x10] - stp x21, x2, [x0, #0x20] - stp x7, x12, [x0, #0x30] - stp x17, x22, [x0, #0x40] - stp x3, x8, [x0, #0x50] - stp x13, x28, [x0, #0x60] - stp x23, x4, [x0, #0x70] - stp x9, x14, [x0, #0x80] - stp x19, x24, [x0, #0x90] - stp x5, x10, [x0, #0xa0] - stp x15, x20, [x0, #0xb0] - str x25, [x0, #0xc0] - sub x0, x0, #0x190 - add x0, x0, #0x258 - ldp x1, x6, [x0] - ldp x11, x16, [x0, #0x10] - ldp x21, x2, [x0, #0x20] - ldp x7, x12, [x0, #0x30] - ldp x17, x22, [x0, #0x40] - ldp x3, x8, [x0, #0x50] - ldp x13, x28, [x0, #0x60] - ldp x23, x4, [x0, #0x70] - ldp x9, x14, [x0, #0x80] - ldp x19, x24, [x0, #0x90] - ldp x5, x10, [x0, #0xa0] - ldp x15, x20, [x0, #0xb0] - ldr x25, [x0, #0xc0] - sub x0, x0, #0x258 - b Lkeccak_f1600_x4_v8a_scalar_hybrid_initial + b.le Lkeccak_f1600_x4_v8a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq Lkeccak_f1600_x4_v8a_scalar_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b Lkeccak_f1600_x4_v8a_scalar_hybrid_initial Lkeccak_f1600_x4_v8a_scalar_hybrid_done: - ldr x0, [sp] - add x0, x0, #0x258 - stp x1, x6, [x0] - stp x11, x16, [x0, #0x10] - stp x21, x2, [x0, #0x20] - stp x7, x12, [x0, #0x30] - stp x17, x22, [x0, #0x40] - stp x3, x8, [x0, #0x50] - stp x13, x28, [x0, #0x60] - stp x23, x4, [x0, #0x70] - stp x9, x14, [x0, #0x80] - stp x19, x24, [x0, #0x90] - stp x5, x10, [x0, #0xa0] - stp x15, x20, [x0, #0xb0] - str x25, [x0, #0xc0] - sub x0, x0, #0x258 - add x4, x0, #0xc8 - trn1 v25.2d, v0.2d, v1.2d - trn1 v26.2d, v2.2d, v3.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v0.2d, v1.2d - trn2 v28.2d, v2.2d, v3.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v4.2d, v5.2d - trn1 v26.2d, v6.2d, v7.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v4.2d, v5.2d - trn2 v28.2d, v6.2d, v7.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v8.2d, v9.2d - trn1 v26.2d, v10.2d, v11.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v8.2d, v9.2d - trn2 v28.2d, v10.2d, v11.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v12.2d, v13.2d - trn1 v26.2d, v14.2d, v15.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v12.2d, v13.2d - trn2 v28.2d, v14.2d, v15.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v16.2d, v17.2d - trn1 v26.2d, v18.2d, v19.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v16.2d, v17.2d - trn2 v28.2d, v18.2d, v19.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v20.2d, v21.2d - trn1 v26.2d, v22.2d, v23.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v20.2d, v21.2d - trn2 v28.2d, v22.2d, v23.2d - st1 { v27.2d, v28.2d }, [x4], #32 - str d24, [x0] - trn2 v25.2d, v24.2d, v24.2d - str d25, [x4] - ldp d8, d9, [sp, #0x90] + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x4], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4] + ldp d8, d9, [sp, #0x90] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0xa0] + ldp d10, d11, [sp, #0xa0] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0xb0] + ldp d12, d13, [sp, #0xb0] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0xc0] + ldp d14, d15, [sp, #0xc0] .cfi_restore d14 .cfi_restore d15 - ldp x19, x20, [sp, #0x30] + ldp x19, x20, [sp, #0x30] .cfi_restore x19 .cfi_restore x20 - ldp x21, x22, [sp, #0x40] + ldp x21, x22, [sp, #0x40] .cfi_restore x21 .cfi_restore x22 - ldp x23, x24, [sp, #0x50] + ldp x23, x24, [sp, #0x50] .cfi_restore x23 .cfi_restore x24 - ldp x25, x26, [sp, #0x60] + ldp x25, x26, [sp, #0x60] .cfi_restore x25 .cfi_restore x26 - ldp x27, x28, [sp, #0x70] + ldp x27, x28, [sp, #0x70] .cfi_restore x27 .cfi_restore x28 - ldp x29, x30, [sp, #0x80] + ldp x29, x30, [sp, #0x80] .cfi_restore x29 .cfi_restore x30 - add sp, sp, #0xe0 + add sp, sp, #0xe0 .cfi_adjust_cfa_offset -0xe0 ret .cfi_endproc diff --git a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S index c48c3cf3e..0aa955dc8 100644 --- a/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +++ b/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S @@ -48,929 +48,929 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm) .cfi_startproc - sub sp, sp, #0xe0 + sub sp, sp, #0xe0 .cfi_adjust_cfa_offset 0xe0 - stp x19, x20, [sp, #0x30] + stp x19, x20, [sp, #0x30] .cfi_rel_offset x19, 0x30 .cfi_rel_offset x20, 0x38 - stp x21, x22, [sp, #0x40] + stp x21, x22, [sp, #0x40] .cfi_rel_offset x21, 0x40 .cfi_rel_offset x22, 0x48 - stp x23, x24, [sp, #0x50] + stp x23, x24, [sp, #0x50] .cfi_rel_offset x23, 0x50 .cfi_rel_offset x24, 0x58 - stp x25, x26, [sp, #0x60] + stp x25, x26, [sp, #0x60] .cfi_rel_offset x25, 0x60 .cfi_rel_offset x26, 0x68 - stp x27, x28, [sp, #0x70] + stp x27, x28, [sp, #0x70] .cfi_rel_offset x27, 0x70 .cfi_rel_offset x28, 0x78 - stp x29, x30, [sp, #0x80] + stp x29, x30, [sp, #0x80] .cfi_rel_offset x29, 0x80 .cfi_rel_offset x30, 0x88 - stp d8, d9, [sp, #0x90] + stp d8, d9, [sp, #0x90] .cfi_rel_offset d8, 0x90 .cfi_rel_offset d9, 0x98 - stp d10, d11, [sp, #0xa0] + stp d10, d11, [sp, #0xa0] .cfi_rel_offset d10, 0xa0 .cfi_rel_offset d11, 0xa8 - stp d12, d13, [sp, #0xb0] + stp d12, d13, [sp, #0xb0] .cfi_rel_offset d12, 0xb0 .cfi_rel_offset d13, 0xb8 - stp d14, d15, [sp, #0xc0] + stp d14, d15, [sp, #0xc0] .cfi_rel_offset d14, 0xc0 .cfi_rel_offset d15, 0xc8 - mov x29, x1 - mov x30, #0x0 // =0 - str x30, [sp, #0x20] - str x29, [sp, #0x8] - str x29, [sp, #0x10] - str x0, [sp] - add x4, x0, #0xc8 - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v0.2d, v25.2d, v27.2d - trn2 v1.2d, v25.2d, v27.2d - trn1 v2.2d, v26.2d, v28.2d - trn2 v3.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v4.2d, v25.2d, v27.2d - trn2 v5.2d, v25.2d, v27.2d - trn1 v6.2d, v26.2d, v28.2d - trn2 v7.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v8.2d, v25.2d, v27.2d - trn2 v9.2d, v25.2d, v27.2d - trn1 v10.2d, v26.2d, v28.2d - trn2 v11.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v12.2d, v25.2d, v27.2d - trn2 v13.2d, v25.2d, v27.2d - trn1 v14.2d, v26.2d, v28.2d - trn2 v15.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v16.2d, v25.2d, v27.2d - trn2 v17.2d, v25.2d, v27.2d - trn1 v18.2d, v26.2d, v28.2d - trn2 v19.2d, v26.2d, v28.2d - ldp q25, q26, [x0], #0x20 - ld1 { v27.2d, v28.2d }, [x4], #32 - trn1 v20.2d, v25.2d, v27.2d - trn2 v21.2d, v25.2d, v27.2d - trn1 v22.2d, v26.2d, v28.2d - trn2 v23.2d, v26.2d, v28.2d - ldr d25, [x0] - ldr d27, [x4] - trn1 v24.2d, v25.2d, v27.2d - sub x0, x0, #0xc0 - add x0, x0, #0x190 - ldp x1, x6, [x0] - ldp x11, x16, [x0, #0x10] - ldp x21, x2, [x0, #0x20] - ldp x7, x12, [x0, #0x30] - ldp x17, x22, [x0, #0x40] - ldp x3, x8, [x0, #0x50] - ldp x13, x28, [x0, #0x60] - ldp x23, x4, [x0, #0x70] - ldp x9, x14, [x0, #0x80] - ldp x19, x24, [x0, #0x90] - ldp x5, x10, [x0, #0xa0] - ldp x15, x20, [x0, #0xb0] - ldr x25, [x0, #0xc0] - sub x0, x0, #0x190 + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x4] + trn1 v24.2d, v25.2d, v27.2d + sub x0, x0, #0xc0 + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_initial: - eor x30, x24, x25 - eor x27, x9, x10 - eor3 v30.16b, v0.16b, v5.16b, v10.16b - eor v30.16b, v30.16b, v15.16b - eor x0, x30, x21 - eor x26, x27, x6 - eor v30.16b, v30.16b, v20.16b - eor x27, x26, x7 - eor x29, x0, x22 - eor3 v29.16b, v1.16b, v6.16b, v11.16b - eor x26, x29, x23 - eor x29, x4, x5 - eor v29.16b, v29.16b, v16.16b - eor x30, x29, x1 - eor x0, x27, x8 - eor v29.16b, v29.16b, v21.16b - eor x29, x30, x2 - eor x30, x19, x20 - eor3 v28.16b, v2.16b, v7.16b, v12.16b - eor x30, x30, x16 - eor x27, x26, x0, ror #63 - eor v28.16b, v28.16b, v17.16b - eor x4, x4, x27 - eor x30, x30, x17 - eor v28.16b, v28.16b, v22.16b - eor x30, x30, x28 - eor x29, x29, x3 - eor3 v27.16b, v3.16b, v8.16b, v13.16b - eor x0, x0, x30, ror #63 - eor x30, x30, x29, ror #63 - eor v27.16b, v27.16b, v18.16b - eor x22, x22, x30 - eor v27.16b, v27.16b, v23.16b - eor x23, x23, x30 - str x23, [sp, #0xd0] - eor3 v26.16b, v4.16b, v9.16b, v14.16b - eor x23, x14, x15 - eor x14, x14, x0 - eor v26.16b, v26.16b, v19.16b - eor x23, x23, x11 - eor x15, x15, x0 - eor v26.16b, v26.16b, v24.16b - eor x1, x1, x27 - eor x23, x23, x12 - rax1 v25.2d, v30.2d, v28.2d - eor x23, x23, x13 - eor x11, x11, x0 - add v31.2d, v26.2d, v26.2d - eor x29, x29, x23, ror #63 - eor x23, x23, x26, ror #63 - sri v31.2d, v26.2d, #0x3f - eor x26, x13, x0 - eor x13, x28, x23 - eor v28.16b, v31.16b, v28.16b - eor x28, x24, x30 - eor x24, x16, x23 - rax1 v26.2d, v26.2d, v29.2d - eor x16, x21, x30 - eor x21, x25, x30 - add v31.2d, v27.2d, v27.2d - eor x30, x19, x23 - sri v31.2d, v27.2d, #0x3f - eor x19, x20, x23 - eor x20, x17, x23 - eor v29.16b, v31.16b, v29.16b - eor x17, x12, x0 - eor x0, x2, x27 - rax1 v27.2d, v27.2d, v30.2d - eor x2, x6, x29 - eor x6, x8, x29 - eor v30.16b, v0.16b, v26.16b - bic x8, x28, x13, ror #47 - eor x12, x3, x27 - eor v31.16b, v2.16b, v29.16b - bic x3, x13, x17, ror #19 - eor x5, x5, x27 - shl v0.2d, v31.2d, #0x3e - ldr x27, [sp, #0xd0] - bic x25, x17, x2, ror #5 - sri v0.2d, v31.2d, #0x2 - eor x9, x9, x29 - eor x23, x25, x5, ror #52 - xar v2.2d, v12.2d, v29.2d, #0x15 - eor x3, x3, x2, ror #24 - eor x8, x8, x17, ror #2 - eor v31.16b, v13.16b, v28.16b - eor x17, x10, x29 - bic x25, x12, x22, ror #47 - shl v12.2d, v31.2d, #0x19 - eor x29, x7, x29 - bic x10, x4, x27, ror #2 - sri v12.2d, v31.2d, #0x27 - bic x7, x5, x28, ror #10 - xar v13.2d, v19.2d, v27.2d, #0x38 - eor x10, x10, x20, ror #50 - eor x13, x7, x13, ror #57 - eor v31.16b, v23.16b, v28.16b - bic x7, x2, x5, ror #47 - eor x2, x25, x24, ror #39 - shl v19.2d, v31.2d, #0x38 - bic x25, x20, x11, ror #57 - bic x5, x17, x4, ror #25 - sri v19.2d, v31.2d, #0x8 - eor x25, x25, x17, ror #53 - bic x17, x11, x17, ror #60 - xar v23.2d, v15.2d, v26.2d, #0x17 - eor x28, x7, x28, ror #57 - bic x7, x9, x12, ror #42 - eor v31.16b, v1.16b, v25.16b - eor x7, x7, x22, ror #25 - bic x22, x22, x24, ror #56 - shl v15.2d, v31.2d, #0x1 - bic x24, x24, x15, ror #31 - eor x22, x22, x15, ror #23 - sri v15.2d, v31.2d, #0x3f - bic x20, x27, x20, ror #48 - bic x15, x15, x9, ror #16 - xar v1.2d, v8.2d, v28.2d, #0x9 - eor x12, x15, x12, ror #58 - eor x15, x5, x27, ror #27 - eor v31.16b, v16.16b, v25.16b - eor x5, x20, x11, ror #41 - shl v8.2d, v31.2d, #0x2d - ldr x11, [sp, #0x8] - eor x20, x17, x4, ror #21 - sri v8.2d, v31.2d, #0x13 - eor x17, x24, x9, ror #47 - mov x24, #0x1 // =1 - xar v16.2d, v7.2d, v29.2d, #0x3a - bic x9, x0, x16, ror #9 - str x24, [sp, #0x18] - eor v31.16b, v10.16b, v26.16b - bic x24, x29, x1, ror #44 - bic x27, x1, x21, ror #50 - shl v7.2d, v31.2d, #0x3 - bic x4, x26, x29, ror #63 - eor x1, x1, x4, ror #21 - sri v7.2d, v31.2d, #0x3d - ldr x11, [x11] - bic x4, x21, x30, ror #57 - xar v10.2d, v3.2d, v28.2d, #0x24 - eor x21, x24, x21, ror #30 - eor x24, x9, x19, ror #44 - eor v31.16b, v18.16b, v28.16b - bic x9, x14, x6, ror #5 - eor x9, x9, x0, ror #43 - shl v3.2d, v31.2d, #0x15 - bic x0, x6, x0, ror #38 - eor x1, x1, x11 - sri v3.2d, v31.2d, #0x2b - eor x11, x4, x26, ror #35 - eor x4, x0, x16, ror #47 - xar v18.2d, v17.2d, v29.2d, #0x31 - bic x0, x16, x19, ror #35 - eor v31.16b, v11.16b, v25.16b - eor x16, x27, x30, ror #43 - bic x27, x30, x26, ror #42 - shl v17.2d, v31.2d, #0xa - bic x26, x19, x14, ror #41 - eor x19, x0, x14, ror #12 - sri v17.2d, v31.2d, #0x36 - eor x14, x26, x6, ror #46 - eor x6, x27, x29, ror #41 - xar v11.2d, v9.2d, v27.2d, #0x2c - eor x0, x15, x11, ror #52 - eor x0, x0, x13, ror #48 - eor v31.16b, v22.16b, v29.16b - eor x26, x8, x9, ror #57 - eor x27, x0, x14, ror #10 - shl v9.2d, v31.2d, #0x3d - eor x29, x16, x28, ror #63 - eor x26, x26, x6, ror #51 - sri v9.2d, v31.2d, #0x3 - eor x30, x23, x22, ror #50 - eor x0, x26, x10, ror #31 - xar v22.2d, v14.2d, v27.2d, #0x19 - eor x29, x29, x19, ror #37 - eor x27, x27, x12, ror #5 - eor v31.16b, v20.16b, v26.16b - eor x30, x30, x24, ror #34 - eor x0, x0, x7, ror #27 - shl v14.2d, v31.2d, #0x12 - eor x26, x30, x21, ror #26 - sri v14.2d, v31.2d, #0x2e - eor x26, x26, x25, ror #15 - ror x30, x27, #0x3e - xar v20.2d, v4.2d, v27.2d, #0x25 - eor x30, x30, x26, ror #57 - ror x26, x26, #0x3a - eor v31.16b, v24.16b, v27.16b - eor x16, x30, x16 - eor x28, x30, x28, ror #63 - shl v4.2d, v31.2d, #0xe - str x28, [sp, #0xd0] - eor x29, x29, x17, ror #36 - sri v4.2d, v31.2d, #0x32 - eor x28, x1, x2, ror #61 - eor x19, x30, x19, ror #37 - xar v24.2d, v21.2d, v25.2d, #0x3e - eor x29, x29, x20, ror #2 - eor x28, x28, x4, ror #54 - eor v31.16b, v5.16b, v26.16b - eor x26, x26, x0, ror #55 - eor x28, x28, x3, ror #39 - shl v21.2d, v31.2d, #0x24 - eor x28, x28, x5, ror #25 - ror x0, x0, #0x38 - sri v21.2d, v31.2d, #0x1c - eor x0, x0, x29, ror #63 - eor x27, x28, x27, ror #61 - xar v27.2d, v6.2d, v25.2d, #0x14 - eor x13, x0, x13, ror #46 - eor x28, x29, x28, ror #63 - bic v31.16b, v7.16b, v11.16b - eor x29, x30, x20, ror #2 - eor v5.16b, v31.16b, v10.16b - eor x20, x26, x3, ror #39 - eor x11, x0, x11, ror #50 - bcax v6.16b, v11.16b, v8.16b, v7.16b - eor x25, x28, x25, ror #9 - eor x3, x28, x21, ror #20 - bic v31.16b, v9.16b, v8.16b - eor x21, x26, x1 - eor x9, x27, x9, ror #49 - eor v7.16b, v31.16b, v7.16b - eor x24, x28, x24, ror #28 - eor x1, x30, x17, ror #36 - bcax v8.16b, v8.16b, v10.16b, v9.16b - eor x14, x0, x14, ror #8 - eor x22, x28, x22, ror #44 - bic v31.16b, v11.16b, v10.16b - eor x8, x27, x8, ror #56 - eor x17, x27, x7, ror #19 - eor v9.16b, v31.16b, v9.16b - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - bcax v10.16b, v15.16b, v12.16b, v16.16b - eor x4, x26, x4, ror #54 - eor x0, x0, x12, ror #3 - bic v31.16b, v13.16b, v12.16b - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - eor v11.16b, v31.16b, v16.16b - eor x26, x26, x5, ror #25 - bcax v12.16b, v12.16b, v14.16b, v13.16b - eor x2, x7, x16, ror #39 - bic x7, x9, x20, ror #42 - bic v31.16b, v15.16b, v14.16b - bic x30, x15, x9, ror #16 - eor x7, x7, x22, ror #25 - eor v13.16b, v31.16b, v13.16b - eor x12, x30, x20, ror #58 - bic x20, x22, x16, ror #56 - bic v31.16b, v16.16b, v15.16b - eor x30, x27, x6, ror #43 - eor x22, x20, x15, ror #23 - eor v14.16b, v31.16b, v14.16b - bic x6, x19, x13, ror #42 - eor x6, x6, x17, ror #41 - bcax v15.16b, v20.16b, v17.16b, v21.16b - bic x5, x13, x17, ror #63 - eor x5, x21, x5, ror #21 - bic v31.16b, v18.16b, v17.16b - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - eor v16.16b, v31.16b, v21.16b - bic x21, x21, x25, ror #50 - bic x20, x27, x4, ror #25 - bcax v17.16b, v17.16b, v19.16b, v18.16b - bic x10, x16, x15, ror #31 - eor x16, x21, x19, ror #43 - bic v31.16b, v20.16b, v19.16b - eor x21, x17, x25, ror #30 - bic x19, x25, x19, ror #57 - eor v18.16b, v31.16b, v18.16b - ldr x25, [sp, #0x18] - bcax v19.16b, v19.16b, v21.16b, v20.16b - eor x17, x10, x9, ror #47 - ldr x9, [sp, #0x8] - bic v31.16b, v22.16b, v1.16b - eor x15, x20, x28, ror #27 - bic x20, x4, x28, ror #2 - eor v20.16b, v31.16b, v0.16b - eor x10, x20, x1, ror #50 - bic x20, x11, x27, ror #60 - bcax v21.16b, v1.16b, v23.16b, v22.16b - eor x20, x20, x4, ror #21 - bic x4, x28, x1, ror #48 - bic v31.16b, v24.16b, v23.16b - bic x1, x1, x11, ror #57 - ldr x28, [x9, x25, lsl #3] - eor v22.16b, v31.16b, v22.16b - ldr x9, [sp, #0xd0] - add x25, x25, #0x1 - bcax v23.16b, v23.16b, v0.16b, v24.16b - str x25, [sp, #0x18] - cmp x25, #0x17 - bic v31.16b, v1.16b, v0.16b - eor x25, x1, x27, ror #53 - bic x27, x30, x26, ror #47 - eor v24.16b, v31.16b, v24.16b - eor x1, x5, x28 - eor x5, x4, x11, ror #41 - bcax v0.16b, v30.16b, v2.16b, v27.16b - eor x11, x19, x13, ror #35 - bic v31.16b, v3.16b, v2.16b - bic x13, x26, x24, ror #10 - eor x28, x27, x24, ror #57 - eor v1.16b, v31.16b, v27.16b - bic x27, x24, x9, ror #47 - bic x19, x23, x3, ror #9 - bcax v2.16b, v2.16b, v4.16b, v3.16b - bic x4, x29, x14, ror #41 - eor x24, x19, x29, ror #44 - bic v31.16b, v30.16b, v4.16b - bic x29, x3, x29, ror #35 - eor x13, x13, x9, ror #57 - eor v3.16b, v31.16b, v3.16b - eor x19, x29, x14, ror #12 - bic x29, x9, x0, ror #19 - bcax v4.16b, v4.16b, v27.16b, v30.16b - bic x14, x14, x8, ror #5 - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - ldr x30, [sp, #0x10] - ld1r { v28.2d }, [x30], #8 - str x30, [sp, #0x10] - eor v0.16b, v0.16b, v28.16b + eor x30, x24, x25 + eor x27, x9, x10 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x0, x30, x21 + eor x26, x27, x6 + eor v30.16b, v30.16b, v20.16b + eor x27, x26, x7 + eor x29, x0, x22 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x26, x29, x23 + eor x29, x4, x5 + eor v29.16b, v29.16b, v16.16b + eor x30, x29, x1 + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor x30, x19, x20 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor x30, x30, x17 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x28 + eor x29, x29, x3 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + str x23, [sp, #0xd0] + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor x15, x15, x0 + eor v26.16b, v26.16b, v24.16b + eor x1, x1, x27 + eor x23, x23, x12 + rax1 v25.2d, v30.2d, v28.2d + eor x23, x23, x13 + eor x11, x11, x0 + add v31.2d, v26.2d, v26.2d + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor x13, x28, x23 + eor v28.16b, v31.16b, v28.16b + eor x28, x24, x30 + eor x24, x16, x23 + rax1 v26.2d, v26.2d, v29.2d + eor x16, x21, x30 + eor x21, x25, x30 + add v31.2d, v27.2d, v27.2d + eor x30, x19, x23 + sri v31.2d, v27.2d, #0x3f + eor x19, x20, x23 + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + rax1 v27.2d, v27.2d, v30.2d + eor x2, x6, x29 + eor x6, x8, x29 + eor v30.16b, v0.16b, v26.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v31.16b, v2.16b, v29.16b + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + shl v0.2d, v31.2d, #0x3e + ldr x27, [sp, #0xd0] + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + xar v2.2d, v12.2d, v29.2d, #0x15 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor v31.16b, v13.16b, v28.16b + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + shl v12.2d, v31.2d, #0x19 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + xar v13.2d, v19.2d, v27.2d, #0x38 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + eor v31.16b, v23.16b, v28.16b + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + shl v19.2d, v31.2d, #0x38 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor v31.16b, v1.16b, v25.16b + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + shl v15.2d, v31.2d, #0x1 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + sri v15.2d, v31.2d, #0x3f + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor v31.16b, v16.16b, v25.16b + eor x5, x20, x11, ror #41 + shl v8.2d, v31.2d, #0x2d + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + sri v8.2d, v31.2d, #0x13 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + xar v16.2d, v7.2d, v29.2d, #0x3a + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v10.16b, v26.16b + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + shl v7.2d, v31.2d, #0x3 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + sri v7.2d, v31.2d, #0x3d + ldr x11, [x11] + bic x4, x21, x30, ror #57 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v18.16b, v28.16b + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + shl v3.2d, v31.2d, #0x15 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + sri v3.2d, v31.2d, #0x2b + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x0, x16, x19, ror #35 + eor v31.16b, v11.16b, v25.16b + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + shl v17.2d, v31.2d, #0xa + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v17.2d, v31.2d, #0x36 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v31.16b, v22.16b, v29.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + shl v9.2d, v31.2d, #0x3d + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v9.2d, v31.2d, #0x3 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + shl v14.2d, v31.2d, #0x12 + eor x26, x30, x21, ror #26 + sri v14.2d, v31.2d, #0x2e + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor v31.16b, v24.16b, v27.16b + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + shl v4.2d, v31.2d, #0xe + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + sri v4.2d, v31.2d, #0x32 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor v31.16b, v5.16b, v26.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v21.2d, v31.2d, #0x24 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + sri v21.2d, v31.2d, #0x1c + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + bic v31.16b, v7.16b, v11.16b + eor x29, x30, x20, ror #2 + eor v5.16b, v31.16b, v10.16b + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v9.16b, v8.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor v7.16b, v31.16b, v7.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + bic v31.16b, v11.16b, v10.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor v9.16b, v31.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + bic v31.16b, v13.16b, v12.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v11.16b, v31.16b, v16.16b + eor x26, x26, x5, ror #25 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic v31.16b, v15.16b, v14.16b + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v13.16b, v31.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + bic v31.16b, v16.16b, v15.16b + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + eor v14.16b, v31.16b, v14.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bcax v15.16b, v20.16b, v17.16b, v21.16b + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic v31.16b, v18.16b, v17.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v16.16b, v31.16b, v21.16b + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + bic v31.16b, v20.16b, v19.16b + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v18.16b, v31.16b, v18.16b + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + bic v31.16b, v22.16b, v1.16b + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor v20.16b, v31.16b, v0.16b + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + bcax v21.16b, v1.16b, v23.16b, v22.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic v31.16b, v24.16b, v23.16b + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + eor v22.16b, v31.16b, v22.16b + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + bcax v23.16b, v23.16b, v0.16b, v24.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v1.16b, v0.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v24.16b, v31.16b, v24.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop: - eor x0, x15, x11, ror #52 - eor x0, x0, x13, ror #48 - eor3 v30.16b, v0.16b, v5.16b, v10.16b - eor v30.16b, v30.16b, v15.16b - eor x26, x8, x9, ror #57 - eor x27, x0, x14, ror #10 - eor v30.16b, v30.16b, v20.16b - eor x29, x16, x28, ror #63 - eor x26, x26, x6, ror #51 - eor3 v29.16b, v1.16b, v6.16b, v11.16b - eor x30, x23, x22, ror #50 - eor x0, x26, x10, ror #31 - eor v29.16b, v29.16b, v16.16b - eor x29, x29, x19, ror #37 - eor x27, x27, x12, ror #5 - eor v29.16b, v29.16b, v21.16b - eor x30, x30, x24, ror #34 - eor x0, x0, x7, ror #27 - eor3 v28.16b, v2.16b, v7.16b, v12.16b - eor x26, x30, x21, ror #26 - eor x26, x26, x25, ror #15 - eor v28.16b, v28.16b, v17.16b - ror x30, x27, #0x3e - eor x30, x30, x26, ror #57 - eor v28.16b, v28.16b, v22.16b - ror x26, x26, #0x3a - eor x16, x30, x16 - eor3 v27.16b, v3.16b, v8.16b, v13.16b - eor x28, x30, x28, ror #63 - str x28, [sp, #0xd0] - eor v27.16b, v27.16b, v18.16b - eor x29, x29, x17, ror #36 - eor x28, x1, x2, ror #61 - eor v27.16b, v27.16b, v23.16b - eor x19, x30, x19, ror #37 - eor x29, x29, x20, ror #2 - eor3 v26.16b, v4.16b, v9.16b, v14.16b - eor x28, x28, x4, ror #54 - eor x26, x26, x0, ror #55 - eor v26.16b, v26.16b, v19.16b - eor x28, x28, x3, ror #39 - eor x28, x28, x5, ror #25 - eor v26.16b, v26.16b, v24.16b - ror x0, x0, #0x38 - eor x0, x0, x29, ror #63 - rax1 v25.2d, v30.2d, v28.2d - eor x27, x28, x27, ror #61 - eor x13, x0, x13, ror #46 - add v31.2d, v26.2d, v26.2d - eor x28, x29, x28, ror #63 - eor x29, x30, x20, ror #2 - sri v31.2d, v26.2d, #0x3f - eor x20, x26, x3, ror #39 - eor x11, x0, x11, ror #50 - eor v28.16b, v31.16b, v28.16b - eor x25, x28, x25, ror #9 - eor x3, x28, x21, ror #20 - rax1 v26.2d, v26.2d, v29.2d - eor x21, x26, x1 - add v31.2d, v27.2d, v27.2d - eor x9, x27, x9, ror #49 - eor x24, x28, x24, ror #28 - sri v31.2d, v27.2d, #0x3f - eor x1, x30, x17, ror #36 - eor x14, x0, x14, ror #8 - eor v29.16b, v31.16b, v29.16b - eor x22, x28, x22, ror #44 - eor x8, x27, x8, ror #56 - rax1 v27.2d, v27.2d, v30.2d - eor x17, x27, x7, ror #19 - eor x15, x0, x15, ror #62 - eor v30.16b, v0.16b, v26.16b - bic x7, x20, x22, ror #47 - eor x4, x26, x4, ror #54 - eor v31.16b, v2.16b, v29.16b - eor x0, x0, x12, ror #3 - eor x28, x28, x23, ror #58 - shl v0.2d, v31.2d, #0x3e - eor x23, x26, x2, ror #61 - eor x26, x26, x5, ror #25 - sri v0.2d, v31.2d, #0x2 - eor x2, x7, x16, ror #39 - bic x7, x9, x20, ror #42 - xar v2.2d, v12.2d, v29.2d, #0x15 - bic x30, x15, x9, ror #16 - eor x7, x7, x22, ror #25 - eor v31.16b, v13.16b, v28.16b - eor x12, x30, x20, ror #58 - bic x20, x22, x16, ror #56 - shl v12.2d, v31.2d, #0x19 - eor x30, x27, x6, ror #43 - eor x22, x20, x15, ror #23 - sri v12.2d, v31.2d, #0x27 - bic x6, x19, x13, ror #42 - eor x6, x6, x17, ror #41 - xar v13.2d, v19.2d, v27.2d, #0x38 - bic x5, x13, x17, ror #63 - eor x5, x21, x5, ror #21 - eor v31.16b, v23.16b, v28.16b - bic x17, x17, x21, ror #44 - eor x27, x27, x10, ror #23 - shl v19.2d, v31.2d, #0x38 - bic x21, x21, x25, ror #50 - bic x20, x27, x4, ror #25 - sri v19.2d, v31.2d, #0x8 - bic x10, x16, x15, ror #31 - eor x16, x21, x19, ror #43 - xar v23.2d, v15.2d, v26.2d, #0x17 - eor x21, x17, x25, ror #30 - bic x19, x25, x19, ror #57 - eor v31.16b, v1.16b, v25.16b - ldr x25, [sp, #0x18] - eor x17, x10, x9, ror #47 - shl v15.2d, v31.2d, #0x1 - ldr x9, [sp, #0x8] - sri v15.2d, v31.2d, #0x3f - eor x15, x20, x28, ror #27 - bic x20, x4, x28, ror #2 - xar v1.2d, v8.2d, v28.2d, #0x9 - eor x10, x20, x1, ror #50 - bic x20, x11, x27, ror #60 - eor v31.16b, v16.16b, v25.16b - eor x20, x20, x4, ror #21 - bic x4, x28, x1, ror #48 - shl v8.2d, v31.2d, #0x2d - bic x1, x1, x11, ror #57 - ldr x28, [x9, x25, lsl #3] - sri v8.2d, v31.2d, #0x13 - ldr x9, [sp, #0xd0] - add x25, x25, #0x1 - xar v16.2d, v7.2d, v29.2d, #0x3a - str x25, [sp, #0x18] - cmp x25, #0x17 - eor v31.16b, v10.16b, v26.16b - eor x25, x1, x27, ror #53 - bic x27, x30, x26, ror #47 - shl v7.2d, v31.2d, #0x3 - eor x1, x5, x28 - eor x5, x4, x11, ror #41 - sri v7.2d, v31.2d, #0x3d - eor x11, x19, x13, ror #35 - bic x13, x26, x24, ror #10 - xar v10.2d, v3.2d, v28.2d, #0x24 - eor x28, x27, x24, ror #57 - bic x27, x24, x9, ror #47 - eor v31.16b, v18.16b, v28.16b - bic x19, x23, x3, ror #9 - bic x4, x29, x14, ror #41 - shl v3.2d, v31.2d, #0x15 - eor x24, x19, x29, ror #44 - bic x29, x3, x29, ror #35 - sri v3.2d, v31.2d, #0x2b - eor x13, x13, x9, ror #57 - eor x19, x29, x14, ror #12 - xar v18.2d, v17.2d, v29.2d, #0x31 - bic x29, x9, x0, ror #19 - bic x14, x14, x8, ror #5 - eor v31.16b, v11.16b, v25.16b - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - shl v17.2d, v31.2d, #0xa - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - sri v17.2d, v31.2d, #0x36 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - xar v11.2d, v9.2d, v27.2d, #0x2c - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - eor v31.16b, v22.16b, v29.16b - eor x0, x15, x11, ror #52 - shl v9.2d, v31.2d, #0x3d - eor x0, x0, x13, ror #48 - eor x26, x8, x9, ror #57 - sri v9.2d, v31.2d, #0x3 - eor x27, x0, x14, ror #10 - eor x29, x16, x28, ror #63 - xar v22.2d, v14.2d, v27.2d, #0x19 - eor x26, x26, x6, ror #51 - eor x30, x23, x22, ror #50 - eor v31.16b, v20.16b, v26.16b - eor x0, x26, x10, ror #31 - eor x29, x29, x19, ror #37 - shl v14.2d, v31.2d, #0x12 - eor x27, x27, x12, ror #5 - eor x30, x30, x24, ror #34 - sri v14.2d, v31.2d, #0x2e - eor x0, x0, x7, ror #27 - eor x26, x30, x21, ror #26 - xar v20.2d, v4.2d, v27.2d, #0x25 - eor x26, x26, x25, ror #15 - ror x30, x27, #0x3e - eor v31.16b, v24.16b, v27.16b - eor x30, x30, x26, ror #57 - ror x26, x26, #0x3a - shl v4.2d, v31.2d, #0xe - eor x16, x30, x16 - eor x28, x30, x28, ror #63 - sri v4.2d, v31.2d, #0x32 - str x28, [sp, #0xd0] - eor x29, x29, x17, ror #36 - xar v24.2d, v21.2d, v25.2d, #0x3e - eor x28, x1, x2, ror #61 - eor x19, x30, x19, ror #37 - eor v31.16b, v5.16b, v26.16b - eor x29, x29, x20, ror #2 - eor x28, x28, x4, ror #54 - shl v21.2d, v31.2d, #0x24 - eor x26, x26, x0, ror #55 - eor x28, x28, x3, ror #39 - sri v21.2d, v31.2d, #0x1c - eor x28, x28, x5, ror #25 - ror x0, x0, #0x38 - xar v27.2d, v6.2d, v25.2d, #0x14 - eor x0, x0, x29, ror #63 - eor x27, x28, x27, ror #61 - bic v31.16b, v7.16b, v11.16b - eor x13, x0, x13, ror #46 - eor x28, x29, x28, ror #63 - eor v5.16b, v31.16b, v10.16b - eor x29, x30, x20, ror #2 - eor x20, x26, x3, ror #39 - bcax v6.16b, v11.16b, v8.16b, v7.16b - eor x11, x0, x11, ror #50 - eor x25, x28, x25, ror #9 - bic v31.16b, v9.16b, v8.16b - eor x3, x28, x21, ror #20 - eor v7.16b, v31.16b, v7.16b - eor x21, x26, x1 - eor x9, x27, x9, ror #49 - bcax v8.16b, v8.16b, v10.16b, v9.16b - eor x24, x28, x24, ror #28 - eor x1, x30, x17, ror #36 - bic v31.16b, v11.16b, v10.16b - eor x14, x0, x14, ror #8 - eor x22, x28, x22, ror #44 - eor v9.16b, v31.16b, v9.16b - eor x8, x27, x8, ror #56 - eor x17, x27, x7, ror #19 - bcax v10.16b, v15.16b, v12.16b, v16.16b - eor x15, x0, x15, ror #62 - bic x7, x20, x22, ror #47 - bic v31.16b, v13.16b, v12.16b - eor x4, x26, x4, ror #54 - eor x0, x0, x12, ror #3 - eor v11.16b, v31.16b, v16.16b - eor x28, x28, x23, ror #58 - eor x23, x26, x2, ror #61 - bcax v12.16b, v12.16b, v14.16b, v13.16b - eor x26, x26, x5, ror #25 - eor x2, x7, x16, ror #39 - bic v31.16b, v15.16b, v14.16b - bic x7, x9, x20, ror #42 - bic x30, x15, x9, ror #16 - eor v13.16b, v31.16b, v13.16b - eor x7, x7, x22, ror #25 - eor x12, x30, x20, ror #58 - bic v31.16b, v16.16b, v15.16b - bic x20, x22, x16, ror #56 - eor x30, x27, x6, ror #43 - eor v14.16b, v31.16b, v14.16b - eor x22, x20, x15, ror #23 - bic x6, x19, x13, ror #42 - bcax v15.16b, v20.16b, v17.16b, v21.16b - eor x6, x6, x17, ror #41 - bic x5, x13, x17, ror #63 - bic v31.16b, v18.16b, v17.16b - eor x5, x21, x5, ror #21 - bic x17, x17, x21, ror #44 - eor v16.16b, v31.16b, v21.16b - eor x27, x27, x10, ror #23 - bic x21, x21, x25, ror #50 - bcax v17.16b, v17.16b, v19.16b, v18.16b - bic x20, x27, x4, ror #25 - bic x10, x16, x15, ror #31 - bic v31.16b, v20.16b, v19.16b - eor x16, x21, x19, ror #43 - eor x21, x17, x25, ror #30 - eor v18.16b, v31.16b, v18.16b - bic x19, x25, x19, ror #57 - ldr x25, [sp, #0x18] - bcax v19.16b, v19.16b, v21.16b, v20.16b - eor x17, x10, x9, ror #47 - bic v31.16b, v22.16b, v1.16b - ldr x9, [sp, #0x8] - eor x15, x20, x28, ror #27 - eor v20.16b, v31.16b, v0.16b - bic x20, x4, x28, ror #2 - eor x10, x20, x1, ror #50 - bcax v21.16b, v1.16b, v23.16b, v22.16b - bic x20, x11, x27, ror #60 - eor x20, x20, x4, ror #21 - bic v31.16b, v24.16b, v23.16b - bic x4, x28, x1, ror #48 - bic x1, x1, x11, ror #57 - eor v22.16b, v31.16b, v22.16b - ldr x28, [x9, x25, lsl #3] - ldr x9, [sp, #0xd0] - bcax v23.16b, v23.16b, v0.16b, v24.16b - add x25, x25, #0x1 - str x25, [sp, #0x18] - bic v31.16b, v1.16b, v0.16b - cmp x25, #0x17 - eor x25, x1, x27, ror #53 - eor v24.16b, v31.16b, v24.16b - bic x27, x30, x26, ror #47 - eor x1, x5, x28 - bcax v0.16b, v30.16b, v2.16b, v27.16b - eor x5, x4, x11, ror #41 - eor x11, x19, x13, ror #35 - bic v31.16b, v3.16b, v2.16b - bic x13, x26, x24, ror #10 - eor x28, x27, x24, ror #57 - eor v1.16b, v31.16b, v27.16b - bic x27, x24, x9, ror #47 - bic x19, x23, x3, ror #9 - bcax v2.16b, v2.16b, v4.16b, v3.16b - bic x4, x29, x14, ror #41 - eor x24, x19, x29, ror #44 - bic v31.16b, v30.16b, v4.16b - bic x29, x3, x29, ror #35 - eor x13, x13, x9, ror #57 - eor v3.16b, v31.16b, v3.16b - eor x19, x29, x14, ror #12 - bic x29, x9, x0, ror #19 - bcax v4.16b, v4.16b, v27.16b, v30.16b - bic x14, x14, x8, ror #5 - eor x9, x14, x23, ror #43 - eor x14, x4, x8, ror #46 - bic x23, x8, x23, ror #38 - eor x8, x27, x0, ror #2 - eor x4, x23, x3, ror #47 - bic x3, x0, x30, ror #5 - eor x23, x3, x26, ror #52 - eor x3, x29, x30, ror #24 - ldr x30, [sp, #0x10] - ld1r { v28.2d }, [x30], #8 - str x30, [sp, #0x10] - eor v0.16b, v0.16b, v28.16b + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor v30.16b, v30.16b, v20.16b + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor v29.16b, v29.16b, v16.16b + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor x16, x30, x16 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor v27.16b, v27.16b, v23.16b + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor v26.16b, v26.16b, v19.16b + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + eor v26.16b, v26.16b, v24.16b + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + rax1 v25.2d, v30.2d, v28.2d + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor v28.16b, v31.16b, v28.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + rax1 v26.2d, v26.2d, v29.2d + eor x21, x26, x1 + add v31.2d, v27.2d, v27.2d + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + sri v31.2d, v27.2d, #0x3f + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + rax1 v27.2d, v27.2d, v30.2d + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + eor v30.16b, v0.16b, v26.16b + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor v31.16b, v2.16b, v29.16b + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + shl v0.2d, v31.2d, #0x3e + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + xar v2.2d, v12.2d, v29.2d, #0x15 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v31.16b, v13.16b, v28.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + shl v12.2d, v31.2d, #0x19 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + xar v13.2d, v19.2d, v27.2d, #0x38 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + eor v31.16b, v23.16b, v28.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + shl v19.2d, v31.2d, #0x38 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v31.16b, v1.16b, v25.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + shl v15.2d, v31.2d, #0x1 + ldr x9, [sp, #0x8] + sri v15.2d, v31.2d, #0x3f + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor v31.16b, v16.16b, v25.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + shl v8.2d, v31.2d, #0x2d + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + sri v8.2d, v31.2d, #0x13 + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + xar v16.2d, v7.2d, v29.2d, #0x3a + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v10.16b, v26.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + shl v7.2d, v31.2d, #0x3 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + sri v7.2d, v31.2d, #0x3d + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + eor v31.16b, v18.16b, v28.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + shl v3.2d, v31.2d, #0x15 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor v31.16b, v11.16b, v25.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v17.2d, v31.2d, #0xa + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + sri v17.2d, v31.2d, #0x36 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + eor v31.16b, v22.16b, v29.16b + eor x0, x15, x11, ror #52 + shl v9.2d, v31.2d, #0x3d + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + sri v9.2d, v31.2d, #0x3 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor v31.16b, v20.16b, v26.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + shl v14.2d, v31.2d, #0x12 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + sri v14.2d, v31.2d, #0x2e + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor v31.16b, v24.16b, v27.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v4.2d, v31.2d, #0xe + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + sri v4.2d, v31.2d, #0x32 + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v31.16b, v5.16b, v26.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v21.2d, v31.2d, #0x24 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + sri v21.2d, v31.2d, #0x1c + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + bic v31.16b, v7.16b, v11.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor v5.16b, v31.16b, v10.16b + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + bic v31.16b, v9.16b, v8.16b + eor x3, x28, x21, ror #20 + eor v7.16b, v31.16b, v7.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bic v31.16b, v11.16b, v10.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v9.16b, v31.16b, v9.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v13.16b, v12.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor v11.16b, v31.16b, v16.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic v31.16b, v15.16b, v14.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v13.16b, v31.16b, v13.16b + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic v31.16b, v16.16b, v15.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v14.16b, v31.16b, v14.16b + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + bcax v15.16b, v20.16b, v17.16b, v21.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v18.16b, v17.16b + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor v16.16b, v31.16b, v21.16b + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + bic v31.16b, v20.16b, v19.16b + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + eor v18.16b, v31.16b, v18.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + bic v31.16b, v22.16b, v1.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + eor v20.16b, v31.16b, v0.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bcax v21.16b, v1.16b, v23.16b, v22.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic v31.16b, v24.16b, v23.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + eor v22.16b, v31.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bcax v23.16b, v23.16b, v0.16b, v24.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + bic v31.16b, v1.16b, v0.16b + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + eor v24.16b, v31.16b, v24.16b + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop_end: - b.le Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop - ror x2, x2, #0x3d - ror x3, x3, #0x27 - ror x4, x4, #0x36 - ror x5, x5, #0x19 - ror x6, x6, #0x2b - ror x7, x7, #0x13 - ror x8, x8, #0x38 - ror x9, x9, #0x31 - ror x10, x10, #0x17 - ror x11, x11, #0x32 - ror x12, x12, #0x3 - ror x13, x13, #0x2e - ror x14, x14, #0x8 - ror x15, x15, #0x3e - ror x17, x17, #0x24 - ror x28, x28, #0x3f - ror x19, x19, #0x25 - ror x20, x20, #0x2 - ror x21, x21, #0x14 - ror x22, x22, #0x2c - ror x23, x23, #0x3a - ror x24, x24, #0x1c - ror x25, x25, #0x9 - ldr x30, [sp, #0x20] - cmp x30, #0x1 - b.eq Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_done - mov x30, #0x1 // =1 - str x30, [sp, #0x20] - ldr x0, [sp] - add x0, x0, #0x190 - stp x1, x6, [x0] - stp x11, x16, [x0, #0x10] - stp x21, x2, [x0, #0x20] - stp x7, x12, [x0, #0x30] - stp x17, x22, [x0, #0x40] - stp x3, x8, [x0, #0x50] - stp x13, x28, [x0, #0x60] - stp x23, x4, [x0, #0x70] - stp x9, x14, [x0, #0x80] - stp x19, x24, [x0, #0x90] - stp x5, x10, [x0, #0xa0] - stp x15, x20, [x0, #0xb0] - str x25, [x0, #0xc0] - sub x0, x0, #0x190 - add x0, x0, #0x258 - ldp x1, x6, [x0] - ldp x11, x16, [x0, #0x10] - ldp x21, x2, [x0, #0x20] - ldp x7, x12, [x0, #0x30] - ldp x17, x22, [x0, #0x40] - ldp x3, x8, [x0, #0x50] - ldp x13, x28, [x0, #0x60] - ldp x23, x4, [x0, #0x70] - ldp x9, x14, [x0, #0x80] - ldp x19, x24, [x0, #0x90] - ldp x5, x10, [x0, #0xa0] - ldp x15, x20, [x0, #0xb0] - ldr x25, [x0, #0xc0] - sub x0, x0, #0x258 - b Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_initial + b.le Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_initial Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_done: - ldr x0, [sp] - add x0, x0, #0x258 - stp x1, x6, [x0] - stp x11, x16, [x0, #0x10] - stp x21, x2, [x0, #0x20] - stp x7, x12, [x0, #0x30] - stp x17, x22, [x0, #0x40] - stp x3, x8, [x0, #0x50] - stp x13, x28, [x0, #0x60] - stp x23, x4, [x0, #0x70] - stp x9, x14, [x0, #0x80] - stp x19, x24, [x0, #0x90] - stp x5, x10, [x0, #0xa0] - stp x15, x20, [x0, #0xb0] - str x25, [x0, #0xc0] - sub x0, x0, #0x258 - add x4, x0, #0xc8 - trn1 v25.2d, v0.2d, v1.2d - trn1 v26.2d, v2.2d, v3.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v0.2d, v1.2d - trn2 v28.2d, v2.2d, v3.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v4.2d, v5.2d - trn1 v26.2d, v6.2d, v7.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v4.2d, v5.2d - trn2 v28.2d, v6.2d, v7.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v8.2d, v9.2d - trn1 v26.2d, v10.2d, v11.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v8.2d, v9.2d - trn2 v28.2d, v10.2d, v11.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v12.2d, v13.2d - trn1 v26.2d, v14.2d, v15.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v12.2d, v13.2d - trn2 v28.2d, v14.2d, v15.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v16.2d, v17.2d - trn1 v26.2d, v18.2d, v19.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v16.2d, v17.2d - trn2 v28.2d, v18.2d, v19.2d - st1 { v27.2d, v28.2d }, [x4], #32 - trn1 v25.2d, v20.2d, v21.2d - trn1 v26.2d, v22.2d, v23.2d - stp q25, q26, [x0], #0x20 - trn2 v27.2d, v20.2d, v21.2d - trn2 v28.2d, v22.2d, v23.2d - st1 { v27.2d, v28.2d }, [x4], #32 - str d24, [x0] - trn2 v25.2d, v24.2d, v24.2d - str d25, [x4] - ldp d8, d9, [sp, #0x90] + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x4], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4] + ldp d8, d9, [sp, #0x90] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0xa0] + ldp d10, d11, [sp, #0xa0] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0xb0] + ldp d12, d13, [sp, #0xb0] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0xc0] + ldp d14, d15, [sp, #0xc0] .cfi_restore d14 .cfi_restore d15 - ldp x19, x20, [sp, #0x30] + ldp x19, x20, [sp, #0x30] .cfi_restore x19 .cfi_restore x20 - ldp x21, x22, [sp, #0x40] + ldp x21, x22, [sp, #0x40] .cfi_restore x21 .cfi_restore x22 - ldp x23, x24, [sp, #0x50] + ldp x23, x24, [sp, #0x50] .cfi_restore x23 .cfi_restore x24 - ldp x25, x26, [sp, #0x60] + ldp x25, x26, [sp, #0x60] .cfi_restore x25 .cfi_restore x26 - ldp x27, x28, [sp, #0x70] + ldp x27, x28, [sp, #0x70] .cfi_restore x27 .cfi_restore x28 - ldp x29, x30, [sp, #0x80] + ldp x29, x30, [sp, #0x80] .cfi_restore x29 .cfi_restore x30 - add sp, sp, #0xe0 + add sp, sp, #0xe0 .cfi_adjust_cfa_offset -0xe0 ret .cfi_endproc diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S index 7db0aac2b..ca2e98188 100644 --- a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S @@ -50,587 +50,587 @@ .global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) - push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} - vpush {d8, d9, d10, d11, d12, d13, d14, d15} - sub sp, #0x80 - mov r6, r2 - mov.w lr, #0x18 - mov r2, r0 - mov r4, r1 - add.w r3, r2, #0x190 - vldrw.u32 q0, [r3] - vldrw.u32 q1, [r2] - vldrw.u32 q2, [r2, #32] - wls lr, lr, keccak_f1600_x4_mve_asm_roundend @ imm = #0x8c0 + push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + sub sp, #0x80 + mov r6, r2 + mov.w lr, #0x18 + mov r2, r0 + mov r4, r1 + add.w r3, r2, #0x190 + vldrw.u32 q0, [r3] + vldrw.u32 q1, [r2] + vldrw.u32 q2, [r2, #32] + wls lr, lr, keccak_f1600_x4_mve_asm_roundend @ imm = #0x8c0 keccak_f1600_x4_mve_asm_roundstart: - vldrw.u32 q6, [r2, #112] - veor q7, q6, q2 - vldrw.u32 q2, [r2, #80] - veor q1, q2, q1 - add.w r5, r2, #0x190 - vldrw.u32 q5, [r5, #80] - veor q4, q5, q0 - vldrw.u32 q0, [r2, #192] - veor q3, q7, q0 - vldrw.u32 q0, [r2, #160] - veor q1, q1, q0 - vldrw.u32 q0, [r5, #160] - veor q0, q4, q0 - vldrw.u32 q6, [r2, #272] - veor q2, q3, q6 - vldrw.u32 q7, [r2, #240] - veor q5, q1, q7 - vldrw.u32 q4, [r5, #240] - veor q4, q0, q4 - vldrw.u32 q6, [r2, #352] - veor q3, q2, q6 - vldrw.u32 q0, [r2, #320] - veor q2, q5, q0 - vldrw.u32 q1, [r5, #320] - veor q5, q4, q1 - vldrw.u32 q4, [r5, #32] - veor q0, q3, q5 - vldrw.u32 q1, [r5, #16] - veor q6, q1, q0 - vstrw.32 q5, [sp] - vshr.u32 q7, q6, #0x1f - add.w r10, r4, #0x190 - vsli.32 q7, q6, #0x1 - vldrw.u32 q6, [r5, #112] - veor q6, q4, q6 - vldrw.u32 q4, [r5, #192] - veor q4, q6, q4 - vldrw.u32 q6, [r5, #272] - veor q4, q4, q6 - vldrw.u32 q6, [r5, #352] - veor q5, q4, q6 - vstrw.32 q7, [r4, #160] - vshr.u32 q4, q5, #0x1f - vsli.32 q4, q5, #0x1 - vldrw.u32 q6, [r2, #16] - veor q7, q4, q2 - veor q1, q6, q7 - vldrw.u32 q6, [r5, #96] - veor q6, q6, q0 - vstrw.32 q1, [r10, #160] - vshr.u32 q1, q6, #0xa - vsli.32 q1, q6, #0x16 - vldrw.u32 q6, [r2, #96] - veor q4, q6, q7 - vstrw.32 q1, [r10, #16] - vshr.u32 q6, q4, #0xa - vsli.32 q6, q4, #0x16 - vldrw.u32 q1, [r5, #336] - veor q4, q1, q0 - vldrw.u32 q1, [r2, #176] - veor q1, q1, q7 - vstrw.32 q6, [r4, #16] - vshr.u32 q6, q1, #0x1b - vsli.32 q6, q1, #0x5 - vldrw.u32 q1, [r2, #256] - veor q1, q1, q7 - vstrw.32 q6, [r4, #272] - vshr.u32 q6, q1, #0xa - vsli.32 q6, q1, #0x16 - vldrw.u32 q1, [r2, #336] - veor q1, q1, q7 - vstrw.32 q6, [r10, #128] - vshr.u32 q6, q1, #0x1f - vsli.32 q6, q1, #0x1 - vldrw.u32 q7, [r5, #176] - veor q7, q7, q0 - vstrw.32 q6, [r4, #384] - vshr.u32 q1, q7, #0x1b - vsli.32 q1, q7, #0x5 - vldrw.u32 q6, [r5, #256] - veor q0, q6, q0 - vstrw.32 q1, [r10, #272] - vshr.u32 q1, q4, #0x1f - vldrw.u32 q7, [r5, #64] - vsli.32 q1, q4, #0x1 - vldrw.u32 q4, [r5, #144] - vshr.u32 q6, q0, #0x9 - vstrw.32 q1, [r10, #384] - vsli.32 q6, q0, #0x17 - veor q7, q7, q4 - vldrw.u32 q1, [r5, #224] - veor q4, q7, q1 - vldrw.u32 q7, [r5, #304] - veor q1, q4, q7 - vldrw.u32 q0, [r5, #384] - veor q7, q1, q0 - vstrw.32 q6, [r4, #128] - vshr.u32 q1, q7, #0x1f - vsli.32 q1, q7, #0x1 - vldrw.u32 q6, [r2, #144] - veor q0, q1, q3 - vldrw.u32 q3, [r2, #64] - veor q1, q3, q6 - vldrw.u32 q6, [r2, #224] - veor q1, q1, q6 - vldrw.u32 q3, [r2, #304] - veor q6, q1, q3 - vldrw.u32 q4, [r2, #384] - veor q3, q6, q4 - vldrw.u32 q4, [r2, #48] - veor q5, q3, q5 - vldrw.u32 q1, [r5, #48] - veor q1, q1, q5 - vshr.u32 q6, q1, #0x12 - vsli.32 q6, q1, #0xe - vldrw.u32 q1, [r2, #128] - veor q1, q1, q0 - vstrw.32 q6, [r10, #80] - vshr.u32 q6, q1, #0x5 - vsli.32 q6, q1, #0x1b - vldrw.u32 q1, [r5, #128] - veor q1, q1, q5 - vstrw.32 q6, [r10, #336] - vshr.u32 q6, q1, #0x4 - vsli.32 q6, q1, #0x1c - veor q1, q4, q0 - vstrw.32 q6, [r4, #336] - vshr.u32 q4, q1, #0x12 - vsli.32 q4, q1, #0xe - vldrw.u32 q6, [r2, #208] - veor q6, q6, q0 - vstrw.32 q4, [r4, #80] - vshr.u32 q1, q6, #0x14 - vsli.32 q1, q6, #0xc - vldrw.u32 q4, [r2, #288] - veor q4, q4, q0 - vldrw.u32 q6, [r2, #368] - veor q0, q6, q0 - vshr.u32 q6, q0, #0x4 - vstrw.32 q1, [r10, #192] - vsli.32 q6, q0, #0x1c - vshr.u32 q0, q4, #0x16 - vldrw.u32 q1, [r5, #368] - vsli.32 q0, q4, #0xa - vstrw.32 q6, [r4, #304] - veor q4, q1, q5 - vstrw.32 q0, [r10, #48] - vshr.u32 q1, q4, #0x4 - vsli.32 q1, q4, #0x1c - vldrw.u32 q6, [r5, #208] - veor q6, q6, q5 - vldrw.u32 q0, [r5, #288] - veor q5, q0, q5 - vstrw.32 q1, [r10, #304] - vshr.u32 q0, q6, #0x13 - vsli.32 q0, q6, #0xd - vldrw.u32 q1, [r5, #96] - vshr.u32 q6, q5, #0x15 - vldrw.u32 q4, [r5, #16] - vsli.32 q6, q5, #0xb - vldrw.u32 q5, [r5, #176] - veor q1, q4, q1 - vldrw.u32 q4, [r5, #256] - veor q5, q1, q5 - vldrw.u32 q1, [r5, #336] - veor q5, q5, q4 - vstrw.32 q0, [r4, #192] - veor q0, q5, q1 - vstrw.32 q6, [r4, #48] - vshr.u32 q5, q0, #0x1f - vsli.32 q5, q0, #0x1 - vldrw.u32 q4, [r2, #16] - veor q3, q5, q3 - vldrw.u32 q6, [r2, #96] - veor q4, q4, q6 - vldrw.u32 q1, [r2, #176] - veor q5, q4, q1 - vldrw.u32 q6, [r2, #256] - veor q6, q5, q6 - vldrw.u32 q4, [r2, #336] - veor q5, q6, q4 - vldrw.u32 q1, [r5] - veor q7, q5, q7 - vldrw.u32 q4, [r2] - veor q1, q1, q7 - veor q4, q4, q3 - vshr.u32 q6, q1, #0x20 - vsli.32 q6, q1, #0x0 - vldrw.u32 q1, [r2, #80] - veor q1, q1, q3 - vstrw.32 q6, [r10] - vshr.u32 q6, q4, #0x20 - vsli.32 q6, q4, #0x0 - vldrw.u32 q4, [r5, #80] - veor q4, q4, q7 - vstrw.32 q6, [r4] - vshr.u32 q6, q1, #0xe - vsli.32 q6, q1, #0x12 - vldrw.u32 q1, [r2, #160] - veor q1, q1, q3 - vstrw.32 q6, [r4, #256] - vshr.u32 q6, q4, #0xe - vsli.32 q6, q4, #0x12 - vldrw.u32 q4, [r2, #240] - veor q4, q4, q3 - vstrw.32 q6, [r10, #256] - vshr.u32 q6, q1, #0x1f - vsli.32 q6, q1, #0x1 - vldrw.u32 q1, [r2, #320] - veor q1, q1, q3 - vstrw.32 q6, [r10, #112] - vshr.u32 q6, q4, #0xc - vsli.32 q6, q4, #0x14 - vldrw.u32 q3, [r5, #240] - veor q3, q3, q7 - vstrw.32 q6, [r10, #368] - vshr.u32 q4, q3, #0xb - vsli.32 q4, q3, #0x15 - vldrw.u32 q3, [r5, #160] - veor q6, q3, q7 - vstrw.32 q4, [r4, #368] - vshr.u32 q3, q6, #0x1e - vsli.32 q3, q6, #0x2 - vldrw.u32 q6, [r5, #320] - veor q7, q6, q7 - vldrw.u32 q4, [r2, #368] - vshr.u32 q6, q1, #0x17 - vstrw.32 q3, [r4, #112] - vsli.32 q6, q1, #0x9 - vshr.u32 q1, q7, #0x17 - vldrw.u32 q3, [r2, #48] - vsli.32 q1, q7, #0x9 - vldrw.u32 q7, [r2, #128] - veor q3, q3, q7 - vldrw.u32 q7, [r2, #208] - veor q7, q3, q7 - vldrw.u32 q3, [r2, #288] - veor q3, q7, q3 - vldrw.u32 q7, [r5, #128] - veor q3, q3, q4 - vldrw.u32 q4, [r5, #48] - veor q0, q3, q0 - veor q4, q4, q7 - vldrw.u32 q7, [r5, #208] - veor q4, q4, q7 - vldrw.u32 q7, [r5, #288] - veor q4, q4, q7 - vldrw.u32 q7, [r5, #368] - veor q7, q4, q7 - vstrw.32 q6, [r4, #224] - vshr.u32 q4, q7, #0x1f - vstrw.32 q1, [r10, #224] - vsli.32 q4, q7, #0x1 - veor q5, q4, q5 - vldrw.u32 q6, [r2, #192] - veor q1, q6, q5 - vldrw.u32 q4, [r5, #112] - veor q7, q2, q7 - vldrw.u32 q6, [r5, #32] - vshr.u32 q2, q1, #0xb - vsli.32 q2, q1, #0x15 - veor q1, q6, q0 - vstrw.32 q2, [r10, #32] - vshr.u32 q6, q1, #0x1 - vsli.32 q6, q1, #0x1f - vldrw.u32 q2, [r2, #112] - veor q2, q2, q5 - vstrw.32 q6, [r10, #320] - vshr.u32 q1, q2, #0x1d - vsli.32 q1, q2, #0x3 - vldrw.u32 q6, [r2, #32] - veor q4, q4, q0 - vstrw.32 q1, [r4, #176] - veor q2, q6, q5 - vshr.u32 q6, q2, #0x1 - vldrw.u32 q1, [r5, #352] - vsli.32 q6, q2, #0x1f - veor q1, q1, q0 - vstrw.32 q6, [r4, #320] - vshr.u32 q6, q1, #0x1 - vsli.32 q6, q1, #0x1f - vldrw.u32 q2, [r5, #192] - vshr.u32 q1, q4, #0x1d - vstrw.32 q6, [r4, #144] - vsli.32 q1, q4, #0x3 - veor q2, q2, q0 - vldrw.u32 q6, [r5, #272] - veor q0, q6, q0 - vldrw.u32 q4, [r2, #352] - veor q6, q4, q5 - vldrw.u32 q4, [r2, #272] - veor q4, q4, q5 - vstrw.32 q1, [r10, #176] - vshr.u32 q1, q2, #0xa - vsli.32 q1, q2, #0x16 - vldrw.u32 q5, [sp] - vshr.u32 q2, q0, #0x18 - vstrw.32 q1, [r4, #32] - vsli.32 q2, q0, #0x8 - vshr.u32 q1, q6, #0x2 - vstrw.32 q2, [r4, #288] - vsli.32 q1, q6, #0x1e - vshr.u32 q6, q4, #0x19 - vstrw.32 q1, [r10, #144] - vsli.32 q6, q4, #0x7 - vshr.u32 q0, q5, #0x1f - vstrw.32 q6, [r10, #288] - vsli.32 q0, q5, #0x1 - veor q5, q0, q3 - vldrw.u32 q6, [r2, #64] - veor q3, q6, q5 - vldrw.u32 q1, [r5, #64] - vshr.u32 q4, q3, #0x13 - vldrw.u32 q2, [r2, #384] - vsli.32 q4, q3, #0xd - vldrw.u32 q0, [r5, #224] - veor q6, q1, q7 - vstrw.32 q4, [r10, #240] - veor q2, q2, q5 - veor q3, q0, q7 - vldrw.u32 q0, [r2, #224] - vshr.u32 q4, q6, #0x12 - vldrw.u32 q1, [r5, #384] - vsli.32 q4, q6, #0xe - vshr.u32 q6, q2, #0x19 - vstrw.32 q4, [r4, #240] - vsli.32 q6, q2, #0x7 - vshr.u32 q2, q3, #0xc - vstrw.32 q6, [r4, #64] - vsli.32 q2, q3, #0x14 - veor q0, q0, q5 - vldrw.u32 q6, [r2, #144] - veor q4, q1, q7 - veor q6, q6, q5 - vstrw.32 q2, [r4, #352] - vshr.u32 q2, q4, #0x19 - vsli.32 q2, q4, #0x7 - vldrw.u32 q1, [r2, #304] - veor q5, q1, q5 - vldrw.u32 q1, [r5, #144] - veor q4, q1, q7 - vldrw.u32 q3, [r5, #304] - veor q1, q3, q7 - vstrw.32 q2, [r10, #64] - vshr.u32 q3, q0, #0xd - vsli.32 q3, q0, #0x13 - vldrw.u32 q7, [r4, #80] - vshr.u32 q0, q6, #0x16 - vstrw.32 q3, [r10, #352] - vsli.32 q0, q6, #0xa - vshr.u32 q2, q5, #0x1c - vsli.32 q2, q5, #0x4 - vldrw.u32 q5, [r4, #112] - vshr.u32 q3, q1, #0x1c - vsli.32 q3, q1, #0x4 - vldrw.u32 q1, [r4, #128] - vbic q6, q5, q0 - vstrw.32 q3, [r10, #208] - vbic q3, q1, q5 - veor q3, q0, q3 - vstrw.32 q3, [r2, #96] - vbic q3, q0, q7 - veor q0, q7, q6 - vldrw.u32 q6, [r4, #144] - vbic q7, q7, q6 - vstrw.32 q0, [r2, #80] - veor q3, q6, q3 - vstrw.32 q3, [r2, #144] - veor q0, q1, q7 - vstrw.32 q0, [r2, #128] - vbic q1, q6, q1 - vshr.u32 q6, q4, #0x16 - vldrw.u32 q3, [r10, #112] - vsli.32 q6, q4, #0xa - vldrw.u32 q4, [r10, #80] - veor q1, q5, q1 - vldrw.u32 q0, [r10, #144] - vbic q7, q4, q0 - vldrw.u32 q5, [r10, #128] - veor q7, q5, q7 - vstrw.32 q1, [r2, #112] - vbic q1, q0, q5 - vstrw.32 q7, [r5, #128] - veor q7, q3, q1 - vstrw.32 q7, [r5, #112] - vbic q7, q5, q3 - vbic q1, q3, q6 - vldrw.u32 q3, [r4, #176] - veor q5, q4, q1 - vbic q4, q6, q4 - vldrw.u32 q1, [r4, #160] - veor q0, q0, q4 - vldrw.u32 q4, [r4, #224] - veor q7, q6, q7 - vstrw.32 q0, [r5, #144] - vbic q0, q1, q4 - vstrw.32 q7, [r5, #96] - veor q0, q2, q0 - vstrw.32 q0, [r2, #208] - vbic q6, q3, q1 - vstrw.32 q5, [r5, #80] - vbic q7, q4, q2 - vldrw.u32 q0, [r10, #160] - veor q6, q4, q6 - vldrw.u32 q5, [r4, #192] - vbic q4, q2, q5 - vldrw.u32 q2, [r10, #224] - veor q4, q3, q4 - vstrw.32 q4, [r2, #176] - vbic q4, q5, q3 - vstrw.32 q6, [r2, #224] - veor q4, q1, q4 - vldrw.u32 q1, [r10, #208] - veor q3, q5, q7 - vldrw.u32 q5, [r10, #192] - vbic q6, q1, q5 - vldrw.u32 q7, [r10, #176] - veor q6, q7, q6 - vstrw.32 q3, [r2, #192] - vbic q3, q0, q2 - vstrw.32 q6, [r5, #176] - veor q3, q1, q3 - vstrw.32 q3, [r5, #208] - vbic q3, q5, q7 - vstrw.32 q4, [r2, #160] - veor q3, q0, q3 - vstrw.32 q3, [r5, #160] - vbic q6, q2, q1 - vldrw.u32 q1, [r4, #288] - vbic q7, q7, q0 - vldrw.u32 q3, [r4, #272] - veor q0, q5, q6 - vldrw.u32 q4, [r4, #304] - veor q6, q2, q7 - vldrw.u32 q7, [r4, #256] - vbic q5, q4, q1 - vstrw.32 q0, [r5, #192] - veor q5, q3, q5 - vstrw.32 q6, [r5, #224] - vbic q0, q3, q7 - vstrw.32 q5, [r2, #272] - vbic q6, q1, q3 - veor q5, q7, q6 - vldrw.u32 q3, [r4, #240] - veor q6, q3, q0 - vldrw.u32 q2, [r10, #288] - vbic q0, q3, q4 - vstrw.32 q6, [r2, #240] - vbic q7, q7, q3 - vstrw.32 q5, [r2, #256] - veor q7, q4, q7 - vstrw.32 q7, [r2, #304] - veor q7, q1, q0 - vstrw.32 q7, [r2, #288] - vldrw.u32 q5, [r10, #304] - vbic q7, q5, q2 - vldrw.u32 q3, [r10, #272] - veor q1, q3, q7 - vldrw.u32 q7, [r4, #336] - vbic q4, q2, q3 - vldrw.u32 q6, [r10, #256] - vbic q3, q3, q6 - vldrw.u32 q0, [r10, #240] - veor q3, q0, q3 - vstrw.32 q1, [r5, #272] - vbic q1, q0, q5 - vstrw.32 q3, [r5, #240] - veor q1, q2, q1 - vldrw.u32 q3, [r4, #384] - vbic q2, q6, q0 - vldrw.u32 q0, [r4, #320] - veor q2, q5, q2 - vldrw.u32 q5, [r4, #352] - veor q4, q6, q4 - vstrw.32 q2, [r5, #304] - vbic q2, q7, q0 - vstrw.32 q1, [r5, #288] - veor q1, q3, q2 - vstrw.32 q1, [r2, #384] - vbic q2, q5, q7 - vstrw.32 q4, [r5, #256] - veor q4, q0, q2 - vstrw.32 q4, [r2, #320] - vbic q2, q0, q3 - vldrw.u32 q4, [r4, #368] - vbic q3, q3, q4 - vldrw.u32 q0, [r10, #320] - veor q1, q5, q3 - vldrw.u32 q6, [r10, #336] - vbic q5, q4, q5 - vstrw.32 q1, [r2, #352] - veor q5, q7, q5 - vstrw.32 q5, [r2, #336] - veor q3, q4, q2 - vstrw.32 q3, [r2, #368] - vbic q7, q6, q0 - vldrw.u32 q5, [r10, #352] - vbic q3, q5, q6 - vldrw.u32 q1, [r10, #368] - vbic q4, q1, q5 - vldrw.u32 q2, [r4, #16] - veor q6, q6, q4 - vldrw.u32 q4, [r10, #384] - veor q3, q0, q3 - vstrw.32 q3, [r5, #320] - veor q3, q4, q7 - vstrw.32 q3, [r5, #384] - vbic q0, q0, q4 - vstrw.32 q6, [r5, #336] - veor q3, q1, q0 - vstrw.32 q3, [r5, #368] - vbic q7, q4, q1 - veor q5, q5, q7 - vldrw.u32 q6, [r4, #32] - vbic q3, q6, q2 - vldrw.u32 q4, [r4, #48] - vbic q0, q4, q6 - vldrw.u32 q1, [r4] - veor q0, q2, q0 - vldrw.u32 q7, [r4, #64] - veor q3, q1, q3 - vstrw.32 q5, [r5, #352] - vbic q5, q1, q7 - vstrw.32 q0, [r2, #16] - veor q0, q4, q5 - vstrw.32 q0, [r2, #48] - vbic q5, q2, q1 - veor q2, q7, q5 - vldrw.u32 q0, [r10, #16] - vbic q5, q7, q4 - vldrw.u32 q4, [r10] - vbic q1, q0, q4 - vldrw.u32 q7, [r10, #64] - veor q1, q7, q1 - vstrw.32 q2, [r2, #64] - veor q2, q6, q5 - vbic q6, q4, q7 - vldrw.u32 q5, [r10, #48] - veor q6, q5, q6 - ldrd r7, r8, [r6] - vbic q7, q7, q5 - vstrw.32 q1, [r5, #64] - vdup.32 q1, r7 - veor q1, q3, q1 - vldrw.u32 q3, [r10, #32] - veor q7, q3, q7 - add.w r6, r6, #0x8 - vbic q5, q5, q3 - vstrw.32 q6, [r5, #48] - vbic q6, q3, q0 - vstrw.32 q1, [r2] - veor q5, q0, q5 - vstrw.32 q7, [r5, #32] - veor q4, q4, q6 - vstrw.32 q5, [r5, #16] - vdup.32 q6, r8 - vstrw.32 q2, [r2, #32] - veor q0, q4, q6 - vstrw.32 q0, [r5] + vldrw.u32 q6, [r2, #112] + veor q7, q6, q2 + vldrw.u32 q2, [r2, #80] + veor q1, q2, q1 + add.w r5, r2, #0x190 + vldrw.u32 q5, [r5, #80] + veor q4, q5, q0 + vldrw.u32 q0, [r2, #192] + veor q3, q7, q0 + vldrw.u32 q0, [r2, #160] + veor q1, q1, q0 + vldrw.u32 q0, [r5, #160] + veor q0, q4, q0 + vldrw.u32 q6, [r2, #272] + veor q2, q3, q6 + vldrw.u32 q7, [r2, #240] + veor q5, q1, q7 + vldrw.u32 q4, [r5, #240] + veor q4, q0, q4 + vldrw.u32 q6, [r2, #352] + veor q3, q2, q6 + vldrw.u32 q0, [r2, #320] + veor q2, q5, q0 + vldrw.u32 q1, [r5, #320] + veor q5, q4, q1 + vldrw.u32 q4, [r5, #32] + veor q0, q3, q5 + vldrw.u32 q1, [r5, #16] + veor q6, q1, q0 + vstrw.32 q5, [sp] + vshr.u32 q7, q6, #0x1f + add.w r10, r4, #0x190 + vsli.32 q7, q6, #0x1 + vldrw.u32 q6, [r5, #112] + veor q6, q4, q6 + vldrw.u32 q4, [r5, #192] + veor q4, q6, q4 + vldrw.u32 q6, [r5, #272] + veor q4, q4, q6 + vldrw.u32 q6, [r5, #352] + veor q5, q4, q6 + vstrw.32 q7, [r4, #160] + vshr.u32 q4, q5, #0x1f + vsli.32 q4, q5, #0x1 + vldrw.u32 q6, [r2, #16] + veor q7, q4, q2 + veor q1, q6, q7 + vldrw.u32 q6, [r5, #96] + veor q6, q6, q0 + vstrw.32 q1, [r10, #160] + vshr.u32 q1, q6, #0xa + vsli.32 q1, q6, #0x16 + vldrw.u32 q6, [r2, #96] + veor q4, q6, q7 + vstrw.32 q1, [r10, #16] + vshr.u32 q6, q4, #0xa + vsli.32 q6, q4, #0x16 + vldrw.u32 q1, [r5, #336] + veor q4, q1, q0 + vldrw.u32 q1, [r2, #176] + veor q1, q1, q7 + vstrw.32 q6, [r4, #16] + vshr.u32 q6, q1, #0x1b + vsli.32 q6, q1, #0x5 + vldrw.u32 q1, [r2, #256] + veor q1, q1, q7 + vstrw.32 q6, [r4, #272] + vshr.u32 q6, q1, #0xa + vsli.32 q6, q1, #0x16 + vldrw.u32 q1, [r2, #336] + veor q1, q1, q7 + vstrw.32 q6, [r10, #128] + vshr.u32 q6, q1, #0x1f + vsli.32 q6, q1, #0x1 + vldrw.u32 q7, [r5, #176] + veor q7, q7, q0 + vstrw.32 q6, [r4, #384] + vshr.u32 q1, q7, #0x1b + vsli.32 q1, q7, #0x5 + vldrw.u32 q6, [r5, #256] + veor q0, q6, q0 + vstrw.32 q1, [r10, #272] + vshr.u32 q1, q4, #0x1f + vldrw.u32 q7, [r5, #64] + vsli.32 q1, q4, #0x1 + vldrw.u32 q4, [r5, #144] + vshr.u32 q6, q0, #0x9 + vstrw.32 q1, [r10, #384] + vsli.32 q6, q0, #0x17 + veor q7, q7, q4 + vldrw.u32 q1, [r5, #224] + veor q4, q7, q1 + vldrw.u32 q7, [r5, #304] + veor q1, q4, q7 + vldrw.u32 q0, [r5, #384] + veor q7, q1, q0 + vstrw.32 q6, [r4, #128] + vshr.u32 q1, q7, #0x1f + vsli.32 q1, q7, #0x1 + vldrw.u32 q6, [r2, #144] + veor q0, q1, q3 + vldrw.u32 q3, [r2, #64] + veor q1, q3, q6 + vldrw.u32 q6, [r2, #224] + veor q1, q1, q6 + vldrw.u32 q3, [r2, #304] + veor q6, q1, q3 + vldrw.u32 q4, [r2, #384] + veor q3, q6, q4 + vldrw.u32 q4, [r2, #48] + veor q5, q3, q5 + vldrw.u32 q1, [r5, #48] + veor q1, q1, q5 + vshr.u32 q6, q1, #0x12 + vsli.32 q6, q1, #0xe + vldrw.u32 q1, [r2, #128] + veor q1, q1, q0 + vstrw.32 q6, [r10, #80] + vshr.u32 q6, q1, #0x5 + vsli.32 q6, q1, #0x1b + vldrw.u32 q1, [r5, #128] + veor q1, q1, q5 + vstrw.32 q6, [r10, #336] + vshr.u32 q6, q1, #0x4 + vsli.32 q6, q1, #0x1c + veor q1, q4, q0 + vstrw.32 q6, [r4, #336] + vshr.u32 q4, q1, #0x12 + vsli.32 q4, q1, #0xe + vldrw.u32 q6, [r2, #208] + veor q6, q6, q0 + vstrw.32 q4, [r4, #80] + vshr.u32 q1, q6, #0x14 + vsli.32 q1, q6, #0xc + vldrw.u32 q4, [r2, #288] + veor q4, q4, q0 + vldrw.u32 q6, [r2, #368] + veor q0, q6, q0 + vshr.u32 q6, q0, #0x4 + vstrw.32 q1, [r10, #192] + vsli.32 q6, q0, #0x1c + vshr.u32 q0, q4, #0x16 + vldrw.u32 q1, [r5, #368] + vsli.32 q0, q4, #0xa + vstrw.32 q6, [r4, #304] + veor q4, q1, q5 + vstrw.32 q0, [r10, #48] + vshr.u32 q1, q4, #0x4 + vsli.32 q1, q4, #0x1c + vldrw.u32 q6, [r5, #208] + veor q6, q6, q5 + vldrw.u32 q0, [r5, #288] + veor q5, q0, q5 + vstrw.32 q1, [r10, #304] + vshr.u32 q0, q6, #0x13 + vsli.32 q0, q6, #0xd + vldrw.u32 q1, [r5, #96] + vshr.u32 q6, q5, #0x15 + vldrw.u32 q4, [r5, #16] + vsli.32 q6, q5, #0xb + vldrw.u32 q5, [r5, #176] + veor q1, q4, q1 + vldrw.u32 q4, [r5, #256] + veor q5, q1, q5 + vldrw.u32 q1, [r5, #336] + veor q5, q5, q4 + vstrw.32 q0, [r4, #192] + veor q0, q5, q1 + vstrw.32 q6, [r4, #48] + vshr.u32 q5, q0, #0x1f + vsli.32 q5, q0, #0x1 + vldrw.u32 q4, [r2, #16] + veor q3, q5, q3 + vldrw.u32 q6, [r2, #96] + veor q4, q4, q6 + vldrw.u32 q1, [r2, #176] + veor q5, q4, q1 + vldrw.u32 q6, [r2, #256] + veor q6, q5, q6 + vldrw.u32 q4, [r2, #336] + veor q5, q6, q4 + vldrw.u32 q1, [r5] + veor q7, q5, q7 + vldrw.u32 q4, [r2] + veor q1, q1, q7 + veor q4, q4, q3 + vshr.u32 q6, q1, #0x20 + vsli.32 q6, q1, #0x0 + vldrw.u32 q1, [r2, #80] + veor q1, q1, q3 + vstrw.32 q6, [r10] + vshr.u32 q6, q4, #0x20 + vsli.32 q6, q4, #0x0 + vldrw.u32 q4, [r5, #80] + veor q4, q4, q7 + vstrw.32 q6, [r4] + vshr.u32 q6, q1, #0xe + vsli.32 q6, q1, #0x12 + vldrw.u32 q1, [r2, #160] + veor q1, q1, q3 + vstrw.32 q6, [r4, #256] + vshr.u32 q6, q4, #0xe + vsli.32 q6, q4, #0x12 + vldrw.u32 q4, [r2, #240] + veor q4, q4, q3 + vstrw.32 q6, [r10, #256] + vshr.u32 q6, q1, #0x1f + vsli.32 q6, q1, #0x1 + vldrw.u32 q1, [r2, #320] + veor q1, q1, q3 + vstrw.32 q6, [r10, #112] + vshr.u32 q6, q4, #0xc + vsli.32 q6, q4, #0x14 + vldrw.u32 q3, [r5, #240] + veor q3, q3, q7 + vstrw.32 q6, [r10, #368] + vshr.u32 q4, q3, #0xb + vsli.32 q4, q3, #0x15 + vldrw.u32 q3, [r5, #160] + veor q6, q3, q7 + vstrw.32 q4, [r4, #368] + vshr.u32 q3, q6, #0x1e + vsli.32 q3, q6, #0x2 + vldrw.u32 q6, [r5, #320] + veor q7, q6, q7 + vldrw.u32 q4, [r2, #368] + vshr.u32 q6, q1, #0x17 + vstrw.32 q3, [r4, #112] + vsli.32 q6, q1, #0x9 + vshr.u32 q1, q7, #0x17 + vldrw.u32 q3, [r2, #48] + vsli.32 q1, q7, #0x9 + vldrw.u32 q7, [r2, #128] + veor q3, q3, q7 + vldrw.u32 q7, [r2, #208] + veor q7, q3, q7 + vldrw.u32 q3, [r2, #288] + veor q3, q7, q3 + vldrw.u32 q7, [r5, #128] + veor q3, q3, q4 + vldrw.u32 q4, [r5, #48] + veor q0, q3, q0 + veor q4, q4, q7 + vldrw.u32 q7, [r5, #208] + veor q4, q4, q7 + vldrw.u32 q7, [r5, #288] + veor q4, q4, q7 + vldrw.u32 q7, [r5, #368] + veor q7, q4, q7 + vstrw.32 q6, [r4, #224] + vshr.u32 q4, q7, #0x1f + vstrw.32 q1, [r10, #224] + vsli.32 q4, q7, #0x1 + veor q5, q4, q5 + vldrw.u32 q6, [r2, #192] + veor q1, q6, q5 + vldrw.u32 q4, [r5, #112] + veor q7, q2, q7 + vldrw.u32 q6, [r5, #32] + vshr.u32 q2, q1, #0xb + vsli.32 q2, q1, #0x15 + veor q1, q6, q0 + vstrw.32 q2, [r10, #32] + vshr.u32 q6, q1, #0x1 + vsli.32 q6, q1, #0x1f + vldrw.u32 q2, [r2, #112] + veor q2, q2, q5 + vstrw.32 q6, [r10, #320] + vshr.u32 q1, q2, #0x1d + vsli.32 q1, q2, #0x3 + vldrw.u32 q6, [r2, #32] + veor q4, q4, q0 + vstrw.32 q1, [r4, #176] + veor q2, q6, q5 + vshr.u32 q6, q2, #0x1 + vldrw.u32 q1, [r5, #352] + vsli.32 q6, q2, #0x1f + veor q1, q1, q0 + vstrw.32 q6, [r4, #320] + vshr.u32 q6, q1, #0x1 + vsli.32 q6, q1, #0x1f + vldrw.u32 q2, [r5, #192] + vshr.u32 q1, q4, #0x1d + vstrw.32 q6, [r4, #144] + vsli.32 q1, q4, #0x3 + veor q2, q2, q0 + vldrw.u32 q6, [r5, #272] + veor q0, q6, q0 + vldrw.u32 q4, [r2, #352] + veor q6, q4, q5 + vldrw.u32 q4, [r2, #272] + veor q4, q4, q5 + vstrw.32 q1, [r10, #176] + vshr.u32 q1, q2, #0xa + vsli.32 q1, q2, #0x16 + vldrw.u32 q5, [sp] + vshr.u32 q2, q0, #0x18 + vstrw.32 q1, [r4, #32] + vsli.32 q2, q0, #0x8 + vshr.u32 q1, q6, #0x2 + vstrw.32 q2, [r4, #288] + vsli.32 q1, q6, #0x1e + vshr.u32 q6, q4, #0x19 + vstrw.32 q1, [r10, #144] + vsli.32 q6, q4, #0x7 + vshr.u32 q0, q5, #0x1f + vstrw.32 q6, [r10, #288] + vsli.32 q0, q5, #0x1 + veor q5, q0, q3 + vldrw.u32 q6, [r2, #64] + veor q3, q6, q5 + vldrw.u32 q1, [r5, #64] + vshr.u32 q4, q3, #0x13 + vldrw.u32 q2, [r2, #384] + vsli.32 q4, q3, #0xd + vldrw.u32 q0, [r5, #224] + veor q6, q1, q7 + vstrw.32 q4, [r10, #240] + veor q2, q2, q5 + veor q3, q0, q7 + vldrw.u32 q0, [r2, #224] + vshr.u32 q4, q6, #0x12 + vldrw.u32 q1, [r5, #384] + vsli.32 q4, q6, #0xe + vshr.u32 q6, q2, #0x19 + vstrw.32 q4, [r4, #240] + vsli.32 q6, q2, #0x7 + vshr.u32 q2, q3, #0xc + vstrw.32 q6, [r4, #64] + vsli.32 q2, q3, #0x14 + veor q0, q0, q5 + vldrw.u32 q6, [r2, #144] + veor q4, q1, q7 + veor q6, q6, q5 + vstrw.32 q2, [r4, #352] + vshr.u32 q2, q4, #0x19 + vsli.32 q2, q4, #0x7 + vldrw.u32 q1, [r2, #304] + veor q5, q1, q5 + vldrw.u32 q1, [r5, #144] + veor q4, q1, q7 + vldrw.u32 q3, [r5, #304] + veor q1, q3, q7 + vstrw.32 q2, [r10, #64] + vshr.u32 q3, q0, #0xd + vsli.32 q3, q0, #0x13 + vldrw.u32 q7, [r4, #80] + vshr.u32 q0, q6, #0x16 + vstrw.32 q3, [r10, #352] + vsli.32 q0, q6, #0xa + vshr.u32 q2, q5, #0x1c + vsli.32 q2, q5, #0x4 + vldrw.u32 q5, [r4, #112] + vshr.u32 q3, q1, #0x1c + vsli.32 q3, q1, #0x4 + vldrw.u32 q1, [r4, #128] + vbic q6, q5, q0 + vstrw.32 q3, [r10, #208] + vbic q3, q1, q5 + veor q3, q0, q3 + vstrw.32 q3, [r2, #96] + vbic q3, q0, q7 + veor q0, q7, q6 + vldrw.u32 q6, [r4, #144] + vbic q7, q7, q6 + vstrw.32 q0, [r2, #80] + veor q3, q6, q3 + vstrw.32 q3, [r2, #144] + veor q0, q1, q7 + vstrw.32 q0, [r2, #128] + vbic q1, q6, q1 + vshr.u32 q6, q4, #0x16 + vldrw.u32 q3, [r10, #112] + vsli.32 q6, q4, #0xa + vldrw.u32 q4, [r10, #80] + veor q1, q5, q1 + vldrw.u32 q0, [r10, #144] + vbic q7, q4, q0 + vldrw.u32 q5, [r10, #128] + veor q7, q5, q7 + vstrw.32 q1, [r2, #112] + vbic q1, q0, q5 + vstrw.32 q7, [r5, #128] + veor q7, q3, q1 + vstrw.32 q7, [r5, #112] + vbic q7, q5, q3 + vbic q1, q3, q6 + vldrw.u32 q3, [r4, #176] + veor q5, q4, q1 + vbic q4, q6, q4 + vldrw.u32 q1, [r4, #160] + veor q0, q0, q4 + vldrw.u32 q4, [r4, #224] + veor q7, q6, q7 + vstrw.32 q0, [r5, #144] + vbic q0, q1, q4 + vstrw.32 q7, [r5, #96] + veor q0, q2, q0 + vstrw.32 q0, [r2, #208] + vbic q6, q3, q1 + vstrw.32 q5, [r5, #80] + vbic q7, q4, q2 + vldrw.u32 q0, [r10, #160] + veor q6, q4, q6 + vldrw.u32 q5, [r4, #192] + vbic q4, q2, q5 + vldrw.u32 q2, [r10, #224] + veor q4, q3, q4 + vstrw.32 q4, [r2, #176] + vbic q4, q5, q3 + vstrw.32 q6, [r2, #224] + veor q4, q1, q4 + vldrw.u32 q1, [r10, #208] + veor q3, q5, q7 + vldrw.u32 q5, [r10, #192] + vbic q6, q1, q5 + vldrw.u32 q7, [r10, #176] + veor q6, q7, q6 + vstrw.32 q3, [r2, #192] + vbic q3, q0, q2 + vstrw.32 q6, [r5, #176] + veor q3, q1, q3 + vstrw.32 q3, [r5, #208] + vbic q3, q5, q7 + vstrw.32 q4, [r2, #160] + veor q3, q0, q3 + vstrw.32 q3, [r5, #160] + vbic q6, q2, q1 + vldrw.u32 q1, [r4, #288] + vbic q7, q7, q0 + vldrw.u32 q3, [r4, #272] + veor q0, q5, q6 + vldrw.u32 q4, [r4, #304] + veor q6, q2, q7 + vldrw.u32 q7, [r4, #256] + vbic q5, q4, q1 + vstrw.32 q0, [r5, #192] + veor q5, q3, q5 + vstrw.32 q6, [r5, #224] + vbic q0, q3, q7 + vstrw.32 q5, [r2, #272] + vbic q6, q1, q3 + veor q5, q7, q6 + vldrw.u32 q3, [r4, #240] + veor q6, q3, q0 + vldrw.u32 q2, [r10, #288] + vbic q0, q3, q4 + vstrw.32 q6, [r2, #240] + vbic q7, q7, q3 + vstrw.32 q5, [r2, #256] + veor q7, q4, q7 + vstrw.32 q7, [r2, #304] + veor q7, q1, q0 + vstrw.32 q7, [r2, #288] + vldrw.u32 q5, [r10, #304] + vbic q7, q5, q2 + vldrw.u32 q3, [r10, #272] + veor q1, q3, q7 + vldrw.u32 q7, [r4, #336] + vbic q4, q2, q3 + vldrw.u32 q6, [r10, #256] + vbic q3, q3, q6 + vldrw.u32 q0, [r10, #240] + veor q3, q0, q3 + vstrw.32 q1, [r5, #272] + vbic q1, q0, q5 + vstrw.32 q3, [r5, #240] + veor q1, q2, q1 + vldrw.u32 q3, [r4, #384] + vbic q2, q6, q0 + vldrw.u32 q0, [r4, #320] + veor q2, q5, q2 + vldrw.u32 q5, [r4, #352] + veor q4, q6, q4 + vstrw.32 q2, [r5, #304] + vbic q2, q7, q0 + vstrw.32 q1, [r5, #288] + veor q1, q3, q2 + vstrw.32 q1, [r2, #384] + vbic q2, q5, q7 + vstrw.32 q4, [r5, #256] + veor q4, q0, q2 + vstrw.32 q4, [r2, #320] + vbic q2, q0, q3 + vldrw.u32 q4, [r4, #368] + vbic q3, q3, q4 + vldrw.u32 q0, [r10, #320] + veor q1, q5, q3 + vldrw.u32 q6, [r10, #336] + vbic q5, q4, q5 + vstrw.32 q1, [r2, #352] + veor q5, q7, q5 + vstrw.32 q5, [r2, #336] + veor q3, q4, q2 + vstrw.32 q3, [r2, #368] + vbic q7, q6, q0 + vldrw.u32 q5, [r10, #352] + vbic q3, q5, q6 + vldrw.u32 q1, [r10, #368] + vbic q4, q1, q5 + vldrw.u32 q2, [r4, #16] + veor q6, q6, q4 + vldrw.u32 q4, [r10, #384] + veor q3, q0, q3 + vstrw.32 q3, [r5, #320] + veor q3, q4, q7 + vstrw.32 q3, [r5, #384] + vbic q0, q0, q4 + vstrw.32 q6, [r5, #336] + veor q3, q1, q0 + vstrw.32 q3, [r5, #368] + vbic q7, q4, q1 + veor q5, q5, q7 + vldrw.u32 q6, [r4, #32] + vbic q3, q6, q2 + vldrw.u32 q4, [r4, #48] + vbic q0, q4, q6 + vldrw.u32 q1, [r4] + veor q0, q2, q0 + vldrw.u32 q7, [r4, #64] + veor q3, q1, q3 + vstrw.32 q5, [r5, #352] + vbic q5, q1, q7 + vstrw.32 q0, [r2, #16] + veor q0, q4, q5 + vstrw.32 q0, [r2, #48] + vbic q5, q2, q1 + veor q2, q7, q5 + vldrw.u32 q0, [r10, #16] + vbic q5, q7, q4 + vldrw.u32 q4, [r10] + vbic q1, q0, q4 + vldrw.u32 q7, [r10, #64] + veor q1, q7, q1 + vstrw.32 q2, [r2, #64] + veor q2, q6, q5 + vbic q6, q4, q7 + vldrw.u32 q5, [r10, #48] + veor q6, q5, q6 + ldrd r7, r8, [r6] + vbic q7, q7, q5 + vstrw.32 q1, [r5, #64] + vdup.32 q1, r7 + veor q1, q3, q1 + vldrw.u32 q3, [r10, #32] + veor q7, q3, q7 + add.w r6, r6, #0x8 + vbic q5, q5, q3 + vstrw.32 q6, [r5, #48] + vbic q6, q3, q0 + vstrw.32 q1, [r2] + veor q5, q0, q5 + vstrw.32 q7, [r5, #32] + veor q4, q4, q6 + vstrw.32 q5, [r5, #16] + vdup.32 q6, r8 + vstrw.32 q2, [r2, #32] + veor q0, q4, q6 + vstrw.32 q0, [r5] keccak_f1600_x4_mve_asm_roundend_pre: - le lr, keccak_f1600_x4_mve_asm_roundstart @ imm = #-0x8c0 + le lr, keccak_f1600_x4_mve_asm_roundstart @ imm = #-0x8c0 keccak_f1600_x4_mve_asm_roundend: - add sp, #0x80 - vpop {d8, d9, d10, d11, d12, d13, d14, d15} - pop.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + add sp, #0x80 + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + pop.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} nop MLD_ASM_FN_SIZE(keccak_f1600_x4_mve_asm) diff --git a/mldsa/src/native/aarch64/src/intt.S b/mldsa/src/native/aarch64/src/intt.S index 7eeac5779..39e946e83 100644 --- a/mldsa/src/native/aarch64/src/intt.S +++ b/mldsa/src/native/aarch64/src/intt.S @@ -36,710 +36,710 @@ MLD_ASM_FN_SYMBOL(intt_asm) .cfi_startproc - sub sp, sp, #0x40 + sub sp, sp, #0x40 .cfi_adjust_cfa_offset 0x40 - stp d8, d9, [sp] + stp d8, d9, [sp] .cfi_rel_offset d8, 0x0 .cfi_rel_offset d9, 0x8 - stp d10, d11, [sp, #0x10] + stp d10, d11, [sp, #0x10] .cfi_rel_offset d10, 0x10 .cfi_rel_offset d11, 0x18 - stp d12, d13, [sp, #0x20] + stp d12, d13, [sp, #0x20] .cfi_rel_offset d12, 0x20 .cfi_rel_offset d13, 0x28 - stp d14, d15, [sp, #0x30] + stp d14, d15, [sp, #0x30] .cfi_rel_offset d14, 0x30 .cfi_rel_offset d15, 0x38 - mov w5, #0xe001 // =57345 - movk w5, #0x7f, lsl #16 - dup v31.4s, w5 - mov x3, x0 - mov x4, #0x10 // =16 - ldr q27, [x3, #0x10] - ldr q18, [x3] - ldr q3, [x3, #0x20] - ldr q13, [x3, #0x30] - ldr q2, [x1, #0x30] - ldr q8, [x3, #0x70] - ldr q21, [x3, #0x60] - trn1 v10.4s, v18.4s, v27.4s - trn2 v23.4s, v18.4s, v27.4s - trn1 v6.4s, v3.4s, v13.4s - trn2 v18.4s, v3.4s, v13.4s - ldr q12, [x1, #0x50] - ldr q17, [x1, #0x40] - trn2 v29.2d, v10.2d, v6.2d - trn2 v14.2d, v23.2d, v18.2d - trn1 v10.2d, v10.2d, v6.2d - trn1 v26.2d, v23.2d, v18.2d - sub v3.4s, v29.4s, v14.4s - trn1 v13.4s, v21.4s, v8.4s - ldr q6, [x1, #0x10] - add v24.4s, v10.4s, v26.4s - sub v30.4s, v10.4s, v26.4s - sqrdmulh v12.4s, v3.4s, v12.4s - ldr q9, [x1, #0x20] - sqrdmulh v5.4s, v30.4s, v2.4s - mul v4.4s, v3.4s, v17.4s - ldr q3, [x3, #0x50] - add v10.4s, v29.4s, v14.4s - ldr q26, [x3, #0x40] - mul v15.4s, v30.4s, v9.4s - trn2 v30.4s, v21.4s, v8.4s - sub v21.4s, v24.4s, v10.4s - add v29.4s, v24.4s, v10.4s - mls v15.4s, v5.4s, v31.s[0] - ldr q16, [x1], #0x60 - trn2 v10.4s, v26.4s, v3.4s - mls v4.4s, v12.4s, v31.s[0] - trn1 v3.4s, v26.4s, v3.4s - ldr q0, [x1, #0x50] - trn2 v25.2d, v10.2d, v30.2d - sqrdmulh v12.4s, v21.4s, v6.4s - trn2 v1.2d, v3.2d, v13.2d - mul v21.4s, v21.4s, v16.4s - sub v23.4s, v1.4s, v25.4s - sub v2.4s, v15.4s, v4.4s - add v20.4s, v15.4s, v4.4s - sqrdmulh v4.4s, v23.4s, v0.4s - trn1 v7.2d, v3.2d, v13.2d - sqrdmulh v3.4s, v2.4s, v6.4s - trn1 v15.2d, v10.2d, v30.2d - mls v21.4s, v12.4s, v31.s[0] - ldr q12, [x1, #0x30] - sub v13.4s, v7.4s, v15.4s - mul v10.4s, v2.4s, v16.4s - ldr q16, [x1, #0x40] - ldr q5, [x1, #0x20] - mls v10.4s, v3.4s, v31.s[0] - trn2 v11.4s, v29.4s, v20.4s - sqrdmulh v2.4s, v13.4s, v12.4s - ldr d9, [x2], #0x20 - mul v17.4s, v13.4s, v5.4s - trn1 v24.4s, v29.4s, v20.4s - trn1 v12.4s, v21.4s, v10.4s - trn2 v10.4s, v21.4s, v10.4s - mul v22.4s, v23.4s, v16.4s - ldur q26, [x2, #-0x10] - trn1 v13.2d, v24.2d, v12.2d - trn1 v3.2d, v11.2d, v10.2d - mls v17.4s, v2.4s, v31.s[0] - mls v22.4s, v4.4s, v31.s[0] - sub v23.4s, v13.4s, v3.4s - trn2 v10.2d, v11.2d, v10.2d - sqrdmulh v0.4s, v23.4s, v26.s[1] - trn2 v21.2d, v24.2d, v12.2d - mul v29.4s, v23.4s, v26.s[0] - sub v23.4s, v21.4s, v10.4s - sqrdmulh v8.4s, v23.4s, v26.s[3] - add v6.4s, v1.4s, v25.4s - add v11.4s, v21.4s, v10.4s - mls v29.4s, v0.4s, v31.s[0] - add v30.4s, v13.4s, v3.4s - ldr q2, [x1, #0x10] - mul v3.4s, v23.4s, v26.s[2] - add v12.4s, v7.4s, v15.4s - mls v3.4s, v8.4s, v31.s[0] - sub v13.4s, v30.4s, v11.4s - sub v21.4s, v12.4s, v6.4s - add v26.4s, v17.4s, v22.4s - sqrdmulh v28.4s, v13.4s, v9.s[1] - mul v18.4s, v13.4s, v9.s[0] - sub v8.4s, v29.4s, v3.4s - sqrdmulh v24.4s, v21.4s, v2.4s - add v15.4s, v30.4s, v11.4s - add v10.4s, v29.4s, v3.4s - add v14.4s, v12.4s, v6.4s - sqrdmulh v16.4s, v8.4s, v9.s[1] - sub v3.4s, v17.4s, v22.4s - mul v8.4s, v8.4s, v9.s[0] - ldr q17, [x1], #0x60 - sub x4, x4, #0x2 + mov w5, #0xe001 // =57345 + movk w5, #0x7f, lsl #16 + dup v31.4s, w5 + mov x3, x0 + mov x4, #0x10 // =16 + ldr q27, [x3, #0x10] + ldr q18, [x3] + ldr q3, [x3, #0x20] + ldr q13, [x3, #0x30] + ldr q2, [x1, #0x30] + ldr q8, [x3, #0x70] + ldr q21, [x3, #0x60] + trn1 v10.4s, v18.4s, v27.4s + trn2 v23.4s, v18.4s, v27.4s + trn1 v6.4s, v3.4s, v13.4s + trn2 v18.4s, v3.4s, v13.4s + ldr q12, [x1, #0x50] + ldr q17, [x1, #0x40] + trn2 v29.2d, v10.2d, v6.2d + trn2 v14.2d, v23.2d, v18.2d + trn1 v10.2d, v10.2d, v6.2d + trn1 v26.2d, v23.2d, v18.2d + sub v3.4s, v29.4s, v14.4s + trn1 v13.4s, v21.4s, v8.4s + ldr q6, [x1, #0x10] + add v24.4s, v10.4s, v26.4s + sub v30.4s, v10.4s, v26.4s + sqrdmulh v12.4s, v3.4s, v12.4s + ldr q9, [x1, #0x20] + sqrdmulh v5.4s, v30.4s, v2.4s + mul v4.4s, v3.4s, v17.4s + ldr q3, [x3, #0x50] + add v10.4s, v29.4s, v14.4s + ldr q26, [x3, #0x40] + mul v15.4s, v30.4s, v9.4s + trn2 v30.4s, v21.4s, v8.4s + sub v21.4s, v24.4s, v10.4s + add v29.4s, v24.4s, v10.4s + mls v15.4s, v5.4s, v31.s[0] + ldr q16, [x1], #0x60 + trn2 v10.4s, v26.4s, v3.4s + mls v4.4s, v12.4s, v31.s[0] + trn1 v3.4s, v26.4s, v3.4s + ldr q0, [x1, #0x50] + trn2 v25.2d, v10.2d, v30.2d + sqrdmulh v12.4s, v21.4s, v6.4s + trn2 v1.2d, v3.2d, v13.2d + mul v21.4s, v21.4s, v16.4s + sub v23.4s, v1.4s, v25.4s + sub v2.4s, v15.4s, v4.4s + add v20.4s, v15.4s, v4.4s + sqrdmulh v4.4s, v23.4s, v0.4s + trn1 v7.2d, v3.2d, v13.2d + sqrdmulh v3.4s, v2.4s, v6.4s + trn1 v15.2d, v10.2d, v30.2d + mls v21.4s, v12.4s, v31.s[0] + ldr q12, [x1, #0x30] + sub v13.4s, v7.4s, v15.4s + mul v10.4s, v2.4s, v16.4s + ldr q16, [x1, #0x40] + ldr q5, [x1, #0x20] + mls v10.4s, v3.4s, v31.s[0] + trn2 v11.4s, v29.4s, v20.4s + sqrdmulh v2.4s, v13.4s, v12.4s + ldr d9, [x2], #0x20 + mul v17.4s, v13.4s, v5.4s + trn1 v24.4s, v29.4s, v20.4s + trn1 v12.4s, v21.4s, v10.4s + trn2 v10.4s, v21.4s, v10.4s + mul v22.4s, v23.4s, v16.4s + ldur q26, [x2, #-0x10] + trn1 v13.2d, v24.2d, v12.2d + trn1 v3.2d, v11.2d, v10.2d + mls v17.4s, v2.4s, v31.s[0] + mls v22.4s, v4.4s, v31.s[0] + sub v23.4s, v13.4s, v3.4s + trn2 v10.2d, v11.2d, v10.2d + sqrdmulh v0.4s, v23.4s, v26.s[1] + trn2 v21.2d, v24.2d, v12.2d + mul v29.4s, v23.4s, v26.s[0] + sub v23.4s, v21.4s, v10.4s + sqrdmulh v8.4s, v23.4s, v26.s[3] + add v6.4s, v1.4s, v25.4s + add v11.4s, v21.4s, v10.4s + mls v29.4s, v0.4s, v31.s[0] + add v30.4s, v13.4s, v3.4s + ldr q2, [x1, #0x10] + mul v3.4s, v23.4s, v26.s[2] + add v12.4s, v7.4s, v15.4s + mls v3.4s, v8.4s, v31.s[0] + sub v13.4s, v30.4s, v11.4s + sub v21.4s, v12.4s, v6.4s + add v26.4s, v17.4s, v22.4s + sqrdmulh v28.4s, v13.4s, v9.s[1] + mul v18.4s, v13.4s, v9.s[0] + sub v8.4s, v29.4s, v3.4s + sqrdmulh v24.4s, v21.4s, v2.4s + add v15.4s, v30.4s, v11.4s + add v10.4s, v29.4s, v3.4s + add v14.4s, v12.4s, v6.4s + sqrdmulh v16.4s, v8.4s, v9.s[1] + sub v3.4s, v17.4s, v22.4s + mul v8.4s, v8.4s, v9.s[0] + ldr q17, [x1], #0x60 + sub x4, x4, #0x2 Lintt_layer5678_start: - ldr d4, [x2], #0x20 - mul v0.4s, v21.4s, v17.4s - ldr q7, [x3, #0xa0] - ldr q27, [x3, #0xb0] - mls v8.4s, v16.4s, v31.s[0] - ldr q29, [x3, #0x80] - trn1 v9.4s, v14.4s, v26.4s - ldr q25, [x3, #0x90] - mls v0.4s, v24.4s, v31.s[0] - str q15, [x3], #0x40 - trn1 v1.4s, v7.4s, v27.4s - ldr q15, [x1, #0x50] - trn2 v12.4s, v7.4s, v27.4s - sqrdmulh v16.4s, v3.4s, v2.4s - trn1 v21.4s, v29.4s, v25.4s - stur q8, [x3, #-0x10] - trn2 v19.4s, v29.4s, v25.4s - mls v18.4s, v28.4s, v31.s[0] - ldr q22, [x1, #0x30] - trn2 v13.2d, v21.2d, v1.2d - trn2 v5.2d, v19.2d, v12.2d - mul v23.4s, v3.4s, v17.4s - trn1 v17.2d, v21.2d, v1.2d - ldr q29, [x1, #0x40] - mls v23.4s, v16.4s, v31.s[0] - sub v30.4s, v13.4s, v5.4s - trn1 v24.2d, v19.2d, v12.2d - stur q10, [x3, #-0x30] - sqrdmulh v16.4s, v30.4s, v15.4s - ldr q10, [x1, #0x20] - trn2 v12.4s, v14.4s, v26.4s - stur q18, [x3, #-0x20] - mul v11.4s, v30.4s, v29.4s - sub v8.4s, v17.4s, v24.4s - trn1 v20.4s, v0.4s, v23.4s - ldr q2, [x1, #0x10] - trn2 v26.4s, v0.4s, v23.4s - sqrdmulh v19.4s, v8.4s, v22.4s - ldur q14, [x2, #-0x10] - trn1 v21.2d, v9.2d, v20.2d - mul v8.4s, v8.4s, v10.4s - trn1 v6.2d, v12.2d, v26.2d - trn2 v0.2d, v9.2d, v20.2d - mls v11.4s, v16.4s, v31.s[0] - sub v7.4s, v21.4s, v6.4s - trn2 v28.2d, v12.2d, v26.2d - add v10.4s, v21.4s, v6.4s - mul v18.4s, v7.4s, v14.s[0] - add v20.4s, v13.4s, v5.4s - sqrdmulh v30.4s, v7.4s, v14.s[1] - sub v21.4s, v0.4s, v28.4s - add v13.4s, v0.4s, v28.4s - add v9.4s, v17.4s, v24.4s - sqrdmulh v1.4s, v21.4s, v14.s[3] - ldr q17, [x1], #0x60 - mul v14.4s, v21.4s, v14.s[2] - sub v21.4s, v9.4s, v20.4s - mls v18.4s, v30.4s, v31.s[0] - mls v14.4s, v1.4s, v31.s[0] - mls v8.4s, v19.4s, v31.s[0] - sub v3.4s, v10.4s, v13.4s - add v15.4s, v10.4s, v13.4s - sqrdmulh v28.4s, v3.4s, v4.s[1] - add v10.4s, v18.4s, v14.4s - sub v19.4s, v18.4s, v14.4s - mul v18.4s, v3.4s, v4.s[0] - sub v3.4s, v8.4s, v11.4s - add v26.4s, v8.4s, v11.4s - sqrdmulh v16.4s, v19.4s, v4.s[1] - add v14.4s, v9.4s, v20.4s - mul v8.4s, v19.4s, v4.s[0] - sqrdmulh v24.4s, v21.4s, v2.4s - subs x4, x4, #0x1 - cbnz x4, Lintt_layer5678_start - sqrdmulh v19.4s, v3.4s, v2.4s - trn2 v0.4s, v14.4s, v26.4s - str q15, [x3], #0x40 - trn1 v30.4s, v14.4s, v26.4s - mul v26.4s, v21.4s, v17.4s - stur q10, [x3, #-0x30] - ldr d29, [x2], #0x20 - mls v26.4s, v24.4s, v31.s[0] - mul v10.4s, v3.4s, v17.4s - ldur q14, [x2, #-0x10] - mls v10.4s, v19.4s, v31.s[0] - mls v8.4s, v16.4s, v31.s[0] - trn2 v22.4s, v26.4s, v10.4s - trn1 v25.4s, v26.4s, v10.4s - trn1 v1.2d, v30.2d, v25.2d - trn1 v2.2d, v0.2d, v22.2d - trn2 v13.2d, v30.2d, v25.2d - trn2 v7.2d, v0.2d, v22.2d - mls v18.4s, v28.4s, v31.s[0] - sub v22.4s, v1.4s, v2.4s - add v3.4s, v13.4s, v7.4s - sqrdmulh v27.4s, v22.4s, v14.s[1] - sub v4.4s, v13.4s, v7.4s - stur q8, [x3, #-0x10] - sqrdmulh v0.4s, v4.4s, v14.s[3] - add v16.4s, v1.4s, v2.4s - stur q18, [x3, #-0x20] - mul v23.4s, v4.4s, v14.s[2] - add v4.4s, v16.4s, v3.4s - sub v17.4s, v16.4s, v3.4s - mul v14.4s, v22.4s, v14.s[0] - str q4, [x3], #0x40 - mls v23.4s, v0.4s, v31.s[0] - mls v14.4s, v27.4s, v31.s[0] - mul v11.4s, v17.4s, v29.s[0] - sqrdmulh v10.4s, v17.4s, v29.s[1] - sub v0.4s, v14.4s, v23.4s - add v6.4s, v14.4s, v23.4s - sqrdmulh v12.4s, v0.4s, v29.s[1] - mul v0.4s, v0.4s, v29.s[0] - stur q6, [x3, #-0x30] - mls v11.4s, v10.4s, v31.s[0] - mls v0.4s, v12.4s, v31.s[0] - stur q11, [x3, #-0x20] - stur q0, [x3, #-0x10] - mov w5, #0x3ffe // =16382 - dup v29.4s, w5 - mov w5, #0xe03 // =3587 - movk w5, #0x40, lsl #16 - dup v30.4s, w5 - mov x4, #0x4 // =4 - ldr q0, [x2], #0x80 - ldur q1, [x2, #-0x70] - ldur q2, [x2, #-0x60] - ldur q3, [x2, #-0x50] - ldur q4, [x2, #-0x40] - ldur q5, [x2, #-0x30] - ldur q6, [x2, #-0x20] - ldur q7, [x2, #-0x10] - ldr q8, [x0, #0xc0] - ldr q27, [x0, #0x80] - ldr q20, [x0, #0x1c0] - ldr q23, [x0, #0x180] - ldr q24, [x0, #0x3c0] - ldr q28, [x0, #0x40] - ldr q25, [x0, #0x340] - ldr q10, [x0, #0x380] - sub v15.4s, v27.4s, v8.4s - ldr q26, [x0] - ldr q18, [x0, #0x300] - add v9.4s, v23.4s, v20.4s - mul v11.4s, v15.4s, v4.s[0] - sub v19.4s, v23.4s, v20.4s - sub v22.4s, v10.4s, v24.4s - ldr q13, [x0, #0x240] - sqrdmulh v12.4s, v15.4s, v4.s[1] - sub v14.4s, v26.4s, v28.4s - ldr q17, [x0, #0x200] - add v23.4s, v18.4s, v25.4s - sqrdmulh v15.4s, v14.4s, v3.s[3] - sub v18.4s, v18.4s, v25.4s - mul v25.4s, v14.4s, v3.s[2] - sub v16.4s, v17.4s, v13.4s - mls v11.4s, v12.4s, v31.s[0] - add v21.4s, v10.4s, v24.4s - mls v25.4s, v15.4s, v31.s[0] - mul v12.4s, v16.4s, v5.s[2] - sub v10.4s, v23.4s, v21.4s - sqrdmulh v14.4s, v10.4s, v3.s[1] - add v23.4s, v23.4s, v21.4s - add v27.4s, v27.4s, v8.4s - mul v15.4s, v10.4s, v3.s[0] - add v24.4s, v25.4s, v11.4s - sub v10.4s, v25.4s, v11.4s - ldr q25, [x0, #0x100] - sqrdmulh v8.4s, v18.4s, v6.s[3] - add v21.4s, v26.4s, v28.4s - ldr q26, [x0, #0x140] - sqrdmulh v20.4s, v16.4s, v5.s[3] - sqrdmulh v28.4s, v22.4s, v7.s[1] - add v11.4s, v25.4s, v26.4s - mls v15.4s, v14.4s, v31.s[0] - add v17.4s, v17.4s, v13.4s - mul v13.4s, v22.4s, v7.s[0] - ldr q14, [x0, #0x280] - ldr q22, [x0, #0x2c0] - mls v13.4s, v28.4s, v31.s[0] - sub v16.4s, v25.4s, v26.4s - sub v28.4s, v21.4s, v27.4s - mls v12.4s, v20.4s, v31.s[0] - add v20.4s, v11.4s, v9.4s - add v26.4s, v14.4s, v22.4s - sub v14.4s, v14.4s, v22.4s - sqrdmulh v22.4s, v28.4s, v1.s[3] - sub v9.4s, v11.4s, v9.4s - mul v25.4s, v14.4s, v6.s[0] - sub v11.4s, v17.4s, v26.4s - add v17.4s, v17.4s, v26.4s - add v26.4s, v21.4s, v27.4s - sqrdmulh v27.4s, v11.4s, v2.s[3] - mul v21.4s, v11.4s, v2.s[2] - mul v11.4s, v18.4s, v6.s[2] - mls v11.4s, v8.4s, v31.s[0] - sqrdmulh v8.4s, v14.4s, v6.s[1] - add v18.4s, v26.4s, v20.4s - mls v21.4s, v27.4s, v31.s[0] - sub v27.4s, v17.4s, v23.4s - mul v14.4s, v28.4s, v1.s[2] - sub v28.4s, v26.4s, v20.4s - add v23.4s, v17.4s, v23.4s - mls v25.4s, v8.4s, v31.s[0] - add v26.4s, v21.4s, v15.4s - sub v15.4s, v21.4s, v15.4s - mul v8.4s, v19.4s, v5.s[0] - mls v14.4s, v22.4s, v31.s[0] - sub v22.4s, v11.4s, v13.4s - sub v20.4s, v12.4s, v25.4s - add v21.4s, v12.4s, v25.4s - sqrdmulh v12.4s, v22.4s, v3.s[1] - mul v17.4s, v16.4s, v4.s[2] - sqrdmulh v25.4s, v16.4s, v4.s[3] - add v16.4s, v11.4s, v13.4s - mul v11.4s, v22.4s, v3.s[0] - sqrdmulh v19.4s, v19.4s, v5.s[1] - sqrdmulh v22.4s, v15.4s, v1.s[1] - sqrdmulh v13.4s, v20.4s, v2.s[3] - mul v20.4s, v20.4s, v2.s[2] - mls v11.4s, v12.4s, v31.s[0] - mls v20.4s, v13.4s, v31.s[0] - sub v13.4s, v18.4s, v23.4s - add v23.4s, v18.4s, v23.4s - mls v17.4s, v25.4s, v31.s[0] - mls v8.4s, v19.4s, v31.s[0] - sub v18.4s, v20.4s, v11.4s - add v20.4s, v20.4s, v11.4s - sqrdmulh v12.4s, v9.4s, v2.s[1] - mul v15.4s, v15.4s, v1.s[0] - sub v11.4s, v17.4s, v8.4s - mls v15.4s, v22.4s, v31.s[0] - add v19.4s, v17.4s, v8.4s - sqrdmulh v25.4s, v11.4s, v2.s[1] - add v17.4s, v24.4s, v19.4s - sub v24.4s, v24.4s, v19.4s - mul v19.4s, v11.4s, v2.s[0] - sqrdmulh v8.4s, v24.4s, v0.s[3] - mls v19.4s, v25.4s, v31.s[0] - mul v25.4s, v9.4s, v2.s[0] - sub x4, x4, #0x1 + ldr d4, [x2], #0x20 + mul v0.4s, v21.4s, v17.4s + ldr q7, [x3, #0xa0] + ldr q27, [x3, #0xb0] + mls v8.4s, v16.4s, v31.s[0] + ldr q29, [x3, #0x80] + trn1 v9.4s, v14.4s, v26.4s + ldr q25, [x3, #0x90] + mls v0.4s, v24.4s, v31.s[0] + str q15, [x3], #0x40 + trn1 v1.4s, v7.4s, v27.4s + ldr q15, [x1, #0x50] + trn2 v12.4s, v7.4s, v27.4s + sqrdmulh v16.4s, v3.4s, v2.4s + trn1 v21.4s, v29.4s, v25.4s + stur q8, [x3, #-0x10] + trn2 v19.4s, v29.4s, v25.4s + mls v18.4s, v28.4s, v31.s[0] + ldr q22, [x1, #0x30] + trn2 v13.2d, v21.2d, v1.2d + trn2 v5.2d, v19.2d, v12.2d + mul v23.4s, v3.4s, v17.4s + trn1 v17.2d, v21.2d, v1.2d + ldr q29, [x1, #0x40] + mls v23.4s, v16.4s, v31.s[0] + sub v30.4s, v13.4s, v5.4s + trn1 v24.2d, v19.2d, v12.2d + stur q10, [x3, #-0x30] + sqrdmulh v16.4s, v30.4s, v15.4s + ldr q10, [x1, #0x20] + trn2 v12.4s, v14.4s, v26.4s + stur q18, [x3, #-0x20] + mul v11.4s, v30.4s, v29.4s + sub v8.4s, v17.4s, v24.4s + trn1 v20.4s, v0.4s, v23.4s + ldr q2, [x1, #0x10] + trn2 v26.4s, v0.4s, v23.4s + sqrdmulh v19.4s, v8.4s, v22.4s + ldur q14, [x2, #-0x10] + trn1 v21.2d, v9.2d, v20.2d + mul v8.4s, v8.4s, v10.4s + trn1 v6.2d, v12.2d, v26.2d + trn2 v0.2d, v9.2d, v20.2d + mls v11.4s, v16.4s, v31.s[0] + sub v7.4s, v21.4s, v6.4s + trn2 v28.2d, v12.2d, v26.2d + add v10.4s, v21.4s, v6.4s + mul v18.4s, v7.4s, v14.s[0] + add v20.4s, v13.4s, v5.4s + sqrdmulh v30.4s, v7.4s, v14.s[1] + sub v21.4s, v0.4s, v28.4s + add v13.4s, v0.4s, v28.4s + add v9.4s, v17.4s, v24.4s + sqrdmulh v1.4s, v21.4s, v14.s[3] + ldr q17, [x1], #0x60 + mul v14.4s, v21.4s, v14.s[2] + sub v21.4s, v9.4s, v20.4s + mls v18.4s, v30.4s, v31.s[0] + mls v14.4s, v1.4s, v31.s[0] + mls v8.4s, v19.4s, v31.s[0] + sub v3.4s, v10.4s, v13.4s + add v15.4s, v10.4s, v13.4s + sqrdmulh v28.4s, v3.4s, v4.s[1] + add v10.4s, v18.4s, v14.4s + sub v19.4s, v18.4s, v14.4s + mul v18.4s, v3.4s, v4.s[0] + sub v3.4s, v8.4s, v11.4s + add v26.4s, v8.4s, v11.4s + sqrdmulh v16.4s, v19.4s, v4.s[1] + add v14.4s, v9.4s, v20.4s + mul v8.4s, v19.4s, v4.s[0] + sqrdmulh v24.4s, v21.4s, v2.4s + subs x4, x4, #0x1 + cbnz x4, Lintt_layer5678_start + sqrdmulh v19.4s, v3.4s, v2.4s + trn2 v0.4s, v14.4s, v26.4s + str q15, [x3], #0x40 + trn1 v30.4s, v14.4s, v26.4s + mul v26.4s, v21.4s, v17.4s + stur q10, [x3, #-0x30] + ldr d29, [x2], #0x20 + mls v26.4s, v24.4s, v31.s[0] + mul v10.4s, v3.4s, v17.4s + ldur q14, [x2, #-0x10] + mls v10.4s, v19.4s, v31.s[0] + mls v8.4s, v16.4s, v31.s[0] + trn2 v22.4s, v26.4s, v10.4s + trn1 v25.4s, v26.4s, v10.4s + trn1 v1.2d, v30.2d, v25.2d + trn1 v2.2d, v0.2d, v22.2d + trn2 v13.2d, v30.2d, v25.2d + trn2 v7.2d, v0.2d, v22.2d + mls v18.4s, v28.4s, v31.s[0] + sub v22.4s, v1.4s, v2.4s + add v3.4s, v13.4s, v7.4s + sqrdmulh v27.4s, v22.4s, v14.s[1] + sub v4.4s, v13.4s, v7.4s + stur q8, [x3, #-0x10] + sqrdmulh v0.4s, v4.4s, v14.s[3] + add v16.4s, v1.4s, v2.4s + stur q18, [x3, #-0x20] + mul v23.4s, v4.4s, v14.s[2] + add v4.4s, v16.4s, v3.4s + sub v17.4s, v16.4s, v3.4s + mul v14.4s, v22.4s, v14.s[0] + str q4, [x3], #0x40 + mls v23.4s, v0.4s, v31.s[0] + mls v14.4s, v27.4s, v31.s[0] + mul v11.4s, v17.4s, v29.s[0] + sqrdmulh v10.4s, v17.4s, v29.s[1] + sub v0.4s, v14.4s, v23.4s + add v6.4s, v14.4s, v23.4s + sqrdmulh v12.4s, v0.4s, v29.s[1] + mul v0.4s, v0.4s, v29.s[0] + stur q6, [x3, #-0x30] + mls v11.4s, v10.4s, v31.s[0] + mls v0.4s, v12.4s, v31.s[0] + stur q11, [x3, #-0x20] + stur q0, [x3, #-0x10] + mov w5, #0x3ffe // =16382 + dup v29.4s, w5 + mov w5, #0xe03 // =3587 + movk w5, #0x40, lsl #16 + dup v30.4s, w5 + mov x4, #0x4 // =4 + ldr q0, [x2], #0x80 + ldur q1, [x2, #-0x70] + ldur q2, [x2, #-0x60] + ldur q3, [x2, #-0x50] + ldur q4, [x2, #-0x40] + ldur q5, [x2, #-0x30] + ldur q6, [x2, #-0x20] + ldur q7, [x2, #-0x10] + ldr q8, [x0, #0xc0] + ldr q27, [x0, #0x80] + ldr q20, [x0, #0x1c0] + ldr q23, [x0, #0x180] + ldr q24, [x0, #0x3c0] + ldr q28, [x0, #0x40] + ldr q25, [x0, #0x340] + ldr q10, [x0, #0x380] + sub v15.4s, v27.4s, v8.4s + ldr q26, [x0] + ldr q18, [x0, #0x300] + add v9.4s, v23.4s, v20.4s + mul v11.4s, v15.4s, v4.s[0] + sub v19.4s, v23.4s, v20.4s + sub v22.4s, v10.4s, v24.4s + ldr q13, [x0, #0x240] + sqrdmulh v12.4s, v15.4s, v4.s[1] + sub v14.4s, v26.4s, v28.4s + ldr q17, [x0, #0x200] + add v23.4s, v18.4s, v25.4s + sqrdmulh v15.4s, v14.4s, v3.s[3] + sub v18.4s, v18.4s, v25.4s + mul v25.4s, v14.4s, v3.s[2] + sub v16.4s, v17.4s, v13.4s + mls v11.4s, v12.4s, v31.s[0] + add v21.4s, v10.4s, v24.4s + mls v25.4s, v15.4s, v31.s[0] + mul v12.4s, v16.4s, v5.s[2] + sub v10.4s, v23.4s, v21.4s + sqrdmulh v14.4s, v10.4s, v3.s[1] + add v23.4s, v23.4s, v21.4s + add v27.4s, v27.4s, v8.4s + mul v15.4s, v10.4s, v3.s[0] + add v24.4s, v25.4s, v11.4s + sub v10.4s, v25.4s, v11.4s + ldr q25, [x0, #0x100] + sqrdmulh v8.4s, v18.4s, v6.s[3] + add v21.4s, v26.4s, v28.4s + ldr q26, [x0, #0x140] + sqrdmulh v20.4s, v16.4s, v5.s[3] + sqrdmulh v28.4s, v22.4s, v7.s[1] + add v11.4s, v25.4s, v26.4s + mls v15.4s, v14.4s, v31.s[0] + add v17.4s, v17.4s, v13.4s + mul v13.4s, v22.4s, v7.s[0] + ldr q14, [x0, #0x280] + ldr q22, [x0, #0x2c0] + mls v13.4s, v28.4s, v31.s[0] + sub v16.4s, v25.4s, v26.4s + sub v28.4s, v21.4s, v27.4s + mls v12.4s, v20.4s, v31.s[0] + add v20.4s, v11.4s, v9.4s + add v26.4s, v14.4s, v22.4s + sub v14.4s, v14.4s, v22.4s + sqrdmulh v22.4s, v28.4s, v1.s[3] + sub v9.4s, v11.4s, v9.4s + mul v25.4s, v14.4s, v6.s[0] + sub v11.4s, v17.4s, v26.4s + add v17.4s, v17.4s, v26.4s + add v26.4s, v21.4s, v27.4s + sqrdmulh v27.4s, v11.4s, v2.s[3] + mul v21.4s, v11.4s, v2.s[2] + mul v11.4s, v18.4s, v6.s[2] + mls v11.4s, v8.4s, v31.s[0] + sqrdmulh v8.4s, v14.4s, v6.s[1] + add v18.4s, v26.4s, v20.4s + mls v21.4s, v27.4s, v31.s[0] + sub v27.4s, v17.4s, v23.4s + mul v14.4s, v28.4s, v1.s[2] + sub v28.4s, v26.4s, v20.4s + add v23.4s, v17.4s, v23.4s + mls v25.4s, v8.4s, v31.s[0] + add v26.4s, v21.4s, v15.4s + sub v15.4s, v21.4s, v15.4s + mul v8.4s, v19.4s, v5.s[0] + mls v14.4s, v22.4s, v31.s[0] + sub v22.4s, v11.4s, v13.4s + sub v20.4s, v12.4s, v25.4s + add v21.4s, v12.4s, v25.4s + sqrdmulh v12.4s, v22.4s, v3.s[1] + mul v17.4s, v16.4s, v4.s[2] + sqrdmulh v25.4s, v16.4s, v4.s[3] + add v16.4s, v11.4s, v13.4s + mul v11.4s, v22.4s, v3.s[0] + sqrdmulh v19.4s, v19.4s, v5.s[1] + sqrdmulh v22.4s, v15.4s, v1.s[1] + sqrdmulh v13.4s, v20.4s, v2.s[3] + mul v20.4s, v20.4s, v2.s[2] + mls v11.4s, v12.4s, v31.s[0] + mls v20.4s, v13.4s, v31.s[0] + sub v13.4s, v18.4s, v23.4s + add v23.4s, v18.4s, v23.4s + mls v17.4s, v25.4s, v31.s[0] + mls v8.4s, v19.4s, v31.s[0] + sub v18.4s, v20.4s, v11.4s + add v20.4s, v20.4s, v11.4s + sqrdmulh v12.4s, v9.4s, v2.s[1] + mul v15.4s, v15.4s, v1.s[0] + sub v11.4s, v17.4s, v8.4s + mls v15.4s, v22.4s, v31.s[0] + add v19.4s, v17.4s, v8.4s + sqrdmulh v25.4s, v11.4s, v2.s[1] + add v17.4s, v24.4s, v19.4s + sub v24.4s, v24.4s, v19.4s + mul v19.4s, v11.4s, v2.s[0] + sqrdmulh v8.4s, v24.4s, v0.s[3] + mls v19.4s, v25.4s, v31.s[0] + mul v25.4s, v9.4s, v2.s[0] + sub x4, x4, #0x1 Lintt_layer1234_start: - sub v22.4s, v21.4s, v16.4s - mls v25.4s, v12.4s, v31.s[0] - add v12.4s, v21.4s, v16.4s - mul v9.4s, v24.4s, v0.s[2] - mls v9.4s, v8.4s, v31.s[0] - sub v16.4s, v14.4s, v25.4s - add v14.4s, v14.4s, v25.4s - sqrdmulh v24.4s, v22.4s, v1.s[1] - sub v11.4s, v14.4s, v26.4s - sqrdmulh v21.4s, v16.4s, v0.s[3] - add v25.4s, v14.4s, v26.4s - mul v14.4s, v22.4s, v1.s[0] - mls v14.4s, v24.4s, v31.s[0] - mul v16.4s, v16.4s, v0.s[2] - mls v16.4s, v21.4s, v31.s[0] - add v24.4s, v9.4s, v14.4s - sqrdmulh v21.4s, v25.4s, v30.4s - sqrdmulh v26.4s, v11.4s, v0.s[1] - mul v8.4s, v25.4s, v29.4s - mls v8.4s, v21.4s, v31.s[0] - mul v25.4s, v24.4s, v29.4s - mul v22.4s, v11.4s, v0.s[0] - sub v11.4s, v17.4s, v12.4s - str q8, [x0, #0x80] - sub v8.4s, v9.4s, v14.4s - add v14.4s, v16.4s, v15.4s - sqrdmulh v9.4s, v11.4s, v0.s[1] - sub v21.4s, v16.4s, v15.4s - mls v22.4s, v26.4s, v31.s[0] - mul v15.4s, v11.4s, v0.s[0] - mls v15.4s, v9.4s, v31.s[0] - add v11.4s, v17.4s, v12.4s - mul v16.4s, v10.4s, v1.s[2] - str q22, [x0, #0x280] - sqrdmulh v26.4s, v10.4s, v1.s[3] - str q15, [x0, #0x240] - sqrdmulh v9.4s, v8.4s, v0.s[1] - mul v12.4s, v8.4s, v0.s[0] - mls v16.4s, v26.4s, v31.s[0] - sqrdmulh v10.4s, v27.4s, v1.s[1] - sqrdmulh v15.4s, v14.4s, v30.4s - add v22.4s, v16.4s, v19.4s - sub v19.4s, v16.4s, v19.4s - mls v12.4s, v9.4s, v31.s[0] - add v8.4s, v22.4s, v20.4s - sub v26.4s, v22.4s, v20.4s - mul v27.4s, v27.4s, v1.s[0] - sqrdmulh v17.4s, v11.4s, v30.4s - mls v27.4s, v10.4s, v31.s[0] - mul v20.4s, v11.4s, v29.4s - sqrdmulh v22.4s, v24.4s, v30.4s - mls v20.4s, v17.4s, v31.s[0] - mul v14.4s, v14.4s, v29.4s - sqrdmulh v16.4s, v23.4s, v30.4s - str q20, [x0, #0x40] - sqrdmulh v11.4s, v21.4s, v0.s[1] - sqrdmulh v20.4s, v28.4s, v0.s[3] - mul v17.4s, v23.4s, v29.4s - mul v23.4s, v28.4s, v0.s[2] - mls v17.4s, v16.4s, v31.s[0] - mul v10.4s, v21.4s, v0.s[0] - str q12, [x0, #0x340] - mls v25.4s, v22.4s, v31.s[0] - str q17, [x0], #0x10 - ldr q17, [x0, #0x2c0] - mls v14.4s, v15.4s, v31.s[0] - ldr q12, [x0, #0x200] - sqrdmulh v16.4s, v13.4s, v0.s[1] - str q25, [x0, #0x130] - mul v28.4s, v13.4s, v0.s[0] - ldr q13, [x0, #0x240] - str q14, [x0, #0x170] - mul v14.4s, v18.4s, v1.s[0] - mls v28.4s, v16.4s, v31.s[0] - ldr q21, [x0, #0xc0] - mls v10.4s, v11.4s, v31.s[0] - mls v23.4s, v20.4s, v31.s[0] - str q28, [x0, #0x1f0] - ldr q11, [x0, #0x300] - sqrdmulh v28.4s, v18.4s, v1.s[1] - str q10, [x0, #0x370] - mul v16.4s, v19.4s, v0.s[2] - sqrdmulh v19.4s, v19.4s, v0.s[3] - sqrdmulh v10.4s, v26.4s, v0.s[1] - add v18.4s, v12.4s, v13.4s - sub v12.4s, v12.4s, v13.4s - mls v16.4s, v19.4s, v31.s[0] - ldr q9, [x0, #0x280] - ldr q20, [x0, #0x380] - add v24.4s, v23.4s, v27.4s - sub v25.4s, v23.4s, v27.4s - mul v19.4s, v26.4s, v0.s[0] - mls v14.4s, v28.4s, v31.s[0] - add v13.4s, v9.4s, v17.4s - sub v28.4s, v18.4s, v13.4s - mls v19.4s, v10.4s, v31.s[0] - sqrdmulh v23.4s, v25.4s, v0.s[1] - mul v15.4s, v25.4s, v0.s[0] - str q19, [x0, #0x2b0] - ldr q19, [x0, #0x80] - sqrdmulh v22.4s, v8.4s, v30.4s - mls v15.4s, v23.4s, v31.s[0] - add v26.4s, v16.4s, v14.4s - mul v10.4s, v8.4s, v29.4s - add v8.4s, v18.4s, v13.4s - sqrdmulh v13.4s, v26.4s, v30.4s - str q15, [x0, #0x2f0] - sub v18.4s, v9.4s, v17.4s - mul v27.4s, v26.4s, v29.4s - sqrdmulh v23.4s, v12.4s, v5.s[3] - mls v27.4s, v13.4s, v31.s[0] - mul v25.4s, v18.4s, v6.s[0] - sqrdmulh v13.4s, v18.4s, v6.s[1] - str q27, [x0, #0x1b0] - sub v27.4s, v19.4s, v21.4s - add v21.4s, v19.4s, v21.4s - mul v18.4s, v12.4s, v5.s[2] - sub v9.4s, v16.4s, v14.4s - mul v26.4s, v24.4s, v29.4s - ldr q12, [x0, #0x3c0] - mls v25.4s, v13.4s, v31.s[0] - sub v16.4s, v20.4s, v12.4s - sqrdmulh v17.4s, v24.4s, v30.4s - mul v13.4s, v16.4s, v7.s[0] - mls v18.4s, v23.4s, v31.s[0] - mls v26.4s, v17.4s, v31.s[0] - sqrdmulh v17.4s, v9.4s, v0.s[1] - sqrdmulh v24.4s, v16.4s, v7.s[1] - ldr q16, [x0, #0x340] - str q26, [x0, #0xf0] - sub v15.4s, v18.4s, v25.4s - mul v23.4s, v9.4s, v0.s[0] - ldr q19, [x0, #0x180] - ldr q14, [x0, #0x1c0] - sqrdmulh v9.4s, v15.4s, v2.s[3] - add v20.4s, v20.4s, v12.4s - add v12.4s, v11.4s, v16.4s - mul v26.4s, v15.4s, v2.s[2] - sub v16.4s, v11.4s, v16.4s - sub v11.4s, v12.4s, v20.4s - mls v23.4s, v17.4s, v31.s[0] - add v20.4s, v12.4s, v20.4s - sub v12.4s, v19.4s, v14.4s - mul v17.4s, v27.4s, v4.s[0] - mls v10.4s, v22.4s, v31.s[0] - add v22.4s, v19.4s, v14.4s - ldr q14, [x0, #0x40] - str q23, [x0, #0x3b0] - sqrdmulh v23.4s, v27.4s, v4.s[1] - ldr q19, [x0, #0x100] - ldr q15, [x0, #0x140] - mls v26.4s, v9.4s, v31.s[0] - str q10, [x0, #0xb0] - sqrdmulh v10.4s, v11.4s, v3.s[1] - sub v27.4s, v8.4s, v20.4s - mls v13.4s, v24.4s, v31.s[0] - add v8.4s, v8.4s, v20.4s - sub v20.4s, v19.4s, v15.4s - sqrdmulh v9.4s, v16.4s, v6.s[3] - add v19.4s, v19.4s, v15.4s - ldr q15, [x0] - mul v24.4s, v20.4s, v4.s[2] - mls v17.4s, v23.4s, v31.s[0] - add v23.4s, v15.4s, v14.4s - sub v14.4s, v15.4s, v14.4s - mul v15.4s, v11.4s, v3.s[0] - sub v11.4s, v23.4s, v21.4s - mul v16.4s, v16.4s, v6.s[2] - add v23.4s, v23.4s, v21.4s - add v21.4s, v18.4s, v25.4s - sqrdmulh v25.4s, v20.4s, v4.s[3] - sqrdmulh v18.4s, v12.4s, v5.s[1] - sqrdmulh v20.4s, v14.4s, v3.s[3] - mul v12.4s, v12.4s, v5.s[0] - mls v16.4s, v9.4s, v31.s[0] - mls v12.4s, v18.4s, v31.s[0] - mls v24.4s, v25.4s, v31.s[0] - sub v25.4s, v16.4s, v13.4s - add v16.4s, v16.4s, v13.4s - mul v13.4s, v28.4s, v2.s[2] - sqrdmulh v9.4s, v25.4s, v3.s[1] - mul v18.4s, v25.4s, v3.s[0] - sqrdmulh v25.4s, v28.4s, v2.s[3] - mls v18.4s, v9.4s, v31.s[0] - mls v15.4s, v10.4s, v31.s[0] - sub v9.4s, v24.4s, v12.4s - mul v28.4s, v14.4s, v3.s[2] - add v12.4s, v24.4s, v12.4s - mls v28.4s, v20.4s, v31.s[0] - add v20.4s, v26.4s, v18.4s - add v24.4s, v19.4s, v22.4s - mls v13.4s, v25.4s, v31.s[0] - sub v18.4s, v26.4s, v18.4s - sqrdmulh v26.4s, v11.4s, v1.s[3] - add v25.4s, v23.4s, v24.4s - mul v14.4s, v11.4s, v1.s[2] - sub v10.4s, v28.4s, v17.4s - add v17.4s, v28.4s, v17.4s - sub v28.4s, v23.4s, v24.4s - sqrdmulh v11.4s, v9.4s, v2.s[1] - sub v22.4s, v19.4s, v22.4s - sub v24.4s, v17.4s, v12.4s - mls v14.4s, v26.4s, v31.s[0] - add v17.4s, v17.4s, v12.4s - sqrdmulh v12.4s, v22.4s, v2.s[1] - sub v23.4s, v13.4s, v15.4s - add v26.4s, v13.4s, v15.4s - mul v19.4s, v9.4s, v2.s[0] - sqrdmulh v9.4s, v23.4s, v1.s[1] - mul v15.4s, v23.4s, v1.s[0] - mls v19.4s, v11.4s, v31.s[0] - sub v13.4s, v25.4s, v8.4s - mls v15.4s, v9.4s, v31.s[0] - add v23.4s, v25.4s, v8.4s - sqrdmulh v8.4s, v24.4s, v0.s[3] - mul v25.4s, v22.4s, v2.s[0] - subs x4, x4, #0x1 - cbnz x4, Lintt_layer1234_start - mul v22.4s, v24.4s, v0.s[2] - sqrdmulh v9.4s, v23.4s, v30.4s - mls v25.4s, v12.4s, v31.s[0] - mul v23.4s, v23.4s, v29.4s - mls v23.4s, v9.4s, v31.s[0] - mls v22.4s, v8.4s, v31.s[0] - sqrdmulh v11.4s, v10.4s, v1.s[3] - sub v9.4s, v14.4s, v25.4s - str q23, [x0], #0x10 - mul v23.4s, v10.4s, v1.s[2] - sqrdmulh v24.4s, v9.4s, v0.s[3] - mls v23.4s, v11.4s, v31.s[0] - mul v9.4s, v9.4s, v0.s[2] - mls v9.4s, v24.4s, v31.s[0] - sub v24.4s, v23.4s, v19.4s - add v19.4s, v23.4s, v19.4s - mul v8.4s, v28.4s, v0.s[2] - add v23.4s, v14.4s, v25.4s - mul v11.4s, v13.4s, v0.s[0] - sub v25.4s, v21.4s, v16.4s - sub v10.4s, v9.4s, v15.4s - add v15.4s, v9.4s, v15.4s - add v16.4s, v21.4s, v16.4s - sqrdmulh v28.4s, v28.4s, v0.s[3] - add v9.4s, v23.4s, v26.4s - add v14.4s, v17.4s, v16.4s - sqrdmulh v21.4s, v13.4s, v0.s[1] - sub v13.4s, v17.4s, v16.4s - mul v16.4s, v14.4s, v29.4s - sub v12.4s, v23.4s, v26.4s - mul v17.4s, v13.4s, v0.s[0] - mls v8.4s, v28.4s, v31.s[0] - sqrdmulh v28.4s, v14.4s, v30.4s - sub v14.4s, v19.4s, v20.4s - add v20.4s, v19.4s, v20.4s - sqrdmulh v19.4s, v13.4s, v0.s[1] - sqrdmulh v26.4s, v27.4s, v1.s[1] - mls v16.4s, v28.4s, v31.s[0] - sqrdmulh v28.4s, v12.4s, v0.s[1] - mul v27.4s, v27.4s, v1.s[0] - str q16, [x0, #0x30] - mul v12.4s, v12.4s, v0.s[0] - mls v12.4s, v28.4s, v31.s[0] - mls v27.4s, v26.4s, v31.s[0] - sqrdmulh v26.4s, v25.4s, v1.s[1] - str q12, [x0, #0x270] - sqrdmulh v12.4s, v14.4s, v0.s[1] - sub v13.4s, v8.4s, v27.4s - add v27.4s, v8.4s, v27.4s - mul v16.4s, v14.4s, v0.s[0] - mul v8.4s, v20.4s, v29.4s - mls v16.4s, v12.4s, v31.s[0] - mul v12.4s, v18.4s, v1.s[0] - sqrdmulh v14.4s, v20.4s, v30.4s - str q16, [x0, #0x2b0] - sqrdmulh v20.4s, v18.4s, v1.s[1] - mul v25.4s, v25.4s, v1.s[0] - mls v25.4s, v26.4s, v31.s[0] - mls v17.4s, v19.4s, v31.s[0] - mul v18.4s, v9.4s, v29.4s - add v28.4s, v22.4s, v25.4s - sqrdmulh v9.4s, v9.4s, v30.4s - sub v23.4s, v22.4s, v25.4s - str q17, [x0, #0x230] - mls v11.4s, v21.4s, v31.s[0] - mls v8.4s, v14.4s, v31.s[0] - sqrdmulh v21.4s, v28.4s, v30.4s - str q11, [x0, #0x1f0] - mul v17.4s, v28.4s, v29.4s - str q8, [x0, #0xb0] - mul v28.4s, v10.4s, v0.s[0] - mls v17.4s, v21.4s, v31.s[0] - sqrdmulh v21.4s, v10.4s, v0.s[1] - mul v8.4s, v15.4s, v29.4s - str q17, [x0, #0x130] - sqrdmulh v15.4s, v15.4s, v30.4s - mul v10.4s, v24.4s, v0.s[2] - sqrdmulh v24.4s, v24.4s, v0.s[3] - mls v12.4s, v20.4s, v31.s[0] - mls v8.4s, v15.4s, v31.s[0] - mls v10.4s, v24.4s, v31.s[0] - sqrdmulh v24.4s, v27.4s, v30.4s - str q8, [x0, #0x170] - mul v14.4s, v13.4s, v0.s[0] - sub v16.4s, v10.4s, v12.4s - sqrdmulh v17.4s, v13.4s, v0.s[1] - add v10.4s, v10.4s, v12.4s - mul v13.4s, v27.4s, v29.4s - sqrdmulh v20.4s, v10.4s, v30.4s - mls v14.4s, v17.4s, v31.s[0] - mls v13.4s, v24.4s, v31.s[0] - sqrdmulh v24.4s, v23.4s, v0.s[1] - str q14, [x0, #0x2f0] - sqrdmulh v8.4s, v16.4s, v0.s[1] - str q13, [x0, #0xf0] - mls v18.4s, v9.4s, v31.s[0] - mul v9.4s, v23.4s, v0.s[0] - mul v14.4s, v16.4s, v0.s[0] - str q18, [x0, #0x70] - mul v27.4s, v10.4s, v29.4s - mls v28.4s, v21.4s, v31.s[0] - mls v14.4s, v8.4s, v31.s[0] - mls v27.4s, v20.4s, v31.s[0] - str q28, [x0, #0x370] - mls v9.4s, v24.4s, v31.s[0] - str q14, [x0, #0x3b0] - str q27, [x0, #0x1b0] - str q9, [x0, #0x330] - ldp d8, d9, [sp] + sub v22.4s, v21.4s, v16.4s + mls v25.4s, v12.4s, v31.s[0] + add v12.4s, v21.4s, v16.4s + mul v9.4s, v24.4s, v0.s[2] + mls v9.4s, v8.4s, v31.s[0] + sub v16.4s, v14.4s, v25.4s + add v14.4s, v14.4s, v25.4s + sqrdmulh v24.4s, v22.4s, v1.s[1] + sub v11.4s, v14.4s, v26.4s + sqrdmulh v21.4s, v16.4s, v0.s[3] + add v25.4s, v14.4s, v26.4s + mul v14.4s, v22.4s, v1.s[0] + mls v14.4s, v24.4s, v31.s[0] + mul v16.4s, v16.4s, v0.s[2] + mls v16.4s, v21.4s, v31.s[0] + add v24.4s, v9.4s, v14.4s + sqrdmulh v21.4s, v25.4s, v30.4s + sqrdmulh v26.4s, v11.4s, v0.s[1] + mul v8.4s, v25.4s, v29.4s + mls v8.4s, v21.4s, v31.s[0] + mul v25.4s, v24.4s, v29.4s + mul v22.4s, v11.4s, v0.s[0] + sub v11.4s, v17.4s, v12.4s + str q8, [x0, #0x80] + sub v8.4s, v9.4s, v14.4s + add v14.4s, v16.4s, v15.4s + sqrdmulh v9.4s, v11.4s, v0.s[1] + sub v21.4s, v16.4s, v15.4s + mls v22.4s, v26.4s, v31.s[0] + mul v15.4s, v11.4s, v0.s[0] + mls v15.4s, v9.4s, v31.s[0] + add v11.4s, v17.4s, v12.4s + mul v16.4s, v10.4s, v1.s[2] + str q22, [x0, #0x280] + sqrdmulh v26.4s, v10.4s, v1.s[3] + str q15, [x0, #0x240] + sqrdmulh v9.4s, v8.4s, v0.s[1] + mul v12.4s, v8.4s, v0.s[0] + mls v16.4s, v26.4s, v31.s[0] + sqrdmulh v10.4s, v27.4s, v1.s[1] + sqrdmulh v15.4s, v14.4s, v30.4s + add v22.4s, v16.4s, v19.4s + sub v19.4s, v16.4s, v19.4s + mls v12.4s, v9.4s, v31.s[0] + add v8.4s, v22.4s, v20.4s + sub v26.4s, v22.4s, v20.4s + mul v27.4s, v27.4s, v1.s[0] + sqrdmulh v17.4s, v11.4s, v30.4s + mls v27.4s, v10.4s, v31.s[0] + mul v20.4s, v11.4s, v29.4s + sqrdmulh v22.4s, v24.4s, v30.4s + mls v20.4s, v17.4s, v31.s[0] + mul v14.4s, v14.4s, v29.4s + sqrdmulh v16.4s, v23.4s, v30.4s + str q20, [x0, #0x40] + sqrdmulh v11.4s, v21.4s, v0.s[1] + sqrdmulh v20.4s, v28.4s, v0.s[3] + mul v17.4s, v23.4s, v29.4s + mul v23.4s, v28.4s, v0.s[2] + mls v17.4s, v16.4s, v31.s[0] + mul v10.4s, v21.4s, v0.s[0] + str q12, [x0, #0x340] + mls v25.4s, v22.4s, v31.s[0] + str q17, [x0], #0x10 + ldr q17, [x0, #0x2c0] + mls v14.4s, v15.4s, v31.s[0] + ldr q12, [x0, #0x200] + sqrdmulh v16.4s, v13.4s, v0.s[1] + str q25, [x0, #0x130] + mul v28.4s, v13.4s, v0.s[0] + ldr q13, [x0, #0x240] + str q14, [x0, #0x170] + mul v14.4s, v18.4s, v1.s[0] + mls v28.4s, v16.4s, v31.s[0] + ldr q21, [x0, #0xc0] + mls v10.4s, v11.4s, v31.s[0] + mls v23.4s, v20.4s, v31.s[0] + str q28, [x0, #0x1f0] + ldr q11, [x0, #0x300] + sqrdmulh v28.4s, v18.4s, v1.s[1] + str q10, [x0, #0x370] + mul v16.4s, v19.4s, v0.s[2] + sqrdmulh v19.4s, v19.4s, v0.s[3] + sqrdmulh v10.4s, v26.4s, v0.s[1] + add v18.4s, v12.4s, v13.4s + sub v12.4s, v12.4s, v13.4s + mls v16.4s, v19.4s, v31.s[0] + ldr q9, [x0, #0x280] + ldr q20, [x0, #0x380] + add v24.4s, v23.4s, v27.4s + sub v25.4s, v23.4s, v27.4s + mul v19.4s, v26.4s, v0.s[0] + mls v14.4s, v28.4s, v31.s[0] + add v13.4s, v9.4s, v17.4s + sub v28.4s, v18.4s, v13.4s + mls v19.4s, v10.4s, v31.s[0] + sqrdmulh v23.4s, v25.4s, v0.s[1] + mul v15.4s, v25.4s, v0.s[0] + str q19, [x0, #0x2b0] + ldr q19, [x0, #0x80] + sqrdmulh v22.4s, v8.4s, v30.4s + mls v15.4s, v23.4s, v31.s[0] + add v26.4s, v16.4s, v14.4s + mul v10.4s, v8.4s, v29.4s + add v8.4s, v18.4s, v13.4s + sqrdmulh v13.4s, v26.4s, v30.4s + str q15, [x0, #0x2f0] + sub v18.4s, v9.4s, v17.4s + mul v27.4s, v26.4s, v29.4s + sqrdmulh v23.4s, v12.4s, v5.s[3] + mls v27.4s, v13.4s, v31.s[0] + mul v25.4s, v18.4s, v6.s[0] + sqrdmulh v13.4s, v18.4s, v6.s[1] + str q27, [x0, #0x1b0] + sub v27.4s, v19.4s, v21.4s + add v21.4s, v19.4s, v21.4s + mul v18.4s, v12.4s, v5.s[2] + sub v9.4s, v16.4s, v14.4s + mul v26.4s, v24.4s, v29.4s + ldr q12, [x0, #0x3c0] + mls v25.4s, v13.4s, v31.s[0] + sub v16.4s, v20.4s, v12.4s + sqrdmulh v17.4s, v24.4s, v30.4s + mul v13.4s, v16.4s, v7.s[0] + mls v18.4s, v23.4s, v31.s[0] + mls v26.4s, v17.4s, v31.s[0] + sqrdmulh v17.4s, v9.4s, v0.s[1] + sqrdmulh v24.4s, v16.4s, v7.s[1] + ldr q16, [x0, #0x340] + str q26, [x0, #0xf0] + sub v15.4s, v18.4s, v25.4s + mul v23.4s, v9.4s, v0.s[0] + ldr q19, [x0, #0x180] + ldr q14, [x0, #0x1c0] + sqrdmulh v9.4s, v15.4s, v2.s[3] + add v20.4s, v20.4s, v12.4s + add v12.4s, v11.4s, v16.4s + mul v26.4s, v15.4s, v2.s[2] + sub v16.4s, v11.4s, v16.4s + sub v11.4s, v12.4s, v20.4s + mls v23.4s, v17.4s, v31.s[0] + add v20.4s, v12.4s, v20.4s + sub v12.4s, v19.4s, v14.4s + mul v17.4s, v27.4s, v4.s[0] + mls v10.4s, v22.4s, v31.s[0] + add v22.4s, v19.4s, v14.4s + ldr q14, [x0, #0x40] + str q23, [x0, #0x3b0] + sqrdmulh v23.4s, v27.4s, v4.s[1] + ldr q19, [x0, #0x100] + ldr q15, [x0, #0x140] + mls v26.4s, v9.4s, v31.s[0] + str q10, [x0, #0xb0] + sqrdmulh v10.4s, v11.4s, v3.s[1] + sub v27.4s, v8.4s, v20.4s + mls v13.4s, v24.4s, v31.s[0] + add v8.4s, v8.4s, v20.4s + sub v20.4s, v19.4s, v15.4s + sqrdmulh v9.4s, v16.4s, v6.s[3] + add v19.4s, v19.4s, v15.4s + ldr q15, [x0] + mul v24.4s, v20.4s, v4.s[2] + mls v17.4s, v23.4s, v31.s[0] + add v23.4s, v15.4s, v14.4s + sub v14.4s, v15.4s, v14.4s + mul v15.4s, v11.4s, v3.s[0] + sub v11.4s, v23.4s, v21.4s + mul v16.4s, v16.4s, v6.s[2] + add v23.4s, v23.4s, v21.4s + add v21.4s, v18.4s, v25.4s + sqrdmulh v25.4s, v20.4s, v4.s[3] + sqrdmulh v18.4s, v12.4s, v5.s[1] + sqrdmulh v20.4s, v14.4s, v3.s[3] + mul v12.4s, v12.4s, v5.s[0] + mls v16.4s, v9.4s, v31.s[0] + mls v12.4s, v18.4s, v31.s[0] + mls v24.4s, v25.4s, v31.s[0] + sub v25.4s, v16.4s, v13.4s + add v16.4s, v16.4s, v13.4s + mul v13.4s, v28.4s, v2.s[2] + sqrdmulh v9.4s, v25.4s, v3.s[1] + mul v18.4s, v25.4s, v3.s[0] + sqrdmulh v25.4s, v28.4s, v2.s[3] + mls v18.4s, v9.4s, v31.s[0] + mls v15.4s, v10.4s, v31.s[0] + sub v9.4s, v24.4s, v12.4s + mul v28.4s, v14.4s, v3.s[2] + add v12.4s, v24.4s, v12.4s + mls v28.4s, v20.4s, v31.s[0] + add v20.4s, v26.4s, v18.4s + add v24.4s, v19.4s, v22.4s + mls v13.4s, v25.4s, v31.s[0] + sub v18.4s, v26.4s, v18.4s + sqrdmulh v26.4s, v11.4s, v1.s[3] + add v25.4s, v23.4s, v24.4s + mul v14.4s, v11.4s, v1.s[2] + sub v10.4s, v28.4s, v17.4s + add v17.4s, v28.4s, v17.4s + sub v28.4s, v23.4s, v24.4s + sqrdmulh v11.4s, v9.4s, v2.s[1] + sub v22.4s, v19.4s, v22.4s + sub v24.4s, v17.4s, v12.4s + mls v14.4s, v26.4s, v31.s[0] + add v17.4s, v17.4s, v12.4s + sqrdmulh v12.4s, v22.4s, v2.s[1] + sub v23.4s, v13.4s, v15.4s + add v26.4s, v13.4s, v15.4s + mul v19.4s, v9.4s, v2.s[0] + sqrdmulh v9.4s, v23.4s, v1.s[1] + mul v15.4s, v23.4s, v1.s[0] + mls v19.4s, v11.4s, v31.s[0] + sub v13.4s, v25.4s, v8.4s + mls v15.4s, v9.4s, v31.s[0] + add v23.4s, v25.4s, v8.4s + sqrdmulh v8.4s, v24.4s, v0.s[3] + mul v25.4s, v22.4s, v2.s[0] + subs x4, x4, #0x1 + cbnz x4, Lintt_layer1234_start + mul v22.4s, v24.4s, v0.s[2] + sqrdmulh v9.4s, v23.4s, v30.4s + mls v25.4s, v12.4s, v31.s[0] + mul v23.4s, v23.4s, v29.4s + mls v23.4s, v9.4s, v31.s[0] + mls v22.4s, v8.4s, v31.s[0] + sqrdmulh v11.4s, v10.4s, v1.s[3] + sub v9.4s, v14.4s, v25.4s + str q23, [x0], #0x10 + mul v23.4s, v10.4s, v1.s[2] + sqrdmulh v24.4s, v9.4s, v0.s[3] + mls v23.4s, v11.4s, v31.s[0] + mul v9.4s, v9.4s, v0.s[2] + mls v9.4s, v24.4s, v31.s[0] + sub v24.4s, v23.4s, v19.4s + add v19.4s, v23.4s, v19.4s + mul v8.4s, v28.4s, v0.s[2] + add v23.4s, v14.4s, v25.4s + mul v11.4s, v13.4s, v0.s[0] + sub v25.4s, v21.4s, v16.4s + sub v10.4s, v9.4s, v15.4s + add v15.4s, v9.4s, v15.4s + add v16.4s, v21.4s, v16.4s + sqrdmulh v28.4s, v28.4s, v0.s[3] + add v9.4s, v23.4s, v26.4s + add v14.4s, v17.4s, v16.4s + sqrdmulh v21.4s, v13.4s, v0.s[1] + sub v13.4s, v17.4s, v16.4s + mul v16.4s, v14.4s, v29.4s + sub v12.4s, v23.4s, v26.4s + mul v17.4s, v13.4s, v0.s[0] + mls v8.4s, v28.4s, v31.s[0] + sqrdmulh v28.4s, v14.4s, v30.4s + sub v14.4s, v19.4s, v20.4s + add v20.4s, v19.4s, v20.4s + sqrdmulh v19.4s, v13.4s, v0.s[1] + sqrdmulh v26.4s, v27.4s, v1.s[1] + mls v16.4s, v28.4s, v31.s[0] + sqrdmulh v28.4s, v12.4s, v0.s[1] + mul v27.4s, v27.4s, v1.s[0] + str q16, [x0, #0x30] + mul v12.4s, v12.4s, v0.s[0] + mls v12.4s, v28.4s, v31.s[0] + mls v27.4s, v26.4s, v31.s[0] + sqrdmulh v26.4s, v25.4s, v1.s[1] + str q12, [x0, #0x270] + sqrdmulh v12.4s, v14.4s, v0.s[1] + sub v13.4s, v8.4s, v27.4s + add v27.4s, v8.4s, v27.4s + mul v16.4s, v14.4s, v0.s[0] + mul v8.4s, v20.4s, v29.4s + mls v16.4s, v12.4s, v31.s[0] + mul v12.4s, v18.4s, v1.s[0] + sqrdmulh v14.4s, v20.4s, v30.4s + str q16, [x0, #0x2b0] + sqrdmulh v20.4s, v18.4s, v1.s[1] + mul v25.4s, v25.4s, v1.s[0] + mls v25.4s, v26.4s, v31.s[0] + mls v17.4s, v19.4s, v31.s[0] + mul v18.4s, v9.4s, v29.4s + add v28.4s, v22.4s, v25.4s + sqrdmulh v9.4s, v9.4s, v30.4s + sub v23.4s, v22.4s, v25.4s + str q17, [x0, #0x230] + mls v11.4s, v21.4s, v31.s[0] + mls v8.4s, v14.4s, v31.s[0] + sqrdmulh v21.4s, v28.4s, v30.4s + str q11, [x0, #0x1f0] + mul v17.4s, v28.4s, v29.4s + str q8, [x0, #0xb0] + mul v28.4s, v10.4s, v0.s[0] + mls v17.4s, v21.4s, v31.s[0] + sqrdmulh v21.4s, v10.4s, v0.s[1] + mul v8.4s, v15.4s, v29.4s + str q17, [x0, #0x130] + sqrdmulh v15.4s, v15.4s, v30.4s + mul v10.4s, v24.4s, v0.s[2] + sqrdmulh v24.4s, v24.4s, v0.s[3] + mls v12.4s, v20.4s, v31.s[0] + mls v8.4s, v15.4s, v31.s[0] + mls v10.4s, v24.4s, v31.s[0] + sqrdmulh v24.4s, v27.4s, v30.4s + str q8, [x0, #0x170] + mul v14.4s, v13.4s, v0.s[0] + sub v16.4s, v10.4s, v12.4s + sqrdmulh v17.4s, v13.4s, v0.s[1] + add v10.4s, v10.4s, v12.4s + mul v13.4s, v27.4s, v29.4s + sqrdmulh v20.4s, v10.4s, v30.4s + mls v14.4s, v17.4s, v31.s[0] + mls v13.4s, v24.4s, v31.s[0] + sqrdmulh v24.4s, v23.4s, v0.s[1] + str q14, [x0, #0x2f0] + sqrdmulh v8.4s, v16.4s, v0.s[1] + str q13, [x0, #0xf0] + mls v18.4s, v9.4s, v31.s[0] + mul v9.4s, v23.4s, v0.s[0] + mul v14.4s, v16.4s, v0.s[0] + str q18, [x0, #0x70] + mul v27.4s, v10.4s, v29.4s + mls v28.4s, v21.4s, v31.s[0] + mls v14.4s, v8.4s, v31.s[0] + mls v27.4s, v20.4s, v31.s[0] + str q28, [x0, #0x370] + mls v9.4s, v24.4s, v31.s[0] + str q14, [x0, #0x3b0] + str q27, [x0, #0x1b0] + str q9, [x0, #0x330] + ldp d8, d9, [sp] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0x10] + ldp d10, d11, [sp, #0x10] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0x20] + ldp d12, d13, [sp, #0x20] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0x30] + ldp d14, d15, [sp, #0x30] .cfi_restore d14 .cfi_restore d15 - add sp, sp, #0x40 + add sp, sp, #0x40 .cfi_adjust_cfa_offset -0x40 ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S index e0dd66f99..7d59e3f1e 100644 --- a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +++ b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S @@ -17,105 +17,105 @@ MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l4_asm) .cfi_startproc - mov w3, #0xe001 // =57345 - movk w3, #0x7f, lsl #16 - dup v0.4s, w3 - mov w3, #0x2001 // =8193 - movk w3, #0x380, lsl #16 - dup v1.4s, w3 - mov x3, #0x40 // =64 + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0x2001 // =8193 + movk w3, #0x380, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 Lpolyvecl_pointwise_acc_montgomery_l4_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 - smull v24.2d, v16.2s, v20.2s - smull2 v25.2d, v16.4s, v20.4s - smull v26.2d, v17.2s, v21.2s - smull2 v27.2d, v17.4s, v21.4s - smull v28.2d, v18.2s, v22.2s - smull2 v29.2d, v18.4s, v22.4s - smull v30.2d, v19.2s, v23.2s - smull2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x3c0] - ldr q17, [x1, #0x3d0] - ldr q18, [x1, #0x3e0] - ldr q19, [x1, #0x3f0] - ldr q20, [x2, #0x3c0] - ldr q21, [x2, #0x3d0] - ldr q22, [x2, #0x3e0] - ldr q23, [x2, #0x3f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x7c0] - ldr q17, [x1, #0x7d0] - ldr q18, [x1, #0x7e0] - ldr q19, [x1, #0x7f0] - ldr q20, [x2, #0x7c0] - ldr q21, [x2, #0x7d0] - ldr q22, [x2, #0x7e0] - ldr q23, [x2, #0x7f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0xbc0] - ldr q17, [x1, #0xbd0] - ldr q18, [x1, #0xbe0] - ldr q19, [x1, #0xbf0] - ldr q20, [x2, #0xbc0] - ldr q21, [x2, #0xbd0] - ldr q22, [x2, #0xbe0] - ldr q23, [x2, #0xbf0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - uzp1 v16.4s, v24.4s, v25.4s - mul v16.4s, v16.4s, v1.4s - smlsl v24.2d, v16.2s, v0.2s - smlsl2 v25.2d, v16.4s, v0.4s - uzp2 v16.4s, v24.4s, v25.4s - uzp1 v17.4s, v26.4s, v27.4s - mul v17.4s, v17.4s, v1.4s - smlsl v26.2d, v17.2s, v0.2s - smlsl2 v27.2d, v17.4s, v0.4s - uzp2 v17.4s, v26.4s, v27.4s - uzp1 v18.4s, v28.4s, v29.4s - mul v18.4s, v18.4s, v1.4s - smlsl v28.2d, v18.2s, v0.2s - smlsl2 v29.2d, v18.4s, v0.4s - uzp2 v18.4s, v28.4s, v29.4s - uzp1 v19.4s, v30.4s, v31.4s - mul v19.4s, v19.4s, v1.4s - smlsl v30.2d, v19.2s, v0.2s - smlsl2 v31.2d, v19.4s, v0.4s - uzp2 v19.4s, v30.4s, v31.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x4 - cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l4_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlsl v24.2d, v16.2s, v0.2s + smlsl2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlsl v26.2d, v17.2s, v0.2s + smlsl2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlsl v28.2d, v18.2s, v0.2s + smlsl2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlsl v30.2d, v19.2s, v0.2s + smlsl2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l4_loop_start ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S index 1e87762ba..8cf241926 100644 --- a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +++ b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S @@ -17,121 +17,121 @@ MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l5_asm) .cfi_startproc - mov w3, #0xe001 // =57345 - movk w3, #0x7f, lsl #16 - dup v0.4s, w3 - mov w3, #0x2001 // =8193 - movk w3, #0x380, lsl #16 - dup v1.4s, w3 - mov x3, #0x40 // =64 + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0x2001 // =8193 + movk w3, #0x380, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 Lpolyvecl_pointwise_acc_montgomery_l5_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 - smull v24.2d, v16.2s, v20.2s - smull2 v25.2d, v16.4s, v20.4s - smull v26.2d, v17.2s, v21.2s - smull2 v27.2d, v17.4s, v21.4s - smull v28.2d, v18.2s, v22.2s - smull2 v29.2d, v18.4s, v22.4s - smull v30.2d, v19.2s, v23.2s - smull2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x3c0] - ldr q17, [x1, #0x3d0] - ldr q18, [x1, #0x3e0] - ldr q19, [x1, #0x3f0] - ldr q20, [x2, #0x3c0] - ldr q21, [x2, #0x3d0] - ldr q22, [x2, #0x3e0] - ldr q23, [x2, #0x3f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x7c0] - ldr q17, [x1, #0x7d0] - ldr q18, [x1, #0x7e0] - ldr q19, [x1, #0x7f0] - ldr q20, [x2, #0x7c0] - ldr q21, [x2, #0x7d0] - ldr q22, [x2, #0x7e0] - ldr q23, [x2, #0x7f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0xbc0] - ldr q17, [x1, #0xbd0] - ldr q18, [x1, #0xbe0] - ldr q19, [x1, #0xbf0] - ldr q20, [x2, #0xbc0] - ldr q21, [x2, #0xbd0] - ldr q22, [x2, #0xbe0] - ldr q23, [x2, #0xbf0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0xfc0] - ldr q17, [x1, #0xfd0] - ldr q18, [x1, #0xfe0] - ldr q19, [x1, #0xff0] - ldr q20, [x2, #0xfc0] - ldr q21, [x2, #0xfd0] - ldr q22, [x2, #0xfe0] - ldr q23, [x2, #0xff0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - uzp1 v16.4s, v24.4s, v25.4s - mul v16.4s, v16.4s, v1.4s - smlsl v24.2d, v16.2s, v0.2s - smlsl2 v25.2d, v16.4s, v0.4s - uzp2 v16.4s, v24.4s, v25.4s - uzp1 v17.4s, v26.4s, v27.4s - mul v17.4s, v17.4s, v1.4s - smlsl v26.2d, v17.2s, v0.2s - smlsl2 v27.2d, v17.4s, v0.4s - uzp2 v17.4s, v26.4s, v27.4s - uzp1 v18.4s, v28.4s, v29.4s - mul v18.4s, v18.4s, v1.4s - smlsl v28.2d, v18.2s, v0.2s - smlsl2 v29.2d, v18.4s, v0.4s - uzp2 v18.4s, v28.4s, v29.4s - uzp1 v19.4s, v30.4s, v31.4s - mul v19.4s, v19.4s, v1.4s - smlsl v30.2d, v19.2s, v0.2s - smlsl2 v31.2d, v19.4s, v0.4s - uzp2 v19.4s, v30.4s, v31.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x4 - cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l5_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xfc0] + ldr q17, [x1, #0xfd0] + ldr q18, [x1, #0xfe0] + ldr q19, [x1, #0xff0] + ldr q20, [x2, #0xfc0] + ldr q21, [x2, #0xfd0] + ldr q22, [x2, #0xfe0] + ldr q23, [x2, #0xff0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlsl v24.2d, v16.2s, v0.2s + smlsl2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlsl v26.2d, v17.2s, v0.2s + smlsl2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlsl v28.2d, v18.2s, v0.2s + smlsl2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlsl v30.2d, v19.2s, v0.2s + smlsl2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l5_loop_start ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S index c7e1f5489..d100a0ce7 100644 --- a/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +++ b/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S @@ -17,153 +17,153 @@ MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l7_asm) .cfi_startproc - mov w3, #0xe001 // =57345 - movk w3, #0x7f, lsl #16 - dup v0.4s, w3 - mov w3, #0x2001 // =8193 - movk w3, #0x380, lsl #16 - dup v1.4s, w3 - mov x3, #0x40 // =64 + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0x2001 // =8193 + movk w3, #0x380, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 Lpolyvecl_pointwise_acc_montgomery_l7_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 - smull v24.2d, v16.2s, v20.2s - smull2 v25.2d, v16.4s, v20.4s - smull v26.2d, v17.2s, v21.2s - smull2 v27.2d, v17.4s, v21.4s - smull v28.2d, v18.2s, v22.2s - smull2 v29.2d, v18.4s, v22.4s - smull v30.2d, v19.2s, v23.2s - smull2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x3c0] - ldr q17, [x1, #0x3d0] - ldr q18, [x1, #0x3e0] - ldr q19, [x1, #0x3f0] - ldr q20, [x2, #0x3c0] - ldr q21, [x2, #0x3d0] - ldr q22, [x2, #0x3e0] - ldr q23, [x2, #0x3f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x7c0] - ldr q17, [x1, #0x7d0] - ldr q18, [x1, #0x7e0] - ldr q19, [x1, #0x7f0] - ldr q20, [x2, #0x7c0] - ldr q21, [x2, #0x7d0] - ldr q22, [x2, #0x7e0] - ldr q23, [x2, #0x7f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0xbc0] - ldr q17, [x1, #0xbd0] - ldr q18, [x1, #0xbe0] - ldr q19, [x1, #0xbf0] - ldr q20, [x2, #0xbc0] - ldr q21, [x2, #0xbd0] - ldr q22, [x2, #0xbe0] - ldr q23, [x2, #0xbf0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0xfc0] - ldr q17, [x1, #0xfd0] - ldr q18, [x1, #0xfe0] - ldr q19, [x1, #0xff0] - ldr q20, [x2, #0xfc0] - ldr q21, [x2, #0xfd0] - ldr q22, [x2, #0xfe0] - ldr q23, [x2, #0xff0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x13c0] - ldr q17, [x1, #0x13d0] - ldr q18, [x1, #0x13e0] - ldr q19, [x1, #0x13f0] - ldr q20, [x2, #0x13c0] - ldr q21, [x2, #0x13d0] - ldr q22, [x2, #0x13e0] - ldr q23, [x2, #0x13f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - ldr q16, [x1, #0x17c0] - ldr q17, [x1, #0x17d0] - ldr q18, [x1, #0x17e0] - ldr q19, [x1, #0x17f0] - ldr q20, [x2, #0x17c0] - ldr q21, [x2, #0x17d0] - ldr q22, [x2, #0x17e0] - ldr q23, [x2, #0x17f0] - smlal v24.2d, v16.2s, v20.2s - smlal2 v25.2d, v16.4s, v20.4s - smlal v26.2d, v17.2s, v21.2s - smlal2 v27.2d, v17.4s, v21.4s - smlal v28.2d, v18.2s, v22.2s - smlal2 v29.2d, v18.4s, v22.4s - smlal v30.2d, v19.2s, v23.2s - smlal2 v31.2d, v19.4s, v23.4s - uzp1 v16.4s, v24.4s, v25.4s - mul v16.4s, v16.4s, v1.4s - smlsl v24.2d, v16.2s, v0.2s - smlsl2 v25.2d, v16.4s, v0.4s - uzp2 v16.4s, v24.4s, v25.4s - uzp1 v17.4s, v26.4s, v27.4s - mul v17.4s, v17.4s, v1.4s - smlsl v26.2d, v17.2s, v0.2s - smlsl2 v27.2d, v17.4s, v0.4s - uzp2 v17.4s, v26.4s, v27.4s - uzp1 v18.4s, v28.4s, v29.4s - mul v18.4s, v18.4s, v1.4s - smlsl v28.2d, v18.2s, v0.2s - smlsl2 v29.2d, v18.4s, v0.4s - uzp2 v18.4s, v28.4s, v29.4s - uzp1 v19.4s, v30.4s, v31.4s - mul v19.4s, v19.4s, v1.4s - smlsl v30.2d, v19.2s, v0.2s - smlsl2 v31.2d, v19.4s, v0.4s - uzp2 v19.4s, v30.4s, v31.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x4 - cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l7_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xfc0] + ldr q17, [x1, #0xfd0] + ldr q18, [x1, #0xfe0] + ldr q19, [x1, #0xff0] + ldr q20, [x2, #0xfc0] + ldr q21, [x2, #0xfd0] + ldr q22, [x2, #0xfe0] + ldr q23, [x2, #0xff0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x13c0] + ldr q17, [x1, #0x13d0] + ldr q18, [x1, #0x13e0] + ldr q19, [x1, #0x13f0] + ldr q20, [x2, #0x13c0] + ldr q21, [x2, #0x13d0] + ldr q22, [x2, #0x13e0] + ldr q23, [x2, #0x13f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x17c0] + ldr q17, [x1, #0x17d0] + ldr q18, [x1, #0x17e0] + ldr q19, [x1, #0x17f0] + ldr q20, [x2, #0x17c0] + ldr q21, [x2, #0x17d0] + ldr q22, [x2, #0x17e0] + ldr q23, [x2, #0x17f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlsl v24.2d, v16.2s, v0.2s + smlsl2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlsl v26.2d, v17.2s, v0.2s + smlsl2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlsl v28.2d, v18.2s, v0.2s + smlsl2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlsl v30.2d, v19.2s, v0.2s + smlsl2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, Lpolyvecl_pointwise_acc_montgomery_l7_loop_start ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/ntt.S b/mldsa/src/native/aarch64/src/ntt.S index 2df0b0f3a..a4a53bea2 100644 --- a/mldsa/src/native/aarch64/src/ntt.S +++ b/mldsa/src/native/aarch64/src/ntt.S @@ -36,610 +36,610 @@ MLD_ASM_FN_SYMBOL(ntt_asm) .cfi_startproc - sub sp, sp, #0x40 + sub sp, sp, #0x40 .cfi_adjust_cfa_offset 0x40 - stp d8, d9, [sp] + stp d8, d9, [sp] .cfi_rel_offset d8, 0x0 .cfi_rel_offset d9, 0x8 - stp d10, d11, [sp, #0x10] + stp d10, d11, [sp, #0x10] .cfi_rel_offset d10, 0x10 .cfi_rel_offset d11, 0x18 - stp d12, d13, [sp, #0x20] + stp d12, d13, [sp, #0x20] .cfi_rel_offset d12, 0x20 .cfi_rel_offset d13, 0x28 - stp d14, d15, [sp, #0x30] + stp d14, d15, [sp, #0x30] .cfi_rel_offset d14, 0x30 .cfi_rel_offset d15, 0x38 - mov w5, #0xe001 // =57345 - movk w5, #0x7f, lsl #16 - dup v7.4s, w5 - mov x3, x0 - mov x4, #0x8 // =8 - ldr q0, [x1], #0x40 - ldur q1, [x1, #-0x30] - ldur q2, [x1, #-0x20] - ldur q3, [x1, #-0x10] - ldr q23, [x0, #0x390] - ldr q13, [x0, #0x380] - ldr q22, [x0, #0x80] - ldr q26, [x0, #0x190] - ldr q8, [x0, #0x280] - ldr q6, [x0, #0x210] - mul v10.4s, v13.4s, v0.s[0] - sqrdmulh v13.4s, v13.4s, v0.s[1] - mul v12.4s, v8.4s, v0.s[0] - sqrdmulh v27.4s, v8.4s, v0.s[1] - mul v4.4s, v6.4s, v0.s[0] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x180] - sqrdmulh v14.4s, v23.4s, v0.s[1] - mls v12.4s, v27.4s, v7.s[0] - add v31.4s, v13.4s, v10.4s - sub v13.4s, v13.4s, v10.4s - mul v10.4s, v23.4s, v0.s[0] - sqrdmulh v8.4s, v13.4s, v1.s[1] - sub v18.4s, v22.4s, v12.4s - mls v10.4s, v14.4s, v7.s[0] - mul v13.4s, v13.4s, v1.s[0] - mls v13.4s, v8.4s, v7.s[0] - sub v29.4s, v26.4s, v10.4s - add v25.4s, v26.4s, v10.4s - mul v10.4s, v31.4s, v0.s[2] - mul v14.4s, v25.4s, v0.s[2] - add v17.4s, v18.4s, v13.4s - sub v15.4s, v18.4s, v13.4s - sqrdmulh v13.4s, v31.4s, v0.s[3] - sqrdmulh v20.4s, v15.4s, v3.s[1] - sqrdmulh v5.4s, v17.4s, v2.s[3] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x300] - mul v18.4s, v17.4s, v2.s[2] - add v31.4s, v22.4s, v12.4s - mul v23.4s, v15.4s, v3.s[0] - ldr q17, [x0, #0x90] - add v19.4s, v31.4s, v10.4s - sub v16.4s, v31.4s, v10.4s - mul v10.4s, v13.4s, v0.s[0] - sqrdmulh v13.4s, v13.4s, v0.s[1] - sqrdmulh v27.4s, v16.4s, v2.s[1] - mul v11.4s, v16.4s, v2.s[0] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x290] - ldr q22, [x0, #0x100] - mls v11.4s, v27.4s, v7.s[0] - sqrdmulh v15.4s, v13.4s, v0.s[1] - sub v12.4s, v22.4s, v10.4s - add v30.4s, v22.4s, v10.4s - mul v10.4s, v13.4s, v0.s[0] - ldr q28, [x0] - sqrdmulh v13.4s, v25.4s, v0.s[3] - sqrdmulh v27.4s, v30.4s, v0.s[3] - mls v10.4s, v15.4s, v7.s[0] - mls v14.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x200] - sqrdmulh v25.4s, v12.4s, v1.s[1] - add v24.4s, v17.4s, v10.4s - sub v21.4s, v17.4s, v10.4s - sqrdmulh v8.4s, v13.4s, v0.s[1] - sub v9.4s, v24.4s, v14.4s - mul v26.4s, v12.4s, v1.s[0] - mul v13.4s, v13.4s, v0.s[0] - mls v13.4s, v8.4s, v7.s[0] - mul v8.4s, v30.4s, v0.s[2] - mls v8.4s, v27.4s, v7.s[0] - add v16.4s, v28.4s, v13.4s - sub v10.4s, v28.4s, v13.4s - mls v26.4s, v25.4s, v7.s[0] - sqrdmulh v12.4s, v19.4s, v1.s[3] - sub v25.4s, v16.4s, v8.4s - mls v23.4s, v20.4s, v7.s[0] - sub v22.4s, v25.4s, v11.4s - sqrdmulh v20.4s, v9.4s, v2.s[1] - sub v15.4s, v10.4s, v26.4s - sub x4, x4, #0x2 + mov w5, #0xe001 // =57345 + movk w5, #0x7f, lsl #16 + dup v7.4s, w5 + mov x3, x0 + mov x4, #0x8 // =8 + ldr q0, [x1], #0x40 + ldur q1, [x1, #-0x30] + ldur q2, [x1, #-0x20] + ldur q3, [x1, #-0x10] + ldr q23, [x0, #0x390] + ldr q13, [x0, #0x380] + ldr q22, [x0, #0x80] + ldr q26, [x0, #0x190] + ldr q8, [x0, #0x280] + ldr q6, [x0, #0x210] + mul v10.4s, v13.4s, v0.s[0] + sqrdmulh v13.4s, v13.4s, v0.s[1] + mul v12.4s, v8.4s, v0.s[0] + sqrdmulh v27.4s, v8.4s, v0.s[1] + mul v4.4s, v6.4s, v0.s[0] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x180] + sqrdmulh v14.4s, v23.4s, v0.s[1] + mls v12.4s, v27.4s, v7.s[0] + add v31.4s, v13.4s, v10.4s + sub v13.4s, v13.4s, v10.4s + mul v10.4s, v23.4s, v0.s[0] + sqrdmulh v8.4s, v13.4s, v1.s[1] + sub v18.4s, v22.4s, v12.4s + mls v10.4s, v14.4s, v7.s[0] + mul v13.4s, v13.4s, v1.s[0] + mls v13.4s, v8.4s, v7.s[0] + sub v29.4s, v26.4s, v10.4s + add v25.4s, v26.4s, v10.4s + mul v10.4s, v31.4s, v0.s[2] + mul v14.4s, v25.4s, v0.s[2] + add v17.4s, v18.4s, v13.4s + sub v15.4s, v18.4s, v13.4s + sqrdmulh v13.4s, v31.4s, v0.s[3] + sqrdmulh v20.4s, v15.4s, v3.s[1] + sqrdmulh v5.4s, v17.4s, v2.s[3] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x300] + mul v18.4s, v17.4s, v2.s[2] + add v31.4s, v22.4s, v12.4s + mul v23.4s, v15.4s, v3.s[0] + ldr q17, [x0, #0x90] + add v19.4s, v31.4s, v10.4s + sub v16.4s, v31.4s, v10.4s + mul v10.4s, v13.4s, v0.s[0] + sqrdmulh v13.4s, v13.4s, v0.s[1] + sqrdmulh v27.4s, v16.4s, v2.s[1] + mul v11.4s, v16.4s, v2.s[0] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x290] + ldr q22, [x0, #0x100] + mls v11.4s, v27.4s, v7.s[0] + sqrdmulh v15.4s, v13.4s, v0.s[1] + sub v12.4s, v22.4s, v10.4s + add v30.4s, v22.4s, v10.4s + mul v10.4s, v13.4s, v0.s[0] + ldr q28, [x0] + sqrdmulh v13.4s, v25.4s, v0.s[3] + sqrdmulh v27.4s, v30.4s, v0.s[3] + mls v10.4s, v15.4s, v7.s[0] + mls v14.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x200] + sqrdmulh v25.4s, v12.4s, v1.s[1] + add v24.4s, v17.4s, v10.4s + sub v21.4s, v17.4s, v10.4s + sqrdmulh v8.4s, v13.4s, v0.s[1] + sub v9.4s, v24.4s, v14.4s + mul v26.4s, v12.4s, v1.s[0] + mul v13.4s, v13.4s, v0.s[0] + mls v13.4s, v8.4s, v7.s[0] + mul v8.4s, v30.4s, v0.s[2] + mls v8.4s, v27.4s, v7.s[0] + add v16.4s, v28.4s, v13.4s + sub v10.4s, v28.4s, v13.4s + mls v26.4s, v25.4s, v7.s[0] + sqrdmulh v12.4s, v19.4s, v1.s[3] + sub v25.4s, v16.4s, v8.4s + mls v23.4s, v20.4s, v7.s[0] + sub v22.4s, v25.4s, v11.4s + sqrdmulh v20.4s, v9.4s, v2.s[1] + sub v15.4s, v10.4s, v26.4s + sub x4, x4, #0x2 Lntt_layer123_start: - add v31.4s, v10.4s, v26.4s - mul v17.4s, v19.4s, v1.s[2] - add v26.4s, v15.4s, v23.4s - ldr q30, [x0, #0x2a0] - sub v13.4s, v15.4s, v23.4s - mul v23.4s, v29.4s, v1.s[0] - add v25.4s, v25.4s, v11.4s - str q22, [x0, #0x180] - mul v11.4s, v9.4s, v2.s[0] - str q13, [x0, #0x380] - ldr q28, [x0, #0x10] - add v10.4s, v16.4s, v8.4s - mls v17.4s, v12.4s, v7.s[0] - ldr q13, [x0, #0x3a0] - str q26, [x0, #0x300] - sqrdmulh v27.4s, v30.4s, v0.s[1] - mls v18.4s, v5.4s, v7.s[0] - ldr q9, [x0, #0x1a0] - sub v16.4s, v10.4s, v17.4s - add v15.4s, v10.4s, v17.4s - sqrdmulh v10.4s, v6.4s, v0.s[1] - str q16, [x0, #0x80] - str q15, [x0], #0x10 - sqrdmulh v19.4s, v13.4s, v0.s[1] - sub v15.4s, v31.4s, v18.4s - mul v8.4s, v13.4s, v0.s[0] - add v26.4s, v31.4s, v18.4s - str q15, [x0, #0x270] - sqrdmulh v13.4s, v29.4s, v1.s[1] - str q26, [x0, #0x1f0] - mls v8.4s, v19.4s, v7.s[0] - mls v11.4s, v20.4s, v7.s[0] - mls v23.4s, v13.4s, v7.s[0] - add v22.4s, v9.4s, v8.4s - ldr q6, [x0, #0x210] - sub v29.4s, v9.4s, v8.4s - mul v17.4s, v30.4s, v0.s[0] - ldr q9, [x0, #0x300] - sqrdmulh v13.4s, v22.4s, v0.s[3] - add v18.4s, v21.4s, v23.4s - mls v4.4s, v10.4s, v7.s[0] - sub v31.4s, v21.4s, v23.4s - sqrdmulh v16.4s, v31.4s, v3.s[1] - add v19.4s, v24.4s, v14.4s - mul v14.4s, v22.4s, v0.s[2] - sub v10.4s, v28.4s, v4.4s - mls v14.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x100] - sqrdmulh v22.4s, v9.4s, v0.s[1] - mul v8.4s, v9.4s, v0.s[0] - mul v23.4s, v31.4s, v3.s[0] - mls v8.4s, v22.4s, v7.s[0] - mls v23.4s, v16.4s, v7.s[0] - add v16.4s, v28.4s, v4.4s - ldr q22, [x0, #0x90] - mul v4.4s, v6.4s, v0.s[0] - mls v17.4s, v27.4s, v7.s[0] - add v21.4s, v13.4s, v8.4s - sub v27.4s, v13.4s, v8.4s - sqrdmulh v31.4s, v21.4s, v0.s[3] - str q25, [x0, #0xf0] - mul v8.4s, v21.4s, v0.s[2] - add v24.4s, v22.4s, v17.4s - sub v21.4s, v22.4s, v17.4s - sqrdmulh v5.4s, v18.4s, v2.s[3] - mls v8.4s, v31.4s, v7.s[0] - sub v9.4s, v24.4s, v14.4s - sqrdmulh v20.4s, v27.4s, v1.s[1] - mul v26.4s, v27.4s, v1.s[0] - sub v25.4s, v16.4s, v8.4s - mul v18.4s, v18.4s, v2.s[2] - sub v22.4s, v25.4s, v11.4s - mls v26.4s, v20.4s, v7.s[0] - sqrdmulh v20.4s, v9.4s, v2.s[1] - sqrdmulh v12.4s, v19.4s, v1.s[3] - sub v15.4s, v10.4s, v26.4s - subs x4, x4, #0x1 - cbnz x4, Lntt_layer123_start - add v13.4s, v10.4s, v26.4s - mls v18.4s, v5.4s, v7.s[0] - str q22, [x0, #0x180] - add v27.4s, v16.4s, v8.4s - mul v22.4s, v19.4s, v1.s[2] - add v26.4s, v24.4s, v14.4s - ldr q31, [x0, #0x110] - sub v14.4s, v15.4s, v23.4s - add v17.4s, v15.4s, v23.4s - mls v22.4s, v12.4s, v7.s[0] - add v28.4s, v13.4s, v18.4s - str q14, [x0, #0x380] - sqrdmulh v24.4s, v6.4s, v0.s[1] - add v5.4s, v25.4s, v11.4s - sub v19.4s, v13.4s, v18.4s - str q17, [x0, #0x300] - str q5, [x0, #0x100] - mul v16.4s, v9.4s, v2.s[0] - ldr q18, [x0, #0x310] - str q19, [x0, #0x280] - mls v16.4s, v20.4s, v7.s[0] - str q28, [x0, #0x200] - add v13.4s, v27.4s, v22.4s - ldr q15, [x0, #0x10] - sub v10.4s, v27.4s, v22.4s - mls v4.4s, v24.4s, v7.s[0] - str q13, [x0], #0x10 - str q10, [x0, #0x70] - sqrdmulh v12.4s, v29.4s, v1.s[1] - mul v23.4s, v29.4s, v1.s[0] - mul v8.4s, v26.4s, v1.s[2] - add v20.4s, v15.4s, v4.4s - sub v6.4s, v15.4s, v4.4s - mls v23.4s, v12.4s, v7.s[0] - sqrdmulh v22.4s, v18.4s, v0.s[1] - mul v5.4s, v18.4s, v0.s[0] - sub v28.4s, v21.4s, v23.4s - sqrdmulh v10.4s, v26.4s, v1.s[3] - mls v5.4s, v22.4s, v7.s[0] - sqrdmulh v30.4s, v28.4s, v3.s[1] - add v4.4s, v21.4s, v23.4s - mls v8.4s, v10.4s, v7.s[0] - add v12.4s, v31.4s, v5.4s - sub v9.4s, v31.4s, v5.4s - sqrdmulh v25.4s, v4.4s, v2.s[3] - sqrdmulh v15.4s, v9.4s, v1.s[1] - sqrdmulh v31.4s, v12.4s, v0.s[3] - mul v18.4s, v12.4s, v0.s[2] - mul v11.4s, v9.4s, v1.s[0] - mls v18.4s, v31.4s, v7.s[0] - mul v29.4s, v4.4s, v2.s[2] - mls v29.4s, v25.4s, v7.s[0] - add v23.4s, v20.4s, v18.4s - mls v11.4s, v15.4s, v7.s[0] - sub v31.4s, v20.4s, v18.4s - add v17.4s, v23.4s, v8.4s - add v5.4s, v31.4s, v16.4s - mul v24.4s, v28.4s, v3.s[0] - str q17, [x0], #0x10 - sub v19.4s, v31.4s, v16.4s - mls v24.4s, v30.4s, v7.s[0] - str q5, [x0, #0xf0] - add v31.4s, v6.4s, v11.4s - sub v26.4s, v23.4s, v8.4s - str q19, [x0, #0x170] - add v4.4s, v31.4s, v29.4s - sub v13.4s, v6.4s, v11.4s - str q26, [x0, #0x70] - sub v11.4s, v31.4s, v29.4s - sub v22.4s, v13.4s, v24.4s - add v23.4s, v13.4s, v24.4s - str q4, [x0, #0x1f0] - str q11, [x0, #0x270] - str q23, [x0, #0x2f0] - str q22, [x0, #0x370] - mov x0, x3 - mov x4, #0x8 // =8 - ldr q9, [x0, #0x40] - ldr q23, [x1], #0x40 - ldr q21, [x2, #0x60] - ldr q1, [x0, #0x20] - ldur q14, [x1, #-0x30] - ldr q13, [x0] - ldr q11, [x2, #0x50] - sqrdmulh v16.4s, v9.4s, v23.s[1] - ldr q17, [x0, #0x50] - mul v15.4s, v9.4s, v23.s[0] - ldr q30, [x0, #0x70] - ldr q27, [x0, #0x60] - ldr q8, [x2, #0x30] - sqrdmulh v12.4s, v17.4s, v23.s[1] - ldr q6, [x0, #0x30] - mls v15.4s, v16.4s, v7.s[0] - sqrdmulh v18.4s, v27.4s, v23.s[1] - sqrdmulh v19.4s, v30.4s, v23.s[1] - add v5.4s, v13.4s, v15.4s - mul v25.4s, v27.4s, v23.s[0] - sub v26.4s, v13.4s, v15.4s - mls v25.4s, v18.4s, v7.s[0] - mul v10.4s, v17.4s, v23.s[0] - mls v10.4s, v12.4s, v7.s[0] - mul v4.4s, v30.4s, v23.s[0] - sub v22.4s, v1.4s, v25.4s - mls v4.4s, v19.4s, v7.s[0] - add v28.4s, v1.4s, v25.4s - sqrdmulh v19.4s, v28.4s, v23.s[3] - sqrdmulh v9.4s, v22.4s, v14.s[1] - add v2.4s, v6.4s, v4.4s - mul v0.4s, v28.4s, v23.s[2] - sqrdmulh v27.4s, v2.4s, v23.s[3] - sub v17.4s, v6.4s, v4.4s - mul v3.4s, v2.4s, v23.s[2] - sqrdmulh v20.4s, v17.4s, v14.s[1] - ldr q1, [x0, #0x10] - mls v3.4s, v27.4s, v7.s[0] - mls v0.4s, v19.4s, v7.s[0] - ldur q16, [x1, #-0x20] - add v31.4s, v1.4s, v10.4s - mul v30.4s, v17.4s, v14.s[0] - mls v30.4s, v20.4s, v7.s[0] - add v27.4s, v31.4s, v3.4s - sub v23.4s, v1.4s, v10.4s - sub v24.4s, v31.4s, v3.4s - sqrdmulh v4.4s, v27.4s, v14.s[3] - sqrdmulh v10.4s, v24.4s, v16.s[1] - mul v18.4s, v24.4s, v16.s[0] - add v15.4s, v23.4s, v30.4s - sub v23.4s, v23.4s, v30.4s - mul v29.4s, v27.4s, v14.s[2] - sub v2.4s, v5.4s, v0.4s - add v12.4s, v5.4s, v0.4s - mls v18.4s, v10.4s, v7.s[0] - ldur q3, [x1, #-0x10] - mls v29.4s, v4.4s, v7.s[0] - mul v4.4s, v22.4s, v14.s[0] - add v1.4s, v2.4s, v18.4s - sub v24.4s, v2.4s, v18.4s - mls v4.4s, v9.4s, v7.s[0] - ldr q20, [x2, #0x10] - add v25.4s, v12.4s, v29.4s - mul v9.4s, v23.4s, v3.s[0] - sub v5.4s, v12.4s, v29.4s - sqrdmulh v31.4s, v23.4s, v3.s[1] - trn2 v6.4s, v1.4s, v24.4s - trn2 v10.4s, v25.4s, v5.4s - sqrdmulh v13.4s, v15.4s, v16.s[3] - trn2 v30.2d, v10.2d, v6.2d - ldr q3, [x2], #0xc0 - mul v12.4s, v15.4s, v16.s[2] - trn1 v27.2d, v10.2d, v6.2d - mls v9.4s, v31.4s, v7.s[0] - trn1 v22.4s, v25.4s, v5.4s - sub v6.4s, v26.4s, v4.4s - mls v12.4s, v13.4s, v7.s[0] - trn1 v1.4s, v1.4s, v24.4s - add v13.4s, v26.4s, v4.4s - trn2 v10.2d, v22.2d, v1.2d - mul v28.4s, v30.4s, v3.4s - sub v31.4s, v6.4s, v9.4s - sub x4, x4, #0x1 + add v31.4s, v10.4s, v26.4s + mul v17.4s, v19.4s, v1.s[2] + add v26.4s, v15.4s, v23.4s + ldr q30, [x0, #0x2a0] + sub v13.4s, v15.4s, v23.4s + mul v23.4s, v29.4s, v1.s[0] + add v25.4s, v25.4s, v11.4s + str q22, [x0, #0x180] + mul v11.4s, v9.4s, v2.s[0] + str q13, [x0, #0x380] + ldr q28, [x0, #0x10] + add v10.4s, v16.4s, v8.4s + mls v17.4s, v12.4s, v7.s[0] + ldr q13, [x0, #0x3a0] + str q26, [x0, #0x300] + sqrdmulh v27.4s, v30.4s, v0.s[1] + mls v18.4s, v5.4s, v7.s[0] + ldr q9, [x0, #0x1a0] + sub v16.4s, v10.4s, v17.4s + add v15.4s, v10.4s, v17.4s + sqrdmulh v10.4s, v6.4s, v0.s[1] + str q16, [x0, #0x80] + str q15, [x0], #0x10 + sqrdmulh v19.4s, v13.4s, v0.s[1] + sub v15.4s, v31.4s, v18.4s + mul v8.4s, v13.4s, v0.s[0] + add v26.4s, v31.4s, v18.4s + str q15, [x0, #0x270] + sqrdmulh v13.4s, v29.4s, v1.s[1] + str q26, [x0, #0x1f0] + mls v8.4s, v19.4s, v7.s[0] + mls v11.4s, v20.4s, v7.s[0] + mls v23.4s, v13.4s, v7.s[0] + add v22.4s, v9.4s, v8.4s + ldr q6, [x0, #0x210] + sub v29.4s, v9.4s, v8.4s + mul v17.4s, v30.4s, v0.s[0] + ldr q9, [x0, #0x300] + sqrdmulh v13.4s, v22.4s, v0.s[3] + add v18.4s, v21.4s, v23.4s + mls v4.4s, v10.4s, v7.s[0] + sub v31.4s, v21.4s, v23.4s + sqrdmulh v16.4s, v31.4s, v3.s[1] + add v19.4s, v24.4s, v14.4s + mul v14.4s, v22.4s, v0.s[2] + sub v10.4s, v28.4s, v4.4s + mls v14.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x100] + sqrdmulh v22.4s, v9.4s, v0.s[1] + mul v8.4s, v9.4s, v0.s[0] + mul v23.4s, v31.4s, v3.s[0] + mls v8.4s, v22.4s, v7.s[0] + mls v23.4s, v16.4s, v7.s[0] + add v16.4s, v28.4s, v4.4s + ldr q22, [x0, #0x90] + mul v4.4s, v6.4s, v0.s[0] + mls v17.4s, v27.4s, v7.s[0] + add v21.4s, v13.4s, v8.4s + sub v27.4s, v13.4s, v8.4s + sqrdmulh v31.4s, v21.4s, v0.s[3] + str q25, [x0, #0xf0] + mul v8.4s, v21.4s, v0.s[2] + add v24.4s, v22.4s, v17.4s + sub v21.4s, v22.4s, v17.4s + sqrdmulh v5.4s, v18.4s, v2.s[3] + mls v8.4s, v31.4s, v7.s[0] + sub v9.4s, v24.4s, v14.4s + sqrdmulh v20.4s, v27.4s, v1.s[1] + mul v26.4s, v27.4s, v1.s[0] + sub v25.4s, v16.4s, v8.4s + mul v18.4s, v18.4s, v2.s[2] + sub v22.4s, v25.4s, v11.4s + mls v26.4s, v20.4s, v7.s[0] + sqrdmulh v20.4s, v9.4s, v2.s[1] + sqrdmulh v12.4s, v19.4s, v1.s[3] + sub v15.4s, v10.4s, v26.4s + subs x4, x4, #0x1 + cbnz x4, Lntt_layer123_start + add v13.4s, v10.4s, v26.4s + mls v18.4s, v5.4s, v7.s[0] + str q22, [x0, #0x180] + add v27.4s, v16.4s, v8.4s + mul v22.4s, v19.4s, v1.s[2] + add v26.4s, v24.4s, v14.4s + ldr q31, [x0, #0x110] + sub v14.4s, v15.4s, v23.4s + add v17.4s, v15.4s, v23.4s + mls v22.4s, v12.4s, v7.s[0] + add v28.4s, v13.4s, v18.4s + str q14, [x0, #0x380] + sqrdmulh v24.4s, v6.4s, v0.s[1] + add v5.4s, v25.4s, v11.4s + sub v19.4s, v13.4s, v18.4s + str q17, [x0, #0x300] + str q5, [x0, #0x100] + mul v16.4s, v9.4s, v2.s[0] + ldr q18, [x0, #0x310] + str q19, [x0, #0x280] + mls v16.4s, v20.4s, v7.s[0] + str q28, [x0, #0x200] + add v13.4s, v27.4s, v22.4s + ldr q15, [x0, #0x10] + sub v10.4s, v27.4s, v22.4s + mls v4.4s, v24.4s, v7.s[0] + str q13, [x0], #0x10 + str q10, [x0, #0x70] + sqrdmulh v12.4s, v29.4s, v1.s[1] + mul v23.4s, v29.4s, v1.s[0] + mul v8.4s, v26.4s, v1.s[2] + add v20.4s, v15.4s, v4.4s + sub v6.4s, v15.4s, v4.4s + mls v23.4s, v12.4s, v7.s[0] + sqrdmulh v22.4s, v18.4s, v0.s[1] + mul v5.4s, v18.4s, v0.s[0] + sub v28.4s, v21.4s, v23.4s + sqrdmulh v10.4s, v26.4s, v1.s[3] + mls v5.4s, v22.4s, v7.s[0] + sqrdmulh v30.4s, v28.4s, v3.s[1] + add v4.4s, v21.4s, v23.4s + mls v8.4s, v10.4s, v7.s[0] + add v12.4s, v31.4s, v5.4s + sub v9.4s, v31.4s, v5.4s + sqrdmulh v25.4s, v4.4s, v2.s[3] + sqrdmulh v15.4s, v9.4s, v1.s[1] + sqrdmulh v31.4s, v12.4s, v0.s[3] + mul v18.4s, v12.4s, v0.s[2] + mul v11.4s, v9.4s, v1.s[0] + mls v18.4s, v31.4s, v7.s[0] + mul v29.4s, v4.4s, v2.s[2] + mls v29.4s, v25.4s, v7.s[0] + add v23.4s, v20.4s, v18.4s + mls v11.4s, v15.4s, v7.s[0] + sub v31.4s, v20.4s, v18.4s + add v17.4s, v23.4s, v8.4s + add v5.4s, v31.4s, v16.4s + mul v24.4s, v28.4s, v3.s[0] + str q17, [x0], #0x10 + sub v19.4s, v31.4s, v16.4s + mls v24.4s, v30.4s, v7.s[0] + str q5, [x0, #0xf0] + add v31.4s, v6.4s, v11.4s + sub v26.4s, v23.4s, v8.4s + str q19, [x0, #0x170] + add v4.4s, v31.4s, v29.4s + sub v13.4s, v6.4s, v11.4s + str q26, [x0, #0x70] + sub v11.4s, v31.4s, v29.4s + sub v22.4s, v13.4s, v24.4s + add v23.4s, v13.4s, v24.4s + str q4, [x0, #0x1f0] + str q11, [x0, #0x270] + str q23, [x0, #0x2f0] + str q22, [x0, #0x370] + mov x0, x3 + mov x4, #0x8 // =8 + ldr q9, [x0, #0x40] + ldr q23, [x1], #0x40 + ldr q21, [x2, #0x60] + ldr q1, [x0, #0x20] + ldur q14, [x1, #-0x30] + ldr q13, [x0] + ldr q11, [x2, #0x50] + sqrdmulh v16.4s, v9.4s, v23.s[1] + ldr q17, [x0, #0x50] + mul v15.4s, v9.4s, v23.s[0] + ldr q30, [x0, #0x70] + ldr q27, [x0, #0x60] + ldr q8, [x2, #0x30] + sqrdmulh v12.4s, v17.4s, v23.s[1] + ldr q6, [x0, #0x30] + mls v15.4s, v16.4s, v7.s[0] + sqrdmulh v18.4s, v27.4s, v23.s[1] + sqrdmulh v19.4s, v30.4s, v23.s[1] + add v5.4s, v13.4s, v15.4s + mul v25.4s, v27.4s, v23.s[0] + sub v26.4s, v13.4s, v15.4s + mls v25.4s, v18.4s, v7.s[0] + mul v10.4s, v17.4s, v23.s[0] + mls v10.4s, v12.4s, v7.s[0] + mul v4.4s, v30.4s, v23.s[0] + sub v22.4s, v1.4s, v25.4s + mls v4.4s, v19.4s, v7.s[0] + add v28.4s, v1.4s, v25.4s + sqrdmulh v19.4s, v28.4s, v23.s[3] + sqrdmulh v9.4s, v22.4s, v14.s[1] + add v2.4s, v6.4s, v4.4s + mul v0.4s, v28.4s, v23.s[2] + sqrdmulh v27.4s, v2.4s, v23.s[3] + sub v17.4s, v6.4s, v4.4s + mul v3.4s, v2.4s, v23.s[2] + sqrdmulh v20.4s, v17.4s, v14.s[1] + ldr q1, [x0, #0x10] + mls v3.4s, v27.4s, v7.s[0] + mls v0.4s, v19.4s, v7.s[0] + ldur q16, [x1, #-0x20] + add v31.4s, v1.4s, v10.4s + mul v30.4s, v17.4s, v14.s[0] + mls v30.4s, v20.4s, v7.s[0] + add v27.4s, v31.4s, v3.4s + sub v23.4s, v1.4s, v10.4s + sub v24.4s, v31.4s, v3.4s + sqrdmulh v4.4s, v27.4s, v14.s[3] + sqrdmulh v10.4s, v24.4s, v16.s[1] + mul v18.4s, v24.4s, v16.s[0] + add v15.4s, v23.4s, v30.4s + sub v23.4s, v23.4s, v30.4s + mul v29.4s, v27.4s, v14.s[2] + sub v2.4s, v5.4s, v0.4s + add v12.4s, v5.4s, v0.4s + mls v18.4s, v10.4s, v7.s[0] + ldur q3, [x1, #-0x10] + mls v29.4s, v4.4s, v7.s[0] + mul v4.4s, v22.4s, v14.s[0] + add v1.4s, v2.4s, v18.4s + sub v24.4s, v2.4s, v18.4s + mls v4.4s, v9.4s, v7.s[0] + ldr q20, [x2, #0x10] + add v25.4s, v12.4s, v29.4s + mul v9.4s, v23.4s, v3.s[0] + sub v5.4s, v12.4s, v29.4s + sqrdmulh v31.4s, v23.4s, v3.s[1] + trn2 v6.4s, v1.4s, v24.4s + trn2 v10.4s, v25.4s, v5.4s + sqrdmulh v13.4s, v15.4s, v16.s[3] + trn2 v30.2d, v10.2d, v6.2d + ldr q3, [x2], #0xc0 + mul v12.4s, v15.4s, v16.s[2] + trn1 v27.2d, v10.2d, v6.2d + mls v9.4s, v31.4s, v7.s[0] + trn1 v22.4s, v25.4s, v5.4s + sub v6.4s, v26.4s, v4.4s + mls v12.4s, v13.4s, v7.s[0] + trn1 v1.4s, v1.4s, v24.4s + add v13.4s, v26.4s, v4.4s + trn2 v10.2d, v22.2d, v1.2d + mul v28.4s, v30.4s, v3.4s + sub v31.4s, v6.4s, v9.4s + sub x4, x4, #0x1 Lntt_layer45678_start: - add v2.4s, v13.4s, v12.4s - sqrdmulh v5.4s, v30.4s, v20.4s - sub v25.4s, v13.4s, v12.4s - add v17.4s, v6.4s, v9.4s - mul v19.4s, v10.4s, v3.4s - trn2 v4.4s, v2.4s, v25.4s - ldur q24, [x2, #-0x50] - trn2 v29.4s, v17.4s, v31.4s - sqrdmulh v15.4s, v10.4s, v20.4s - mls v28.4s, v5.4s, v7.s[0] - trn2 v3.2d, v4.2d, v29.2d - sqrdmulh v12.4s, v3.4s, v24.4s - mul v16.4s, v3.4s, v21.4s - mls v19.4s, v15.4s, v7.s[0] - ldur q10, [x2, #-0xa0] - add v13.4s, v27.4s, v28.4s - mls v16.4s, v12.4s, v7.s[0] - sqrdmulh v9.4s, v13.4s, v8.4s - sub v30.4s, v27.4s, v28.4s - ldr q18, [x1], #0x40 - mul v8.4s, v13.4s, v10.4s - ldr q10, [x0, #0xd0] - sqrdmulh v14.4s, v30.4s, v11.4s - ldr q23, [x0, #0xe0] - sqrdmulh v13.4s, v10.4s, v18.s[1] - sqrdmulh v12.4s, v23.4s, v18.s[1] - ldur q6, [x2, #-0x80] - mul v3.4s, v10.4s, v18.s[0] - mls v3.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0xf0] - trn1 v27.4s, v2.4s, v25.4s - mul v2.4s, v30.4s, v6.4s - trn1 v20.4s, v17.4s, v31.4s - trn1 v25.2d, v4.2d, v29.2d - sqrdmulh v10.4s, v13.4s, v18.s[1] - trn2 v5.2d, v27.2d, v20.2d - ldur q6, [x2, #-0x10] - mls v8.4s, v9.4s, v7.s[0] - sub v15.4s, v25.4s, v16.4s - sqrdmulh v31.4s, v5.4s, v24.4s - sqrdmulh v30.4s, v15.4s, v6.4s - ldur q9, [x2, #-0x30] - mul v4.4s, v5.4s, v21.4s - ldur q21, [x2, #-0x40] - ldur q6, [x2, #-0x20] - add v5.4s, v25.4s, v16.4s - mls v4.4s, v31.4s, v7.s[0] - mul v0.4s, v5.4s, v21.4s - mul v17.4s, v13.4s, v18.s[0] - mls v17.4s, v10.4s, v7.s[0] - ldr q28, [x0, #0xb0] - sqrdmulh v26.4s, v5.4s, v9.4s - mul v9.4s, v15.4s, v6.4s - trn1 v6.2d, v22.2d, v1.2d - mls v9.4s, v30.4s, v7.s[0] - add v25.4s, v28.4s, v17.4s - mls v2.4s, v14.4s, v7.s[0] - trn1 v5.2d, v27.2d, v20.2d - ldr q20, [x2, #0x10] - mul v29.4s, v25.4s, v18.s[2] - add v15.4s, v6.4s, v19.4s - ldr q30, [x0, #0xc0] - sub v19.4s, v6.4s, v19.4s - add v31.4s, v15.4s, v8.4s - mls v0.4s, v26.4s, v7.s[0] - ldur q14, [x1, #-0x30] - add v21.4s, v19.4s, v2.4s - sub v24.4s, v19.4s, v2.4s - sqrdmulh v27.4s, v25.4s, v18.s[3] - sub v26.4s, v15.4s, v8.4s - ldr q2, [x0, #0x90] - mul v16.4s, v30.4s, v18.s[0] - sub v25.4s, v28.4s, v17.4s - trn1 v11.4s, v31.4s, v26.4s - ldr q1, [x0, #0xa0] - trn1 v6.4s, v21.4s, v24.4s - sqrdmulh v13.4s, v25.4s, v14.s[1] - add v8.4s, v2.4s, v3.4s - trn2 v28.2d, v11.2d, v6.2d - sqrdmulh v19.4s, v30.4s, v18.s[1] - sub v10.4s, v5.4s, v4.4s - ldur q22, [x1, #-0x20] - str q28, [x0, #0x20] - mls v29.4s, v27.4s, v7.s[0] - add v15.4s, v10.4s, v9.4s - mul v25.4s, v25.4s, v14.s[0] - ldur q27, [x1, #-0x10] - trn2 v17.4s, v31.4s, v26.4s - trn2 v21.4s, v21.4s, v24.4s - mls v16.4s, v19.4s, v7.s[0] - sub v24.4s, v8.4s, v29.4s - sub v10.4s, v10.4s, v9.4s - mls v25.4s, v13.4s, v7.s[0] - trn1 v13.2d, v11.2d, v6.2d - ldr q28, [x0, #0x80] - sqrdmulh v30.4s, v24.4s, v22.s[1] - trn2 v19.2d, v17.2d, v21.2d - trn1 v6.2d, v17.2d, v21.2d - mul v31.4s, v23.4s, v18.s[0] - str q13, [x0], #0x80 - stur q6, [x0, #-0x70] - stur q19, [x0, #-0x50] - ldr q11, [x2, #0x50] - mls v31.4s, v12.4s, v7.s[0] - ldr q21, [x2, #0x60] - trn1 v9.4s, v15.4s, v10.4s - trn2 v6.4s, v15.4s, v10.4s - mul v24.4s, v24.4s, v22.s[0] - sub v10.4s, v2.4s, v3.4s - ldr q3, [x2], #0xc0 - mls v24.4s, v30.4s, v7.s[0] - add v26.4s, v8.4s, v29.4s - ldur q8, [x2, #-0x90] - add v17.4s, v5.4s, v4.4s - sqrdmulh v2.4s, v26.4s, v14.s[3] - sub v13.4s, v1.4s, v31.4s - add v30.4s, v1.4s, v31.4s - add v15.4s, v10.4s, v25.4s - sqrdmulh v19.4s, v13.4s, v14.s[1] - sub v25.4s, v10.4s, v25.4s - mul v29.4s, v13.4s, v14.s[0] - sub v5.4s, v28.4s, v16.4s - sqrdmulh v4.4s, v30.4s, v18.s[3] - sub v23.4s, v17.4s, v0.4s - add v31.4s, v17.4s, v0.4s - mul v18.4s, v30.4s, v18.s[2] - add v1.4s, v28.4s, v16.4s - trn2 v12.4s, v31.4s, v23.4s - mls v29.4s, v19.4s, v7.s[0] - trn1 v13.4s, v31.4s, v23.4s - trn2 v30.2d, v12.2d, v6.2d - mls v18.4s, v4.4s, v7.s[0] - trn2 v10.2d, v13.2d, v9.2d - trn1 v31.2d, v13.2d, v9.2d - mul v19.4s, v26.4s, v14.s[2] - trn1 v12.2d, v12.2d, v6.2d - sub v6.4s, v5.4s, v29.4s - mls v19.4s, v2.4s, v7.s[0] - add v13.4s, v5.4s, v29.4s - stur q10, [x0, #-0x20] - sub v10.4s, v1.4s, v18.4s - add v28.4s, v1.4s, v18.4s - sqrdmulh v5.4s, v25.4s, v27.s[1] - stur q31, [x0, #-0x40] - add v26.4s, v10.4s, v24.4s - sub v31.4s, v10.4s, v24.4s - mul v9.4s, v25.4s, v27.s[0] - stur q12, [x0, #-0x30] - sub v24.4s, v28.4s, v19.4s - sqrdmulh v10.4s, v15.4s, v22.s[3] - trn1 v1.4s, v26.4s, v31.4s - stur q30, [x0, #-0x10] - add v30.4s, v28.4s, v19.4s - mls v9.4s, v5.4s, v7.s[0] - trn2 v25.4s, v26.4s, v31.4s - trn2 v14.4s, v30.4s, v24.4s - mul v12.4s, v15.4s, v22.s[2] - trn1 v22.4s, v30.4s, v24.4s - trn1 v27.2d, v14.2d, v25.2d - mls v12.4s, v10.4s, v7.s[0] - trn2 v30.2d, v14.2d, v25.2d - sub v31.4s, v6.4s, v9.4s - trn2 v10.2d, v22.2d, v1.2d - mul v28.4s, v30.4s, v3.4s - subs x4, x4, #0x1 - cbnz x4, Lntt_layer45678_start - add v9.4s, v6.4s, v9.4s - sqrdmulh v6.4s, v30.4s, v20.4s - ldur q24, [x2, #-0xa0] - add v25.4s, v13.4s, v12.4s - sub v15.4s, v13.4s, v12.4s - mul v19.4s, v10.4s, v3.4s - trn2 v5.4s, v9.4s, v31.4s - sqrdmulh v3.4s, v10.4s, v20.4s - trn2 v10.4s, v25.4s, v15.4s - mls v28.4s, v6.4s, v7.s[0] - trn2 v13.2d, v10.2d, v5.2d - ldur q30, [x2, #-0x50] - mul v12.4s, v13.4s, v21.4s - mls v19.4s, v3.4s, v7.s[0] - add v20.4s, v27.4s, v28.4s - sqrdmulh v13.4s, v13.4s, v30.4s - sub v3.4s, v27.4s, v28.4s - mul v24.4s, v20.4s, v24.4s - sqrdmulh v6.4s, v3.4s, v11.4s - ldur q27, [x2, #-0x80] - mls v12.4s, v13.4s, v7.s[0] - trn1 v25.4s, v25.4s, v15.4s - mul v27.4s, v3.4s, v27.4s - trn1 v31.4s, v9.4s, v31.4s - trn1 v3.2d, v10.2d, v5.2d - ldur q13, [x2, #-0x30] - ldur q15, [x2, #-0x40] - sqrdmulh v9.4s, v20.4s, v8.4s - trn2 v20.2d, v25.2d, v31.2d - ldur q10, [x2, #-0x10] - mls v27.4s, v6.4s, v7.s[0] - add v5.4s, v3.4s, v12.4s - sub v6.4s, v3.4s, v12.4s - sqrdmulh v3.4s, v20.4s, v30.4s - trn1 v12.2d, v22.2d, v1.2d - sqrdmulh v10.4s, v6.4s, v10.4s - mls v24.4s, v9.4s, v7.s[0] - sub v9.4s, v12.4s, v19.4s - trn1 v25.2d, v25.2d, v31.2d - sqrdmulh v31.4s, v5.4s, v13.4s - add v30.4s, v9.4s, v27.4s - add v13.4s, v12.4s, v19.4s - mul v1.4s, v20.4s, v21.4s - ldur q12, [x2, #-0x20] - add v21.4s, v13.4s, v24.4s - sub v13.4s, v13.4s, v24.4s - mls v1.4s, v3.4s, v7.s[0] - sub v3.4s, v9.4s, v27.4s - mul v9.4s, v6.4s, v12.4s - trn2 v12.4s, v21.4s, v13.4s - trn1 v6.4s, v30.4s, v3.4s - trn2 v30.4s, v30.4s, v3.4s - mls v9.4s, v10.4s, v7.s[0] - trn1 v13.4s, v21.4s, v13.4s - mul v15.4s, v5.4s, v15.4s - sub v3.4s, v25.4s, v1.4s - add v5.4s, v25.4s, v1.4s - mls v15.4s, v31.4s, v7.s[0] - trn1 v21.2d, v13.2d, v6.2d - trn2 v6.2d, v13.2d, v6.2d - add v10.4s, v3.4s, v9.4s - sub v13.4s, v3.4s, v9.4s - str q21, [x0], #0x80 - trn1 v3.2d, v12.2d, v30.2d - trn2 v31.2d, v12.2d, v30.2d - trn1 v21.4s, v10.4s, v13.4s - sub v30.4s, v5.4s, v15.4s - add v12.4s, v5.4s, v15.4s - stur q3, [x0, #-0x70] - trn2 v13.4s, v10.4s, v13.4s - trn1 v19.4s, v12.4s, v30.4s - trn2 v12.4s, v12.4s, v30.4s - stur q6, [x0, #-0x60] - stur q31, [x0, #-0x50] - trn1 v10.2d, v19.2d, v21.2d - trn2 v3.2d, v19.2d, v21.2d - trn1 v21.2d, v12.2d, v13.2d - trn2 v13.2d, v12.2d, v13.2d - stur q10, [x0, #-0x40] - stur q3, [x0, #-0x20] - stur q13, [x0, #-0x10] - stur q21, [x0, #-0x30] - ldp d8, d9, [sp] + add v2.4s, v13.4s, v12.4s + sqrdmulh v5.4s, v30.4s, v20.4s + sub v25.4s, v13.4s, v12.4s + add v17.4s, v6.4s, v9.4s + mul v19.4s, v10.4s, v3.4s + trn2 v4.4s, v2.4s, v25.4s + ldur q24, [x2, #-0x50] + trn2 v29.4s, v17.4s, v31.4s + sqrdmulh v15.4s, v10.4s, v20.4s + mls v28.4s, v5.4s, v7.s[0] + trn2 v3.2d, v4.2d, v29.2d + sqrdmulh v12.4s, v3.4s, v24.4s + mul v16.4s, v3.4s, v21.4s + mls v19.4s, v15.4s, v7.s[0] + ldur q10, [x2, #-0xa0] + add v13.4s, v27.4s, v28.4s + mls v16.4s, v12.4s, v7.s[0] + sqrdmulh v9.4s, v13.4s, v8.4s + sub v30.4s, v27.4s, v28.4s + ldr q18, [x1], #0x40 + mul v8.4s, v13.4s, v10.4s + ldr q10, [x0, #0xd0] + sqrdmulh v14.4s, v30.4s, v11.4s + ldr q23, [x0, #0xe0] + sqrdmulh v13.4s, v10.4s, v18.s[1] + sqrdmulh v12.4s, v23.4s, v18.s[1] + ldur q6, [x2, #-0x80] + mul v3.4s, v10.4s, v18.s[0] + mls v3.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0xf0] + trn1 v27.4s, v2.4s, v25.4s + mul v2.4s, v30.4s, v6.4s + trn1 v20.4s, v17.4s, v31.4s + trn1 v25.2d, v4.2d, v29.2d + sqrdmulh v10.4s, v13.4s, v18.s[1] + trn2 v5.2d, v27.2d, v20.2d + ldur q6, [x2, #-0x10] + mls v8.4s, v9.4s, v7.s[0] + sub v15.4s, v25.4s, v16.4s + sqrdmulh v31.4s, v5.4s, v24.4s + sqrdmulh v30.4s, v15.4s, v6.4s + ldur q9, [x2, #-0x30] + mul v4.4s, v5.4s, v21.4s + ldur q21, [x2, #-0x40] + ldur q6, [x2, #-0x20] + add v5.4s, v25.4s, v16.4s + mls v4.4s, v31.4s, v7.s[0] + mul v0.4s, v5.4s, v21.4s + mul v17.4s, v13.4s, v18.s[0] + mls v17.4s, v10.4s, v7.s[0] + ldr q28, [x0, #0xb0] + sqrdmulh v26.4s, v5.4s, v9.4s + mul v9.4s, v15.4s, v6.4s + trn1 v6.2d, v22.2d, v1.2d + mls v9.4s, v30.4s, v7.s[0] + add v25.4s, v28.4s, v17.4s + mls v2.4s, v14.4s, v7.s[0] + trn1 v5.2d, v27.2d, v20.2d + ldr q20, [x2, #0x10] + mul v29.4s, v25.4s, v18.s[2] + add v15.4s, v6.4s, v19.4s + ldr q30, [x0, #0xc0] + sub v19.4s, v6.4s, v19.4s + add v31.4s, v15.4s, v8.4s + mls v0.4s, v26.4s, v7.s[0] + ldur q14, [x1, #-0x30] + add v21.4s, v19.4s, v2.4s + sub v24.4s, v19.4s, v2.4s + sqrdmulh v27.4s, v25.4s, v18.s[3] + sub v26.4s, v15.4s, v8.4s + ldr q2, [x0, #0x90] + mul v16.4s, v30.4s, v18.s[0] + sub v25.4s, v28.4s, v17.4s + trn1 v11.4s, v31.4s, v26.4s + ldr q1, [x0, #0xa0] + trn1 v6.4s, v21.4s, v24.4s + sqrdmulh v13.4s, v25.4s, v14.s[1] + add v8.4s, v2.4s, v3.4s + trn2 v28.2d, v11.2d, v6.2d + sqrdmulh v19.4s, v30.4s, v18.s[1] + sub v10.4s, v5.4s, v4.4s + ldur q22, [x1, #-0x20] + str q28, [x0, #0x20] + mls v29.4s, v27.4s, v7.s[0] + add v15.4s, v10.4s, v9.4s + mul v25.4s, v25.4s, v14.s[0] + ldur q27, [x1, #-0x10] + trn2 v17.4s, v31.4s, v26.4s + trn2 v21.4s, v21.4s, v24.4s + mls v16.4s, v19.4s, v7.s[0] + sub v24.4s, v8.4s, v29.4s + sub v10.4s, v10.4s, v9.4s + mls v25.4s, v13.4s, v7.s[0] + trn1 v13.2d, v11.2d, v6.2d + ldr q28, [x0, #0x80] + sqrdmulh v30.4s, v24.4s, v22.s[1] + trn2 v19.2d, v17.2d, v21.2d + trn1 v6.2d, v17.2d, v21.2d + mul v31.4s, v23.4s, v18.s[0] + str q13, [x0], #0x80 + stur q6, [x0, #-0x70] + stur q19, [x0, #-0x50] + ldr q11, [x2, #0x50] + mls v31.4s, v12.4s, v7.s[0] + ldr q21, [x2, #0x60] + trn1 v9.4s, v15.4s, v10.4s + trn2 v6.4s, v15.4s, v10.4s + mul v24.4s, v24.4s, v22.s[0] + sub v10.4s, v2.4s, v3.4s + ldr q3, [x2], #0xc0 + mls v24.4s, v30.4s, v7.s[0] + add v26.4s, v8.4s, v29.4s + ldur q8, [x2, #-0x90] + add v17.4s, v5.4s, v4.4s + sqrdmulh v2.4s, v26.4s, v14.s[3] + sub v13.4s, v1.4s, v31.4s + add v30.4s, v1.4s, v31.4s + add v15.4s, v10.4s, v25.4s + sqrdmulh v19.4s, v13.4s, v14.s[1] + sub v25.4s, v10.4s, v25.4s + mul v29.4s, v13.4s, v14.s[0] + sub v5.4s, v28.4s, v16.4s + sqrdmulh v4.4s, v30.4s, v18.s[3] + sub v23.4s, v17.4s, v0.4s + add v31.4s, v17.4s, v0.4s + mul v18.4s, v30.4s, v18.s[2] + add v1.4s, v28.4s, v16.4s + trn2 v12.4s, v31.4s, v23.4s + mls v29.4s, v19.4s, v7.s[0] + trn1 v13.4s, v31.4s, v23.4s + trn2 v30.2d, v12.2d, v6.2d + mls v18.4s, v4.4s, v7.s[0] + trn2 v10.2d, v13.2d, v9.2d + trn1 v31.2d, v13.2d, v9.2d + mul v19.4s, v26.4s, v14.s[2] + trn1 v12.2d, v12.2d, v6.2d + sub v6.4s, v5.4s, v29.4s + mls v19.4s, v2.4s, v7.s[0] + add v13.4s, v5.4s, v29.4s + stur q10, [x0, #-0x20] + sub v10.4s, v1.4s, v18.4s + add v28.4s, v1.4s, v18.4s + sqrdmulh v5.4s, v25.4s, v27.s[1] + stur q31, [x0, #-0x40] + add v26.4s, v10.4s, v24.4s + sub v31.4s, v10.4s, v24.4s + mul v9.4s, v25.4s, v27.s[0] + stur q12, [x0, #-0x30] + sub v24.4s, v28.4s, v19.4s + sqrdmulh v10.4s, v15.4s, v22.s[3] + trn1 v1.4s, v26.4s, v31.4s + stur q30, [x0, #-0x10] + add v30.4s, v28.4s, v19.4s + mls v9.4s, v5.4s, v7.s[0] + trn2 v25.4s, v26.4s, v31.4s + trn2 v14.4s, v30.4s, v24.4s + mul v12.4s, v15.4s, v22.s[2] + trn1 v22.4s, v30.4s, v24.4s + trn1 v27.2d, v14.2d, v25.2d + mls v12.4s, v10.4s, v7.s[0] + trn2 v30.2d, v14.2d, v25.2d + sub v31.4s, v6.4s, v9.4s + trn2 v10.2d, v22.2d, v1.2d + mul v28.4s, v30.4s, v3.4s + subs x4, x4, #0x1 + cbnz x4, Lntt_layer45678_start + add v9.4s, v6.4s, v9.4s + sqrdmulh v6.4s, v30.4s, v20.4s + ldur q24, [x2, #-0xa0] + add v25.4s, v13.4s, v12.4s + sub v15.4s, v13.4s, v12.4s + mul v19.4s, v10.4s, v3.4s + trn2 v5.4s, v9.4s, v31.4s + sqrdmulh v3.4s, v10.4s, v20.4s + trn2 v10.4s, v25.4s, v15.4s + mls v28.4s, v6.4s, v7.s[0] + trn2 v13.2d, v10.2d, v5.2d + ldur q30, [x2, #-0x50] + mul v12.4s, v13.4s, v21.4s + mls v19.4s, v3.4s, v7.s[0] + add v20.4s, v27.4s, v28.4s + sqrdmulh v13.4s, v13.4s, v30.4s + sub v3.4s, v27.4s, v28.4s + mul v24.4s, v20.4s, v24.4s + sqrdmulh v6.4s, v3.4s, v11.4s + ldur q27, [x2, #-0x80] + mls v12.4s, v13.4s, v7.s[0] + trn1 v25.4s, v25.4s, v15.4s + mul v27.4s, v3.4s, v27.4s + trn1 v31.4s, v9.4s, v31.4s + trn1 v3.2d, v10.2d, v5.2d + ldur q13, [x2, #-0x30] + ldur q15, [x2, #-0x40] + sqrdmulh v9.4s, v20.4s, v8.4s + trn2 v20.2d, v25.2d, v31.2d + ldur q10, [x2, #-0x10] + mls v27.4s, v6.4s, v7.s[0] + add v5.4s, v3.4s, v12.4s + sub v6.4s, v3.4s, v12.4s + sqrdmulh v3.4s, v20.4s, v30.4s + trn1 v12.2d, v22.2d, v1.2d + sqrdmulh v10.4s, v6.4s, v10.4s + mls v24.4s, v9.4s, v7.s[0] + sub v9.4s, v12.4s, v19.4s + trn1 v25.2d, v25.2d, v31.2d + sqrdmulh v31.4s, v5.4s, v13.4s + add v30.4s, v9.4s, v27.4s + add v13.4s, v12.4s, v19.4s + mul v1.4s, v20.4s, v21.4s + ldur q12, [x2, #-0x20] + add v21.4s, v13.4s, v24.4s + sub v13.4s, v13.4s, v24.4s + mls v1.4s, v3.4s, v7.s[0] + sub v3.4s, v9.4s, v27.4s + mul v9.4s, v6.4s, v12.4s + trn2 v12.4s, v21.4s, v13.4s + trn1 v6.4s, v30.4s, v3.4s + trn2 v30.4s, v30.4s, v3.4s + mls v9.4s, v10.4s, v7.s[0] + trn1 v13.4s, v21.4s, v13.4s + mul v15.4s, v5.4s, v15.4s + sub v3.4s, v25.4s, v1.4s + add v5.4s, v25.4s, v1.4s + mls v15.4s, v31.4s, v7.s[0] + trn1 v21.2d, v13.2d, v6.2d + trn2 v6.2d, v13.2d, v6.2d + add v10.4s, v3.4s, v9.4s + sub v13.4s, v3.4s, v9.4s + str q21, [x0], #0x80 + trn1 v3.2d, v12.2d, v30.2d + trn2 v31.2d, v12.2d, v30.2d + trn1 v21.4s, v10.4s, v13.4s + sub v30.4s, v5.4s, v15.4s + add v12.4s, v5.4s, v15.4s + stur q3, [x0, #-0x70] + trn2 v13.4s, v10.4s, v13.4s + trn1 v19.4s, v12.4s, v30.4s + trn2 v12.4s, v12.4s, v30.4s + stur q6, [x0, #-0x60] + stur q31, [x0, #-0x50] + trn1 v10.2d, v19.2d, v21.2d + trn2 v3.2d, v19.2d, v21.2d + trn1 v21.2d, v12.2d, v13.2d + trn2 v13.2d, v12.2d, v13.2d + stur q10, [x0, #-0x40] + stur q3, [x0, #-0x20] + stur q13, [x0, #-0x10] + stur q21, [x0, #-0x30] + ldp d8, d9, [sp] .cfi_restore d8 .cfi_restore d9 - ldp d10, d11, [sp, #0x10] + ldp d10, d11, [sp, #0x10] .cfi_restore d10 .cfi_restore d11 - ldp d12, d13, [sp, #0x20] + ldp d12, d13, [sp, #0x20] .cfi_restore d12 .cfi_restore d13 - ldp d14, d15, [sp, #0x30] + ldp d14, d15, [sp, #0x30] .cfi_restore d14 .cfi_restore d15 - add sp, sp, #0x40 + add sp, sp, #0x40 .cfi_adjust_cfa_offset -0x40 ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/pointwise_montgomery.S b/mldsa/src/native/aarch64/src/pointwise_montgomery.S index c8dbffdd2..bd3a61f33 100644 --- a/mldsa/src/native/aarch64/src/pointwise_montgomery.S +++ b/mldsa/src/native/aarch64/src/pointwise_montgomery.S @@ -16,57 +16,57 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) .cfi_startproc - mov w3, #0xe001 // =57345 - movk w3, #0x7f, lsl #16 - dup v0.4s, w3 - mov w3, #0x2001 // =8193 - movk w3, #0x380, lsl #16 - dup v1.4s, w3 - mov x3, #0x40 // =64 + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0x2001 // =8193 + movk w3, #0x380, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 Lpoly_pointwise_montgomery_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 - smull v24.2d, v16.2s, v20.2s - smull2 v25.2d, v16.4s, v20.4s - smull v26.2d, v17.2s, v21.2s - smull2 v27.2d, v17.4s, v21.4s - smull v28.2d, v18.2s, v22.2s - smull2 v29.2d, v18.4s, v22.4s - smull v30.2d, v19.2s, v23.2s - smull2 v31.2d, v19.4s, v23.4s - uzp1 v16.4s, v24.4s, v25.4s - mul v16.4s, v16.4s, v1.4s - smlsl v24.2d, v16.2s, v0.2s - smlsl2 v25.2d, v16.4s, v0.4s - uzp2 v16.4s, v24.4s, v25.4s - uzp1 v17.4s, v26.4s, v27.4s - mul v17.4s, v17.4s, v1.4s - smlsl v26.2d, v17.2s, v0.2s - smlsl2 v27.2d, v17.4s, v0.4s - uzp2 v17.4s, v26.4s, v27.4s - uzp1 v18.4s, v28.4s, v29.4s - mul v18.4s, v18.4s, v1.4s - smlsl v28.2d, v18.2s, v0.2s - smlsl2 v29.2d, v18.4s, v0.4s - uzp2 v18.4s, v28.4s, v29.4s - uzp1 v19.4s, v30.4s, v31.4s - mul v19.4s, v19.4s, v1.4s - smlsl v30.2d, v19.2s, v0.2s - smlsl2 v31.2d, v19.4s, v0.4s - uzp2 v19.4s, v30.4s, v31.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x4 - cbnz x3, Lpoly_pointwise_montgomery_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlsl v24.2d, v16.2s, v0.2s + smlsl2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlsl v26.2d, v17.2s, v0.2s + smlsl2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlsl v28.2d, v18.2s, v0.2s + smlsl2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlsl v30.2d, v19.2s, v0.2s + smlsl2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, Lpoly_pointwise_montgomery_loop_start ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_caddq_asm.S b/mldsa/src/native/aarch64/src/poly_caddq_asm.S index 2bb85d61c..92633dba2 100644 --- a/mldsa/src/native/aarch64/src/poly_caddq_asm.S +++ b/mldsa/src/native/aarch64/src/poly_caddq_asm.S @@ -17,30 +17,30 @@ MLD_ASM_FN_SYMBOL(poly_caddq_asm) .cfi_startproc - mov w9, #0xe001 // =57345 - movk w9, #0x7f, lsl #16 - dup v4.4s, w9 - mov x1, #0x10 // =16 + mov w9, #0xe001 // =57345 + movk w9, #0x7f, lsl #16 + dup v4.4s, w9 + mov x1, #0x10 // =16 Lpoly_caddq_loop: - ldr q0, [x0] - ldr q1, [x0, #0x10] - ldr q2, [x0, #0x20] - ldr q3, [x0, #0x30] - ushr v5.4s, v0.4s, #0x1f - mla v0.4s, v5.4s, v4.4s - ushr v5.4s, v1.4s, #0x1f - mla v1.4s, v5.4s, v4.4s - ushr v5.4s, v2.4s, #0x1f - mla v2.4s, v5.4s, v4.4s - ushr v5.4s, v3.4s, #0x1f - mla v3.4s, v5.4s, v4.4s - str q1, [x0, #0x10] - str q2, [x0, #0x20] - str q3, [x0, #0x30] - str q0, [x0], #0x40 - subs x1, x1, #0x1 - b.ne Lpoly_caddq_loop + ldr q0, [x0] + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ushr v5.4s, v0.4s, #0x1f + mla v0.4s, v5.4s, v4.4s + ushr v5.4s, v1.4s, #0x1f + mla v1.4s, v5.4s, v4.4s + ushr v5.4s, v2.4s, #0x1f + mla v2.4s, v5.4s, v4.4s + ushr v5.4s, v3.4s, #0x1f + mla v3.4s, v5.4s, v4.4s + str q1, [x0, #0x10] + str q2, [x0, #0x20] + str q3, [x0, #0x30] + str q0, [x0], #0x40 + subs x1, x1, #0x1 + b.ne Lpoly_caddq_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_chknorm_asm.S b/mldsa/src/native/aarch64/src/poly_chknorm_asm.S index 7049a6563..aa67ce0e8 100644 --- a/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +++ b/mldsa/src/native/aarch64/src/poly_chknorm_asm.S @@ -17,32 +17,32 @@ MLD_ASM_FN_SYMBOL(poly_chknorm_asm) .cfi_startproc - dup v20.4s, w1 - eor v21.16b, v21.16b, v21.16b - mov x2, #0x10 // =16 + dup v20.4s, w1 + eor v21.16b, v21.16b, v21.16b + mov x2, #0x10 // =16 Lpoly_chknorm_loop: - ldr q1, [x0, #0x10] - ldr q2, [x0, #0x20] - ldr q3, [x0, #0x30] - ldr q0, [x0], #0x40 - abs v1.4s, v1.4s - cmge v1.4s, v1.4s, v20.4s - orr v21.16b, v21.16b, v1.16b - abs v2.4s, v2.4s - cmge v2.4s, v2.4s, v20.4s - orr v21.16b, v21.16b, v2.16b - abs v3.4s, v3.4s - cmge v3.4s, v3.4s, v20.4s - orr v21.16b, v21.16b, v3.16b - abs v0.4s, v0.4s - cmge v0.4s, v0.4s, v20.4s - orr v21.16b, v21.16b, v0.16b - subs x2, x2, #0x1 - b.ne Lpoly_chknorm_loop - umaxv s21, v21.4s - fmov w0, s21 - and w0, w0, #0x1 + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ldr q0, [x0], #0x40 + abs v1.4s, v1.4s + cmge v1.4s, v1.4s, v20.4s + orr v21.16b, v21.16b, v1.16b + abs v2.4s, v2.4s + cmge v2.4s, v2.4s, v20.4s + orr v21.16b, v21.16b, v2.16b + abs v3.4s, v3.4s + cmge v3.4s, v3.4s, v20.4s + orr v21.16b, v21.16b, v3.16b + abs v0.4s, v0.4s + cmge v0.4s, v0.4s, v20.4s + orr v21.16b, v21.16b, v0.16b + subs x2, x2, #0x1 + b.ne Lpoly_chknorm_loop + umaxv s21, v21.4s + fmov w0, s21 + and w0, w0, #0x1 ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S index aa925880b..92066cb79 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S @@ -18,59 +18,59 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) .cfi_startproc - mov w4, #0xe001 // =57345 - movk w4, #0x7f, lsl #16 - dup v20.4s, w4 - mov w5, #0xe100 // =57600 - movk w5, #0x7b, lsl #16 - dup v21.4s, w5 - mov w7, #0xfe00 // =65024 - movk w7, #0x7, lsl #16 - dup v22.4s, w7 - mov w11, #0x401 // =1025 - movk w11, #0x4010, lsl #16 - dup v23.4s, w11 - mov x3, #0x10 // =16 + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0xe100 // =57600 + movk w5, #0x7b, lsl #16 + dup v21.4s, w5 + mov w7, #0xfe00 // =65024 + movk w7, #0x7, lsl #16 + dup v22.4s, w7 + mov w11, #0x401 // =1025 + movk w11, #0x4010, lsl #16 + dup v23.4s, w11 + mov x3, #0x10 // =16 Lpoly_decompose_32_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x12 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x12 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x12 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x12 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 - subs x3, x3, #0x1 - b.ne Lpoly_decompose_32_loop + ldr q0, [x1] + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + sqdmulh v5.4s, v1.4s, v23.4s + srshr v5.4s, v5.4s, #0x12 + cmgt v24.4s, v1.4s, v21.4s + mls v1.4s, v5.4s, v22.4s + bic v5.16b, v5.16b, v24.16b + add v1.4s, v1.4s, v24.4s + sqdmulh v6.4s, v2.4s, v23.4s + srshr v6.4s, v6.4s, #0x12 + cmgt v24.4s, v2.4s, v21.4s + mls v2.4s, v6.4s, v22.4s + bic v6.16b, v6.16b, v24.16b + add v2.4s, v2.4s, v24.4s + sqdmulh v7.4s, v3.4s, v23.4s + srshr v7.4s, v7.4s, #0x12 + cmgt v24.4s, v3.4s, v21.4s + mls v3.4s, v7.4s, v22.4s + bic v7.16b, v7.16b, v24.16b + add v3.4s, v3.4s, v24.4s + sqdmulh v4.4s, v0.4s, v23.4s + srshr v4.4s, v4.4s, #0x12 + cmgt v24.4s, v0.4s, v21.4s + mls v0.4s, v4.4s, v22.4s + bic v4.16b, v4.16b, v24.16b + add v0.4s, v0.4s, v24.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + str q1, [x1, #0x10] + str q2, [x1, #0x20] + str q3, [x1, #0x30] + str q0, [x1], #0x40 + subs x3, x3, #0x1 + b.ne Lpoly_decompose_32_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S index 1127f1684..6e686c78f 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S @@ -18,59 +18,59 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) .cfi_startproc - mov w4, #0xe001 // =57345 - movk w4, #0x7f, lsl #16 - dup v20.4s, w4 - mov w5, #0x6c00 // =27648 - movk w5, #0x7e, lsl #16 - dup v21.4s, w5 - mov w7, #0xe800 // =59392 - movk w7, #0x2, lsl #16 - dup v22.4s, w7 - mov w11, #0x581 // =1409 - movk w11, #0x5816, lsl #16 - dup v23.4s, w11 - mov x3, #0x10 // =16 + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0x6c00 // =27648 + movk w5, #0x7e, lsl #16 + dup v21.4s, w5 + mov w7, #0xe800 // =59392 + movk w7, #0x2, lsl #16 + dup v22.4s, w7 + mov w11, #0x581 // =1409 + movk w11, #0x5816, lsl #16 + dup v23.4s, w11 + mov x3, #0x10 // =16 Lpoly_decompose_88_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x11 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x11 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x11 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x11 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 - subs x3, x3, #0x1 - b.ne Lpoly_decompose_88_loop + ldr q0, [x1] + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + sqdmulh v5.4s, v1.4s, v23.4s + srshr v5.4s, v5.4s, #0x11 + cmgt v24.4s, v1.4s, v21.4s + mls v1.4s, v5.4s, v22.4s + bic v5.16b, v5.16b, v24.16b + add v1.4s, v1.4s, v24.4s + sqdmulh v6.4s, v2.4s, v23.4s + srshr v6.4s, v6.4s, #0x11 + cmgt v24.4s, v2.4s, v21.4s + mls v2.4s, v6.4s, v22.4s + bic v6.16b, v6.16b, v24.16b + add v2.4s, v2.4s, v24.4s + sqdmulh v7.4s, v3.4s, v23.4s + srshr v7.4s, v7.4s, #0x11 + cmgt v24.4s, v3.4s, v21.4s + mls v3.4s, v7.4s, v22.4s + bic v7.16b, v7.16b, v24.16b + add v3.4s, v3.4s, v24.4s + sqdmulh v4.4s, v0.4s, v23.4s + srshr v4.4s, v4.4s, #0x11 + cmgt v24.4s, v0.4s, v21.4s + mls v0.4s, v4.4s, v22.4s + bic v4.16b, v4.16b, v24.16b + add v0.4s, v0.4s, v24.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + str q1, [x1, #0x10] + str q2, [x1, #0x20] + str q3, [x1, #0x30] + str q0, [x1], #0x40 + subs x3, x3, #0x1 + b.ne Lpoly_decompose_88_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S b/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S index ded2ffbc2..d4f7c9b8a 100644 --- a/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +++ b/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S @@ -18,76 +18,76 @@ MLD_ASM_FN_SYMBOL(poly_use_hint_32_asm) .cfi_startproc - mov w4, #0xe001 // =57345 - movk w4, #0x7f, lsl #16 - dup v20.4s, w4 - mov w5, #0xe100 // =57600 - movk w5, #0x7b, lsl #16 - dup v21.4s, w5 - mov w7, #0xfe00 // =65024 - movk w7, #0x7, lsl #16 - dup v22.4s, w7 - mov w11, #0x401 // =1025 - movk w11, #0x4010, lsl #16 - dup v23.4s, w11 - movi v24.4s, #0xf - mov x3, #0x10 // =16 + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0xe100 // =57600 + movk w5, #0x7b, lsl #16 + dup v21.4s, w5 + mov w7, #0xfe00 // =65024 + movk w7, #0x7, lsl #16 + dup v22.4s, w7 + mov w11, #0x401 // =1025 + movk w11, #0x4010, lsl #16 + dup v23.4s, w11 + movi v24.4s, #0xf + mov x3, #0x10 // =16 Lpoly_use_hint_32_loop: - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - ldr q0, [x1], #0x40 - ldr q5, [x2, #0x10] - ldr q6, [x2, #0x20] - ldr q7, [x2, #0x30] - ldr q4, [x2], #0x40 - sqdmulh v17.4s, v1.4s, v23.4s - srshr v17.4s, v17.4s, #0x12 - cmgt v25.4s, v1.4s, v21.4s - mls v1.4s, v17.4s, v22.4s - bic v17.16b, v17.16b, v25.16b - add v1.4s, v1.4s, v25.4s - cmle v1.4s, v1.4s, #0 - orr v1.4s, #0x1 - mla v17.4s, v1.4s, v5.4s - and v17.16b, v17.16b, v24.16b - sqdmulh v18.4s, v2.4s, v23.4s - srshr v18.4s, v18.4s, #0x12 - cmgt v25.4s, v2.4s, v21.4s - mls v2.4s, v18.4s, v22.4s - bic v18.16b, v18.16b, v25.16b - add v2.4s, v2.4s, v25.4s - cmle v2.4s, v2.4s, #0 - orr v2.4s, #0x1 - mla v18.4s, v2.4s, v6.4s - and v18.16b, v18.16b, v24.16b - sqdmulh v19.4s, v3.4s, v23.4s - srshr v19.4s, v19.4s, #0x12 - cmgt v25.4s, v3.4s, v21.4s - mls v3.4s, v19.4s, v22.4s - bic v19.16b, v19.16b, v25.16b - add v3.4s, v3.4s, v25.4s - cmle v3.4s, v3.4s, #0 - orr v3.4s, #0x1 - mla v19.4s, v3.4s, v7.4s - and v19.16b, v19.16b, v24.16b - sqdmulh v16.4s, v0.4s, v23.4s - srshr v16.4s, v16.4s, #0x12 - cmgt v25.4s, v0.4s, v21.4s - mls v0.4s, v16.4s, v22.4s - bic v16.16b, v16.16b, v25.16b - add v0.4s, v0.4s, v25.4s - cmle v0.4s, v0.4s, #0 - orr v0.4s, #0x1 - mla v16.4s, v0.4s, v4.4s - and v16.16b, v16.16b, v24.16b - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x1 - b.ne Lpoly_use_hint_32_loop + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + ldr q0, [x1], #0x40 + ldr q5, [x2, #0x10] + ldr q6, [x2, #0x20] + ldr q7, [x2, #0x30] + ldr q4, [x2], #0x40 + sqdmulh v17.4s, v1.4s, v23.4s + srshr v17.4s, v17.4s, #0x12 + cmgt v25.4s, v1.4s, v21.4s + mls v1.4s, v17.4s, v22.4s + bic v17.16b, v17.16b, v25.16b + add v1.4s, v1.4s, v25.4s + cmle v1.4s, v1.4s, #0 + orr v1.4s, #0x1 + mla v17.4s, v1.4s, v5.4s + and v17.16b, v17.16b, v24.16b + sqdmulh v18.4s, v2.4s, v23.4s + srshr v18.4s, v18.4s, #0x12 + cmgt v25.4s, v2.4s, v21.4s + mls v2.4s, v18.4s, v22.4s + bic v18.16b, v18.16b, v25.16b + add v2.4s, v2.4s, v25.4s + cmle v2.4s, v2.4s, #0 + orr v2.4s, #0x1 + mla v18.4s, v2.4s, v6.4s + and v18.16b, v18.16b, v24.16b + sqdmulh v19.4s, v3.4s, v23.4s + srshr v19.4s, v19.4s, #0x12 + cmgt v25.4s, v3.4s, v21.4s + mls v3.4s, v19.4s, v22.4s + bic v19.16b, v19.16b, v25.16b + add v3.4s, v3.4s, v25.4s + cmle v3.4s, v3.4s, #0 + orr v3.4s, #0x1 + mla v19.4s, v3.4s, v7.4s + and v19.16b, v19.16b, v24.16b + sqdmulh v16.4s, v0.4s, v23.4s + srshr v16.4s, v16.4s, #0x12 + cmgt v25.4s, v0.4s, v21.4s + mls v0.4s, v16.4s, v22.4s + bic v16.16b, v16.16b, v25.16b + add v0.4s, v0.4s, v25.4s + cmle v0.4s, v0.4s, #0 + orr v0.4s, #0x1 + mla v16.4s, v0.4s, v4.4s + and v16.16b, v16.16b, v24.16b + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x1 + b.ne Lpoly_use_hint_32_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S b/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S index 0f7731344..315a978b0 100644 --- a/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +++ b/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S @@ -18,84 +18,84 @@ MLD_ASM_FN_SYMBOL(poly_use_hint_88_asm) .cfi_startproc - mov w4, #0xe001 // =57345 - movk w4, #0x7f, lsl #16 - dup v20.4s, w4 - mov w5, #0x6c00 // =27648 - movk w5, #0x7e, lsl #16 - dup v21.4s, w5 - mov w7, #0xe800 // =59392 - movk w7, #0x2, lsl #16 - dup v22.4s, w7 - mov w11, #0x581 // =1409 - movk w11, #0x5816, lsl #16 - dup v23.4s, w11 - movi v24.4s, #0x2b - mov x3, #0x10 // =16 + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0x6c00 // =27648 + movk w5, #0x7e, lsl #16 + dup v21.4s, w5 + mov w7, #0xe800 // =59392 + movk w7, #0x2, lsl #16 + dup v22.4s, w7 + mov w11, #0x581 // =1409 + movk w11, #0x5816, lsl #16 + dup v23.4s, w11 + movi v24.4s, #0x2b + mov x3, #0x10 // =16 Lpoly_use_hint_88_loop: - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - ldr q0, [x1], #0x40 - ldr q5, [x2, #0x10] - ldr q6, [x2, #0x20] - ldr q7, [x2, #0x30] - ldr q4, [x2], #0x40 - sqdmulh v17.4s, v1.4s, v23.4s - srshr v17.4s, v17.4s, #0x11 - cmgt v25.4s, v1.4s, v21.4s - mls v1.4s, v17.4s, v22.4s - bic v17.16b, v17.16b, v25.16b - add v1.4s, v1.4s, v25.4s - cmle v1.4s, v1.4s, #0 - orr v1.4s, #0x1 - mla v17.4s, v1.4s, v5.4s - cmgt v25.4s, v17.4s, v24.4s - bic v17.16b, v17.16b, v25.16b - umin v17.4s, v17.4s, v24.4s - sqdmulh v18.4s, v2.4s, v23.4s - srshr v18.4s, v18.4s, #0x11 - cmgt v25.4s, v2.4s, v21.4s - mls v2.4s, v18.4s, v22.4s - bic v18.16b, v18.16b, v25.16b - add v2.4s, v2.4s, v25.4s - cmle v2.4s, v2.4s, #0 - orr v2.4s, #0x1 - mla v18.4s, v2.4s, v6.4s - cmgt v25.4s, v18.4s, v24.4s - bic v18.16b, v18.16b, v25.16b - umin v18.4s, v18.4s, v24.4s - sqdmulh v19.4s, v3.4s, v23.4s - srshr v19.4s, v19.4s, #0x11 - cmgt v25.4s, v3.4s, v21.4s - mls v3.4s, v19.4s, v22.4s - bic v19.16b, v19.16b, v25.16b - add v3.4s, v3.4s, v25.4s - cmle v3.4s, v3.4s, #0 - orr v3.4s, #0x1 - mla v19.4s, v3.4s, v7.4s - cmgt v25.4s, v19.4s, v24.4s - bic v19.16b, v19.16b, v25.16b - umin v19.4s, v19.4s, v24.4s - sqdmulh v16.4s, v0.4s, v23.4s - srshr v16.4s, v16.4s, #0x11 - cmgt v25.4s, v0.4s, v21.4s - mls v0.4s, v16.4s, v22.4s - bic v16.16b, v16.16b, v25.16b - add v0.4s, v0.4s, v25.4s - cmle v0.4s, v0.4s, #0 - orr v0.4s, #0x1 - mla v16.4s, v0.4s, v4.4s - cmgt v25.4s, v16.4s, v24.4s - bic v16.16b, v16.16b, v25.16b - umin v16.4s, v16.4s, v24.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] - str q19, [x0, #0x30] - str q16, [x0], #0x40 - subs x3, x3, #0x1 - b.ne Lpoly_use_hint_88_loop + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + ldr q0, [x1], #0x40 + ldr q5, [x2, #0x10] + ldr q6, [x2, #0x20] + ldr q7, [x2, #0x30] + ldr q4, [x2], #0x40 + sqdmulh v17.4s, v1.4s, v23.4s + srshr v17.4s, v17.4s, #0x11 + cmgt v25.4s, v1.4s, v21.4s + mls v1.4s, v17.4s, v22.4s + bic v17.16b, v17.16b, v25.16b + add v1.4s, v1.4s, v25.4s + cmle v1.4s, v1.4s, #0 + orr v1.4s, #0x1 + mla v17.4s, v1.4s, v5.4s + cmgt v25.4s, v17.4s, v24.4s + bic v17.16b, v17.16b, v25.16b + umin v17.4s, v17.4s, v24.4s + sqdmulh v18.4s, v2.4s, v23.4s + srshr v18.4s, v18.4s, #0x11 + cmgt v25.4s, v2.4s, v21.4s + mls v2.4s, v18.4s, v22.4s + bic v18.16b, v18.16b, v25.16b + add v2.4s, v2.4s, v25.4s + cmle v2.4s, v2.4s, #0 + orr v2.4s, #0x1 + mla v18.4s, v2.4s, v6.4s + cmgt v25.4s, v18.4s, v24.4s + bic v18.16b, v18.16b, v25.16b + umin v18.4s, v18.4s, v24.4s + sqdmulh v19.4s, v3.4s, v23.4s + srshr v19.4s, v19.4s, #0x11 + cmgt v25.4s, v3.4s, v21.4s + mls v3.4s, v19.4s, v22.4s + bic v19.16b, v19.16b, v25.16b + add v3.4s, v3.4s, v25.4s + cmle v3.4s, v3.4s, #0 + orr v3.4s, #0x1 + mla v19.4s, v3.4s, v7.4s + cmgt v25.4s, v19.4s, v24.4s + bic v19.16b, v19.16b, v25.16b + umin v19.4s, v19.4s, v24.4s + sqdmulh v16.4s, v0.4s, v23.4s + srshr v16.4s, v16.4s, #0x11 + cmgt v25.4s, v0.4s, v21.4s + mls v0.4s, v16.4s, v22.4s + bic v16.16b, v16.16b, v25.16b + add v0.4s, v0.4s, v25.4s + cmle v0.4s, v0.4s, #0 + orr v0.4s, #0x1 + mla v16.4s, v0.4s, v4.4s + cmgt v25.4s, v16.4s, v24.4s + bic v16.16b, v16.16b, v25.16b + umin v16.4s, v16.4s, v24.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x1 + b.ne Lpoly_use_hint_88_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S b/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S index 6386305fa..34195a43b 100644 --- a/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +++ b/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S @@ -19,45 +19,45 @@ MLD_ASM_FN_SYMBOL(polyz_unpack_17_asm) .cfi_startproc - ldr q24, [x2] - ldr q25, [x2, #0x10] - ldr q26, [x2, #0x20] - ldr q27, [x2, #0x30] - mov x3, #0xfe00000000 // =1090921693184 - mov v28.d[0], x3 - mov x3, #0xfc // =252 - movk x3, #0xfa, lsl #32 - mov v28.d[1], x3 - movi v29.4s, #0x3, msl #16 - movi v30.4s, #0x2, lsl #16 - mov x9, #0x10 // =16 + ldr q24, [x2] + ldr q25, [x2, #0x10] + ldr q26, [x2, #0x20] + ldr q27, [x2, #0x30] + mov x3, #0xfe00000000 // =1090921693184 + mov v28.d[0], x3 + mov x3, #0xfc // =252 + movk x3, #0xfa, lsl #32 + mov v28.d[1], x3 + movi v29.4s, #0x3, msl #16 + movi v30.4s, #0x2, lsl #16 + mov x9, #0x10 // =16 Lpolyz_unpack_17_loop: - ld1 { v0.16b, v1.16b }, [x1] - add x1, x1, #0x14 - ld1 { v2.16b }, [x1], #16 - tbl v4.16b, { v0.16b }, v24.16b - tbl v5.16b, { v0.16b, v1.16b }, v25.16b - tbl v6.16b, { v1.16b }, v26.16b - tbl v7.16b, { v1.16b, v2.16b }, v27.16b - ushl v4.4s, v4.4s, v28.4s - and v4.16b, v4.16b, v29.16b - sub v4.4s, v30.4s, v4.4s - ushl v5.4s, v5.4s, v28.4s - and v5.16b, v5.16b, v29.16b - sub v5.4s, v30.4s, v5.4s - ushl v6.4s, v6.4s, v28.4s - and v6.16b, v6.16b, v29.16b - sub v6.4s, v30.4s, v6.4s - ushl v7.4s, v7.4s, v28.4s - and v7.16b, v7.16b, v29.16b - sub v7.4s, v30.4s, v7.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - subs x9, x9, #0x1 - b.ne Lpolyz_unpack_17_loop + ld1 { v0.16b, v1.16b }, [x1] + add x1, x1, #0x14 + ld1 { v2.16b }, [x1], #16 + tbl v4.16b, { v0.16b }, v24.16b + tbl v5.16b, { v0.16b, v1.16b }, v25.16b + tbl v6.16b, { v1.16b }, v26.16b + tbl v7.16b, { v1.16b, v2.16b }, v27.16b + ushl v4.4s, v4.4s, v28.4s + and v4.16b, v4.16b, v29.16b + sub v4.4s, v30.4s, v4.4s + ushl v5.4s, v5.4s, v28.4s + and v5.16b, v5.16b, v29.16b + sub v5.4s, v30.4s, v5.4s + ushl v6.4s, v6.4s, v28.4s + and v6.16b, v6.16b, v29.16b + sub v6.4s, v30.4s, v6.4s + ushl v7.4s, v7.4s, v28.4s + and v7.16b, v7.16b, v29.16b + sub v7.4s, v30.4s, v7.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + subs x9, x9, #0x1 + b.ne Lpolyz_unpack_17_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S b/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S index 6a458b926..95bc1d05c 100644 --- a/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +++ b/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S @@ -19,42 +19,42 @@ MLD_ASM_FN_SYMBOL(polyz_unpack_19_asm) .cfi_startproc - ldr q24, [x2] - ldr q25, [x2, #0x10] - ldr q26, [x2, #0x20] - ldr q27, [x2, #0x30] - mov x3, #0xfc00000000 // =1082331758592 - dup v28.2d, x3 - movi v29.4s, #0xf, msl #16 - movi v30.4s, #0x8, lsl #16 - mov x9, #0x10 // =16 + ldr q24, [x2] + ldr q25, [x2, #0x10] + ldr q26, [x2, #0x20] + ldr q27, [x2, #0x30] + mov x3, #0xfc00000000 // =1082331758592 + dup v28.2d, x3 + movi v29.4s, #0xf, msl #16 + movi v30.4s, #0x8, lsl #16 + mov x9, #0x10 // =16 Lpolyz_unpack_19_loop: - ld1 { v0.16b, v1.16b }, [x1] - add x1, x1, #0x18 - ld1 { v2.16b }, [x1], #16 - tbl v4.16b, { v0.16b }, v24.16b - tbl v5.16b, { v0.16b, v1.16b }, v25.16b - tbl v6.16b, { v1.16b }, v26.16b - tbl v7.16b, { v1.16b, v2.16b }, v27.16b - ushl v4.4s, v4.4s, v28.4s - and v4.16b, v4.16b, v29.16b - sub v4.4s, v30.4s, v4.4s - ushl v5.4s, v5.4s, v28.4s - and v5.16b, v5.16b, v29.16b - sub v5.4s, v30.4s, v5.4s - ushl v6.4s, v6.4s, v28.4s - and v6.16b, v6.16b, v29.16b - sub v6.4s, v30.4s, v6.4s - ushl v7.4s, v7.4s, v28.4s - and v7.16b, v7.16b, v29.16b - sub v7.4s, v30.4s, v7.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - subs x9, x9, #0x1 - b.ne Lpolyz_unpack_19_loop + ld1 { v0.16b, v1.16b }, [x1] + add x1, x1, #0x18 + ld1 { v2.16b }, [x1], #16 + tbl v4.16b, { v0.16b }, v24.16b + tbl v5.16b, { v0.16b, v1.16b }, v25.16b + tbl v6.16b, { v1.16b }, v26.16b + tbl v7.16b, { v1.16b, v2.16b }, v27.16b + ushl v4.4s, v4.4s, v28.4s + and v4.16b, v4.16b, v29.16b + sub v4.4s, v30.4s, v4.4s + ushl v5.4s, v5.4s, v28.4s + and v5.16b, v5.16b, v29.16b + sub v5.4s, v30.4s, v5.4s + ushl v6.4s, v6.4s, v28.4s + and v6.16b, v6.16b, v29.16b + sub v6.4s, v30.4s, v6.4s + ushl v7.4s, v7.4s, v28.4s + and v7.16b, v7.16b, v29.16b + sub v7.4s, v30.4s, v7.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + subs x9, x9, #0x1 + b.ne Lpolyz_unpack_19_loop ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/rej_uniform_asm.S b/mldsa/src/native/aarch64/src/rej_uniform_asm.S index 3ebd80030..6c3225e4a 100644 --- a/mldsa/src/native/aarch64/src/rej_uniform_asm.S +++ b/mldsa/src/native/aarch64/src/rej_uniform_asm.S @@ -19,163 +19,163 @@ MLD_ASM_FN_SYMBOL(rej_uniform_asm) .cfi_startproc - sub sp, sp, #0x440 + sub sp, sp, #0x440 .cfi_adjust_cfa_offset 0x440 - mov x7, #0x1 // =1 - movk x7, #0x2, lsl #32 - mov v31.d[0], x7 - mov x7, #0x4 // =4 - movk x7, #0x8, lsl #32 - mov v31.d[1], x7 - mov w7, #0xe001 // =57345 - movk w7, #0x7f, lsl #16 - dup v30.4s, w7 - mov x8, sp - mov x7, x8 - mov x11, #0x0 // =0 - eor v16.16b, v16.16b, v16.16b + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #32 + mov v31.d[0], x7 + mov x7, #0x4 // =4 + movk x7, #0x8, lsl #32 + mov v31.d[1], x7 + mov w7, #0xe001 // =57345 + movk w7, #0x7f, lsl #16 + dup v30.4s, w7 + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b Lrej_uniform_initial_zero: - str q16, [x7], #0x40 - stur q16, [x7, #-0x30] - stur q16, [x7, #-0x20] - stur q16, [x7, #-0x10] - add x11, x11, #0x10 - cmp x11, #0x100 - b.lt Lrej_uniform_initial_zero - mov x7, x8 - mov x9, #0x0 // =0 - mov x4, #0x100 // =256 - cmp x2, #0x30 - b.lo Lrej_uniform_loop48_end + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt Lrej_uniform_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 + cmp x2, #0x30 + b.lo Lrej_uniform_loop48_end Lrej_uniform_loop48: - cmp x9, x4 - b.hs Lrej_uniform_memory_copy - sub x2, x2, #0x30 - ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 - movi v4.16b, #0x80 - bic v2.16b, v2.16b, v4.16b - zip1 v4.16b, v0.16b, v1.16b - zip2 v5.16b, v0.16b, v1.16b - ushll v6.8h, v2.8b, #0x0 - ushll2 v7.8h, v2.16b, #0x0 - zip1 v16.8h, v4.8h, v6.8h - zip2 v17.8h, v4.8h, v6.8h - zip1 v18.8h, v5.8h, v7.8h - zip2 v19.8h, v5.8h, v7.8h - cmhi v4.4s, v30.4s, v16.4s - cmhi v5.4s, v30.4s, v17.4s - cmhi v6.4s, v30.4s, v18.4s - cmhi v7.4s, v30.4s, v19.4s - and v4.16b, v4.16b, v31.16b - and v5.16b, v5.16b, v31.16b - and v6.16b, v6.16b, v31.16b - and v7.16b, v7.16b, v31.16b - uaddlv d20, v4.4s - uaddlv d21, v5.4s - uaddlv d22, v6.4s - uaddlv d23, v7.4s - fmov x12, d20 - fmov x13, d21 - fmov x14, d22 - fmov x15, d23 - ldr q24, [x3, x12, lsl #4] - ldr q25, [x3, x13, lsl #4] - ldr q26, [x3, x14, lsl #4] - ldr q27, [x3, x15, lsl #4] - cnt v4.16b, v4.16b - cnt v5.16b, v5.16b - cnt v6.16b, v6.16b - cnt v7.16b, v7.16b - uaddlv d20, v4.4s - uaddlv d21, v5.4s - uaddlv d22, v6.4s - uaddlv d23, v7.4s - fmov x12, d20 - fmov x13, d21 - fmov x14, d22 - fmov x15, d23 - tbl v16.16b, { v16.16b }, v24.16b - tbl v17.16b, { v17.16b }, v25.16b - tbl v18.16b, { v18.16b }, v26.16b - tbl v19.16b, { v19.16b }, v27.16b - st1 { v16.4s }, [x7] - add x7, x7, x12, lsl #2 - st1 { v17.4s }, [x7] - add x7, x7, x13, lsl #2 - st1 { v18.4s }, [x7] - add x7, x7, x14, lsl #2 - st1 { v19.4s }, [x7] - add x7, x7, x15, lsl #2 - add x12, x12, x13 - add x14, x14, x15 - add x9, x9, x12 - add x9, x9, x14 - cmp x2, #0x30 - b.hs Lrej_uniform_loop48 + cmp x9, x4 + b.hs Lrej_uniform_memory_copy + sub x2, x2, #0x30 + ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 + movi v4.16b, #0x80 + bic v2.16b, v2.16b, v4.16b + zip1 v4.16b, v0.16b, v1.16b + zip2 v5.16b, v0.16b, v1.16b + ushll v6.8h, v2.8b, #0x0 + ushll2 v7.8h, v2.16b, #0x0 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + zip1 v18.8h, v5.8h, v7.8h + zip2 v19.8h, v5.8h, v7.8h + cmhi v4.4s, v30.4s, v16.4s + cmhi v5.4s, v30.4s, v17.4s + cmhi v6.4s, v30.4s, v18.4s + cmhi v7.4s, v30.4s, v19.4s + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + and v6.16b, v6.16b, v31.16b + and v7.16b, v7.16b, v31.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + uaddlv d22, v6.4s + uaddlv d23, v7.4s + fmov x12, d20 + fmov x13, d21 + fmov x14, d22 + fmov x15, d23 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + ldr q26, [x3, x14, lsl #4] + ldr q27, [x3, x15, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + cnt v6.16b, v6.16b + cnt v7.16b, v7.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + uaddlv d22, v6.4s + uaddlv d23, v7.4s + fmov x12, d20 + fmov x13, d21 + fmov x14, d22 + fmov x15, d23 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + tbl v18.16b, { v18.16b }, v26.16b + tbl v19.16b, { v19.16b }, v27.16b + st1 { v16.4s }, [x7] + add x7, x7, x12, lsl #2 + st1 { v17.4s }, [x7] + add x7, x7, x13, lsl #2 + st1 { v18.4s }, [x7] + add x7, x7, x14, lsl #2 + st1 { v19.4s }, [x7] + add x7, x7, x15, lsl #2 + add x12, x12, x13 + add x14, x14, x15 + add x9, x9, x12 + add x9, x9, x14 + cmp x2, #0x30 + b.hs Lrej_uniform_loop48 Lrej_uniform_loop48_end: - cmp x9, x4 - b.hs Lrej_uniform_memory_copy - cmp x2, #0x18 - b.lo Lrej_uniform_memory_copy - sub x2, x2, #0x18 - ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 - movi v4.16b, #0x80 - bic v2.16b, v2.16b, v4.16b - zip1 v4.16b, v0.16b, v1.16b - ushll v6.8h, v2.8b, #0x0 - zip1 v16.8h, v4.8h, v6.8h - zip2 v17.8h, v4.8h, v6.8h - cmhi v4.4s, v30.4s, v16.4s - cmhi v5.4s, v30.4s, v17.4s - and v4.16b, v4.16b, v31.16b - and v5.16b, v5.16b, v31.16b - uaddlv d20, v4.4s - uaddlv d21, v5.4s - fmov x12, d20 - fmov x13, d21 - ldr q24, [x3, x12, lsl #4] - ldr q25, [x3, x13, lsl #4] - cnt v4.16b, v4.16b - cnt v5.16b, v5.16b - uaddlv d20, v4.4s - uaddlv d21, v5.4s - fmov x12, d20 - fmov x13, d21 - tbl v16.16b, { v16.16b }, v24.16b - tbl v17.16b, { v17.16b }, v25.16b - str q16, [x7] - add x7, x7, x12, lsl #2 - str q17, [x7] - add x7, x7, x13, lsl #2 - add x9, x9, x12 - add x9, x9, x13 + cmp x9, x4 + b.hs Lrej_uniform_memory_copy + cmp x2, #0x18 + b.lo Lrej_uniform_memory_copy + sub x2, x2, #0x18 + ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 + movi v4.16b, #0x80 + bic v2.16b, v2.16b, v4.16b + zip1 v4.16b, v0.16b, v1.16b + ushll v6.8h, v2.8b, #0x0 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + cmhi v4.4s, v30.4s, v16.4s + cmhi v5.4s, v30.4s, v17.4s + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + fmov x12, d20 + fmov x13, d21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + fmov x12, d20 + fmov x13, d21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #2 + str q17, [x7] + add x7, x7, x13, lsl #2 + add x9, x9, x12 + add x9, x9, x13 Lrej_uniform_memory_copy: - cmp x9, x4 - csel x9, x9, x4, lo - mov x11, #0x0 // =0 - mov x7, x8 + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 // =0 + mov x7, x8 Lrej_uniform_final_copy: - ldr q16, [x7], #0x40 - ldur q17, [x7, #-0x30] - ldur q18, [x7, #-0x20] - ldur q19, [x7, #-0x10] - str q16, [x0], #0x40 - stur q17, [x0, #-0x30] - stur q18, [x0, #-0x20] - stur q19, [x0, #-0x10] - add x11, x11, #0x10 - cmp x11, #0x100 - b.lt Lrej_uniform_final_copy - mov x0, x9 - b Lrej_uniform_return + ldr q16, [x7], #0x40 + ldur q17, [x7, #-0x30] + ldur q18, [x7, #-0x20] + ldur q19, [x7, #-0x10] + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt Lrej_uniform_final_copy + mov x0, x9 + b Lrej_uniform_return Lrej_uniform_return: - add sp, sp, #0x440 + add sp, sp, #0x440 .cfi_adjust_cfa_offset -0x440 ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S b/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S index 1fe015b80..299042a99 100644 --- a/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +++ b/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S @@ -20,107 +20,107 @@ MLD_ASM_FN_SYMBOL(rej_uniform_eta2_asm) .cfi_startproc - sub sp, sp, #0x240 + sub sp, sp, #0x240 .cfi_adjust_cfa_offset 0x240 - mov x7, #0x1 // =1 - movk x7, #0x2, lsl #16 - movk x7, #0x4, lsl #32 - movk x7, #0x8, lsl #48 - mov v31.d[0], x7 - mov x7, #0x10 // =16 - movk x7, #0x20, lsl #16 - movk x7, #0x40, lsl #32 - movk x7, #0x80, lsl #48 - mov v31.d[1], x7 - movi v30.8h, #0xf - mov x8, sp - mov x7, x8 - mov x11, #0x0 // =0 - eor v16.16b, v16.16b, v16.16b + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 // =16 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + movi v30.8h, #0xf + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b Lrej_uniform_eta2_initial_zero: - str q16, [x7], #0x40 - stur q16, [x7, #-0x30] - stur q16, [x7, #-0x20] - stur q16, [x7, #-0x10] - add x11, x11, #0x20 - cmp x11, #0x100 - b.lt Lrej_uniform_eta2_initial_zero - mov x7, x8 - mov x9, #0x0 // =0 - mov x4, #0x100 // =256 + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt Lrej_uniform_eta2_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 Lrej_uniform_eta2_loop8: - cmp x9, x4 - b.hs Lrej_uniform_eta2_memory_copy - sub x2, x2, #0x8 - ld1 { v0.8b }, [x1], #8 - movi v26.8b, #0xf - and v27.8b, v0.8b, v26.8b - ushr v28.8b, v0.8b, #0x4 - zip1 v26.8b, v27.8b, v28.8b - zip2 v29.8b, v27.8b, v28.8b - ushll v16.8h, v26.8b, #0x0 - ushll v17.8h, v29.8b, #0x0 - cmhi v4.8h, v30.8h, v16.8h - cmhi v5.8h, v30.8h, v17.8h - and v4.16b, v4.16b, v31.16b - and v5.16b, v5.16b, v31.16b - uaddlv s20, v4.8h - uaddlv s21, v5.8h - fmov w12, s20 - fmov w13, s21 - ldr q24, [x3, x12, lsl #4] - ldr q25, [x3, x13, lsl #4] - cnt v4.16b, v4.16b - cnt v5.16b, v5.16b - uaddlv s20, v4.8h - uaddlv s21, v5.8h - fmov w12, s20 - fmov w13, s21 - tbl v16.16b, { v16.16b }, v24.16b - tbl v17.16b, { v17.16b }, v25.16b - st1 { v16.8h }, [x7] - add x7, x7, x12, lsl #1 - st1 { v17.8h }, [x7] - add x7, x7, x13, lsl #1 - add x12, x12, x13 - add x9, x9, x12 - cmp x2, #0x8 - b.hs Lrej_uniform_eta2_loop8 + cmp x9, x4 + b.hs Lrej_uniform_eta2_memory_copy + sub x2, x2, #0x8 + ld1 { v0.8b }, [x1], #8 + movi v26.8b, #0xf + and v27.8b, v0.8b, v26.8b + ushr v28.8b, v0.8b, #0x4 + zip1 v26.8b, v27.8b, v28.8b + zip2 v29.8b, v27.8b, v28.8b + ushll v16.8h, v26.8b, #0x0 + ushll v17.8h, v29.8b, #0x0 + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + st1 { v16.8h }, [x7] + add x7, x7, x12, lsl #1 + st1 { v17.8h }, [x7] + add x7, x7, x13, lsl #1 + add x12, x12, x13 + add x9, x9, x12 + cmp x2, #0x8 + b.hs Lrej_uniform_eta2_loop8 Lrej_uniform_eta2_memory_copy: - cmp x9, x4 - csel x9, x9, x4, lo - mov w7, #0x199a // =6554 - dup v26.8h, w7 - movi v27.8h, #0x5 - movi v7.8h, #0x2 - mov x11, #0x0 // =0 - mov x7, x8 + cmp x9, x4 + csel x9, x9, x4, lo + mov w7, #0x199a // =6554 + dup v26.8h, w7 + movi v27.8h, #0x5 + movi v7.8h, #0x2 + mov x11, #0x0 // =0 + mov x7, x8 Lrej_uniform_eta2_final_copy: - ldr q16, [x7], #0x20 - ldur q18, [x7, #-0x10] - sqdmulh v28.8h, v16.8h, v26.8h - mls v16.8h, v28.8h, v27.8h - sqdmulh v28.8h, v18.8h, v26.8h - mls v18.8h, v28.8h, v27.8h - sub v16.8h, v7.8h, v16.8h - sub v18.8h, v7.8h, v18.8h - sshll2 v17.4s, v16.8h, #0x0 - sshll v16.4s, v16.4h, #0x0 - sshll2 v19.4s, v18.8h, #0x0 - sshll v18.4s, v18.4h, #0x0 - str q16, [x0], #0x40 - stur q17, [x0, #-0x30] - stur q18, [x0, #-0x20] - stur q19, [x0, #-0x10] - add x11, x11, #0x10 - cmp x11, #0x100 - b.lt Lrej_uniform_eta2_final_copy - mov x0, x9 - add sp, sp, #0x240 + ldr q16, [x7], #0x20 + ldur q18, [x7, #-0x10] + sqdmulh v28.8h, v16.8h, v26.8h + mls v16.8h, v28.8h, v27.8h + sqdmulh v28.8h, v18.8h, v26.8h + mls v18.8h, v28.8h, v27.8h + sub v16.8h, v7.8h, v16.8h + sub v18.8h, v7.8h, v18.8h + sshll2 v17.4s, v16.8h, #0x0 + sshll v16.4s, v16.4h, #0x0 + sshll2 v19.4s, v18.8h, #0x0 + sshll v18.4s, v18.4h, #0x0 + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt Lrej_uniform_eta2_final_copy + mov x0, x9 + add sp, sp, #0x240 .cfi_adjust_cfa_offset -0x240 ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S b/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S index 2426bbd1b..e1aef466f 100644 --- a/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +++ b/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S @@ -20,100 +20,100 @@ MLD_ASM_FN_SYMBOL(rej_uniform_eta4_asm) .cfi_startproc - sub sp, sp, #0x240 + sub sp, sp, #0x240 .cfi_adjust_cfa_offset 0x240 - mov x7, #0x1 // =1 - movk x7, #0x2, lsl #16 - movk x7, #0x4, lsl #32 - movk x7, #0x8, lsl #48 - mov v31.d[0], x7 - mov x7, #0x10 // =16 - movk x7, #0x20, lsl #16 - movk x7, #0x40, lsl #32 - movk x7, #0x80, lsl #48 - mov v31.d[1], x7 - movi v30.8h, #0x9 - movi v7.8h, #0x4 - mov x8, sp - mov x7, x8 - mov x11, #0x0 // =0 - eor v16.16b, v16.16b, v16.16b + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 // =16 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + movi v30.8h, #0x9 + movi v7.8h, #0x4 + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b Lrej_uniform_eta4_initial_zero: - str q16, [x7], #0x40 - stur q16, [x7, #-0x30] - stur q16, [x7, #-0x20] - stur q16, [x7, #-0x10] - add x11, x11, #0x20 - cmp x11, #0x100 - b.lt Lrej_uniform_eta4_initial_zero - mov x7, x8 - mov x9, #0x0 // =0 - mov x4, #0x100 // =256 + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt Lrej_uniform_eta4_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 Lrej_uniform_eta4_loop8: - cmp x9, x4 - b.hs Lrej_uniform_eta4_memory_copy - sub x2, x2, #0x8 - ld1 { v0.8b }, [x1], #8 - movi v26.8b, #0xf - and v27.8b, v0.8b, v26.8b - ushr v28.8b, v0.8b, #0x4 - zip1 v26.8b, v27.8b, v28.8b - zip2 v29.8b, v27.8b, v28.8b - ushll v16.8h, v26.8b, #0x0 - ushll v17.8h, v29.8b, #0x0 - cmhi v4.8h, v30.8h, v16.8h - cmhi v5.8h, v30.8h, v17.8h - and v4.16b, v4.16b, v31.16b - and v5.16b, v5.16b, v31.16b - uaddlv s20, v4.8h - uaddlv s21, v5.8h - fmov w12, s20 - fmov w13, s21 - ldr q24, [x3, x12, lsl #4] - ldr q25, [x3, x13, lsl #4] - cnt v4.16b, v4.16b - cnt v5.16b, v5.16b - uaddlv s20, v4.8h - uaddlv s21, v5.8h - fmov w12, s20 - fmov w13, s21 - tbl v16.16b, { v16.16b }, v24.16b - tbl v17.16b, { v17.16b }, v25.16b - st1 { v16.8h }, [x7] - add x7, x7, x12, lsl #1 - st1 { v17.8h }, [x7] - add x7, x7, x13, lsl #1 - add x12, x12, x13 - add x9, x9, x12 - cmp x2, #0x8 - b.hs Lrej_uniform_eta4_loop8 + cmp x9, x4 + b.hs Lrej_uniform_eta4_memory_copy + sub x2, x2, #0x8 + ld1 { v0.8b }, [x1], #8 + movi v26.8b, #0xf + and v27.8b, v0.8b, v26.8b + ushr v28.8b, v0.8b, #0x4 + zip1 v26.8b, v27.8b, v28.8b + zip2 v29.8b, v27.8b, v28.8b + ushll v16.8h, v26.8b, #0x0 + ushll v17.8h, v29.8b, #0x0 + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + st1 { v16.8h }, [x7] + add x7, x7, x12, lsl #1 + st1 { v17.8h }, [x7] + add x7, x7, x13, lsl #1 + add x12, x12, x13 + add x9, x9, x12 + cmp x2, #0x8 + b.hs Lrej_uniform_eta4_loop8 Lrej_uniform_eta4_memory_copy: - cmp x9, x4 - csel x9, x9, x4, lo - mov x11, #0x0 // =0 - mov x7, x8 + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 // =0 + mov x7, x8 Lrej_uniform_eta4_final_copy: - ldr q16, [x7], #0x20 - ldur q18, [x7, #-0x10] - sub v16.8h, v7.8h, v16.8h - sub v18.8h, v7.8h, v18.8h - sshll2 v17.4s, v16.8h, #0x0 - sshll v16.4s, v16.4h, #0x0 - sshll2 v19.4s, v18.8h, #0x0 - sshll v18.4s, v18.4h, #0x0 - str q16, [x0], #0x40 - stur q17, [x0, #-0x30] - stur q18, [x0, #-0x20] - stur q19, [x0, #-0x10] - add x11, x11, #0x10 - cmp x11, #0x100 - b.lt Lrej_uniform_eta4_final_copy - mov x0, x9 - add sp, sp, #0x240 + ldr q16, [x7], #0x20 + ldur q18, [x7, #-0x10] + sub v16.8h, v7.8h, v16.8h + sub v18.8h, v7.8h, v18.8h + sshll2 v17.4s, v16.8h, #0x0 + sshll v16.4s, v16.4h, #0x0 + sshll2 v19.4s, v18.8h, #0x0 + sshll v18.4s, v18.4h, #0x0 + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt Lrej_uniform_eta4_final_copy + mov x0, x9 + add sp, sp, #0x240 .cfi_adjust_cfa_offset -0x240 ret .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/intt.S b/mldsa/src/native/x86_64/src/intt.S index 50b572819..0a0b52dbb 100644 --- a/mldsa/src/native/x86_64/src/intt.S +++ b/mldsa/src/native/x86_64/src/intt.S @@ -33,2271 +33,2271 @@ MLD_ASM_FN_SYMBOL(invntt_avx2) .cfi_startproc - vmovdqa (%rsi), %ymm0 - vmovdqa (%rdi), %ymm4 - vmovdqa 0x20(%rdi), %ymm5 - vmovdqa 0x40(%rdi), %ymm6 - vmovdqa 0x60(%rdi), %ymm7 - vmovdqa 0x80(%rdi), %ymm8 - vmovdqa 0xa0(%rdi), %ymm9 - vmovdqa 0xc0(%rdi), %ymm10 - vmovdqa 0xe0(%rdi), %ymm11 - vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x9c(%rsi), %ymm1 - vpbroadcastd 0x53c(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, (%rdi) - vmovdqa %ymm10, 0x20(%rdi) - vmovdqa %ymm6, 0x40(%rdi) - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm3, 0x80(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm8, 0xc0(%rdi) - vmovdqa %ymm11, 0xe0(%rdi) - vmovdqa 0x100(%rdi), %ymm4 - vmovdqa 0x120(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x160(%rdi), %ymm7 - vmovdqa 0x180(%rdi), %ymm8 - vmovdqa 0x1a0(%rdi), %ymm9 - vmovdqa 0x1c0(%rdi), %ymm10 - vmovdqa 0x1e0(%rdi), %ymm11 - vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x98(%rsi), %ymm1 - vpbroadcastd 0x538(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x100(%rdi) - vmovdqa %ymm10, 0x120(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm4, 0x160(%rdi) - vmovdqa %ymm3, 0x180(%rdi) - vmovdqa %ymm5, 0x1a0(%rdi) - vmovdqa %ymm8, 0x1c0(%rdi) - vmovdqa %ymm11, 0x1e0(%rdi) - vmovdqa 0x200(%rdi), %ymm4 - vmovdqa 0x220(%rdi), %ymm5 - vmovdqa 0x240(%rdi), %ymm6 - vmovdqa 0x260(%rdi), %ymm7 - vmovdqa 0x280(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x2c0(%rdi), %ymm10 - vmovdqa 0x2e0(%rdi), %ymm11 - vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x94(%rsi), %ymm1 - vpbroadcastd 0x534(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x200(%rdi) - vmovdqa %ymm10, 0x220(%rdi) - vmovdqa %ymm6, 0x240(%rdi) - vmovdqa %ymm4, 0x260(%rdi) - vmovdqa %ymm3, 0x280(%rdi) - vmovdqa %ymm5, 0x2a0(%rdi) - vmovdqa %ymm8, 0x2c0(%rdi) - vmovdqa %ymm11, 0x2e0(%rdi) - vmovdqa 0x300(%rdi), %ymm4 - vmovdqa 0x320(%rdi), %ymm5 - vmovdqa 0x340(%rdi), %ymm6 - vmovdqa 0x360(%rdi), %ymm7 - vmovdqa 0x380(%rdi), %ymm8 - vmovdqa 0x3a0(%rdi), %ymm9 - vmovdqa 0x3c0(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x90(%rsi), %ymm1 - vpbroadcastd 0x530(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x300(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm6, 0x340(%rdi) - vmovdqa %ymm4, 0x360(%rdi) - vmovdqa %ymm3, 0x380(%rdi) - vmovdqa %ymm5, 0x3a0(%rdi) - vmovdqa %ymm8, 0x3c0(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa (%rdi), %ymm4 - vmovdqa 0x80(%rdi), %ymm5 - vmovdqa 0x100(%rdi), %ymm6 - vmovdqa 0x180(%rdi), %ymm7 - vmovdqa 0x200(%rdi), %ymm8 - vmovdqa 0x280(%rdi), %ymm9 - vmovdqa 0x300(%rdi), %ymm10 - vmovdqa 0x380(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x200(%rdi) - vmovdqa %ymm9, 0x280(%rdi) - vmovdqa %ymm10, 0x300(%rdi) - vmovdqa %ymm11, 0x380(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, (%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm6, 0x100(%rdi) - vmovdqa %ymm7, 0x180(%rdi) - vmovdqa 0x20(%rdi), %ymm4 - vmovdqa 0xa0(%rdi), %ymm5 - vmovdqa 0x120(%rdi), %ymm6 - vmovdqa 0x1a0(%rdi), %ymm7 - vmovdqa 0x220(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x320(%rdi), %ymm10 - vmovdqa 0x3a0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm9, 0x2a0(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm11, 0x3a0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x20(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm6, 0x120(%rdi) - vmovdqa %ymm7, 0x1a0(%rdi) - vmovdqa 0x40(%rdi), %ymm4 - vmovdqa 0xc0(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x1c0(%rdi), %ymm7 - vmovdqa 0x240(%rdi), %ymm8 - vmovdqa 0x2c0(%rdi), %ymm9 - vmovdqa 0x340(%rdi), %ymm10 - vmovdqa 0x3c0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x240(%rdi) - vmovdqa %ymm9, 0x2c0(%rdi) - vmovdqa %ymm10, 0x340(%rdi) - vmovdqa %ymm11, 0x3c0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x40(%rdi) - vmovdqa %ymm5, 0xc0(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm7, 0x1c0(%rdi) - vmovdqa 0x60(%rdi), %ymm4 - vmovdqa 0xe0(%rdi), %ymm5 - vmovdqa 0x160(%rdi), %ymm6 - vmovdqa 0x1e0(%rdi), %ymm7 - vmovdqa 0x260(%rdi), %ymm8 - vmovdqa 0x2e0(%rdi), %ymm9 - vmovdqa 0x360(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x260(%rdi) - vmovdqa %ymm9, 0x2e0(%rdi) - vmovdqa %ymm10, 0x360(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm5, 0xe0(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa (%rsi), %ymm0 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm10, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm3, 0x80(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm8, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm10, 0x120(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm4, 0x160(%rdi) + vmovdqa %ymm3, 0x180(%rdi) + vmovdqa %ymm5, 0x1a0(%rdi) + vmovdqa %ymm8, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm10, 0x220(%rdi) + vmovdqa %ymm6, 0x240(%rdi) + vmovdqa %ymm4, 0x260(%rdi) + vmovdqa %ymm3, 0x280(%rdi) + vmovdqa %ymm5, 0x2a0(%rdi) + vmovdqa %ymm8, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm6, 0x340(%rdi) + vmovdqa %ymm4, 0x360(%rdi) + vmovdqa %ymm3, 0x380(%rdi) + vmovdqa %ymm5, 0x3a0(%rdi) + vmovdqa %ymm8, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/ntt.S b/mldsa/src/native/x86_64/src/ntt.S index 6d39340c7..f25c0195e 100644 --- a/mldsa/src/native/x86_64/src/ntt.S +++ b/mldsa/src/native/x86_64/src/ntt.S @@ -33,2343 +33,2343 @@ MLD_ASM_FN_SYMBOL(ntt_avx2) .cfi_startproc - vmovdqa (%rsi), %ymm0 - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa (%rdi), %ymm4 - vmovdqa 0x80(%rdi), %ymm5 - vmovdqa 0x100(%rdi), %ymm6 - vmovdqa 0x180(%rdi), %ymm7 - vmovdqa 0x200(%rdi), %ymm8 - vmovdqa 0x280(%rdi), %ymm9 - vmovdqa 0x300(%rdi), %ymm10 - vmovdqa 0x380(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, (%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm6, 0x100(%rdi) - vmovdqa %ymm7, 0x180(%rdi) - vmovdqa %ymm8, 0x200(%rdi) - vmovdqa %ymm9, 0x280(%rdi) - vmovdqa %ymm10, 0x300(%rdi) - vmovdqa %ymm11, 0x380(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x20(%rdi), %ymm4 - vmovdqa 0xa0(%rdi), %ymm5 - vmovdqa 0x120(%rdi), %ymm6 - vmovdqa 0x1a0(%rdi), %ymm7 - vmovdqa 0x220(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x320(%rdi), %ymm10 - vmovdqa 0x3a0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x20(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm6, 0x120(%rdi) - vmovdqa %ymm7, 0x1a0(%rdi) - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm9, 0x2a0(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm11, 0x3a0(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x40(%rdi), %ymm4 - vmovdqa 0xc0(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x1c0(%rdi), %ymm7 - vmovdqa 0x240(%rdi), %ymm8 - vmovdqa 0x2c0(%rdi), %ymm9 - vmovdqa 0x340(%rdi), %ymm10 - vmovdqa 0x3c0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x40(%rdi) - vmovdqa %ymm5, 0xc0(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm7, 0x1c0(%rdi) - vmovdqa %ymm8, 0x240(%rdi) - vmovdqa %ymm9, 0x2c0(%rdi) - vmovdqa %ymm10, 0x340(%rdi) - vmovdqa %ymm11, 0x3c0(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x60(%rdi), %ymm4 - vmovdqa 0xe0(%rdi), %ymm5 - vmovdqa 0x160(%rdi), %ymm6 - vmovdqa 0x1e0(%rdi), %ymm7 - vmovdqa 0x260(%rdi), %ymm8 - vmovdqa 0x2e0(%rdi), %ymm9 - vmovdqa 0x360(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm5, 0xe0(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm7, 0x1e0(%rdi) - vmovdqa %ymm8, 0x260(%rdi) - vmovdqa %ymm9, 0x2e0(%rdi) - vmovdqa %ymm10, 0x360(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa (%rdi), %ymm4 - vmovdqa 0x20(%rdi), %ymm5 - vmovdqa 0x40(%rdi), %ymm6 - vmovdqa 0x60(%rdi), %ymm7 - vmovdqa 0x80(%rdi), %ymm8 - vmovdqa 0xa0(%rdi), %ymm9 - vmovdqa 0xc0(%rdi), %ymm10 - vmovdqa 0xe0(%rdi), %ymm11 - vpbroadcastd 0x90(%rsi), %ymm1 - vpbroadcastd 0x530(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xa0(%rsi), %ymm1 - vmovdqa 0x540(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x120(%rsi), %ymm1 - vmovdqa 0x5c0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1a0(%rsi), %ymm1 - vmovdqa 0x640(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x220(%rsi), %ymm1 - vmovdqa 0x6c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2a0(%rsi), %ymm1 - vmovdqa 0x740(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x320(%rsi), %ymm1 - vmovdqa 0x7c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3a0(%rsi), %ymm1 - vmovdqa 0x840(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x420(%rsi), %ymm1 - vmovdqa 0x8c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4a0(%rsi), %ymm1 - vmovdqa 0x940(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, (%rdi) - vmovdqa %ymm8, 0x20(%rdi) - vmovdqa %ymm7, 0x40(%rdi) - vmovdqa %ymm6, 0x60(%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm4, 0xa0(%rdi) - vmovdqa %ymm3, 0xc0(%rdi) - vmovdqa %ymm11, 0xe0(%rdi) - vmovdqa 0x100(%rdi), %ymm4 - vmovdqa 0x120(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x160(%rdi), %ymm7 - vmovdqa 0x180(%rdi), %ymm8 - vmovdqa 0x1a0(%rdi), %ymm9 - vmovdqa 0x1c0(%rdi), %ymm10 - vmovdqa 0x1e0(%rdi), %ymm11 - vpbroadcastd 0x94(%rsi), %ymm1 - vpbroadcastd 0x534(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xc0(%rsi), %ymm1 - vmovdqa 0x560(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x140(%rsi), %ymm1 - vmovdqa 0x5e0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1c0(%rsi), %ymm1 - vmovdqa 0x660(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x240(%rsi), %ymm1 - vmovdqa 0x6e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2c0(%rsi), %ymm1 - vmovdqa 0x760(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x340(%rsi), %ymm1 - vmovdqa 0x7e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3c0(%rsi), %ymm1 - vmovdqa 0x860(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x440(%rsi), %ymm1 - vmovdqa 0x8e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4c0(%rsi), %ymm1 - vmovdqa 0x960(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x100(%rdi) - vmovdqa %ymm8, 0x120(%rdi) - vmovdqa %ymm7, 0x140(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm5, 0x180(%rdi) - vmovdqa %ymm4, 0x1a0(%rdi) - vmovdqa %ymm3, 0x1c0(%rdi) - vmovdqa %ymm11, 0x1e0(%rdi) - vmovdqa 0x200(%rdi), %ymm4 - vmovdqa 0x220(%rdi), %ymm5 - vmovdqa 0x240(%rdi), %ymm6 - vmovdqa 0x260(%rdi), %ymm7 - vmovdqa 0x280(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x2c0(%rdi), %ymm10 - vmovdqa 0x2e0(%rdi), %ymm11 - vpbroadcastd 0x98(%rsi), %ymm1 - vpbroadcastd 0x538(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xe0(%rsi), %ymm1 - vmovdqa 0x580(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x160(%rsi), %ymm1 - vmovdqa 0x600(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1e0(%rsi), %ymm1 - vmovdqa 0x680(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x260(%rsi), %ymm1 - vmovdqa 0x700(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2e0(%rsi), %ymm1 - vmovdqa 0x780(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x360(%rsi), %ymm1 - vmovdqa 0x800(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3e0(%rsi), %ymm1 - vmovdqa 0x880(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x460(%rsi), %ymm1 - vmovdqa 0x900(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4e0(%rsi), %ymm1 - vmovdqa 0x980(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x200(%rdi) - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm7, 0x240(%rdi) - vmovdqa %ymm6, 0x260(%rdi) - vmovdqa %ymm5, 0x280(%rdi) - vmovdqa %ymm4, 0x2a0(%rdi) - vmovdqa %ymm3, 0x2c0(%rdi) - vmovdqa %ymm11, 0x2e0(%rdi) - vmovdqa 0x300(%rdi), %ymm4 - vmovdqa 0x320(%rdi), %ymm5 - vmovdqa 0x340(%rdi), %ymm6 - vmovdqa 0x360(%rdi), %ymm7 - vmovdqa 0x380(%rdi), %ymm8 - vmovdqa 0x3a0(%rdi), %ymm9 - vmovdqa 0x3c0(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpbroadcastd 0x9c(%rsi), %ymm1 - vpbroadcastd 0x53c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0x100(%rsi), %ymm1 - vmovdqa 0x5a0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x180(%rsi), %ymm1 - vmovdqa 0x620(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x200(%rsi), %ymm1 - vmovdqa 0x6a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x280(%rsi), %ymm1 - vmovdqa 0x720(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x300(%rsi), %ymm1 - vmovdqa 0x7a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x380(%rsi), %ymm1 - vmovdqa 0x820(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x400(%rsi), %ymm1 - vmovdqa 0x8a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x480(%rsi), %ymm1 - vmovdqa 0x920(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x500(%rsi), %ymm1 - vmovdqa 0x9a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x300(%rdi) - vmovdqa %ymm8, 0x320(%rdi) - vmovdqa %ymm7, 0x340(%rdi) - vmovdqa %ymm6, 0x360(%rdi) - vmovdqa %ymm5, 0x380(%rdi) - vmovdqa %ymm4, 0x3a0(%rdi) - vmovdqa %ymm3, 0x3c0(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rsi), %ymm0 + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xa0(%rsi), %ymm1 + vmovdqa 0x540(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x120(%rsi), %ymm1 + vmovdqa 0x5c0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1a0(%rsi), %ymm1 + vmovdqa 0x640(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x220(%rsi), %ymm1 + vmovdqa 0x6c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2a0(%rsi), %ymm1 + vmovdqa 0x740(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x320(%rsi), %ymm1 + vmovdqa 0x7c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3a0(%rsi), %ymm1 + vmovdqa 0x840(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x420(%rsi), %ymm1 + vmovdqa 0x8c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4a0(%rsi), %ymm1 + vmovdqa 0x940(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xc0(%rsi), %ymm1 + vmovdqa 0x560(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x140(%rsi), %ymm1 + vmovdqa 0x5e0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1c0(%rsi), %ymm1 + vmovdqa 0x660(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x240(%rsi), %ymm1 + vmovdqa 0x6e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2c0(%rsi), %ymm1 + vmovdqa 0x760(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x340(%rsi), %ymm1 + vmovdqa 0x7e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3c0(%rsi), %ymm1 + vmovdqa 0x860(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x440(%rsi), %ymm1 + vmovdqa 0x8e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4c0(%rsi), %ymm1 + vmovdqa 0x960(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xe0(%rsi), %ymm1 + vmovdqa 0x580(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x160(%rsi), %ymm1 + vmovdqa 0x600(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1e0(%rsi), %ymm1 + vmovdqa 0x680(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x260(%rsi), %ymm1 + vmovdqa 0x700(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2e0(%rsi), %ymm1 + vmovdqa 0x780(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x360(%rsi), %ymm1 + vmovdqa 0x800(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3e0(%rsi), %ymm1 + vmovdqa 0x880(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x460(%rsi), %ymm1 + vmovdqa 0x900(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4e0(%rsi), %ymm1 + vmovdqa 0x980(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0x100(%rsi), %ymm1 + vmovdqa 0x5a0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x180(%rsi), %ymm1 + vmovdqa 0x620(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x200(%rsi), %ymm1 + vmovdqa 0x6a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x280(%rsi), %ymm1 + vmovdqa 0x720(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x300(%rsi), %ymm1 + vmovdqa 0x7a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x380(%rsi), %ymm1 + vmovdqa 0x820(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x400(%rsi), %ymm1 + vmovdqa 0x8a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x480(%rsi), %ymm1 + vmovdqa 0x920(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x500(%rsi), %ymm1 + vmovdqa 0x9a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/nttunpack.S b/mldsa/src/native/x86_64/src/nttunpack.S index efae5c818..53fcc5b50 100644 --- a/mldsa/src/native/x86_64/src/nttunpack.S +++ b/mldsa/src/native/x86_64/src/nttunpack.S @@ -34,198 +34,198 @@ MLD_ASM_FN_SYMBOL(nttunpack_avx2) .cfi_startproc - vmovdqa (%rdi), %ymm4 - vmovdqa 0x20(%rdi), %ymm5 - vmovdqa 0x40(%rdi), %ymm6 - vmovdqa 0x60(%rdi), %ymm7 - vmovdqa 0x80(%rdi), %ymm8 - vmovdqa 0xa0(%rdi), %ymm9 - vmovdqa 0xc0(%rdi), %ymm10 - vmovdqa 0xe0(%rdi), %ymm11 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa %ymm9, (%rdi) - vmovdqa %ymm8, 0x20(%rdi) - vmovdqa %ymm7, 0x40(%rdi) - vmovdqa %ymm6, 0x60(%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm4, 0xa0(%rdi) - vmovdqa %ymm3, 0xc0(%rdi) - vmovdqa %ymm11, 0xe0(%rdi) - vmovdqa 0x100(%rdi), %ymm4 - vmovdqa 0x120(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x160(%rdi), %ymm7 - vmovdqa 0x180(%rdi), %ymm8 - vmovdqa 0x1a0(%rdi), %ymm9 - vmovdqa 0x1c0(%rdi), %ymm10 - vmovdqa 0x1e0(%rdi), %ymm11 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa %ymm9, 0x100(%rdi) - vmovdqa %ymm8, 0x120(%rdi) - vmovdqa %ymm7, 0x140(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm5, 0x180(%rdi) - vmovdqa %ymm4, 0x1a0(%rdi) - vmovdqa %ymm3, 0x1c0(%rdi) - vmovdqa %ymm11, 0x1e0(%rdi) - vmovdqa 0x200(%rdi), %ymm4 - vmovdqa 0x220(%rdi), %ymm5 - vmovdqa 0x240(%rdi), %ymm6 - vmovdqa 0x260(%rdi), %ymm7 - vmovdqa 0x280(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x2c0(%rdi), %ymm10 - vmovdqa 0x2e0(%rdi), %ymm11 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa %ymm9, 0x200(%rdi) - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm7, 0x240(%rdi) - vmovdqa %ymm6, 0x260(%rdi) - vmovdqa %ymm5, 0x280(%rdi) - vmovdqa %ymm4, 0x2a0(%rdi) - vmovdqa %ymm3, 0x2c0(%rdi) - vmovdqa %ymm11, 0x2e0(%rdi) - vmovdqa 0x300(%rdi), %ymm4 - vmovdqa 0x320(%rdi), %ymm5 - vmovdqa 0x340(%rdi), %ymm6 - vmovdqa 0x360(%rdi), %ymm7 - vmovdqa 0x380(%rdi), %ymm8 - vmovdqa 0x3a0(%rdi), %ymm9 - vmovdqa 0x3c0(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa %ymm9, 0x300(%rdi) - vmovdqa %ymm8, 0x320(%rdi) - vmovdqa %ymm7, 0x340(%rdi) - vmovdqa %ymm6, 0x360(%rdi) - vmovdqa %ymm5, 0x380(%rdi) - vmovdqa %ymm4, 0x3a0(%rdi) - vmovdqa %ymm3, 0x3c0(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/pointwise.S b/mldsa/src/native/x86_64/src/pointwise.S index cb662462b..718e7178b 100644 --- a/mldsa/src/native/x86_64/src/pointwise.S +++ b/mldsa/src/native/x86_64/src/pointwise.S @@ -32,92 +32,92 @@ MLD_ASM_FN_SYMBOL(pointwise_avx2) .cfi_startproc - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 - xorl %eax, %eax + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax Lpointwise_avx2_looptop1: - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa 0x40(%rsi), %ymm6 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vmovdqa 0x40(%rdx), %ymm14 - vpsrlq $0x20, %ymm2, %ymm3 - vpsrlq $0x20, %ymm4, %ymm5 - vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] - vpsrlq $0x20, %ymm10, %ymm11 - vpsrlq $0x20, %ymm12, %ymm13 - vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm2, %ymm2 - vpmuldq %ymm11, %ymm3, %ymm3 - vpmuldq %ymm12, %ymm4, %ymm4 - vpmuldq %ymm13, %ymm5, %ymm5 - vpmuldq %ymm14, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm0, %ymm10 - vpmuldq %ymm3, %ymm0, %ymm11 - vpmuldq %ymm4, %ymm0, %ymm12 - vpmuldq %ymm5, %ymm0, %ymm13 - vpmuldq %ymm6, %ymm0, %ymm14 - vpmuldq %ymm7, %ymm0, %ymm15 - vpmuldq %ymm10, %ymm1, %ymm10 - vpmuldq %ymm11, %ymm1, %ymm11 - vpmuldq %ymm12, %ymm1, %ymm12 - vpmuldq %ymm13, %ymm1, %ymm13 - vpmuldq %ymm14, %ymm1, %ymm14 - vpmuldq %ymm15, %ymm1, %ymm15 - vpsubq %ymm10, %ymm2, %ymm2 - vpsubq %ymm11, %ymm3, %ymm3 - vpsubq %ymm12, %ymm4, %ymm4 - vpsubq %ymm13, %ymm5, %ymm5 - vpsubq %ymm14, %ymm6, %ymm6 - vpsubq %ymm15, %ymm7, %ymm7 - vpsrlq $0x20, %ymm2, %ymm2 - vpsrlq $0x20, %ymm4, %ymm4 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] - vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovdqa %ymm2, (%rdi) - vmovdqa %ymm4, 0x20(%rdi) - vmovdqa %ymm6, 0x40(%rdi) - addq $0x60, %rdi - addq $0x60, %rsi - addq $0x60, %rdx - addl $0x1, %eax - cmpl $0xa, %eax - jb Lpointwise_avx2_looptop1 - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vpsrlq $0x20, %ymm2, %ymm3 - vpsrlq $0x20, %ymm4, %ymm5 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm2, %ymm2 - vpmuldq %ymm11, %ymm3, %ymm3 - vpmuldq %ymm12, %ymm4, %ymm4 - vpmuldq %ymm13, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm0, %ymm10 - vpmuldq %ymm3, %ymm0, %ymm11 - vpmuldq %ymm4, %ymm0, %ymm12 - vpmuldq %ymm5, %ymm0, %ymm13 - vpmuldq %ymm10, %ymm1, %ymm10 - vpmuldq %ymm11, %ymm1, %ymm11 - vpmuldq %ymm12, %ymm1, %ymm12 - vpmuldq %ymm13, %ymm1, %ymm13 - vpsubq %ymm10, %ymm2, %ymm2 - vpsubq %ymm11, %ymm3, %ymm3 - vpsubq %ymm12, %ymm4, %ymm4 - vpsubq %ymm13, %ymm5, %ymm5 - vpsrlq $0x20, %ymm2, %ymm2 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] - vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovdqa %ymm2, (%rdi) - vmovdqa %ymm4, 0x20(%rdi) + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm4 + vmovdqa 0x40(%rsi), %ymm6 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vmovdqa 0x40(%rdx), %ymm14 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] + vpsrlq $0x20, %ymm10, %ymm11 + vpsrlq $0x20, %ymm12, %ymm13 + vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm14, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm6, %ymm0, %ymm14 + vpmuldq %ymm7, %ymm0, %ymm15 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpmuldq %ymm14, %ymm1, %ymm14 + vpmuldq %ymm15, %ymm1, %ymm15 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsubq %ymm14, %ymm6, %ymm6 + vpsubq %ymm15, %ymm7, %ymm7 + vpsrlq $0x20, %ymm2, %ymm2 + vpsrlq $0x20, %ymm4, %ymm4 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + addq $0x60, %rdi + addq $0x60, %rsi + addq $0x60, %rdx + addl $0x1, %eax + cmpl $0xa, %eax + jb Lpointwise_avx2_looptop1 + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm4 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/pointwise_acc_l4.S b/mldsa/src/native/x86_64/src/pointwise_acc_l4.S index a8f0fc597..df359b8f8 100644 --- a/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +++ b/mldsa/src/native/x86_64/src/pointwise_acc_l4.S @@ -33,99 +33,99 @@ MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2) .cfi_startproc - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 - xorl %eax, %eax + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax Lpointwise_acc_l4_avx2_looptop2: - vmovdqa (%rsi), %ymm6 - vmovdqa 0x20(%rsi), %ymm8 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vmovdqa %ymm6, %ymm2 - vmovdqa %ymm7, %ymm3 - vmovdqa %ymm8, %ymm4 - vmovdqa %ymm9, %ymm5 - vmovdqa 0x400(%rsi), %ymm6 - vmovdqa 0x420(%rsi), %ymm8 - vmovdqa 0x400(%rdx), %ymm10 - vmovdqa 0x420(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x800(%rsi), %ymm6 - vmovdqa 0x820(%rsi), %ymm8 - vmovdqa 0x800(%rdx), %ymm10 - vmovdqa 0x820(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0xc00(%rsi), %ymm6 - vmovdqa 0xc20(%rsi), %ymm8 - vmovdqa 0xc00(%rdx), %ymm10 - vmovdqa 0xc20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vpmuldq %ymm2, %ymm0, %ymm6 - vpmuldq %ymm3, %ymm0, %ymm7 - vpmuldq %ymm4, %ymm0, %ymm8 - vpmuldq %ymm5, %ymm0, %ymm9 - vpmuldq %ymm6, %ymm1, %ymm6 - vpmuldq %ymm7, %ymm1, %ymm7 - vpmuldq %ymm8, %ymm1, %ymm8 - vpmuldq %ymm9, %ymm1, %ymm9 - vpsubq %ymm6, %ymm2, %ymm2 - vpsubq %ymm7, %ymm3, %ymm3 - vpsubq %ymm8, %ymm4, %ymm4 - vpsubq %ymm9, %ymm5, %ymm5 - vpsrlq $0x20, %ymm2, %ymm2 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] - vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovdqa %ymm2, (%rdi) - vmovdqa %ymm4, 0x20(%rdi) - addq $0x40, %rsi - addq $0x40, %rdx - addq $0x40, %rdi - addl $0x1, %eax - cmpl $0x10, %eax - jb Lpointwise_acc_l4_avx2_looptop2 + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l4_avx2_looptop2 retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/pointwise_acc_l5.S b/mldsa/src/native/x86_64/src/pointwise_acc_l5.S index 44720c6e4..6244bd0db 100644 --- a/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +++ b/mldsa/src/native/x86_64/src/pointwise_acc_l5.S @@ -33,115 +33,115 @@ MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2) .cfi_startproc - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 - xorl %eax, %eax + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax Lpointwise_acc_l5_avx2_looptop2: - vmovdqa (%rsi), %ymm6 - vmovdqa 0x20(%rsi), %ymm8 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vmovdqa %ymm6, %ymm2 - vmovdqa %ymm7, %ymm3 - vmovdqa %ymm8, %ymm4 - vmovdqa %ymm9, %ymm5 - vmovdqa 0x400(%rsi), %ymm6 - vmovdqa 0x420(%rsi), %ymm8 - vmovdqa 0x400(%rdx), %ymm10 - vmovdqa 0x420(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x800(%rsi), %ymm6 - vmovdqa 0x820(%rsi), %ymm8 - vmovdqa 0x800(%rdx), %ymm10 - vmovdqa 0x820(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0xc00(%rsi), %ymm6 - vmovdqa 0xc20(%rsi), %ymm8 - vmovdqa 0xc00(%rdx), %ymm10 - vmovdqa 0xc20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x1000(%rsi), %ymm6 - vmovdqa 0x1020(%rsi), %ymm8 - vmovdqa 0x1000(%rdx), %ymm10 - vmovdqa 0x1020(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vpmuldq %ymm2, %ymm0, %ymm6 - vpmuldq %ymm3, %ymm0, %ymm7 - vpmuldq %ymm4, %ymm0, %ymm8 - vpmuldq %ymm5, %ymm0, %ymm9 - vpmuldq %ymm6, %ymm1, %ymm6 - vpmuldq %ymm7, %ymm1, %ymm7 - vpmuldq %ymm8, %ymm1, %ymm8 - vpmuldq %ymm9, %ymm1, %ymm9 - vpsubq %ymm6, %ymm2, %ymm2 - vpsubq %ymm7, %ymm3, %ymm3 - vpsubq %ymm8, %ymm4, %ymm4 - vpsubq %ymm9, %ymm5, %ymm5 - vpsrlq $0x20, %ymm2, %ymm2 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] - vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovdqa %ymm2, (%rdi) - vmovdqa %ymm4, 0x20(%rdi) - addq $0x40, %rsi - addq $0x40, %rdx - addq $0x40, %rdi - addl $0x1, %eax - cmpl $0x10, %eax - jb Lpointwise_acc_l5_avx2_looptop2 + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l5_avx2_looptop2 retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/pointwise_acc_l7.S b/mldsa/src/native/x86_64/src/pointwise_acc_l7.S index 9450494dc..285d6dae5 100644 --- a/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +++ b/mldsa/src/native/x86_64/src/pointwise_acc_l7.S @@ -33,147 +33,147 @@ MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2) .cfi_startproc - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 - xorl %eax, %eax + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax Lpointwise_acc_l7_avx2_looptop2: - vmovdqa (%rsi), %ymm6 - vmovdqa 0x20(%rsi), %ymm8 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vmovdqa %ymm6, %ymm2 - vmovdqa %ymm7, %ymm3 - vmovdqa %ymm8, %ymm4 - vmovdqa %ymm9, %ymm5 - vmovdqa 0x400(%rsi), %ymm6 - vmovdqa 0x420(%rsi), %ymm8 - vmovdqa 0x400(%rdx), %ymm10 - vmovdqa 0x420(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x800(%rsi), %ymm6 - vmovdqa 0x820(%rsi), %ymm8 - vmovdqa 0x800(%rdx), %ymm10 - vmovdqa 0x820(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0xc00(%rsi), %ymm6 - vmovdqa 0xc20(%rsi), %ymm8 - vmovdqa 0xc00(%rdx), %ymm10 - vmovdqa 0xc20(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x1000(%rsi), %ymm6 - vmovdqa 0x1020(%rsi), %ymm8 - vmovdqa 0x1000(%rdx), %ymm10 - vmovdqa 0x1020(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x1400(%rsi), %ymm6 - vmovdqa 0x1420(%rsi), %ymm8 - vmovdqa 0x1400(%rdx), %ymm10 - vmovdqa 0x1420(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vmovdqa 0x1800(%rsi), %ymm6 - vmovdqa 0x1820(%rsi), %ymm8 - vmovdqa 0x1800(%rdx), %ymm10 - vmovdqa 0x1820(%rdx), %ymm12 - vpsrlq $0x20, %ymm6, %ymm7 - vpsrlq $0x20, %ymm8, %ymm9 - vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] - vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm6, %ymm6 - vpmuldq %ymm11, %ymm7, %ymm7 - vpmuldq %ymm12, %ymm8, %ymm8 - vpmuldq %ymm13, %ymm9, %ymm9 - vpaddq %ymm2, %ymm6, %ymm2 - vpaddq %ymm3, %ymm7, %ymm3 - vpaddq %ymm4, %ymm8, %ymm4 - vpaddq %ymm5, %ymm9, %ymm5 - vpmuldq %ymm2, %ymm0, %ymm6 - vpmuldq %ymm3, %ymm0, %ymm7 - vpmuldq %ymm4, %ymm0, %ymm8 - vpmuldq %ymm5, %ymm0, %ymm9 - vpmuldq %ymm6, %ymm1, %ymm6 - vpmuldq %ymm7, %ymm1, %ymm7 - vpmuldq %ymm8, %ymm1, %ymm8 - vpmuldq %ymm9, %ymm1, %ymm9 - vpsubq %ymm6, %ymm2, %ymm2 - vpsubq %ymm7, %ymm3, %ymm3 - vpsubq %ymm8, %ymm4, %ymm4 - vpsubq %ymm9, %ymm5, %ymm5 - vpsrlq $0x20, %ymm2, %ymm2 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] - vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovdqa %ymm2, (%rdi) - vmovdqa %ymm4, 0x20(%rdi) - addq $0x40, %rsi - addq $0x40, %rdx - addq $0x40, %rdi - addl $0x1, %eax - cmpl $0x10, %eax - jb Lpointwise_acc_l7_avx2_looptop2 + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1400(%rsi), %ymm6 + vmovdqa 0x1420(%rsi), %ymm8 + vmovdqa 0x1400(%rdx), %ymm10 + vmovdqa 0x1420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1800(%rsi), %ymm6 + vmovdqa 0x1820(%rsi), %ymm8 + vmovdqa 0x1800(%rdx), %ymm10 + vmovdqa 0x1820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l7_avx2_looptop2 retq .cfi_endproc diff --git a/mldsa/src/native/x86_64/src/poly_caddq_avx2.S b/mldsa/src/native/x86_64/src/poly_caddq_avx2.S index 78a1136ef..a8a8c3f3e 100644 --- a/mldsa/src/native/x86_64/src/poly_caddq_avx2.S +++ b/mldsa/src/native/x86_64/src/poly_caddq_avx2.S @@ -44,32 +44,32 @@ MLD_ASM_FN_SYMBOL(poly_caddq_avx2) .cfi_startproc - movl $0x7fe001, %edx # imm = 0x7FE001 - leaq 0x400(%rdi), %rax - vpxor %xmm2, %xmm2, %xmm2 - vmovd %edx, %xmm1 - vpbroadcastd %xmm1, %ymm1 + movl $0x7fe001, %edx # imm = 0x7FE001 + leaq 0x400(%rdi), %rax + vpxor %xmm2, %xmm2, %xmm2 + vmovd %edx, %xmm1 + vpbroadcastd %xmm1, %ymm1 Lpoly_caddq_avx2_loop: - vpcmpgtd (%rdi), %ymm2, %ymm0 - vpand %ymm1, %ymm0, %ymm0 - vpaddd (%rdi), %ymm0, %ymm0 - vmovdqa %ymm0, (%rdi) - vpcmpgtd 0x20(%rdi), %ymm2, %ymm3 - vpand %ymm1, %ymm3, %ymm3 - vpaddd 0x20(%rdi), %ymm3, %ymm3 - vmovdqa %ymm3, 0x20(%rdi) - vpcmpgtd 0x40(%rdi), %ymm2, %ymm4 - vpand %ymm1, %ymm4, %ymm4 - vpaddd 0x40(%rdi), %ymm4, %ymm4 - vmovdqa %ymm4, 0x40(%rdi) - vpcmpgtd 0x60(%rdi), %ymm2, %ymm5 - vpand %ymm1, %ymm5, %ymm5 - vpaddd 0x60(%rdi), %ymm5, %ymm5 - vmovdqa %ymm5, 0x60(%rdi) - addq $0x80, %rdi - cmpq %rdi, %rax - jne Lpoly_caddq_avx2_loop + vpcmpgtd (%rdi), %ymm2, %ymm0 + vpand %ymm1, %ymm0, %ymm0 + vpaddd (%rdi), %ymm0, %ymm0 + vmovdqa %ymm0, (%rdi) + vpcmpgtd 0x20(%rdi), %ymm2, %ymm3 + vpand %ymm1, %ymm3, %ymm3 + vpaddd 0x20(%rdi), %ymm3, %ymm3 + vmovdqa %ymm3, 0x20(%rdi) + vpcmpgtd 0x40(%rdi), %ymm2, %ymm4 + vpand %ymm1, %ymm4, %ymm4 + vpaddd 0x40(%rdi), %ymm4, %ymm4 + vmovdqa %ymm4, 0x40(%rdi) + vpcmpgtd 0x60(%rdi), %ymm2, %ymm5 + vpand %ymm1, %ymm5, %ymm5 + vpaddd 0x60(%rdi), %ymm5, %ymm5 + vmovdqa %ymm5, 0x60(%rdi) + addq $0x80, %rdi + cmpq %rdi, %rax + jne Lpoly_caddq_avx2_loop retq .cfi_endproc diff --git a/mldsa/src/sign.c b/mldsa/src/sign.c index 159ea7fbf..95efb54de 100644 --- a/mldsa/src/sign.c +++ b/mldsa/src/sign.c @@ -65,8 +65,8 @@ __contract__( requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) ensures(return_value == 0 - || return_value == MLD_ERR_FAIL - || return_value == MLD_ERR_OUT_OF_MEMORY + || return_value == MLD_ERR_FAIL + || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL) ); diff --git a/proofs/hol_light/aarch64/mldsa/mldsa_ntt.S b/proofs/hol_light/aarch64/mldsa/mldsa_ntt.S index 393258843..18dfe017d 100644 --- a/proofs/hol_light/aarch64/mldsa/mldsa_ntt.S +++ b/proofs/hol_light/aarch64/mldsa/mldsa_ntt.S @@ -1,18 +1,33 @@ -/* +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mlkem-native project authors * Copyright (c) The mldsa-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + * + * - [SLOTHY_Paper] + * Fast and Clean: Auditable high-performance assembly via constraint solving + * Abdulrahman, Becker, Kannwischer, Klein + * https://eprint.iacr.org/2022/1303 + */ + +/* AArch64 ML-DSA forward NTT following @[NeonNTT] and @[SLOTHY_Paper] */ + /* * WARNING: This file is auto-derived from the mldsa-native source file * dev/aarch64_opt/src/ntt.S using scripts/simpasm. Do not modify it directly. */ -#if defined(__ELF__) -.section .note.GNU-stack,"",@progbits -#endif - .text .balign 4 #ifdef __APPLE__ @@ -24,592 +39,614 @@ PQCP_MLDSA_NATIVE_MLDSA44_ntt_asm: #endif .cfi_startproc - sub sp, sp, #0x40 - stp d8, d9, [sp] - stp d10, d11, [sp, #0x10] - stp d12, d13, [sp, #0x20] - stp d14, d15, [sp, #0x30] - mov w5, #0xe001 // =57345 - movk w5, #0x7f, lsl #16 - dup v7.4s, w5 - mov x3, x0 - mov x4, #0x8 // =8 - ldr q0, [x1], #0x40 - ldur q1, [x1, #-0x30] - ldur q2, [x1, #-0x20] - ldur q3, [x1, #-0x10] - ldr q23, [x0, #0x390] - ldr q13, [x0, #0x380] - ldr q22, [x0, #0x80] - ldr q26, [x0, #0x190] - ldr q8, [x0, #0x280] - ldr q6, [x0, #0x210] - mul v10.4s, v13.4s, v0.s[0] - sqrdmulh v13.4s, v13.4s, v0.s[1] - mul v12.4s, v8.4s, v0.s[0] - sqrdmulh v27.4s, v8.4s, v0.s[1] - mul v4.4s, v6.4s, v0.s[0] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x180] - sqrdmulh v14.4s, v23.4s, v0.s[1] - mls v12.4s, v27.4s, v7.s[0] - add v31.4s, v13.4s, v10.4s - sub v13.4s, v13.4s, v10.4s - mul v10.4s, v23.4s, v0.s[0] - sqrdmulh v8.4s, v13.4s, v1.s[1] - sub v18.4s, v22.4s, v12.4s - mls v10.4s, v14.4s, v7.s[0] - mul v13.4s, v13.4s, v1.s[0] - mls v13.4s, v8.4s, v7.s[0] - sub v29.4s, v26.4s, v10.4s - add v25.4s, v26.4s, v10.4s - mul v10.4s, v31.4s, v0.s[2] - mul v14.4s, v25.4s, v0.s[2] - add v17.4s, v18.4s, v13.4s - sub v15.4s, v18.4s, v13.4s - sqrdmulh v13.4s, v31.4s, v0.s[3] - sqrdmulh v20.4s, v15.4s, v3.s[1] - sqrdmulh v5.4s, v17.4s, v2.s[3] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x300] - mul v18.4s, v17.4s, v2.s[2] - add v31.4s, v22.4s, v12.4s - mul v23.4s, v15.4s, v3.s[0] - ldr q17, [x0, #0x90] - add v19.4s, v31.4s, v10.4s - sub v16.4s, v31.4s, v10.4s - mul v10.4s, v13.4s, v0.s[0] - sqrdmulh v13.4s, v13.4s, v0.s[1] - sqrdmulh v27.4s, v16.4s, v2.s[1] - mul v11.4s, v16.4s, v2.s[0] - mls v10.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x290] - ldr q22, [x0, #0x100] - mls v11.4s, v27.4s, v7.s[0] - sqrdmulh v15.4s, v13.4s, v0.s[1] - sub v12.4s, v22.4s, v10.4s - add v30.4s, v22.4s, v10.4s - mul v10.4s, v13.4s, v0.s[0] - ldr q28, [x0] - sqrdmulh v13.4s, v25.4s, v0.s[3] - sqrdmulh v27.4s, v30.4s, v0.s[3] - mls v10.4s, v15.4s, v7.s[0] - mls v14.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x200] - sqrdmulh v25.4s, v12.4s, v1.s[1] - add v24.4s, v17.4s, v10.4s - sub v21.4s, v17.4s, v10.4s - sqrdmulh v8.4s, v13.4s, v0.s[1] - sub v9.4s, v24.4s, v14.4s - mul v26.4s, v12.4s, v1.s[0] - mul v13.4s, v13.4s, v0.s[0] - mls v13.4s, v8.4s, v7.s[0] - mul v8.4s, v30.4s, v0.s[2] - mls v8.4s, v27.4s, v7.s[0] - add v16.4s, v28.4s, v13.4s - sub v10.4s, v28.4s, v13.4s - mls v26.4s, v25.4s, v7.s[0] - sqrdmulh v12.4s, v19.4s, v1.s[3] - sub v25.4s, v16.4s, v8.4s - mls v23.4s, v20.4s, v7.s[0] - sub v22.4s, v25.4s, v11.4s - sqrdmulh v20.4s, v9.4s, v2.s[1] - sub v15.4s, v10.4s, v26.4s - sub x4, x4, #0x2 + sub sp, sp, #0x40 + .cfi_adjust_cfa_offset 0x40 + stp d8, d9, [sp] + .cfi_rel_offset d8, 0x0 + .cfi_rel_offset d9, 0x8 + stp d10, d11, [sp, #0x10] + .cfi_rel_offset d10, 0x10 + .cfi_rel_offset d11, 0x18 + stp d12, d13, [sp, #0x20] + .cfi_rel_offset d12, 0x20 + .cfi_rel_offset d13, 0x28 + stp d14, d15, [sp, #0x30] + .cfi_rel_offset d14, 0x30 + .cfi_rel_offset d15, 0x38 + mov w5, #0xe001 // =57345 + movk w5, #0x7f, lsl #16 + dup v7.4s, w5 + mov x3, x0 + mov x4, #0x8 // =8 + ldr q0, [x1], #0x40 + ldur q1, [x1, #-0x30] + ldur q2, [x1, #-0x20] + ldur q3, [x1, #-0x10] + ldr q23, [x0, #0x390] + ldr q13, [x0, #0x380] + ldr q22, [x0, #0x80] + ldr q26, [x0, #0x190] + ldr q8, [x0, #0x280] + ldr q6, [x0, #0x210] + mul v10.4s, v13.4s, v0.s[0] + sqrdmulh v13.4s, v13.4s, v0.s[1] + mul v12.4s, v8.4s, v0.s[0] + sqrdmulh v27.4s, v8.4s, v0.s[1] + mul v4.4s, v6.4s, v0.s[0] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x180] + sqrdmulh v14.4s, v23.4s, v0.s[1] + mls v12.4s, v27.4s, v7.s[0] + add v31.4s, v13.4s, v10.4s + sub v13.4s, v13.4s, v10.4s + mul v10.4s, v23.4s, v0.s[0] + sqrdmulh v8.4s, v13.4s, v1.s[1] + sub v18.4s, v22.4s, v12.4s + mls v10.4s, v14.4s, v7.s[0] + mul v13.4s, v13.4s, v1.s[0] + mls v13.4s, v8.4s, v7.s[0] + sub v29.4s, v26.4s, v10.4s + add v25.4s, v26.4s, v10.4s + mul v10.4s, v31.4s, v0.s[2] + mul v14.4s, v25.4s, v0.s[2] + add v17.4s, v18.4s, v13.4s + sub v15.4s, v18.4s, v13.4s + sqrdmulh v13.4s, v31.4s, v0.s[3] + sqrdmulh v20.4s, v15.4s, v3.s[1] + sqrdmulh v5.4s, v17.4s, v2.s[3] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x300] + mul v18.4s, v17.4s, v2.s[2] + add v31.4s, v22.4s, v12.4s + mul v23.4s, v15.4s, v3.s[0] + ldr q17, [x0, #0x90] + add v19.4s, v31.4s, v10.4s + sub v16.4s, v31.4s, v10.4s + mul v10.4s, v13.4s, v0.s[0] + sqrdmulh v13.4s, v13.4s, v0.s[1] + sqrdmulh v27.4s, v16.4s, v2.s[1] + mul v11.4s, v16.4s, v2.s[0] + mls v10.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x290] + ldr q22, [x0, #0x100] + mls v11.4s, v27.4s, v7.s[0] + sqrdmulh v15.4s, v13.4s, v0.s[1] + sub v12.4s, v22.4s, v10.4s + add v30.4s, v22.4s, v10.4s + mul v10.4s, v13.4s, v0.s[0] + ldr q28, [x0] + sqrdmulh v13.4s, v25.4s, v0.s[3] + sqrdmulh v27.4s, v30.4s, v0.s[3] + mls v10.4s, v15.4s, v7.s[0] + mls v14.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x200] + sqrdmulh v25.4s, v12.4s, v1.s[1] + add v24.4s, v17.4s, v10.4s + sub v21.4s, v17.4s, v10.4s + sqrdmulh v8.4s, v13.4s, v0.s[1] + sub v9.4s, v24.4s, v14.4s + mul v26.4s, v12.4s, v1.s[0] + mul v13.4s, v13.4s, v0.s[0] + mls v13.4s, v8.4s, v7.s[0] + mul v8.4s, v30.4s, v0.s[2] + mls v8.4s, v27.4s, v7.s[0] + add v16.4s, v28.4s, v13.4s + sub v10.4s, v28.4s, v13.4s + mls v26.4s, v25.4s, v7.s[0] + sqrdmulh v12.4s, v19.4s, v1.s[3] + sub v25.4s, v16.4s, v8.4s + mls v23.4s, v20.4s, v7.s[0] + sub v22.4s, v25.4s, v11.4s + sqrdmulh v20.4s, v9.4s, v2.s[1] + sub v15.4s, v10.4s, v26.4s + sub x4, x4, #0x2 Lntt_layer123_start: - add v31.4s, v10.4s, v26.4s - mul v17.4s, v19.4s, v1.s[2] - add v26.4s, v15.4s, v23.4s - ldr q30, [x0, #0x2a0] - sub v13.4s, v15.4s, v23.4s - mul v23.4s, v29.4s, v1.s[0] - add v25.4s, v25.4s, v11.4s - str q22, [x0, #0x180] - mul v11.4s, v9.4s, v2.s[0] - str q13, [x0, #0x380] - ldr q28, [x0, #0x10] - add v10.4s, v16.4s, v8.4s - mls v17.4s, v12.4s, v7.s[0] - ldr q13, [x0, #0x3a0] - str q26, [x0, #0x300] - sqrdmulh v27.4s, v30.4s, v0.s[1] - mls v18.4s, v5.4s, v7.s[0] - ldr q9, [x0, #0x1a0] - sub v16.4s, v10.4s, v17.4s - add v15.4s, v10.4s, v17.4s - sqrdmulh v10.4s, v6.4s, v0.s[1] - str q16, [x0, #0x80] - str q15, [x0], #0x10 - sqrdmulh v19.4s, v13.4s, v0.s[1] - sub v15.4s, v31.4s, v18.4s - mul v8.4s, v13.4s, v0.s[0] - add v26.4s, v31.4s, v18.4s - str q15, [x0, #0x270] - sqrdmulh v13.4s, v29.4s, v1.s[1] - str q26, [x0, #0x1f0] - mls v8.4s, v19.4s, v7.s[0] - mls v11.4s, v20.4s, v7.s[0] - mls v23.4s, v13.4s, v7.s[0] - add v22.4s, v9.4s, v8.4s - ldr q6, [x0, #0x210] - sub v29.4s, v9.4s, v8.4s - mul v17.4s, v30.4s, v0.s[0] - ldr q9, [x0, #0x300] - sqrdmulh v13.4s, v22.4s, v0.s[3] - add v18.4s, v21.4s, v23.4s - mls v4.4s, v10.4s, v7.s[0] - sub v31.4s, v21.4s, v23.4s - sqrdmulh v16.4s, v31.4s, v3.s[1] - add v19.4s, v24.4s, v14.4s - mul v14.4s, v22.4s, v0.s[2] - sub v10.4s, v28.4s, v4.4s - mls v14.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0x100] - sqrdmulh v22.4s, v9.4s, v0.s[1] - mul v8.4s, v9.4s, v0.s[0] - mul v23.4s, v31.4s, v3.s[0] - mls v8.4s, v22.4s, v7.s[0] - mls v23.4s, v16.4s, v7.s[0] - add v16.4s, v28.4s, v4.4s - ldr q22, [x0, #0x90] - mul v4.4s, v6.4s, v0.s[0] - mls v17.4s, v27.4s, v7.s[0] - add v21.4s, v13.4s, v8.4s - sub v27.4s, v13.4s, v8.4s - sqrdmulh v31.4s, v21.4s, v0.s[3] - str q25, [x0, #0xf0] - mul v8.4s, v21.4s, v0.s[2] - add v24.4s, v22.4s, v17.4s - sub v21.4s, v22.4s, v17.4s - sqrdmulh v5.4s, v18.4s, v2.s[3] - mls v8.4s, v31.4s, v7.s[0] - sub v9.4s, v24.4s, v14.4s - sqrdmulh v20.4s, v27.4s, v1.s[1] - mul v26.4s, v27.4s, v1.s[0] - sub v25.4s, v16.4s, v8.4s - mul v18.4s, v18.4s, v2.s[2] - sub v22.4s, v25.4s, v11.4s - mls v26.4s, v20.4s, v7.s[0] - sqrdmulh v20.4s, v9.4s, v2.s[1] - sqrdmulh v12.4s, v19.4s, v1.s[3] - sub v15.4s, v10.4s, v26.4s - subs x4, x4, #0x1 - cbnz x4, Lntt_layer123_start - add v13.4s, v10.4s, v26.4s - mls v18.4s, v5.4s, v7.s[0] - str q22, [x0, #0x180] - add v27.4s, v16.4s, v8.4s - mul v22.4s, v19.4s, v1.s[2] - add v26.4s, v24.4s, v14.4s - ldr q31, [x0, #0x110] - sub v14.4s, v15.4s, v23.4s - add v17.4s, v15.4s, v23.4s - mls v22.4s, v12.4s, v7.s[0] - add v28.4s, v13.4s, v18.4s - str q14, [x0, #0x380] - sqrdmulh v24.4s, v6.4s, v0.s[1] - add v5.4s, v25.4s, v11.4s - sub v19.4s, v13.4s, v18.4s - str q17, [x0, #0x300] - str q5, [x0, #0x100] - mul v16.4s, v9.4s, v2.s[0] - ldr q18, [x0, #0x310] - str q19, [x0, #0x280] - mls v16.4s, v20.4s, v7.s[0] - str q28, [x0, #0x200] - add v13.4s, v27.4s, v22.4s - ldr q15, [x0, #0x10] - sub v10.4s, v27.4s, v22.4s - mls v4.4s, v24.4s, v7.s[0] - str q13, [x0], #0x10 - str q10, [x0, #0x70] - sqrdmulh v12.4s, v29.4s, v1.s[1] - mul v23.4s, v29.4s, v1.s[0] - mul v8.4s, v26.4s, v1.s[2] - add v20.4s, v15.4s, v4.4s - sub v6.4s, v15.4s, v4.4s - mls v23.4s, v12.4s, v7.s[0] - sqrdmulh v22.4s, v18.4s, v0.s[1] - mul v5.4s, v18.4s, v0.s[0] - sub v28.4s, v21.4s, v23.4s - sqrdmulh v10.4s, v26.4s, v1.s[3] - mls v5.4s, v22.4s, v7.s[0] - sqrdmulh v30.4s, v28.4s, v3.s[1] - add v4.4s, v21.4s, v23.4s - mls v8.4s, v10.4s, v7.s[0] - add v12.4s, v31.4s, v5.4s - sub v9.4s, v31.4s, v5.4s - sqrdmulh v25.4s, v4.4s, v2.s[3] - sqrdmulh v15.4s, v9.4s, v1.s[1] - sqrdmulh v31.4s, v12.4s, v0.s[3] - mul v18.4s, v12.4s, v0.s[2] - mul v11.4s, v9.4s, v1.s[0] - mls v18.4s, v31.4s, v7.s[0] - mul v29.4s, v4.4s, v2.s[2] - mls v29.4s, v25.4s, v7.s[0] - add v23.4s, v20.4s, v18.4s - mls v11.4s, v15.4s, v7.s[0] - sub v31.4s, v20.4s, v18.4s - add v17.4s, v23.4s, v8.4s - add v5.4s, v31.4s, v16.4s - mul v24.4s, v28.4s, v3.s[0] - str q17, [x0], #0x10 - sub v19.4s, v31.4s, v16.4s - mls v24.4s, v30.4s, v7.s[0] - str q5, [x0, #0xf0] - add v31.4s, v6.4s, v11.4s - sub v26.4s, v23.4s, v8.4s - str q19, [x0, #0x170] - add v4.4s, v31.4s, v29.4s - sub v13.4s, v6.4s, v11.4s - str q26, [x0, #0x70] - sub v11.4s, v31.4s, v29.4s - sub v22.4s, v13.4s, v24.4s - add v23.4s, v13.4s, v24.4s - str q4, [x0, #0x1f0] - str q11, [x0, #0x270] - str q23, [x0, #0x2f0] - str q22, [x0, #0x370] - mov x0, x3 - mov x4, #0x8 // =8 - ldr q9, [x0, #0x40] - ldr q23, [x1], #0x40 - ldr q21, [x2, #0x60] - ldr q1, [x0, #0x20] - ldur q14, [x1, #-0x30] - ldr q13, [x0] - ldr q11, [x2, #0x50] - sqrdmulh v16.4s, v9.4s, v23.s[1] - ldr q17, [x0, #0x50] - mul v15.4s, v9.4s, v23.s[0] - ldr q30, [x0, #0x70] - ldr q27, [x0, #0x60] - ldr q8, [x2, #0x30] - sqrdmulh v12.4s, v17.4s, v23.s[1] - ldr q6, [x0, #0x30] - mls v15.4s, v16.4s, v7.s[0] - sqrdmulh v18.4s, v27.4s, v23.s[1] - sqrdmulh v19.4s, v30.4s, v23.s[1] - add v5.4s, v13.4s, v15.4s - mul v25.4s, v27.4s, v23.s[0] - sub v26.4s, v13.4s, v15.4s - mls v25.4s, v18.4s, v7.s[0] - mul v10.4s, v17.4s, v23.s[0] - mls v10.4s, v12.4s, v7.s[0] - mul v4.4s, v30.4s, v23.s[0] - sub v22.4s, v1.4s, v25.4s - mls v4.4s, v19.4s, v7.s[0] - add v28.4s, v1.4s, v25.4s - sqrdmulh v19.4s, v28.4s, v23.s[3] - sqrdmulh v9.4s, v22.4s, v14.s[1] - add v2.4s, v6.4s, v4.4s - mul v0.4s, v28.4s, v23.s[2] - sqrdmulh v27.4s, v2.4s, v23.s[3] - sub v17.4s, v6.4s, v4.4s - mul v3.4s, v2.4s, v23.s[2] - sqrdmulh v20.4s, v17.4s, v14.s[1] - ldr q1, [x0, #0x10] - mls v3.4s, v27.4s, v7.s[0] - mls v0.4s, v19.4s, v7.s[0] - ldur q16, [x1, #-0x20] - add v31.4s, v1.4s, v10.4s - mul v30.4s, v17.4s, v14.s[0] - mls v30.4s, v20.4s, v7.s[0] - add v27.4s, v31.4s, v3.4s - sub v23.4s, v1.4s, v10.4s - sub v24.4s, v31.4s, v3.4s - sqrdmulh v4.4s, v27.4s, v14.s[3] - sqrdmulh v10.4s, v24.4s, v16.s[1] - mul v18.4s, v24.4s, v16.s[0] - add v15.4s, v23.4s, v30.4s - sub v23.4s, v23.4s, v30.4s - mul v29.4s, v27.4s, v14.s[2] - sub v2.4s, v5.4s, v0.4s - add v12.4s, v5.4s, v0.4s - mls v18.4s, v10.4s, v7.s[0] - ldur q3, [x1, #-0x10] - mls v29.4s, v4.4s, v7.s[0] - mul v4.4s, v22.4s, v14.s[0] - add v1.4s, v2.4s, v18.4s - sub v24.4s, v2.4s, v18.4s - mls v4.4s, v9.4s, v7.s[0] - ldr q20, [x2, #0x10] - add v25.4s, v12.4s, v29.4s - mul v9.4s, v23.4s, v3.s[0] - sub v5.4s, v12.4s, v29.4s - sqrdmulh v31.4s, v23.4s, v3.s[1] - trn2 v6.4s, v1.4s, v24.4s - trn2 v10.4s, v25.4s, v5.4s - sqrdmulh v13.4s, v15.4s, v16.s[3] - trn2 v30.2d, v10.2d, v6.2d - ldr q3, [x2], #0xc0 - mul v12.4s, v15.4s, v16.s[2] - trn1 v27.2d, v10.2d, v6.2d - mls v9.4s, v31.4s, v7.s[0] - trn1 v22.4s, v25.4s, v5.4s - sub v6.4s, v26.4s, v4.4s - mls v12.4s, v13.4s, v7.s[0] - trn1 v1.4s, v1.4s, v24.4s - add v13.4s, v26.4s, v4.4s - trn2 v10.2d, v22.2d, v1.2d - mul v28.4s, v30.4s, v3.4s - sub v31.4s, v6.4s, v9.4s - sub x4, x4, #0x1 + add v31.4s, v10.4s, v26.4s + mul v17.4s, v19.4s, v1.s[2] + add v26.4s, v15.4s, v23.4s + ldr q30, [x0, #0x2a0] + sub v13.4s, v15.4s, v23.4s + mul v23.4s, v29.4s, v1.s[0] + add v25.4s, v25.4s, v11.4s + str q22, [x0, #0x180] + mul v11.4s, v9.4s, v2.s[0] + str q13, [x0, #0x380] + ldr q28, [x0, #0x10] + add v10.4s, v16.4s, v8.4s + mls v17.4s, v12.4s, v7.s[0] + ldr q13, [x0, #0x3a0] + str q26, [x0, #0x300] + sqrdmulh v27.4s, v30.4s, v0.s[1] + mls v18.4s, v5.4s, v7.s[0] + ldr q9, [x0, #0x1a0] + sub v16.4s, v10.4s, v17.4s + add v15.4s, v10.4s, v17.4s + sqrdmulh v10.4s, v6.4s, v0.s[1] + str q16, [x0, #0x80] + str q15, [x0], #0x10 + sqrdmulh v19.4s, v13.4s, v0.s[1] + sub v15.4s, v31.4s, v18.4s + mul v8.4s, v13.4s, v0.s[0] + add v26.4s, v31.4s, v18.4s + str q15, [x0, #0x270] + sqrdmulh v13.4s, v29.4s, v1.s[1] + str q26, [x0, #0x1f0] + mls v8.4s, v19.4s, v7.s[0] + mls v11.4s, v20.4s, v7.s[0] + mls v23.4s, v13.4s, v7.s[0] + add v22.4s, v9.4s, v8.4s + ldr q6, [x0, #0x210] + sub v29.4s, v9.4s, v8.4s + mul v17.4s, v30.4s, v0.s[0] + ldr q9, [x0, #0x300] + sqrdmulh v13.4s, v22.4s, v0.s[3] + add v18.4s, v21.4s, v23.4s + mls v4.4s, v10.4s, v7.s[0] + sub v31.4s, v21.4s, v23.4s + sqrdmulh v16.4s, v31.4s, v3.s[1] + add v19.4s, v24.4s, v14.4s + mul v14.4s, v22.4s, v0.s[2] + sub v10.4s, v28.4s, v4.4s + mls v14.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0x100] + sqrdmulh v22.4s, v9.4s, v0.s[1] + mul v8.4s, v9.4s, v0.s[0] + mul v23.4s, v31.4s, v3.s[0] + mls v8.4s, v22.4s, v7.s[0] + mls v23.4s, v16.4s, v7.s[0] + add v16.4s, v28.4s, v4.4s + ldr q22, [x0, #0x90] + mul v4.4s, v6.4s, v0.s[0] + mls v17.4s, v27.4s, v7.s[0] + add v21.4s, v13.4s, v8.4s + sub v27.4s, v13.4s, v8.4s + sqrdmulh v31.4s, v21.4s, v0.s[3] + str q25, [x0, #0xf0] + mul v8.4s, v21.4s, v0.s[2] + add v24.4s, v22.4s, v17.4s + sub v21.4s, v22.4s, v17.4s + sqrdmulh v5.4s, v18.4s, v2.s[3] + mls v8.4s, v31.4s, v7.s[0] + sub v9.4s, v24.4s, v14.4s + sqrdmulh v20.4s, v27.4s, v1.s[1] + mul v26.4s, v27.4s, v1.s[0] + sub v25.4s, v16.4s, v8.4s + mul v18.4s, v18.4s, v2.s[2] + sub v22.4s, v25.4s, v11.4s + mls v26.4s, v20.4s, v7.s[0] + sqrdmulh v20.4s, v9.4s, v2.s[1] + sqrdmulh v12.4s, v19.4s, v1.s[3] + sub v15.4s, v10.4s, v26.4s + subs x4, x4, #0x1 + cbnz x4, Lntt_layer123_start + add v13.4s, v10.4s, v26.4s + mls v18.4s, v5.4s, v7.s[0] + str q22, [x0, #0x180] + add v27.4s, v16.4s, v8.4s + mul v22.4s, v19.4s, v1.s[2] + add v26.4s, v24.4s, v14.4s + ldr q31, [x0, #0x110] + sub v14.4s, v15.4s, v23.4s + add v17.4s, v15.4s, v23.4s + mls v22.4s, v12.4s, v7.s[0] + add v28.4s, v13.4s, v18.4s + str q14, [x0, #0x380] + sqrdmulh v24.4s, v6.4s, v0.s[1] + add v5.4s, v25.4s, v11.4s + sub v19.4s, v13.4s, v18.4s + str q17, [x0, #0x300] + str q5, [x0, #0x100] + mul v16.4s, v9.4s, v2.s[0] + ldr q18, [x0, #0x310] + str q19, [x0, #0x280] + mls v16.4s, v20.4s, v7.s[0] + str q28, [x0, #0x200] + add v13.4s, v27.4s, v22.4s + ldr q15, [x0, #0x10] + sub v10.4s, v27.4s, v22.4s + mls v4.4s, v24.4s, v7.s[0] + str q13, [x0], #0x10 + str q10, [x0, #0x70] + sqrdmulh v12.4s, v29.4s, v1.s[1] + mul v23.4s, v29.4s, v1.s[0] + mul v8.4s, v26.4s, v1.s[2] + add v20.4s, v15.4s, v4.4s + sub v6.4s, v15.4s, v4.4s + mls v23.4s, v12.4s, v7.s[0] + sqrdmulh v22.4s, v18.4s, v0.s[1] + mul v5.4s, v18.4s, v0.s[0] + sub v28.4s, v21.4s, v23.4s + sqrdmulh v10.4s, v26.4s, v1.s[3] + mls v5.4s, v22.4s, v7.s[0] + sqrdmulh v30.4s, v28.4s, v3.s[1] + add v4.4s, v21.4s, v23.4s + mls v8.4s, v10.4s, v7.s[0] + add v12.4s, v31.4s, v5.4s + sub v9.4s, v31.4s, v5.4s + sqrdmulh v25.4s, v4.4s, v2.s[3] + sqrdmulh v15.4s, v9.4s, v1.s[1] + sqrdmulh v31.4s, v12.4s, v0.s[3] + mul v18.4s, v12.4s, v0.s[2] + mul v11.4s, v9.4s, v1.s[0] + mls v18.4s, v31.4s, v7.s[0] + mul v29.4s, v4.4s, v2.s[2] + mls v29.4s, v25.4s, v7.s[0] + add v23.4s, v20.4s, v18.4s + mls v11.4s, v15.4s, v7.s[0] + sub v31.4s, v20.4s, v18.4s + add v17.4s, v23.4s, v8.4s + add v5.4s, v31.4s, v16.4s + mul v24.4s, v28.4s, v3.s[0] + str q17, [x0], #0x10 + sub v19.4s, v31.4s, v16.4s + mls v24.4s, v30.4s, v7.s[0] + str q5, [x0, #0xf0] + add v31.4s, v6.4s, v11.4s + sub v26.4s, v23.4s, v8.4s + str q19, [x0, #0x170] + add v4.4s, v31.4s, v29.4s + sub v13.4s, v6.4s, v11.4s + str q26, [x0, #0x70] + sub v11.4s, v31.4s, v29.4s + sub v22.4s, v13.4s, v24.4s + add v23.4s, v13.4s, v24.4s + str q4, [x0, #0x1f0] + str q11, [x0, #0x270] + str q23, [x0, #0x2f0] + str q22, [x0, #0x370] + mov x0, x3 + mov x4, #0x8 // =8 + ldr q9, [x0, #0x40] + ldr q23, [x1], #0x40 + ldr q21, [x2, #0x60] + ldr q1, [x0, #0x20] + ldur q14, [x1, #-0x30] + ldr q13, [x0] + ldr q11, [x2, #0x50] + sqrdmulh v16.4s, v9.4s, v23.s[1] + ldr q17, [x0, #0x50] + mul v15.4s, v9.4s, v23.s[0] + ldr q30, [x0, #0x70] + ldr q27, [x0, #0x60] + ldr q8, [x2, #0x30] + sqrdmulh v12.4s, v17.4s, v23.s[1] + ldr q6, [x0, #0x30] + mls v15.4s, v16.4s, v7.s[0] + sqrdmulh v18.4s, v27.4s, v23.s[1] + sqrdmulh v19.4s, v30.4s, v23.s[1] + add v5.4s, v13.4s, v15.4s + mul v25.4s, v27.4s, v23.s[0] + sub v26.4s, v13.4s, v15.4s + mls v25.4s, v18.4s, v7.s[0] + mul v10.4s, v17.4s, v23.s[0] + mls v10.4s, v12.4s, v7.s[0] + mul v4.4s, v30.4s, v23.s[0] + sub v22.4s, v1.4s, v25.4s + mls v4.4s, v19.4s, v7.s[0] + add v28.4s, v1.4s, v25.4s + sqrdmulh v19.4s, v28.4s, v23.s[3] + sqrdmulh v9.4s, v22.4s, v14.s[1] + add v2.4s, v6.4s, v4.4s + mul v0.4s, v28.4s, v23.s[2] + sqrdmulh v27.4s, v2.4s, v23.s[3] + sub v17.4s, v6.4s, v4.4s + mul v3.4s, v2.4s, v23.s[2] + sqrdmulh v20.4s, v17.4s, v14.s[1] + ldr q1, [x0, #0x10] + mls v3.4s, v27.4s, v7.s[0] + mls v0.4s, v19.4s, v7.s[0] + ldur q16, [x1, #-0x20] + add v31.4s, v1.4s, v10.4s + mul v30.4s, v17.4s, v14.s[0] + mls v30.4s, v20.4s, v7.s[0] + add v27.4s, v31.4s, v3.4s + sub v23.4s, v1.4s, v10.4s + sub v24.4s, v31.4s, v3.4s + sqrdmulh v4.4s, v27.4s, v14.s[3] + sqrdmulh v10.4s, v24.4s, v16.s[1] + mul v18.4s, v24.4s, v16.s[0] + add v15.4s, v23.4s, v30.4s + sub v23.4s, v23.4s, v30.4s + mul v29.4s, v27.4s, v14.s[2] + sub v2.4s, v5.4s, v0.4s + add v12.4s, v5.4s, v0.4s + mls v18.4s, v10.4s, v7.s[0] + ldur q3, [x1, #-0x10] + mls v29.4s, v4.4s, v7.s[0] + mul v4.4s, v22.4s, v14.s[0] + add v1.4s, v2.4s, v18.4s + sub v24.4s, v2.4s, v18.4s + mls v4.4s, v9.4s, v7.s[0] + ldr q20, [x2, #0x10] + add v25.4s, v12.4s, v29.4s + mul v9.4s, v23.4s, v3.s[0] + sub v5.4s, v12.4s, v29.4s + sqrdmulh v31.4s, v23.4s, v3.s[1] + trn2 v6.4s, v1.4s, v24.4s + trn2 v10.4s, v25.4s, v5.4s + sqrdmulh v13.4s, v15.4s, v16.s[3] + trn2 v30.2d, v10.2d, v6.2d + ldr q3, [x2], #0xc0 + mul v12.4s, v15.4s, v16.s[2] + trn1 v27.2d, v10.2d, v6.2d + mls v9.4s, v31.4s, v7.s[0] + trn1 v22.4s, v25.4s, v5.4s + sub v6.4s, v26.4s, v4.4s + mls v12.4s, v13.4s, v7.s[0] + trn1 v1.4s, v1.4s, v24.4s + add v13.4s, v26.4s, v4.4s + trn2 v10.2d, v22.2d, v1.2d + mul v28.4s, v30.4s, v3.4s + sub v31.4s, v6.4s, v9.4s + sub x4, x4, #0x1 Lntt_layer45678_start: - add v2.4s, v13.4s, v12.4s - sqrdmulh v5.4s, v30.4s, v20.4s - sub v25.4s, v13.4s, v12.4s - add v17.4s, v6.4s, v9.4s - mul v19.4s, v10.4s, v3.4s - trn2 v4.4s, v2.4s, v25.4s - ldur q24, [x2, #-0x50] - trn2 v29.4s, v17.4s, v31.4s - sqrdmulh v15.4s, v10.4s, v20.4s - mls v28.4s, v5.4s, v7.s[0] - trn2 v3.2d, v4.2d, v29.2d - sqrdmulh v12.4s, v3.4s, v24.4s - mul v16.4s, v3.4s, v21.4s - mls v19.4s, v15.4s, v7.s[0] - ldur q10, [x2, #-0xa0] - add v13.4s, v27.4s, v28.4s - mls v16.4s, v12.4s, v7.s[0] - sqrdmulh v9.4s, v13.4s, v8.4s - sub v30.4s, v27.4s, v28.4s - ldr q18, [x1], #0x40 - mul v8.4s, v13.4s, v10.4s - ldr q10, [x0, #0xd0] - sqrdmulh v14.4s, v30.4s, v11.4s - ldr q23, [x0, #0xe0] - sqrdmulh v13.4s, v10.4s, v18.s[1] - sqrdmulh v12.4s, v23.4s, v18.s[1] - ldur q6, [x2, #-0x80] - mul v3.4s, v10.4s, v18.s[0] - mls v3.4s, v13.4s, v7.s[0] - ldr q13, [x0, #0xf0] - trn1 v27.4s, v2.4s, v25.4s - mul v2.4s, v30.4s, v6.4s - trn1 v20.4s, v17.4s, v31.4s - trn1 v25.2d, v4.2d, v29.2d - sqrdmulh v10.4s, v13.4s, v18.s[1] - trn2 v5.2d, v27.2d, v20.2d - ldur q6, [x2, #-0x10] - mls v8.4s, v9.4s, v7.s[0] - sub v15.4s, v25.4s, v16.4s - sqrdmulh v31.4s, v5.4s, v24.4s - sqrdmulh v30.4s, v15.4s, v6.4s - ldur q9, [x2, #-0x30] - mul v4.4s, v5.4s, v21.4s - ldur q21, [x2, #-0x40] - ldur q6, [x2, #-0x20] - add v5.4s, v25.4s, v16.4s - mls v4.4s, v31.4s, v7.s[0] - mul v0.4s, v5.4s, v21.4s - mul v17.4s, v13.4s, v18.s[0] - mls v17.4s, v10.4s, v7.s[0] - ldr q28, [x0, #0xb0] - sqrdmulh v26.4s, v5.4s, v9.4s - mul v9.4s, v15.4s, v6.4s - trn1 v6.2d, v22.2d, v1.2d - mls v9.4s, v30.4s, v7.s[0] - add v25.4s, v28.4s, v17.4s - mls v2.4s, v14.4s, v7.s[0] - trn1 v5.2d, v27.2d, v20.2d - ldr q20, [x2, #0x10] - mul v29.4s, v25.4s, v18.s[2] - add v15.4s, v6.4s, v19.4s - ldr q30, [x0, #0xc0] - sub v19.4s, v6.4s, v19.4s - add v31.4s, v15.4s, v8.4s - mls v0.4s, v26.4s, v7.s[0] - ldur q14, [x1, #-0x30] - add v21.4s, v19.4s, v2.4s - sub v24.4s, v19.4s, v2.4s - sqrdmulh v27.4s, v25.4s, v18.s[3] - sub v26.4s, v15.4s, v8.4s - ldr q2, [x0, #0x90] - mul v16.4s, v30.4s, v18.s[0] - sub v25.4s, v28.4s, v17.4s - trn1 v11.4s, v31.4s, v26.4s - ldr q1, [x0, #0xa0] - trn1 v6.4s, v21.4s, v24.4s - sqrdmulh v13.4s, v25.4s, v14.s[1] - add v8.4s, v2.4s, v3.4s - trn2 v28.2d, v11.2d, v6.2d - sqrdmulh v19.4s, v30.4s, v18.s[1] - sub v10.4s, v5.4s, v4.4s - ldur q22, [x1, #-0x20] - str q28, [x0, #0x20] - mls v29.4s, v27.4s, v7.s[0] - add v15.4s, v10.4s, v9.4s - mul v25.4s, v25.4s, v14.s[0] - ldur q27, [x1, #-0x10] - trn2 v17.4s, v31.4s, v26.4s - trn2 v21.4s, v21.4s, v24.4s - mls v16.4s, v19.4s, v7.s[0] - sub v24.4s, v8.4s, v29.4s - sub v10.4s, v10.4s, v9.4s - mls v25.4s, v13.4s, v7.s[0] - trn1 v13.2d, v11.2d, v6.2d - ldr q28, [x0, #0x80] - sqrdmulh v30.4s, v24.4s, v22.s[1] - trn2 v19.2d, v17.2d, v21.2d - trn1 v6.2d, v17.2d, v21.2d - mul v31.4s, v23.4s, v18.s[0] - str q13, [x0], #0x80 - stur q6, [x0, #-0x70] - stur q19, [x0, #-0x50] - ldr q11, [x2, #0x50] - mls v31.4s, v12.4s, v7.s[0] - ldr q21, [x2, #0x60] - trn1 v9.4s, v15.4s, v10.4s - trn2 v6.4s, v15.4s, v10.4s - mul v24.4s, v24.4s, v22.s[0] - sub v10.4s, v2.4s, v3.4s - ldr q3, [x2], #0xc0 - mls v24.4s, v30.4s, v7.s[0] - add v26.4s, v8.4s, v29.4s - ldur q8, [x2, #-0x90] - add v17.4s, v5.4s, v4.4s - sqrdmulh v2.4s, v26.4s, v14.s[3] - sub v13.4s, v1.4s, v31.4s - add v30.4s, v1.4s, v31.4s - add v15.4s, v10.4s, v25.4s - sqrdmulh v19.4s, v13.4s, v14.s[1] - sub v25.4s, v10.4s, v25.4s - mul v29.4s, v13.4s, v14.s[0] - sub v5.4s, v28.4s, v16.4s - sqrdmulh v4.4s, v30.4s, v18.s[3] - sub v23.4s, v17.4s, v0.4s - add v31.4s, v17.4s, v0.4s - mul v18.4s, v30.4s, v18.s[2] - add v1.4s, v28.4s, v16.4s - trn2 v12.4s, v31.4s, v23.4s - mls v29.4s, v19.4s, v7.s[0] - trn1 v13.4s, v31.4s, v23.4s - trn2 v30.2d, v12.2d, v6.2d - mls v18.4s, v4.4s, v7.s[0] - trn2 v10.2d, v13.2d, v9.2d - trn1 v31.2d, v13.2d, v9.2d - mul v19.4s, v26.4s, v14.s[2] - trn1 v12.2d, v12.2d, v6.2d - sub v6.4s, v5.4s, v29.4s - mls v19.4s, v2.4s, v7.s[0] - add v13.4s, v5.4s, v29.4s - stur q10, [x0, #-0x20] - sub v10.4s, v1.4s, v18.4s - add v28.4s, v1.4s, v18.4s - sqrdmulh v5.4s, v25.4s, v27.s[1] - stur q31, [x0, #-0x40] - add v26.4s, v10.4s, v24.4s - sub v31.4s, v10.4s, v24.4s - mul v9.4s, v25.4s, v27.s[0] - stur q12, [x0, #-0x30] - sub v24.4s, v28.4s, v19.4s - sqrdmulh v10.4s, v15.4s, v22.s[3] - trn1 v1.4s, v26.4s, v31.4s - stur q30, [x0, #-0x10] - add v30.4s, v28.4s, v19.4s - mls v9.4s, v5.4s, v7.s[0] - trn2 v25.4s, v26.4s, v31.4s - trn2 v14.4s, v30.4s, v24.4s - mul v12.4s, v15.4s, v22.s[2] - trn1 v22.4s, v30.4s, v24.4s - trn1 v27.2d, v14.2d, v25.2d - mls v12.4s, v10.4s, v7.s[0] - trn2 v30.2d, v14.2d, v25.2d - sub v31.4s, v6.4s, v9.4s - trn2 v10.2d, v22.2d, v1.2d - mul v28.4s, v30.4s, v3.4s - subs x4, x4, #0x1 - cbnz x4, Lntt_layer45678_start - add v9.4s, v6.4s, v9.4s - sqrdmulh v6.4s, v30.4s, v20.4s - ldur q24, [x2, #-0xa0] - add v25.4s, v13.4s, v12.4s - sub v15.4s, v13.4s, v12.4s - mul v19.4s, v10.4s, v3.4s - trn2 v5.4s, v9.4s, v31.4s - sqrdmulh v3.4s, v10.4s, v20.4s - trn2 v10.4s, v25.4s, v15.4s - mls v28.4s, v6.4s, v7.s[0] - trn2 v13.2d, v10.2d, v5.2d - ldur q30, [x2, #-0x50] - mul v12.4s, v13.4s, v21.4s - mls v19.4s, v3.4s, v7.s[0] - add v20.4s, v27.4s, v28.4s - sqrdmulh v13.4s, v13.4s, v30.4s - sub v3.4s, v27.4s, v28.4s - mul v24.4s, v20.4s, v24.4s - sqrdmulh v6.4s, v3.4s, v11.4s - ldur q27, [x2, #-0x80] - mls v12.4s, v13.4s, v7.s[0] - trn1 v25.4s, v25.4s, v15.4s - mul v27.4s, v3.4s, v27.4s - trn1 v31.4s, v9.4s, v31.4s - trn1 v3.2d, v10.2d, v5.2d - ldur q13, [x2, #-0x30] - ldur q15, [x2, #-0x40] - sqrdmulh v9.4s, v20.4s, v8.4s - trn2 v20.2d, v25.2d, v31.2d - ldur q10, [x2, #-0x10] - mls v27.4s, v6.4s, v7.s[0] - add v5.4s, v3.4s, v12.4s - sub v6.4s, v3.4s, v12.4s - sqrdmulh v3.4s, v20.4s, v30.4s - trn1 v12.2d, v22.2d, v1.2d - sqrdmulh v10.4s, v6.4s, v10.4s - mls v24.4s, v9.4s, v7.s[0] - sub v9.4s, v12.4s, v19.4s - trn1 v25.2d, v25.2d, v31.2d - sqrdmulh v31.4s, v5.4s, v13.4s - add v30.4s, v9.4s, v27.4s - add v13.4s, v12.4s, v19.4s - mul v1.4s, v20.4s, v21.4s - ldur q12, [x2, #-0x20] - add v21.4s, v13.4s, v24.4s - sub v13.4s, v13.4s, v24.4s - mls v1.4s, v3.4s, v7.s[0] - sub v3.4s, v9.4s, v27.4s - mul v9.4s, v6.4s, v12.4s - trn2 v12.4s, v21.4s, v13.4s - trn1 v6.4s, v30.4s, v3.4s - trn2 v30.4s, v30.4s, v3.4s - mls v9.4s, v10.4s, v7.s[0] - trn1 v13.4s, v21.4s, v13.4s - mul v15.4s, v5.4s, v15.4s - sub v3.4s, v25.4s, v1.4s - add v5.4s, v25.4s, v1.4s - mls v15.4s, v31.4s, v7.s[0] - trn1 v21.2d, v13.2d, v6.2d - trn2 v6.2d, v13.2d, v6.2d - add v10.4s, v3.4s, v9.4s - sub v13.4s, v3.4s, v9.4s - str q21, [x0], #0x80 - trn1 v3.2d, v12.2d, v30.2d - trn2 v31.2d, v12.2d, v30.2d - trn1 v21.4s, v10.4s, v13.4s - sub v30.4s, v5.4s, v15.4s - add v12.4s, v5.4s, v15.4s - stur q3, [x0, #-0x70] - trn2 v13.4s, v10.4s, v13.4s - trn1 v19.4s, v12.4s, v30.4s - trn2 v12.4s, v12.4s, v30.4s - stur q6, [x0, #-0x60] - stur q31, [x0, #-0x50] - trn1 v10.2d, v19.2d, v21.2d - trn2 v3.2d, v19.2d, v21.2d - trn1 v21.2d, v12.2d, v13.2d - trn2 v13.2d, v12.2d, v13.2d - stur q10, [x0, #-0x40] - stur q3, [x0, #-0x20] - stur q13, [x0, #-0x10] - stur q21, [x0, #-0x30] - ldp d8, d9, [sp] - ldp d10, d11, [sp, #0x10] - ldp d12, d13, [sp, #0x20] - ldp d14, d15, [sp, #0x30] - add sp, sp, #0x40 + add v2.4s, v13.4s, v12.4s + sqrdmulh v5.4s, v30.4s, v20.4s + sub v25.4s, v13.4s, v12.4s + add v17.4s, v6.4s, v9.4s + mul v19.4s, v10.4s, v3.4s + trn2 v4.4s, v2.4s, v25.4s + ldur q24, [x2, #-0x50] + trn2 v29.4s, v17.4s, v31.4s + sqrdmulh v15.4s, v10.4s, v20.4s + mls v28.4s, v5.4s, v7.s[0] + trn2 v3.2d, v4.2d, v29.2d + sqrdmulh v12.4s, v3.4s, v24.4s + mul v16.4s, v3.4s, v21.4s + mls v19.4s, v15.4s, v7.s[0] + ldur q10, [x2, #-0xa0] + add v13.4s, v27.4s, v28.4s + mls v16.4s, v12.4s, v7.s[0] + sqrdmulh v9.4s, v13.4s, v8.4s + sub v30.4s, v27.4s, v28.4s + ldr q18, [x1], #0x40 + mul v8.4s, v13.4s, v10.4s + ldr q10, [x0, #0xd0] + sqrdmulh v14.4s, v30.4s, v11.4s + ldr q23, [x0, #0xe0] + sqrdmulh v13.4s, v10.4s, v18.s[1] + sqrdmulh v12.4s, v23.4s, v18.s[1] + ldur q6, [x2, #-0x80] + mul v3.4s, v10.4s, v18.s[0] + mls v3.4s, v13.4s, v7.s[0] + ldr q13, [x0, #0xf0] + trn1 v27.4s, v2.4s, v25.4s + mul v2.4s, v30.4s, v6.4s + trn1 v20.4s, v17.4s, v31.4s + trn1 v25.2d, v4.2d, v29.2d + sqrdmulh v10.4s, v13.4s, v18.s[1] + trn2 v5.2d, v27.2d, v20.2d + ldur q6, [x2, #-0x10] + mls v8.4s, v9.4s, v7.s[0] + sub v15.4s, v25.4s, v16.4s + sqrdmulh v31.4s, v5.4s, v24.4s + sqrdmulh v30.4s, v15.4s, v6.4s + ldur q9, [x2, #-0x30] + mul v4.4s, v5.4s, v21.4s + ldur q21, [x2, #-0x40] + ldur q6, [x2, #-0x20] + add v5.4s, v25.4s, v16.4s + mls v4.4s, v31.4s, v7.s[0] + mul v0.4s, v5.4s, v21.4s + mul v17.4s, v13.4s, v18.s[0] + mls v17.4s, v10.4s, v7.s[0] + ldr q28, [x0, #0xb0] + sqrdmulh v26.4s, v5.4s, v9.4s + mul v9.4s, v15.4s, v6.4s + trn1 v6.2d, v22.2d, v1.2d + mls v9.4s, v30.4s, v7.s[0] + add v25.4s, v28.4s, v17.4s + mls v2.4s, v14.4s, v7.s[0] + trn1 v5.2d, v27.2d, v20.2d + ldr q20, [x2, #0x10] + mul v29.4s, v25.4s, v18.s[2] + add v15.4s, v6.4s, v19.4s + ldr q30, [x0, #0xc0] + sub v19.4s, v6.4s, v19.4s + add v31.4s, v15.4s, v8.4s + mls v0.4s, v26.4s, v7.s[0] + ldur q14, [x1, #-0x30] + add v21.4s, v19.4s, v2.4s + sub v24.4s, v19.4s, v2.4s + sqrdmulh v27.4s, v25.4s, v18.s[3] + sub v26.4s, v15.4s, v8.4s + ldr q2, [x0, #0x90] + mul v16.4s, v30.4s, v18.s[0] + sub v25.4s, v28.4s, v17.4s + trn1 v11.4s, v31.4s, v26.4s + ldr q1, [x0, #0xa0] + trn1 v6.4s, v21.4s, v24.4s + sqrdmulh v13.4s, v25.4s, v14.s[1] + add v8.4s, v2.4s, v3.4s + trn2 v28.2d, v11.2d, v6.2d + sqrdmulh v19.4s, v30.4s, v18.s[1] + sub v10.4s, v5.4s, v4.4s + ldur q22, [x1, #-0x20] + str q28, [x0, #0x20] + mls v29.4s, v27.4s, v7.s[0] + add v15.4s, v10.4s, v9.4s + mul v25.4s, v25.4s, v14.s[0] + ldur q27, [x1, #-0x10] + trn2 v17.4s, v31.4s, v26.4s + trn2 v21.4s, v21.4s, v24.4s + mls v16.4s, v19.4s, v7.s[0] + sub v24.4s, v8.4s, v29.4s + sub v10.4s, v10.4s, v9.4s + mls v25.4s, v13.4s, v7.s[0] + trn1 v13.2d, v11.2d, v6.2d + ldr q28, [x0, #0x80] + sqrdmulh v30.4s, v24.4s, v22.s[1] + trn2 v19.2d, v17.2d, v21.2d + trn1 v6.2d, v17.2d, v21.2d + mul v31.4s, v23.4s, v18.s[0] + str q13, [x0], #0x80 + stur q6, [x0, #-0x70] + stur q19, [x0, #-0x50] + ldr q11, [x2, #0x50] + mls v31.4s, v12.4s, v7.s[0] + ldr q21, [x2, #0x60] + trn1 v9.4s, v15.4s, v10.4s + trn2 v6.4s, v15.4s, v10.4s + mul v24.4s, v24.4s, v22.s[0] + sub v10.4s, v2.4s, v3.4s + ldr q3, [x2], #0xc0 + mls v24.4s, v30.4s, v7.s[0] + add v26.4s, v8.4s, v29.4s + ldur q8, [x2, #-0x90] + add v17.4s, v5.4s, v4.4s + sqrdmulh v2.4s, v26.4s, v14.s[3] + sub v13.4s, v1.4s, v31.4s + add v30.4s, v1.4s, v31.4s + add v15.4s, v10.4s, v25.4s + sqrdmulh v19.4s, v13.4s, v14.s[1] + sub v25.4s, v10.4s, v25.4s + mul v29.4s, v13.4s, v14.s[0] + sub v5.4s, v28.4s, v16.4s + sqrdmulh v4.4s, v30.4s, v18.s[3] + sub v23.4s, v17.4s, v0.4s + add v31.4s, v17.4s, v0.4s + mul v18.4s, v30.4s, v18.s[2] + add v1.4s, v28.4s, v16.4s + trn2 v12.4s, v31.4s, v23.4s + mls v29.4s, v19.4s, v7.s[0] + trn1 v13.4s, v31.4s, v23.4s + trn2 v30.2d, v12.2d, v6.2d + mls v18.4s, v4.4s, v7.s[0] + trn2 v10.2d, v13.2d, v9.2d + trn1 v31.2d, v13.2d, v9.2d + mul v19.4s, v26.4s, v14.s[2] + trn1 v12.2d, v12.2d, v6.2d + sub v6.4s, v5.4s, v29.4s + mls v19.4s, v2.4s, v7.s[0] + add v13.4s, v5.4s, v29.4s + stur q10, [x0, #-0x20] + sub v10.4s, v1.4s, v18.4s + add v28.4s, v1.4s, v18.4s + sqrdmulh v5.4s, v25.4s, v27.s[1] + stur q31, [x0, #-0x40] + add v26.4s, v10.4s, v24.4s + sub v31.4s, v10.4s, v24.4s + mul v9.4s, v25.4s, v27.s[0] + stur q12, [x0, #-0x30] + sub v24.4s, v28.4s, v19.4s + sqrdmulh v10.4s, v15.4s, v22.s[3] + trn1 v1.4s, v26.4s, v31.4s + stur q30, [x0, #-0x10] + add v30.4s, v28.4s, v19.4s + mls v9.4s, v5.4s, v7.s[0] + trn2 v25.4s, v26.4s, v31.4s + trn2 v14.4s, v30.4s, v24.4s + mul v12.4s, v15.4s, v22.s[2] + trn1 v22.4s, v30.4s, v24.4s + trn1 v27.2d, v14.2d, v25.2d + mls v12.4s, v10.4s, v7.s[0] + trn2 v30.2d, v14.2d, v25.2d + sub v31.4s, v6.4s, v9.4s + trn2 v10.2d, v22.2d, v1.2d + mul v28.4s, v30.4s, v3.4s + subs x4, x4, #0x1 + cbnz x4, Lntt_layer45678_start + add v9.4s, v6.4s, v9.4s + sqrdmulh v6.4s, v30.4s, v20.4s + ldur q24, [x2, #-0xa0] + add v25.4s, v13.4s, v12.4s + sub v15.4s, v13.4s, v12.4s + mul v19.4s, v10.4s, v3.4s + trn2 v5.4s, v9.4s, v31.4s + sqrdmulh v3.4s, v10.4s, v20.4s + trn2 v10.4s, v25.4s, v15.4s + mls v28.4s, v6.4s, v7.s[0] + trn2 v13.2d, v10.2d, v5.2d + ldur q30, [x2, #-0x50] + mul v12.4s, v13.4s, v21.4s + mls v19.4s, v3.4s, v7.s[0] + add v20.4s, v27.4s, v28.4s + sqrdmulh v13.4s, v13.4s, v30.4s + sub v3.4s, v27.4s, v28.4s + mul v24.4s, v20.4s, v24.4s + sqrdmulh v6.4s, v3.4s, v11.4s + ldur q27, [x2, #-0x80] + mls v12.4s, v13.4s, v7.s[0] + trn1 v25.4s, v25.4s, v15.4s + mul v27.4s, v3.4s, v27.4s + trn1 v31.4s, v9.4s, v31.4s + trn1 v3.2d, v10.2d, v5.2d + ldur q13, [x2, #-0x30] + ldur q15, [x2, #-0x40] + sqrdmulh v9.4s, v20.4s, v8.4s + trn2 v20.2d, v25.2d, v31.2d + ldur q10, [x2, #-0x10] + mls v27.4s, v6.4s, v7.s[0] + add v5.4s, v3.4s, v12.4s + sub v6.4s, v3.4s, v12.4s + sqrdmulh v3.4s, v20.4s, v30.4s + trn1 v12.2d, v22.2d, v1.2d + sqrdmulh v10.4s, v6.4s, v10.4s + mls v24.4s, v9.4s, v7.s[0] + sub v9.4s, v12.4s, v19.4s + trn1 v25.2d, v25.2d, v31.2d + sqrdmulh v31.4s, v5.4s, v13.4s + add v30.4s, v9.4s, v27.4s + add v13.4s, v12.4s, v19.4s + mul v1.4s, v20.4s, v21.4s + ldur q12, [x2, #-0x20] + add v21.4s, v13.4s, v24.4s + sub v13.4s, v13.4s, v24.4s + mls v1.4s, v3.4s, v7.s[0] + sub v3.4s, v9.4s, v27.4s + mul v9.4s, v6.4s, v12.4s + trn2 v12.4s, v21.4s, v13.4s + trn1 v6.4s, v30.4s, v3.4s + trn2 v30.4s, v30.4s, v3.4s + mls v9.4s, v10.4s, v7.s[0] + trn1 v13.4s, v21.4s, v13.4s + mul v15.4s, v5.4s, v15.4s + sub v3.4s, v25.4s, v1.4s + add v5.4s, v25.4s, v1.4s + mls v15.4s, v31.4s, v7.s[0] + trn1 v21.2d, v13.2d, v6.2d + trn2 v6.2d, v13.2d, v6.2d + add v10.4s, v3.4s, v9.4s + sub v13.4s, v3.4s, v9.4s + str q21, [x0], #0x80 + trn1 v3.2d, v12.2d, v30.2d + trn2 v31.2d, v12.2d, v30.2d + trn1 v21.4s, v10.4s, v13.4s + sub v30.4s, v5.4s, v15.4s + add v12.4s, v5.4s, v15.4s + stur q3, [x0, #-0x70] + trn2 v13.4s, v10.4s, v13.4s + trn1 v19.4s, v12.4s, v30.4s + trn2 v12.4s, v12.4s, v30.4s + stur q6, [x0, #-0x60] + stur q31, [x0, #-0x50] + trn1 v10.2d, v19.2d, v21.2d + trn2 v3.2d, v19.2d, v21.2d + trn1 v21.2d, v12.2d, v13.2d + trn2 v13.2d, v12.2d, v13.2d + stur q10, [x0, #-0x40] + stur q3, [x0, #-0x20] + stur q13, [x0, #-0x10] + stur q21, [x0, #-0x30] + ldp d8, d9, [sp] + .cfi_restore d8 + .cfi_restore d9 + ldp d10, d11, [sp, #0x10] + .cfi_restore d10 + .cfi_restore d11 + ldp d12, d13, [sp, #0x20] + .cfi_restore d12 + .cfi_restore d13 + ldp d14, d15, [sp, #0x30] + .cfi_restore d14 + .cfi_restore d15 + add sp, sp, #0x40 + .cfi_adjust_cfa_offset -0x40 ret .cfi_endproc + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/hol_light/aarch64/mldsa/mldsa_poly_caddq.S b/proofs/hol_light/aarch64/mldsa/mldsa_poly_caddq.S index 907955c82..c3610001b 100644 --- a/proofs/hol_light/aarch64/mldsa/mldsa_poly_caddq.S +++ b/proofs/hol_light/aarch64/mldsa/mldsa_poly_caddq.S @@ -20,30 +20,30 @@ PQCP_MLDSA_NATIVE_MLDSA44_poly_caddq_asm: #endif .cfi_startproc - mov w9, #0xe001 // =57345 - movk w9, #0x7f, lsl #16 - dup v4.4s, w9 - mov x1, #0x10 // =16 + mov w9, #0xe001 // =57345 + movk w9, #0x7f, lsl #16 + dup v4.4s, w9 + mov x1, #0x10 // =16 Lpoly_caddq_loop: - ldr q0, [x0] - ldr q1, [x0, #0x10] - ldr q2, [x0, #0x20] - ldr q3, [x0, #0x30] - ushr v5.4s, v0.4s, #0x1f - mla v0.4s, v5.4s, v4.4s - ushr v5.4s, v1.4s, #0x1f - mla v1.4s, v5.4s, v4.4s - ushr v5.4s, v2.4s, #0x1f - mla v2.4s, v5.4s, v4.4s - ushr v5.4s, v3.4s, #0x1f - mla v3.4s, v5.4s, v4.4s - str q1, [x0, #0x10] - str q2, [x0, #0x20] - str q3, [x0, #0x30] - str q0, [x0], #0x40 - subs x1, x1, #0x1 - b.ne Lpoly_caddq_loop + ldr q0, [x0] + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ushr v5.4s, v0.4s, #0x1f + mla v0.4s, v5.4s, v4.4s + ushr v5.4s, v1.4s, #0x1f + mla v1.4s, v5.4s, v4.4s + ushr v5.4s, v2.4s, #0x1f + mla v2.4s, v5.4s, v4.4s + ushr v5.4s, v3.4s, #0x1f + mla v3.4s, v5.4s, v4.4s + str q1, [x0, #0x10] + str q2, [x0, #0x20] + str q3, [x0, #0x30] + str q0, [x0], #0x40 + subs x1, x1, #0x1 + b.ne Lpoly_caddq_loop ret .cfi_endproc diff --git a/proofs/hol_light/aarch64/mldsa/mldsa_poly_chknorm.S b/proofs/hol_light/aarch64/mldsa/mldsa_poly_chknorm.S index b524c59c7..ba082f0a9 100644 --- a/proofs/hol_light/aarch64/mldsa/mldsa_poly_chknorm.S +++ b/proofs/hol_light/aarch64/mldsa/mldsa_poly_chknorm.S @@ -20,32 +20,32 @@ PQCP_MLDSA_NATIVE_MLDSA44_poly_chknorm_asm: #endif .cfi_startproc - dup v20.4s, w1 - eor v21.16b, v21.16b, v21.16b - mov x2, #0x10 // =16 + dup v20.4s, w1 + eor v21.16b, v21.16b, v21.16b + mov x2, #0x10 // =16 Lpoly_chknorm_loop: - ldr q1, [x0, #0x10] - ldr q2, [x0, #0x20] - ldr q3, [x0, #0x30] - ldr q0, [x0], #0x40 - abs v1.4s, v1.4s - cmge v1.4s, v1.4s, v20.4s - orr v21.16b, v21.16b, v1.16b - abs v2.4s, v2.4s - cmge v2.4s, v2.4s, v20.4s - orr v21.16b, v21.16b, v2.16b - abs v3.4s, v3.4s - cmge v3.4s, v3.4s, v20.4s - orr v21.16b, v21.16b, v3.16b - abs v0.4s, v0.4s - cmge v0.4s, v0.4s, v20.4s - orr v21.16b, v21.16b, v0.16b - subs x2, x2, #0x1 - b.ne Lpoly_chknorm_loop - umaxv s21, v21.4s - fmov w0, s21 - and w0, w0, #0x1 + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ldr q0, [x0], #0x40 + abs v1.4s, v1.4s + cmge v1.4s, v1.4s, v20.4s + orr v21.16b, v21.16b, v1.16b + abs v2.4s, v2.4s + cmge v2.4s, v2.4s, v20.4s + orr v21.16b, v21.16b, v2.16b + abs v3.4s, v3.4s + cmge v3.4s, v3.4s, v20.4s + orr v21.16b, v21.16b, v3.16b + abs v0.4s, v0.4s + cmge v0.4s, v0.4s, v20.4s + orr v21.16b, v21.16b, v0.16b + subs x2, x2, #0x1 + b.ne Lpoly_chknorm_loop + umaxv s21, v21.4s + fmov w0, s21 + and w0, w0, #0x1 ret .cfi_endproc diff --git a/proofs/hol_light/x86_64/mldsa/mldsa_intt.S b/proofs/hol_light/x86_64/mldsa/mldsa_intt.S index 21eb1ad60..f06c070f9 100644 --- a/proofs/hol_light/x86_64/mldsa/mldsa_intt.S +++ b/proofs/hol_light/x86_64/mldsa/mldsa_intt.S @@ -36,2271 +36,2271 @@ PQCP_MLDSA_NATIVE_MLDSA44_invntt_avx2: .cfi_startproc endbr64 - vmovdqa (%rsi), %ymm0 - vmovdqa (%rdi), %ymm4 - vmovdqa 0x20(%rdi), %ymm5 - vmovdqa 0x40(%rdi), %ymm6 - vmovdqa 0x60(%rdi), %ymm7 - vmovdqa 0x80(%rdi), %ymm8 - vmovdqa 0xa0(%rdi), %ymm9 - vmovdqa 0xc0(%rdi), %ymm10 - vmovdqa 0xe0(%rdi), %ymm11 - vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x9c(%rsi), %ymm1 - vpbroadcastd 0x53c(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, (%rdi) - vmovdqa %ymm10, 0x20(%rdi) - vmovdqa %ymm6, 0x40(%rdi) - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm3, 0x80(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm8, 0xc0(%rdi) - vmovdqa %ymm11, 0xe0(%rdi) - vmovdqa 0x100(%rdi), %ymm4 - vmovdqa 0x120(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x160(%rdi), %ymm7 - vmovdqa 0x180(%rdi), %ymm8 - vmovdqa 0x1a0(%rdi), %ymm9 - vmovdqa 0x1c0(%rdi), %ymm10 - vmovdqa 0x1e0(%rdi), %ymm11 - vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x98(%rsi), %ymm1 - vpbroadcastd 0x538(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x100(%rdi) - vmovdqa %ymm10, 0x120(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm4, 0x160(%rdi) - vmovdqa %ymm3, 0x180(%rdi) - vmovdqa %ymm5, 0x1a0(%rdi) - vmovdqa %ymm8, 0x1c0(%rdi) - vmovdqa %ymm11, 0x1e0(%rdi) - vmovdqa 0x200(%rdi), %ymm4 - vmovdqa 0x220(%rdi), %ymm5 - vmovdqa 0x240(%rdi), %ymm6 - vmovdqa 0x260(%rdi), %ymm7 - vmovdqa 0x280(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x2c0(%rdi), %ymm10 - vmovdqa 0x2e0(%rdi), %ymm11 - vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x94(%rsi), %ymm1 - vpbroadcastd 0x534(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x200(%rdi) - vmovdqa %ymm10, 0x220(%rdi) - vmovdqa %ymm6, 0x240(%rdi) - vmovdqa %ymm4, 0x260(%rdi) - vmovdqa %ymm3, 0x280(%rdi) - vmovdqa %ymm5, 0x2a0(%rdi) - vmovdqa %ymm8, 0x2c0(%rdi) - vmovdqa %ymm11, 0x2e0(%rdi) - vmovdqa 0x300(%rdi), %ymm4 - vmovdqa 0x320(%rdi), %ymm5 - vmovdqa 0x340(%rdi), %ymm6 - vmovdqa 0x360(%rdi), %ymm7 - vmovdqa 0x380(%rdi), %ymm8 - vmovdqa 0x3a0(%rdi), %ymm9 - vmovdqa 0x3c0(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm5, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm9, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm10, %ymm11, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] - vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] - vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] - vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm3, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm15, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] - vpsrlq $0x20, %ymm4, %ymm4 - vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] - vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] - vpsrlq $0x20, %ymm6, %ymm6 - vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] - vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] - vpsrlq $0x20, %ymm8, %ymm8 - vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] - vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm4, %ymm7, %ymm12 - vpaddd %ymm7, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm6, %ymm9, %ymm12 - vpaddd %ymm6, %ymm9, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm8, %ymm11, %ymm12 - vpaddd %ymm11, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] - vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] - vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] - vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] - vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] - vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] - vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] - vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] - vpsubd %ymm10, %ymm4, %ymm12 - vpaddd %ymm4, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm4, %ymm4 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] - vpsubd %ymm3, %ymm8, %ymm12 - vpaddd %ymm3, %ymm8, %ymm3 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm7, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpsubd %ymm5, %ymm11, %ymm12 - vpaddd %ymm5, %ymm11, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] - vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] - vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] - vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] - vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vpbroadcastd 0x90(%rsi), %ymm1 - vpbroadcastd 0x530(%rsi), %ymm2 - vpsubd %ymm9, %ymm3, %ymm12 - vpaddd %ymm3, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm3, %ymm3 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] - vpsubd %ymm10, %ymm5, %ymm12 - vpaddd %ymm5, %ymm10, %ymm10 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm5, %ymm5 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm4, %ymm11, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm9, 0x300(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm6, 0x340(%rdi) - vmovdqa %ymm4, 0x360(%rdi) - vmovdqa %ymm3, 0x380(%rdi) - vmovdqa %ymm5, 0x3a0(%rdi) - vmovdqa %ymm8, 0x3c0(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa (%rdi), %ymm4 - vmovdqa 0x80(%rdi), %ymm5 - vmovdqa 0x100(%rdi), %ymm6 - vmovdqa 0x180(%rdi), %ymm7 - vmovdqa 0x200(%rdi), %ymm8 - vmovdqa 0x280(%rdi), %ymm9 - vmovdqa 0x300(%rdi), %ymm10 - vmovdqa 0x380(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x200(%rdi) - vmovdqa %ymm9, 0x280(%rdi) - vmovdqa %ymm10, 0x300(%rdi) - vmovdqa %ymm11, 0x380(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, (%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm6, 0x100(%rdi) - vmovdqa %ymm7, 0x180(%rdi) - vmovdqa 0x20(%rdi), %ymm4 - vmovdqa 0xa0(%rdi), %ymm5 - vmovdqa 0x120(%rdi), %ymm6 - vmovdqa 0x1a0(%rdi), %ymm7 - vmovdqa 0x220(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x320(%rdi), %ymm10 - vmovdqa 0x3a0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm9, 0x2a0(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm11, 0x3a0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x20(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm6, 0x120(%rdi) - vmovdqa %ymm7, 0x1a0(%rdi) - vmovdqa 0x40(%rdi), %ymm4 - vmovdqa 0xc0(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x1c0(%rdi), %ymm7 - vmovdqa 0x240(%rdi), %ymm8 - vmovdqa 0x2c0(%rdi), %ymm9 - vmovdqa 0x340(%rdi), %ymm10 - vmovdqa 0x3c0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x240(%rdi) - vmovdqa %ymm9, 0x2c0(%rdi) - vmovdqa %ymm10, 0x340(%rdi) - vmovdqa %ymm11, 0x3c0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x40(%rdi) - vmovdqa %ymm5, 0xc0(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm7, 0x1c0(%rdi) - vmovdqa 0x60(%rdi), %ymm4 - vmovdqa 0xe0(%rdi), %ymm5 - vmovdqa 0x160(%rdi), %ymm6 - vmovdqa 0x1e0(%rdi), %ymm7 - vmovdqa 0x260(%rdi), %ymm8 - vmovdqa 0x2e0(%rdi), %ymm9 - vmovdqa 0x360(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpsubd %ymm4, %ymm6, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm6, %ymm6 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] - vpsubd %ymm5, %ymm7, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm7, %ymm7 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpsubd %ymm8, %ymm10, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm9, %ymm11, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vpbroadcastd 0x80(%rsi), %ymm1 - vpbroadcastd 0x520(%rsi), %ymm2 - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm8, %ymm8 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm9, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm9, %ymm9 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] - vpsubd %ymm6, %ymm10, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm10, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm10, %ymm10 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] - vpsubd %ymm7, %ymm11, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vpmuldq %ymm1, %ymm12, %ymm13 - vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm14 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpsubd %ymm13, %ymm12, %ymm12 - vpsubd %ymm14, %ymm11, %ymm11 - vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] - vmovdqa %ymm8, 0x260(%rdi) - vmovdqa %ymm9, 0x2e0(%rdi) - vmovdqa %ymm10, 0x360(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa 0x40(%rsi), %ymm1 - vmovdqa 0x60(%rsi), %ymm2 - vpmuldq %ymm1, %ymm4, %ymm12 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm4, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] - vpmuldq %ymm1, %ymm6, %ymm12 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm14 - vpmuldq %ymm1, %ymm9, %ymm15 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm0, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vpmuldq %ymm0, %ymm15, %ymm15 - vpsubd %ymm12, %ymm6, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vpsubd %ymm14, %ymm8, %ymm8 - vpsubd %ymm15, %ymm9, %ymm9 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] - vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm5, 0xe0(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa (%rsi), %ymm0 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm10, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm3, 0x80(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm8, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm10, 0x120(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm4, 0x160(%rdi) + vmovdqa %ymm3, 0x180(%rdi) + vmovdqa %ymm5, 0x1a0(%rdi) + vmovdqa %ymm8, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm10, 0x220(%rdi) + vmovdqa %ymm6, 0x240(%rdi) + vmovdqa %ymm4, 0x260(%rdi) + vmovdqa %ymm3, 0x280(%rdi) + vmovdqa %ymm5, 0x2a0(%rdi) + vmovdqa %ymm8, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm6, 0x340(%rdi) + vmovdqa %ymm4, 0x360(%rdi) + vmovdqa %ymm3, 0x380(%rdi) + vmovdqa %ymm5, 0x3a0(%rdi) + vmovdqa %ymm8, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) retq .cfi_endproc diff --git a/proofs/hol_light/x86_64/mldsa/mldsa_ntt.S b/proofs/hol_light/x86_64/mldsa/mldsa_ntt.S index 2ad059a18..ae2294998 100644 --- a/proofs/hol_light/x86_64/mldsa/mldsa_ntt.S +++ b/proofs/hol_light/x86_64/mldsa/mldsa_ntt.S @@ -36,2343 +36,2343 @@ PQCP_MLDSA_NATIVE_MLDSA44_ntt_avx2: .cfi_startproc endbr64 - vmovdqa (%rsi), %ymm0 - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa (%rdi), %ymm4 - vmovdqa 0x80(%rdi), %ymm5 - vmovdqa 0x100(%rdi), %ymm6 - vmovdqa 0x180(%rdi), %ymm7 - vmovdqa 0x200(%rdi), %ymm8 - vmovdqa 0x280(%rdi), %ymm9 - vmovdqa 0x300(%rdi), %ymm10 - vmovdqa 0x380(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, (%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm6, 0x100(%rdi) - vmovdqa %ymm7, 0x180(%rdi) - vmovdqa %ymm8, 0x200(%rdi) - vmovdqa %ymm9, 0x280(%rdi) - vmovdqa %ymm10, 0x300(%rdi) - vmovdqa %ymm11, 0x380(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x20(%rdi), %ymm4 - vmovdqa 0xa0(%rdi), %ymm5 - vmovdqa 0x120(%rdi), %ymm6 - vmovdqa 0x1a0(%rdi), %ymm7 - vmovdqa 0x220(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x320(%rdi), %ymm10 - vmovdqa 0x3a0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x20(%rdi) - vmovdqa %ymm5, 0xa0(%rdi) - vmovdqa %ymm6, 0x120(%rdi) - vmovdqa %ymm7, 0x1a0(%rdi) - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm9, 0x2a0(%rdi) - vmovdqa %ymm10, 0x320(%rdi) - vmovdqa %ymm11, 0x3a0(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x40(%rdi), %ymm4 - vmovdqa 0xc0(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x1c0(%rdi), %ymm7 - vmovdqa 0x240(%rdi), %ymm8 - vmovdqa 0x2c0(%rdi), %ymm9 - vmovdqa 0x340(%rdi), %ymm10 - vmovdqa 0x3c0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x40(%rdi) - vmovdqa %ymm5, 0xc0(%rdi) - vmovdqa %ymm6, 0x140(%rdi) - vmovdqa %ymm7, 0x1c0(%rdi) - vmovdqa %ymm8, 0x240(%rdi) - vmovdqa %ymm9, 0x2c0(%rdi) - vmovdqa %ymm10, 0x340(%rdi) - vmovdqa %ymm11, 0x3c0(%rdi) - vpbroadcastd 0x84(%rsi), %ymm1 - vpbroadcastd 0x524(%rsi), %ymm2 - vmovdqa 0x60(%rdi), %ymm4 - vmovdqa 0xe0(%rdi), %ymm5 - vmovdqa 0x160(%rdi), %ymm6 - vmovdqa 0x1e0(%rdi), %ymm7 - vmovdqa 0x260(%rdi), %ymm8 - vmovdqa 0x2e0(%rdi), %ymm9 - vmovdqa 0x360(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vpbroadcastd 0x88(%rsi), %ymm1 - vpbroadcastd 0x528(%rsi), %ymm2 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm5, %ymm12 - vpaddd %ymm7, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm5, %ymm5 - vpbroadcastd 0x8c(%rsi), %ymm1 - vpbroadcastd 0x52c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa %ymm4, 0x60(%rdi) - vmovdqa %ymm5, 0xe0(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm7, 0x1e0(%rdi) - vmovdqa %ymm8, 0x260(%rdi) - vmovdqa %ymm9, 0x2e0(%rdi) - vmovdqa %ymm10, 0x360(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) - vmovdqa (%rdi), %ymm4 - vmovdqa 0x20(%rdi), %ymm5 - vmovdqa 0x40(%rdi), %ymm6 - vmovdqa 0x60(%rdi), %ymm7 - vmovdqa 0x80(%rdi), %ymm8 - vmovdqa 0xa0(%rdi), %ymm9 - vmovdqa 0xc0(%rdi), %ymm10 - vmovdqa 0xe0(%rdi), %ymm11 - vpbroadcastd 0x90(%rsi), %ymm1 - vpbroadcastd 0x530(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xa0(%rsi), %ymm1 - vmovdqa 0x540(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x120(%rsi), %ymm1 - vmovdqa 0x5c0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1a0(%rsi), %ymm1 - vmovdqa 0x640(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x220(%rsi), %ymm1 - vmovdqa 0x6c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2a0(%rsi), %ymm1 - vmovdqa 0x740(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x320(%rsi), %ymm1 - vmovdqa 0x7c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3a0(%rsi), %ymm1 - vmovdqa 0x840(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x420(%rsi), %ymm1 - vmovdqa 0x8c0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4a0(%rsi), %ymm1 - vmovdqa 0x940(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, (%rdi) - vmovdqa %ymm8, 0x20(%rdi) - vmovdqa %ymm7, 0x40(%rdi) - vmovdqa %ymm6, 0x60(%rdi) - vmovdqa %ymm5, 0x80(%rdi) - vmovdqa %ymm4, 0xa0(%rdi) - vmovdqa %ymm3, 0xc0(%rdi) - vmovdqa %ymm11, 0xe0(%rdi) - vmovdqa 0x100(%rdi), %ymm4 - vmovdqa 0x120(%rdi), %ymm5 - vmovdqa 0x140(%rdi), %ymm6 - vmovdqa 0x160(%rdi), %ymm7 - vmovdqa 0x180(%rdi), %ymm8 - vmovdqa 0x1a0(%rdi), %ymm9 - vmovdqa 0x1c0(%rdi), %ymm10 - vmovdqa 0x1e0(%rdi), %ymm11 - vpbroadcastd 0x94(%rsi), %ymm1 - vpbroadcastd 0x534(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xc0(%rsi), %ymm1 - vmovdqa 0x560(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x140(%rsi), %ymm1 - vmovdqa 0x5e0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1c0(%rsi), %ymm1 - vmovdqa 0x660(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x240(%rsi), %ymm1 - vmovdqa 0x6e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2c0(%rsi), %ymm1 - vmovdqa 0x760(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x340(%rsi), %ymm1 - vmovdqa 0x7e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3c0(%rsi), %ymm1 - vmovdqa 0x860(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x440(%rsi), %ymm1 - vmovdqa 0x8e0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4c0(%rsi), %ymm1 - vmovdqa 0x960(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x100(%rdi) - vmovdqa %ymm8, 0x120(%rdi) - vmovdqa %ymm7, 0x140(%rdi) - vmovdqa %ymm6, 0x160(%rdi) - vmovdqa %ymm5, 0x180(%rdi) - vmovdqa %ymm4, 0x1a0(%rdi) - vmovdqa %ymm3, 0x1c0(%rdi) - vmovdqa %ymm11, 0x1e0(%rdi) - vmovdqa 0x200(%rdi), %ymm4 - vmovdqa 0x220(%rdi), %ymm5 - vmovdqa 0x240(%rdi), %ymm6 - vmovdqa 0x260(%rdi), %ymm7 - vmovdqa 0x280(%rdi), %ymm8 - vmovdqa 0x2a0(%rdi), %ymm9 - vmovdqa 0x2c0(%rdi), %ymm10 - vmovdqa 0x2e0(%rdi), %ymm11 - vpbroadcastd 0x98(%rsi), %ymm1 - vpbroadcastd 0x538(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0xe0(%rsi), %ymm1 - vmovdqa 0x580(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x160(%rsi), %ymm1 - vmovdqa 0x600(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x1e0(%rsi), %ymm1 - vmovdqa 0x680(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x260(%rsi), %ymm1 - vmovdqa 0x700(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x2e0(%rsi), %ymm1 - vmovdqa 0x780(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x360(%rsi), %ymm1 - vmovdqa 0x800(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x3e0(%rsi), %ymm1 - vmovdqa 0x880(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x460(%rsi), %ymm1 - vmovdqa 0x900(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x4e0(%rsi), %ymm1 - vmovdqa 0x980(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x200(%rdi) - vmovdqa %ymm8, 0x220(%rdi) - vmovdqa %ymm7, 0x240(%rdi) - vmovdqa %ymm6, 0x260(%rdi) - vmovdqa %ymm5, 0x280(%rdi) - vmovdqa %ymm4, 0x2a0(%rdi) - vmovdqa %ymm3, 0x2c0(%rdi) - vmovdqa %ymm11, 0x2e0(%rdi) - vmovdqa 0x300(%rdi), %ymm4 - vmovdqa 0x320(%rdi), %ymm5 - vmovdqa 0x340(%rdi), %ymm6 - vmovdqa 0x360(%rdi), %ymm7 - vmovdqa 0x380(%rdi), %ymm8 - vmovdqa 0x3a0(%rdi), %ymm9 - vmovdqa 0x3c0(%rdi), %ymm10 - vmovdqa 0x3e0(%rdi), %ymm11 - vpbroadcastd 0x9c(%rsi), %ymm1 - vpbroadcastd 0x53c(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm4, %ymm12 - vpaddd %ymm4, %ymm8, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm9, %ymm13 - vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm9, %ymm9 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] - vpsubd %ymm9, %ymm5, %ymm12 - vpaddd %ymm5, %ymm9, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm9 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm6, %ymm12 - vpaddd %ymm6, %ymm10, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm6, %ymm6 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm7, %ymm12 - vpaddd %ymm7, %ymm11, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm7, %ymm7 - vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] - vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] - vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] - vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] - vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] - vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] - vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] - vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] - vmovdqa 0x100(%rsi), %ymm1 - vmovdqa 0x5a0(%rsi), %ymm2 - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm3, %ymm12 - vpaddd %ymm5, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm10, %ymm13 - vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm10, %ymm10 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] - vpsubd %ymm10, %ymm8, %ymm12 - vpaddd %ymm10, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm10 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm4, %ymm12 - vpaddd %ymm6, %ymm4, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm4, %ymm4 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm9, %ymm12 - vpaddd %ymm11, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm9, %ymm9 - vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] - vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] - vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] - vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] - vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] - vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] - vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] - vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] - vmovdqa 0x180(%rsi), %ymm1 - vmovdqa 0x620(%rsi), %ymm2 - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm7, %ymm12 - vpaddd %ymm7, %ymm8, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm5, %ymm12 - vpaddd %ymm6, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm3, %ymm12 - vpaddd %ymm4, %ymm3, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm3, %ymm3 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm2, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm10, %ymm12 - vpaddd %ymm11, %ymm10, %ymm10 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm10, %ymm10 - vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] - vpsrlq $0x20, %ymm7, %ymm7 - vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] - vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] - vpsrlq $0x20, %ymm5, %ymm5 - vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] - vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] - vpsrlq $0x20, %ymm3, %ymm3 - vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] - vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] - vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] - vpsrlq $0x20, %ymm10, %ymm10 - vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] - vmovdqa 0x200(%rsi), %ymm1 - vmovdqa 0x6a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm5, %ymm13 - vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm5, %ymm5 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] - vpsubd %ymm5, %ymm9, %ymm12 - vpaddd %ymm5, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm5 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm8, %ymm12 - vpaddd %ymm4, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm8, %ymm8 - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm7, %ymm12 - vpaddd %ymm3, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm7, %ymm7 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm6, %ymm12 - vpaddd %ymm6, %ymm11, %ymm6 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm6, %ymm6 - vmovdqa 0x280(%rsi), %ymm1 - vmovdqa 0x720(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm7, %ymm13 - vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm7, %ymm7 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] - vpsubd %ymm7, %ymm9, %ymm12 - vpaddd %ymm7, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm7 - vpsubd %ymm13, %ymm9, %ymm9 - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm8, %ymm12 - vpaddd %ymm6, %ymm8, %ymm8 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm8, %ymm8 - vmovdqa 0x300(%rsi), %ymm1 - vmovdqa 0x7a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm3, %ymm13 - vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm3, %ymm3 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] - vpsubd %ymm3, %ymm5, %ymm12 - vpaddd %ymm3, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm3 - vpsubd %ymm13, %ymm5, %ymm5 - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm4, %ymm12 - vpaddd %ymm4, %ymm11, %ymm4 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm4, %ymm4 - vmovdqa 0x380(%rsi), %ymm1 - vmovdqa 0x820(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm8, %ymm13 - vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm8, %ymm8 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] - vpsubd %ymm8, %ymm9, %ymm12 - vpaddd %ymm8, %ymm9, %ymm9 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm8 - vpsubd %ymm13, %ymm9, %ymm9 - vmovdqa 0x400(%rsi), %ymm1 - vmovdqa 0x8a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm6, %ymm13 - vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm6, %ymm6 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] - vpsubd %ymm6, %ymm7, %ymm12 - vpaddd %ymm6, %ymm7, %ymm7 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm6 - vpsubd %ymm13, %ymm7, %ymm7 - vmovdqa 0x480(%rsi), %ymm1 - vmovdqa 0x920(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm4, %ymm13 - vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm4, %ymm4 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] - vpsubd %ymm4, %ymm5, %ymm12 - vpaddd %ymm4, %ymm5, %ymm5 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm4 - vpsubd %ymm13, %ymm5, %ymm5 - vmovdqa 0x500(%rsi), %ymm1 - vmovdqa 0x9a0(%rsi), %ymm2 - vpsrlq $0x20, %ymm1, %ymm10 - vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] - vpmuldq %ymm1, %ymm11, %ymm13 - vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] - vpmuldq %ymm10, %ymm12, %ymm14 - vpmuldq %ymm2, %ymm11, %ymm11 - vpmuldq %ymm15, %ymm12, %ymm12 - vpmuldq %ymm0, %ymm13, %ymm13 - vpmuldq %ymm0, %ymm14, %ymm14 - vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] - vpsubd %ymm11, %ymm3, %ymm12 - vpaddd %ymm3, %ymm11, %ymm3 - vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] - vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] - vpaddd %ymm13, %ymm12, %ymm11 - vpsubd %ymm13, %ymm3, %ymm3 - vmovdqa %ymm9, 0x300(%rdi) - vmovdqa %ymm8, 0x320(%rdi) - vmovdqa %ymm7, 0x340(%rdi) - vmovdqa %ymm6, 0x360(%rdi) - vmovdqa %ymm5, 0x380(%rdi) - vmovdqa %ymm4, 0x3a0(%rdi) - vmovdqa %ymm3, 0x3c0(%rdi) - vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rsi), %ymm0 + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xa0(%rsi), %ymm1 + vmovdqa 0x540(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x120(%rsi), %ymm1 + vmovdqa 0x5c0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1a0(%rsi), %ymm1 + vmovdqa 0x640(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x220(%rsi), %ymm1 + vmovdqa 0x6c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2a0(%rsi), %ymm1 + vmovdqa 0x740(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x320(%rsi), %ymm1 + vmovdqa 0x7c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3a0(%rsi), %ymm1 + vmovdqa 0x840(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x420(%rsi), %ymm1 + vmovdqa 0x8c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4a0(%rsi), %ymm1 + vmovdqa 0x940(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xc0(%rsi), %ymm1 + vmovdqa 0x560(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x140(%rsi), %ymm1 + vmovdqa 0x5e0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1c0(%rsi), %ymm1 + vmovdqa 0x660(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x240(%rsi), %ymm1 + vmovdqa 0x6e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2c0(%rsi), %ymm1 + vmovdqa 0x760(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x340(%rsi), %ymm1 + vmovdqa 0x7e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3c0(%rsi), %ymm1 + vmovdqa 0x860(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x440(%rsi), %ymm1 + vmovdqa 0x8e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4c0(%rsi), %ymm1 + vmovdqa 0x960(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xe0(%rsi), %ymm1 + vmovdqa 0x580(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x160(%rsi), %ymm1 + vmovdqa 0x600(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1e0(%rsi), %ymm1 + vmovdqa 0x680(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x260(%rsi), %ymm1 + vmovdqa 0x700(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2e0(%rsi), %ymm1 + vmovdqa 0x780(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x360(%rsi), %ymm1 + vmovdqa 0x800(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3e0(%rsi), %ymm1 + vmovdqa 0x880(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x460(%rsi), %ymm1 + vmovdqa 0x900(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4e0(%rsi), %ymm1 + vmovdqa 0x980(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0x100(%rsi), %ymm1 + vmovdqa 0x5a0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x180(%rsi), %ymm1 + vmovdqa 0x620(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x200(%rsi), %ymm1 + vmovdqa 0x6a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x280(%rsi), %ymm1 + vmovdqa 0x720(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x300(%rsi), %ymm1 + vmovdqa 0x7a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x380(%rsi), %ymm1 + vmovdqa 0x820(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x400(%rsi), %ymm1 + vmovdqa 0x8a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x480(%rsi), %ymm1 + vmovdqa 0x920(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x500(%rsi), %ymm1 + vmovdqa 0x9a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) retq .cfi_endproc diff --git a/scripts/autogen b/scripts/autogen index 77e7cb3b8..9ede4f277 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2295,6 +2295,13 @@ def gen_hol_light_asm_file(job): def gen_hol_light_asm(): aarch64_flags = "-march=armv8.4-a+sha3" joblist_aarch64 = [ + ( + "ntt.S", + "mldsa_ntt.S", + "dev/aarch64_opt/src", + f"-Imldsa/src/native/aarch64/src {aarch64_flags}", + "aarch64", + ), ( "poly_caddq_asm.S", "mldsa_poly_caddq.S", diff --git a/scripts/format b/scripts/format index 6dbb0a3fe..096d0e147 100755 --- a/scripts/format +++ b/scripts/format @@ -1,5 +1,6 @@ #!/usr/bin/env bash # Copyright (c) The mldsa-native project authors +# Copyright (c) The mlkem-native project authors # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT set -o errexit @@ -65,6 +66,18 @@ git ls-files -- ":/*.c" ":/*.h" | xargs -P "$nproc" -I {} sh -c ' clang-format -i {} fi' +info "Expanding tabs" +expand-tabs() +{ + git ls-files -- ":/" ":/!:Makefile" ":/!:**/Makefile" ":/!:**/Makefile.*" ":/!:Makefile.*" ":/!:*.mk" ":/!:*.patch" ":/!:*.S" ":/!:*.inc" ":/!:nix/valgrind/*.txt" | xargs -P "$nproc" -I {} sh -c ' + if [ ! -L {} ] && grep -Pq '"'"'\t'"'"' "{}"; then + tmp=$(mktemp) + expand -t 4 "{}" > "$tmp" && mv "$tmp" "{}" + echo "{}" + fi' +} +expand-tabs + info "Checking for eol" check-eol() { diff --git a/scripts/simpasm b/scripts/simpasm index 3a37b94c3..fda061fcc 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -66,7 +66,7 @@ def patchup_disasm(asm, cfify=False): raise Exception( f'The following does not seem to be an assembly line of the expected format `ADDRESS: BYTECODE INSTRUCTION`:\n"{l}"' ) - yield " " * indentation + d.group("inst") + yield " " * indentation + d.group("inst").expandtabs(1) return list(gen(asm)) @@ -231,7 +231,7 @@ def simplify(logger, args, asm_input, asm_output=None): ) raise Exception("simpasm failed") sym_info = nm_output[0].split(" ") - sym_addr = int(sym_info[0]) + sym_addr = int(sym_info[0], 16) if sym_addr != 0: logger.error( f"Global sym {sym} not at address 0 (instead at address {hex(sym_addr)}) -- please reorder the assembly to start with the global function symbol" @@ -445,7 +445,6 @@ def _main(): default="aarch64", help="Target architecture for CFI directives", ) - parser.add_argument( "--syntax", type=str,