@@ -35,23 +35,20 @@ TEXT ·_decodeByteStreamSplitWidth4AVX2(SB), NOSPLIT, $0-32
3535 JMP suffix_check_avx2
3636
3737suffix_loop_avx2:
38- // Gather bytes: gathered_byte_data [ b ] = data [ b * stride + i ]
39- MOVBQZX ( R9 )( R14 * 1 ) , BX // byte from stream 0
40- MOVBQZX ( R10 )( R14 * 1 ) , R15 // byte from stream 1
41-
42- // Calculate output offset: i * 4
38+ MOVBQZX ( R9 )( R14 * 1 ) , BX // s0
39+ MOVBQZX ( R10 )( R14 * 1 ) , R15 // s1
40+ SHLQ $ 8 , R15
41+ ORQ R15 , BX
42+ MOVBQZX ( R11 )( R14 * 1 ) , R15 // s2
43+ SHLQ $ 16 , R15
44+ ORQ R15 , BX
45+ MOVBQZX ( R12 )( R14 * 1 ) , R15 // s3
46+ SHLQ $ 24 , R15
47+ ORQ R15 , BX
4348 MOVQ R14 , AX
44- SHLQ $ 2 , AX // AX = i * 4
45-
46- // Store gathered bytes
47- MOVB BX , ( DI )( AX * 1 )
48- MOVB R15 , 1 ( DI )( AX * 1 )
49-
50- MOVBQZX ( R11 )( R14 * 1 ) , BX // byte from stream 2
51- MOVBQZX ( R12 )( R14 * 1 ) , R15 // byte from stream 3
5249
53- MOVB BX , 2 ( DI )( AX * 1 )
54- MOVB R15 , 3 ( DI )( AX * 1 )
50+ SHLQ $ 2 , AX // AX = i * 4
51+ MOVL BX , (DI )( AX * 1 ) // ← single 32 - bit store
5552
5653 INCQ R14
5754
@@ -168,34 +165,47 @@ TEXT ·_decodeByteStreamSplitWidth8AVX2(SB), NOSPLIT, $0-32
168165 JMP suffix_check_w8_avx2
169166
170167suffix_loop_w8_avx2:
171- // Calculate output offset: i * 8
168+ // Load first byte (stream 0 ) and start accumulator
169+ MOVBQZX ( R9 )( SI * 1 ) , DX // DX = s0 [ i ] (lowest byte)
170+
171+ // stream 1 << 8
172+ MOVBQZX ( R10 )( SI * 1 ) , AX
173+ SHLQ $ 8 , AX
174+ ORQ AX , DX
175+
176+ // stream 2 << 16
177+ MOVBQZX ( R11 )( SI * 1 ) , AX
178+ SHLQ $ 16 , AX
179+ ORQ AX , DX
180+
181+ // stream 3 << 24
182+ MOVBQZX ( R12 )( SI * 1 ) , AX
183+ SHLQ $ 24 , AX
184+ ORQ AX , DX
185+
186+ // stream 4 << 32
187+ MOVBQZX ( R13 )( SI * 1 ) , AX
188+ SHLQ $ 32 , AX
189+ ORQ AX , DX
190+
191+ // stream 5 << 40
192+ MOVBQZX ( R14 )( SI * 1 ) , AX
193+ SHLQ $ 40 , AX
194+ ORQ AX , DX
195+
196+ // stream 6 << 48
197+ MOVBQZX ( R15 )( SI * 1 ) , AX
198+ SHLQ $ 48 , AX
199+ ORQ AX , DX
200+
201+ // stream 7 << 56
202+ MOVBQZX ( BX )( SI * 1 ) , AX
203+ SHLQ $ 56 , AX
204+ ORQ AX , DX
205+
172206 MOVQ SI , AX
173207 SHLQ $ 3 , AX // AX = i * 8
174-
175- // Load and store bytes from all 8 streams
176- MOVBQZX ( R9 )( SI * 1 ) , DX
177- MOVB DX , ( DI )( AX * 1 )
178-
179- MOVBQZX ( R10 )( SI * 1 ) , DX
180- MOVB DX , 1 ( DI )( AX * 1 )
181-
182- MOVBQZX ( R11 )( SI * 1 ) , DX
183- MOVB DX , 2 ( DI )( AX * 1 )
184-
185- MOVBQZX ( R12 )( SI * 1 ) , DX
186- MOVB DX , 3 ( DI )( AX * 1 )
187-
188- MOVBQZX ( R13 )( SI * 1 ) , DX
189- MOVB DX , 4 ( DI )( AX * 1 )
190-
191- MOVBQZX ( R14 )( SI * 1 ) , DX
192- MOVB DX , 5 ( DI )( AX * 1 )
193-
194- MOVBQZX ( R15 )( SI * 1 ) , DX
195- MOVB DX , 6 ( DI )( AX * 1 )
196-
197- MOVBQZX ( BX )( SI * 1 ) , DX
198- MOVB DX , 7 ( DI )( AX * 1 )
208+ MOVQ DX , ( DI )( AX * 1 )
199209
200210 INCQ SI
201211
0 commit comments