Skip to content

Commit 1f8cc24

Browse files
Update suffix loop in _decodeByteStreamSplitWidth4AVX2 and _decodeByteStreamSplitWidth8AVX2
1 parent 779a11b commit 1f8cc24

1 file changed

Lines changed: 51 additions & 41 deletions

File tree

parquet/internal/encoding/byte_stream_split_decode_avx2_amd64.s

Lines changed: 51 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,20 @@ TEXT ·_decodeByteStreamSplitWidth4AVX2(SB), NOSPLIT, $0-32
3535
JMP suffix_check_avx2
3636

3737
suffix_loop_avx2:
38-
// Gather bytes: gathered_byte_data[b] = data[b * stride + i]
39-
MOVBQZX (R9)(R14*1), BX // byte from stream 0
40-
MOVBQZX (R10)(R14*1), R15 // byte from stream 1
41-
42-
// Calculate output offset: i * 4
38+
MOVBQZX (R9)(R14*1), BX // s0
39+
MOVBQZX (R10)(R14*1), R15 // s1
40+
SHLQ $8, R15
41+
ORQ R15, BX
42+
MOVBQZX (R11)(R14*1), R15 // s2
43+
SHLQ $16, R15
44+
ORQ R15, BX
45+
MOVBQZX (R12)(R14*1), R15 // s3
46+
SHLQ $24, R15
47+
ORQ R15, BX
4348
MOVQ R14, AX
44-
SHLQ $2, AX // AX = i * 4
45-
46-
// Store gathered bytes
47-
MOVB BX, (DI)(AX*1)
48-
MOVB R15, 1(DI)(AX*1)
49-
50-
MOVBQZX (R11)(R14*1), BX // byte from stream 2
51-
MOVBQZX (R12)(R14*1), R15 // byte from stream 3
5249

53-
MOVB BX, 2(DI)(AX*1)
54-
MOVB R15, 3(DI)(AX*1)
50+
SHLQ $2, AX // AX = i*4
51+
MOVL BX, (DI)(AX*1) // ← single 32-bit store
5552

5653
INCQ R14
5754

@@ -168,34 +165,47 @@ TEXT ·_decodeByteStreamSplitWidth8AVX2(SB), NOSPLIT, $0-32
168165
JMP suffix_check_w8_avx2
169166

170167
suffix_loop_w8_avx2:
171-
// Calculate output offset: i * 8
168+
// Load first byte (stream 0) and start accumulator
169+
MOVBQZX (R9)(SI*1), DX // DX = s0[i] (lowest byte)
170+
171+
// stream 1 << 8
172+
MOVBQZX (R10)(SI*1), AX
173+
SHLQ $8, AX
174+
ORQ AX, DX
175+
176+
// stream 2 << 16
177+
MOVBQZX (R11)(SI*1), AX
178+
SHLQ $16, AX
179+
ORQ AX, DX
180+
181+
// stream 3 << 24
182+
MOVBQZX (R12)(SI*1), AX
183+
SHLQ $24, AX
184+
ORQ AX, DX
185+
186+
// stream 4 << 32
187+
MOVBQZX (R13)(SI*1), AX
188+
SHLQ $32, AX
189+
ORQ AX, DX
190+
191+
// stream 5 << 40
192+
MOVBQZX (R14)(SI*1), AX
193+
SHLQ $40, AX
194+
ORQ AX, DX
195+
196+
// stream 6 << 48
197+
MOVBQZX (R15)(SI*1), AX
198+
SHLQ $48, AX
199+
ORQ AX, DX
200+
201+
// stream 7 << 56
202+
MOVBQZX (BX)(SI*1), AX
203+
SHLQ $56, AX
204+
ORQ AX, DX
205+
172206
MOVQ SI, AX
173207
SHLQ $3, AX // AX = i * 8
174-
175-
// Load and store bytes from all 8 streams
176-
MOVBQZX (R9)(SI*1), DX
177-
MOVB DX, (DI)(AX*1)
178-
179-
MOVBQZX (R10)(SI*1), DX
180-
MOVB DX, 1(DI)(AX*1)
181-
182-
MOVBQZX (R11)(SI*1), DX
183-
MOVB DX, 2(DI)(AX*1)
184-
185-
MOVBQZX (R12)(SI*1), DX
186-
MOVB DX, 3(DI)(AX*1)
187-
188-
MOVBQZX (R13)(SI*1), DX
189-
MOVB DX, 4(DI)(AX*1)
190-
191-
MOVBQZX (R14)(SI*1), DX
192-
MOVB DX, 5(DI)(AX*1)
193-
194-
MOVBQZX (R15)(SI*1), DX
195-
MOVB DX, 6(DI)(AX*1)
196-
197-
MOVBQZX (BX)(SI*1), DX
198-
MOVB DX, 7(DI)(AX*1)
208+
MOVQ DX, (DI)(AX*1)
199209

200210
INCQ SI
201211

0 commit comments

Comments
 (0)