@@ -8,8 +8,8 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
88
99 // Step 1: Unpack into shared memory
1010 #pragma unroll
11- for (int i = 0 ; i < 2 ; i++) {
12- _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i);
11+ for (int i = 0 ; i < FL_LANES< uint16_t > / 32 ; i++) {
12+ _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES< uint16_t > / 32 ) + i);
1313 }
1414 __syncwarp ();
1515
@@ -24,128 +24,128 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
2424
2525 // Step 3: Copy to global memory
2626 #pragma unroll
27- for (int i = 0 ; i < 32 ; i++) {
27+ for (int i = 0 ; i < FL_CHUNK / 32 ; i++) {
2828 auto idx = i * 32 + thread_idx;
2929 out[idx] = shared_out[idx];
3030 }
3131}
3232
3333extern " C" __global__ void bit_unpack_16_0bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
3434 int thread_idx = threadIdx .x ;
35- auto in = full_in + (blockIdx .x * (128 * 0 / sizeof ( uint16_t ) ));
36- auto out = full_out + (blockIdx .x * 1024 );
35+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 0 ));
36+ auto out = full_out + (blockIdx .x * FL_CHUNK );
3737 _bit_unpack_16_device<0 >(in, out, reference, thread_idx, patches);
3838}
3939
4040extern " C" __global__ void bit_unpack_16_1bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
4141 int thread_idx = threadIdx .x ;
42- auto in = full_in + (blockIdx .x * (128 * 1 / sizeof ( uint16_t ) ));
43- auto out = full_out + (blockIdx .x * 1024 );
42+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 1 ));
43+ auto out = full_out + (blockIdx .x * FL_CHUNK );
4444 _bit_unpack_16_device<1 >(in, out, reference, thread_idx, patches);
4545}
4646
4747extern " C" __global__ void bit_unpack_16_2bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
4848 int thread_idx = threadIdx .x ;
49- auto in = full_in + (blockIdx .x * (128 * 2 / sizeof ( uint16_t ) ));
50- auto out = full_out + (blockIdx .x * 1024 );
49+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 2 ));
50+ auto out = full_out + (blockIdx .x * FL_CHUNK );
5151 _bit_unpack_16_device<2 >(in, out, reference, thread_idx, patches);
5252}
5353
5454extern " C" __global__ void bit_unpack_16_3bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
5555 int thread_idx = threadIdx .x ;
56- auto in = full_in + (blockIdx .x * (128 * 3 / sizeof ( uint16_t ) ));
57- auto out = full_out + (blockIdx .x * 1024 );
56+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 3 ));
57+ auto out = full_out + (blockIdx .x * FL_CHUNK );
5858 _bit_unpack_16_device<3 >(in, out, reference, thread_idx, patches);
5959}
6060
6161extern " C" __global__ void bit_unpack_16_4bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
6262 int thread_idx = threadIdx .x ;
63- auto in = full_in + (blockIdx .x * (128 * 4 / sizeof ( uint16_t ) ));
64- auto out = full_out + (blockIdx .x * 1024 );
63+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 4 ));
64+ auto out = full_out + (blockIdx .x * FL_CHUNK );
6565 _bit_unpack_16_device<4 >(in, out, reference, thread_idx, patches);
6666}
6767
6868extern " C" __global__ void bit_unpack_16_5bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
6969 int thread_idx = threadIdx .x ;
70- auto in = full_in + (blockIdx .x * (128 * 5 / sizeof ( uint16_t ) ));
71- auto out = full_out + (blockIdx .x * 1024 );
70+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 5 ));
71+ auto out = full_out + (blockIdx .x * FL_CHUNK );
7272 _bit_unpack_16_device<5 >(in, out, reference, thread_idx, patches);
7373}
7474
7575extern " C" __global__ void bit_unpack_16_6bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
7676 int thread_idx = threadIdx .x ;
77- auto in = full_in + (blockIdx .x * (128 * 6 / sizeof ( uint16_t ) ));
78- auto out = full_out + (blockIdx .x * 1024 );
77+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 6 ));
78+ auto out = full_out + (blockIdx .x * FL_CHUNK );
7979 _bit_unpack_16_device<6 >(in, out, reference, thread_idx, patches);
8080}
8181
8282extern " C" __global__ void bit_unpack_16_7bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
8383 int thread_idx = threadIdx .x ;
84- auto in = full_in + (blockIdx .x * (128 * 7 / sizeof ( uint16_t ) ));
85- auto out = full_out + (blockIdx .x * 1024 );
84+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 7 ));
85+ auto out = full_out + (blockIdx .x * FL_CHUNK );
8686 _bit_unpack_16_device<7 >(in, out, reference, thread_idx, patches);
8787}
8888
8989extern " C" __global__ void bit_unpack_16_8bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
9090 int thread_idx = threadIdx .x ;
91- auto in = full_in + (blockIdx .x * (128 * 8 / sizeof ( uint16_t ) ));
92- auto out = full_out + (blockIdx .x * 1024 );
91+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 8 ));
92+ auto out = full_out + (blockIdx .x * FL_CHUNK );
9393 _bit_unpack_16_device<8 >(in, out, reference, thread_idx, patches);
9494}
9595
9696extern " C" __global__ void bit_unpack_16_9bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
9797 int thread_idx = threadIdx .x ;
98- auto in = full_in + (blockIdx .x * (128 * 9 / sizeof ( uint16_t ) ));
99- auto out = full_out + (blockIdx .x * 1024 );
98+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 9 ));
99+ auto out = full_out + (blockIdx .x * FL_CHUNK );
100100 _bit_unpack_16_device<9 >(in, out, reference, thread_idx, patches);
101101}
102102
103103extern " C" __global__ void bit_unpack_16_10bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
104104 int thread_idx = threadIdx .x ;
105- auto in = full_in + (blockIdx .x * (128 * 10 / sizeof ( uint16_t ) ));
106- auto out = full_out + (blockIdx .x * 1024 );
105+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 10 ));
106+ auto out = full_out + (blockIdx .x * FL_CHUNK );
107107 _bit_unpack_16_device<10 >(in, out, reference, thread_idx, patches);
108108}
109109
110110extern " C" __global__ void bit_unpack_16_11bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
111111 int thread_idx = threadIdx .x ;
112- auto in = full_in + (blockIdx .x * (128 * 11 / sizeof ( uint16_t ) ));
113- auto out = full_out + (blockIdx .x * 1024 );
112+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 11 ));
113+ auto out = full_out + (blockIdx .x * FL_CHUNK );
114114 _bit_unpack_16_device<11 >(in, out, reference, thread_idx, patches);
115115}
116116
117117extern " C" __global__ void bit_unpack_16_12bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
118118 int thread_idx = threadIdx .x ;
119- auto in = full_in + (blockIdx .x * (128 * 12 / sizeof ( uint16_t ) ));
120- auto out = full_out + (blockIdx .x * 1024 );
119+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 12 ));
120+ auto out = full_out + (blockIdx .x * FL_CHUNK );
121121 _bit_unpack_16_device<12 >(in, out, reference, thread_idx, patches);
122122}
123123
124124extern " C" __global__ void bit_unpack_16_13bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
125125 int thread_idx = threadIdx .x ;
126- auto in = full_in + (blockIdx .x * (128 * 13 / sizeof ( uint16_t ) ));
127- auto out = full_out + (blockIdx .x * 1024 );
126+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 13 ));
127+ auto out = full_out + (blockIdx .x * FL_CHUNK );
128128 _bit_unpack_16_device<13 >(in, out, reference, thread_idx, patches);
129129}
130130
131131extern " C" __global__ void bit_unpack_16_14bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
132132 int thread_idx = threadIdx .x ;
133- auto in = full_in + (blockIdx .x * (128 * 14 / sizeof ( uint16_t ) ));
134- auto out = full_out + (blockIdx .x * 1024 );
133+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 14 ));
134+ auto out = full_out + (blockIdx .x * FL_CHUNK );
135135 _bit_unpack_16_device<14 >(in, out, reference, thread_idx, patches);
136136}
137137
138138extern " C" __global__ void bit_unpack_16_15bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
139139 int thread_idx = threadIdx .x ;
140- auto in = full_in + (blockIdx .x * (128 * 15 / sizeof ( uint16_t ) ));
141- auto out = full_out + (blockIdx .x * 1024 );
140+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 15 ));
141+ auto out = full_out + (blockIdx .x * FL_CHUNK );
142142 _bit_unpack_16_device<15 >(in, out, reference, thread_idx, patches);
143143}
144144
145145extern " C" __global__ void bit_unpack_16_16bw_32t (const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
146146 int thread_idx = threadIdx .x ;
147- auto in = full_in + (blockIdx .x * (128 * 16 / sizeof ( uint16_t ) ));
148- auto out = full_out + (blockIdx .x * 1024 );
147+ auto in = full_in + (blockIdx .x * (FL_LANES< uint16_t > * 16 ));
148+ auto out = full_out + (blockIdx .x * FL_CHUNK );
149149 _bit_unpack_16_device<16 >(in, out, reference, thread_idx, patches);
150150}
151151
0 commit comments