|
4 | 4 |
|
5 | 5 | template <int BW> |
6 | 6 | __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) { |
7 | | - __shared__ uint16_t shared_out[1024]; |
| 7 | + __shared__ uint16_t shared_out[FL_CHUNK]; |
8 | 8 |
|
9 | 9 | // Step 1: Unpack into shared memory |
10 | 10 | #pragma unroll |
11 | | - for (int i = 0; i < 2; i++) { |
12 | | - _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i); |
| 11 | + for (int i = 0; i < FL_LANES<uint16_t> / 32; i++) { |
| 12 | + _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint16_t> / 32) + i); |
13 | 13 | } |
14 | 14 | __syncwarp(); |
15 | 15 |
|
16 | 16 | // Step 2: Apply patches to shared memory in parallel |
17 | 17 | PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32); |
18 | 18 | auto patch = cursor.next(); |
19 | | - while (patch.index != 1024) { |
| 19 | + while (patch.index != FL_CHUNK) { |
20 | 20 | shared_out[patch.index] = patch.value; |
21 | 21 | patch = cursor.next(); |
22 | 22 | } |
23 | 23 | __syncwarp(); |
24 | 24 |
|
25 | 25 | // Step 3: Copy to global memory |
26 | 26 | #pragma unroll |
27 | | - for (int i = 0; i < 32; i++) { |
| 27 | + for (int i = 0; i < FL_CHUNK / 32; i++) { |
28 | 28 | auto idx = i * 32 + thread_idx; |
29 | 29 | out[idx] = shared_out[idx]; |
30 | 30 | } |
31 | 31 | } |
32 | 32 |
|
33 | 33 | extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
34 | 34 | int thread_idx = threadIdx.x; |
35 | | - auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t))); |
36 | | - auto out = full_out + (blockIdx.x * 1024); |
| 35 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 0)); |
| 36 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
37 | 37 | _bit_unpack_16_device<0>(in, out, reference, thread_idx, patches); |
38 | 38 | } |
39 | 39 |
|
40 | 40 | extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
41 | 41 | int thread_idx = threadIdx.x; |
42 | | - auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t))); |
43 | | - auto out = full_out + (blockIdx.x * 1024); |
| 42 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 1)); |
| 43 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
44 | 44 | _bit_unpack_16_device<1>(in, out, reference, thread_idx, patches); |
45 | 45 | } |
46 | 46 |
|
47 | 47 | extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
48 | 48 | int thread_idx = threadIdx.x; |
49 | | - auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t))); |
50 | | - auto out = full_out + (blockIdx.x * 1024); |
| 49 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 2)); |
| 50 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
51 | 51 | _bit_unpack_16_device<2>(in, out, reference, thread_idx, patches); |
52 | 52 | } |
53 | 53 |
|
54 | 54 | extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
55 | 55 | int thread_idx = threadIdx.x; |
56 | | - auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t))); |
57 | | - auto out = full_out + (blockIdx.x * 1024); |
| 56 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 3)); |
| 57 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
58 | 58 | _bit_unpack_16_device<3>(in, out, reference, thread_idx, patches); |
59 | 59 | } |
60 | 60 |
|
61 | 61 | extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
62 | 62 | int thread_idx = threadIdx.x; |
63 | | - auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t))); |
64 | | - auto out = full_out + (blockIdx.x * 1024); |
| 63 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 4)); |
| 64 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
65 | 65 | _bit_unpack_16_device<4>(in, out, reference, thread_idx, patches); |
66 | 66 | } |
67 | 67 |
|
68 | 68 | extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
69 | 69 | int thread_idx = threadIdx.x; |
70 | | - auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t))); |
71 | | - auto out = full_out + (blockIdx.x * 1024); |
| 70 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 5)); |
| 71 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
72 | 72 | _bit_unpack_16_device<5>(in, out, reference, thread_idx, patches); |
73 | 73 | } |
74 | 74 |
|
75 | 75 | extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
76 | 76 | int thread_idx = threadIdx.x; |
77 | | - auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t))); |
78 | | - auto out = full_out + (blockIdx.x * 1024); |
| 77 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 6)); |
| 78 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
79 | 79 | _bit_unpack_16_device<6>(in, out, reference, thread_idx, patches); |
80 | 80 | } |
81 | 81 |
|
82 | 82 | extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
83 | 83 | int thread_idx = threadIdx.x; |
84 | | - auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t))); |
85 | | - auto out = full_out + (blockIdx.x * 1024); |
| 84 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 7)); |
| 85 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
86 | 86 | _bit_unpack_16_device<7>(in, out, reference, thread_idx, patches); |
87 | 87 | } |
88 | 88 |
|
89 | 89 | extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
90 | 90 | int thread_idx = threadIdx.x; |
91 | | - auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t))); |
92 | | - auto out = full_out + (blockIdx.x * 1024); |
| 91 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 8)); |
| 92 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
93 | 93 | _bit_unpack_16_device<8>(in, out, reference, thread_idx, patches); |
94 | 94 | } |
95 | 95 |
|
96 | 96 | extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
97 | 97 | int thread_idx = threadIdx.x; |
98 | | - auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t))); |
99 | | - auto out = full_out + (blockIdx.x * 1024); |
| 98 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 9)); |
| 99 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
100 | 100 | _bit_unpack_16_device<9>(in, out, reference, thread_idx, patches); |
101 | 101 | } |
102 | 102 |
|
103 | 103 | extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
104 | 104 | int thread_idx = threadIdx.x; |
105 | | - auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t))); |
106 | | - auto out = full_out + (blockIdx.x * 1024); |
| 105 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 10)); |
| 106 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
107 | 107 | _bit_unpack_16_device<10>(in, out, reference, thread_idx, patches); |
108 | 108 | } |
109 | 109 |
|
110 | 110 | extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
111 | 111 | int thread_idx = threadIdx.x; |
112 | | - auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t))); |
113 | | - auto out = full_out + (blockIdx.x * 1024); |
| 112 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 11)); |
| 113 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
114 | 114 | _bit_unpack_16_device<11>(in, out, reference, thread_idx, patches); |
115 | 115 | } |
116 | 116 |
|
117 | 117 | extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
118 | 118 | int thread_idx = threadIdx.x; |
119 | | - auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t))); |
120 | | - auto out = full_out + (blockIdx.x * 1024); |
| 119 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 12)); |
| 120 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
121 | 121 | _bit_unpack_16_device<12>(in, out, reference, thread_idx, patches); |
122 | 122 | } |
123 | 123 |
|
124 | 124 | extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
125 | 125 | int thread_idx = threadIdx.x; |
126 | | - auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t))); |
127 | | - auto out = full_out + (blockIdx.x * 1024); |
| 126 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 13)); |
| 127 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
128 | 128 | _bit_unpack_16_device<13>(in, out, reference, thread_idx, patches); |
129 | 129 | } |
130 | 130 |
|
131 | 131 | extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
132 | 132 | int thread_idx = threadIdx.x; |
133 | | - auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t))); |
134 | | - auto out = full_out + (blockIdx.x * 1024); |
| 133 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 14)); |
| 134 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
135 | 135 | _bit_unpack_16_device<14>(in, out, reference, thread_idx, patches); |
136 | 136 | } |
137 | 137 |
|
138 | 138 | extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
139 | 139 | int thread_idx = threadIdx.x; |
140 | | - auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t))); |
141 | | - auto out = full_out + (blockIdx.x * 1024); |
| 140 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 15)); |
| 141 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
142 | 142 | _bit_unpack_16_device<15>(in, out, reference, thread_idx, patches); |
143 | 143 | } |
144 | 144 |
|
145 | 145 | extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { |
146 | 146 | int thread_idx = threadIdx.x; |
147 | | - auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t))); |
148 | | - auto out = full_out + (blockIdx.x * 1024); |
| 147 | + auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 16)); |
| 148 | + auto out = full_out + (blockIdx.x * FL_CHUNK); |
149 | 149 | _bit_unpack_16_device<16>(in, out, reference, thread_idx, patches); |
150 | 150 | } |
151 | 151 |
|
0 commit comments