vortex-data
diff --git a/‎vortex-cuda/kernels/src/bit_unpack_16.cu‎
Lines changed: 37 additions & 37 deletions b/‎vortex-cuda/kernels/src/bit_unpack_16.cu‎
Lines changed: 37 additions & 37 deletions
diff --git a/‎vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh‎
Lines changed: 16 additions & 16 deletions b/‎vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh‎
Lines changed: 16 additions & 16 deletions
@@ -8,8 +8,8 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 2; i++) {
-        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i);
+    for (int i = 0; i < FL_LANES<uint16_t> / 32; i++) {
+        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint16_t> / 32) + i);
     }
     __syncwarp();
 
@@ -24,128 +24,128 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 32; i++) {
+    for (int i = 0; i < FL_CHUNK / 32; i++) {
         auto idx = i * 32 + thread_idx;
         out[idx] = shared_out[idx];
     }
 }
 
 extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<8>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 9));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<9>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 10));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<10>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 11));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<11>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 12));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<12>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 13));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<13>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 14));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<14>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 15));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<15>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 16));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<16>(in, out, reference, thread_idx, patches);
 }
 
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_16_lane<0>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -59,7 +59,7 @@ __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -101,7 +101,7 @@ __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -145,7 +145,7 @@ __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -191,7 +191,7 @@ __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -239,7 +239,7 @@ __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -289,7 +289,7 @@ __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -341,7 +341,7 @@ __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -395,7 +395,7 @@ __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -451,7 +451,7 @@ __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -509,7 +509,7 @@ __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -569,7 +569,7 @@ __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -631,7 +631,7 @@ __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -695,7 +695,7 @@ __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -761,7 +761,7 @@ __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -829,7 +829,7 @@ __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<16>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     #pragma unroll
     for (int row = 0; row < 16; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;