Skip to content

Commit 8451d4a

Browse files
Enhance tensor management and CUDA utilities with benchmarks (#51)
2 parents 0517a06 + cd675d2 commit 8451d4a

7 files changed

Lines changed: 321 additions & 47 deletions

File tree

cuda/includes/memory.cuh

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#pragma once
2+
#include "common.h"
3+
#include "tensor.cuh"
4+
#include <cstring>
5+
static inline void *qx_host_alloc(size_t n)
6+
{
7+
void *p = malloc(n);
8+
if (!p && n)
9+
{
10+
perror("[QX] malloc");
11+
exit(1);
12+
}
13+
return p;
14+
}
15+
static inline void qx_host_free(void *p)
16+
{
17+
free(p);
18+
}
19+
20+
static inline void *qx_pinned_alloc(size_t n)
21+
{
22+
void *p = nullptr;
23+
CUDA_CHECK(cudaMallocHost(&p, n));
24+
return p;
25+
}
26+
static inline void qx_pinned_free(void *p)
27+
{
28+
if (p)
29+
CUDA_CHECK(cudaFreeHost(p));
30+
}
31+
32+
static inline void *qx_device_alloc(size_t n, int dev = 0)
33+
{
34+
CUDA_CHECK(cudaSetDevice(dev));
35+
void *p = nullptr;
36+
CUDA_CHECK(cudaMalloc(&p, ROUND_UP(n, QX_MEM_ALIGN)));
37+
return p;
38+
}
39+
static inline void qx_device_free(void *p)
40+
{
41+
if (p)
42+
CUDA_CHECK(cudaFree(p));
43+
}
44+
static inline void qx_device_zero(void *p, size_t n, cudaStream_t s = 0)
45+
{
46+
if (p && n)
47+
CUDA_CHECK(cudaMemsetAsync(p, 0, n, s));
48+
}
49+
// Tensor allocators
50+
static inline Tensor *tensor_alloc_device(const TensorShape &sh, DType dt,
51+
int dev = 0, cudaStream_t s = 0,
52+
const char *name = "")
53+
{
54+
Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
55+
t->shape = sh;
56+
t->dtype = dt;
57+
t->mem_loc = MEM_DEVICE;
58+
t->owns_data = true;
59+
t->device_id = dev;
60+
strncpy(t->name, name, 63);
61+
t->data = qx_device_alloc((size_t)sh.numel() * dtype_size(dt), dev);
62+
qx_device_zero(t->data, (size_t)sh.numel() * dtype_size(dt), s);
63+
return t;
64+
}
65+
66+
static inline Tensor *tensor_alloc_host(const TensorShape &sh, DType dt,
67+
bool pinned = false, const char *name = "")
68+
{
69+
Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
70+
t->shape = sh;
71+
t->dtype = dt;
72+
t->mem_loc = pinned ? MEM_HOST_PINNED : MEM_HOST;
73+
t->owns_data = true;
74+
t->device_id = -1;
75+
strncpy(t->name, name, 63);
76+
size_t nb = (size_t)sh.numel() * dtype_size(dt);
77+
t->data = pinned ? qx_pinned_alloc(nb) : calloc(1, nb);
78+
return t;
79+
}
80+
81+
static inline void tensor_free(Tensor *t)
82+
{
83+
if (!t)
84+
return;
85+
if (t->owns_data && t->data)
86+
{
87+
if (t->mem_loc == MEM_DEVICE)
88+
qx_device_free(t->data);
89+
else if (t->mem_loc == MEM_HOST_PINNED)
90+
qx_pinned_free(t->data);
91+
else
92+
free(t->data);
93+
}
94+
free(t);
95+
}
96+
static inline void tensor_h2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
97+
{
98+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyHostToDevice, s));
99+
}
100+
static inline void tensor_d2h(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
101+
{
102+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToHost, s));
103+
}
104+
static inline void tensor_d2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
105+
{
106+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToDevice, s));
107+
}

cuda/includes/reduce.cuh

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#pragma once
2+
#include "common.h"
3+
4+
#ifdef __CUDACC__
5+
6+
static constexpr unsigned FULL_MASK = 0xffffffff;
7+
// Warp reductions
8+
__device__ QX_INLINE float warpReduceSum(float v)
9+
{
10+
v += __shfl_xor_sync(FULL_MASK, v, 16);
11+
v += __shfl_xor_sync(FULL_MASK, v, 8);
12+
v += __shfl_xor_sync(FULL_MASK, v, 4);
13+
v += __shfl_xor_sync(FULL_MASK, v, 2);
14+
v += __shfl_xor_sync(FULL_MASK, v, 1);
15+
return v;
16+
}
17+
__device__ QX_INLINE float warpReduceMax(float v)
18+
{
19+
v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 16));
20+
v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 8));
21+
v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 4));
22+
v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 2));
23+
v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 1));
24+
return v;
25+
}
26+
__device__ QX_INLINE float warpReduceMin(float v)
27+
{
28+
v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 16));
29+
v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 8));
30+
v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 4));
31+
v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 2));
32+
v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 1));
33+
return v;
34+
}
35+
__device__ QX_INLINE float warpBroadcast(float v)
36+
{
37+
return __shfl_sync(FULL_MASK, v, 0);
38+
}
39+
__device__ QX_INLINE float blockReduceSum(float v, float *smem)
40+
{
41+
int lane = threadIdx.x % QX_WARP_SIZE;
42+
int wid = threadIdx.x / QX_WARP_SIZE;
43+
v = warpReduceSum(v);
44+
if (lane == 0)
45+
smem[wid] = v;
46+
__syncthreads();
47+
v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : 0.f;
48+
if (wid == 0)
49+
v = warpReduceSum(v);
50+
return v;
51+
}
52+
__device__ QX_INLINE float blockReduceMax(float v, float *smem)
53+
{
54+
int lane = threadIdx.x % QX_WARP_SIZE;
55+
int wid = threadIdx.x / QX_WARP_SIZE;
56+
v = warpReduceMax(v);
57+
if (lane == 0)
58+
smem[wid] = v;
59+
__syncthreads();
60+
v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : QX_NEG_INF_F32;
61+
if (wid == 0)
62+
v = warpReduceMax(v);
63+
return v;
64+
}
65+
66+
#endif // __CUDACC__

cuda/includes/tensor.cuh

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#pragma once
2+
#include "common.h"
3+
// TensorShape — dimensions + strides (row-major by default)
4+
struct QX_ALIGN_16 TensorShape
5+
{
6+
int dims[QX_MAX_DIMS];
7+
int strides[QX_MAX_DIMS];
8+
int ndim;
9+
int _pad;
10+
11+
QX_HOST_DEVICE QX_INLINE int64_t numel() const
12+
{
13+
int64_t n = 1;
14+
for (int i = 0; i < ndim; i++)
15+
n *= dims[i];
16+
return n;
17+
}
18+
19+
QX_HOST QX_INLINE void compute_strides()
20+
{
21+
strides[ndim - 1] = 1;
22+
for (int i = ndim - 2; i >= 0; i--)
23+
strides[i] = strides[i + 1] * dims[i + 1];
24+
}
25+
26+
QX_HOST QX_INLINE bool is_contiguous() const
27+
{
28+
int expected = 1;
29+
for (int i = ndim - 1; i >= 0; i--)
30+
{
31+
if (strides[i] != expected)
32+
return false;
33+
expected *= dims[i];
34+
}
35+
return true;
36+
}
37+
};
38+
39+
static inline TensorShape make_shape(const int *d, int ndim)
40+
{
41+
TensorShape s;
42+
s.ndim = ndim;
43+
s._pad = 0;
44+
for (int i = 0; i < ndim; i++)
45+
s.dims[i] = d[i];
46+
for (int i = ndim; i < QX_MAX_DIMS; i++)
47+
{
48+
s.dims[i] = 1;
49+
s.strides[i] = 1;
50+
}
51+
s.compute_strides();
52+
return s;
53+
}
54+
static inline TensorShape make_shape1d(int a)
55+
{
56+
int d[] = {a};
57+
return make_shape(d, 1);
58+
}
59+
static inline TensorShape make_shape2d(int a, int b)
60+
{
61+
int d[] = {a, b};
62+
return make_shape(d, 2);
63+
}
64+
static inline TensorShape make_shape3d(int a, int b, int c)
65+
{
66+
int d[] = {a, b, c};
67+
return make_shape(d, 3);
68+
}
69+
static inline TensorShape make_shape4d(int a, int b, int c, int e)
70+
{
71+
int d[] = {a, b, c, e};
72+
return make_shape(d, 4);
73+
}
74+
// Tensor — primary data carrier (host struct, kernels get raw pointers)
75+
struct Tensor
76+
{
77+
void *data;
78+
TensorShape shape;
79+
DType dtype;
80+
MemLocation mem_loc;
81+
bool owns_data;
82+
int device_id;
83+
char name[64];
84+
85+
template <typename T>
86+
QX_HOST_DEVICE QX_INLINE T *as()
87+
{
88+
return reinterpret_cast<T *>(data);
89+
}
90+
template <typename T>
91+
QX_HOST_DEVICE QX_INLINE const T *as() const
92+
{
93+
return reinterpret_cast<const T *>(data);
94+
}
95+
96+
QX_HOST QX_INLINE size_t nbytes() const { return (size_t)shape.numel() * dtype_size(dtype); }
97+
QX_HOST_DEVICE QX_INLINE int dim(int i) const { return shape.dims[i]; }
98+
QX_HOST_DEVICE QX_INLINE int ndim() const { return shape.ndim; }
99+
QX_HOST_DEVICE QX_INLINE int64_t numel() const { return shape.numel(); }
100+
};

cuda/includes/utils.cuh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#pragma once
2+
3+
// Aggregator — include this one header to get the full Day 1 runtime.
4+
// Each sub-header is small and independently loadable.
5+
6+
#include "common.h" // macros, enums, error checks, dtype helpers
7+
#include "tensor.cuh" // TensorShape, Tensor struct
8+
#include "memory.cuh" // allocators, tensor_alloc_*, tensor_free, transfers
9+
#include "reduce.cuh" // warpReduceSum/Max/Min, blockReduceSum/Max

0 commit comments

Comments
 (0)