Skip to content

Commit 98dce09

Browse files
committed
feat(core): add host, pinned, and device memory management utilities
- Implement memory wrappers for host (`malloc`), pinned host (`cudaMallocHost`), and aligned device allocations (`cudaMalloc`). - Enforce strict memory layout by rounding up device bytes to `QX_MEM_ALIGN`. - Add `tensor_alloc_device` and `tensor_alloc_host` factory allocators with automatic initialization. - Implement unified `tensor_free` handling safe deallocations across all memory spaces. - Add async Host-to-Device (`tensor_h2d`) copy routine.
1 parent d46ac16 commit 98dce09

1 file changed

Lines changed: 107 additions & 0 deletions

File tree

cuda/includes/memory.cuh

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#pragma once
2+
#include "common.h"
3+
#include "tensor.cuh"
4+
#include <cstring>
5+
static inline void *qx_host_alloc(size_t n)
6+
{
7+
void *p = malloc(n);
8+
if (!p && n)
9+
{
10+
perror("[QX] malloc");
11+
exit(1);
12+
}
13+
return p;
14+
}
15+
static inline void qx_host_free(void *p)
16+
{
17+
free(p);
18+
}
19+
20+
static inline void *qx_pinned_alloc(size_t n)
21+
{
22+
void *p = nullptr;
23+
CUDA_CHECK(cudaMallocHost(&p, n));
24+
return p;
25+
}
26+
static inline void qx_pinned_free(void *p)
27+
{
28+
if (p)
29+
CUDA_CHECK(cudaFreeHost(p));
30+
}
31+
32+
static inline void *qx_device_alloc(size_t n, int dev = 0)
33+
{
34+
CUDA_CHECK(cudaSetDevice(dev));
35+
void *p = nullptr;
36+
CUDA_CHECK(cudaMalloc(&p, ROUND_UP(n, QX_MEM_ALIGN)));
37+
return p;
38+
}
39+
static inline void qx_device_free(void *p)
40+
{
41+
if (p)
42+
CUDA_CHECK(cudaFree(p));
43+
}
44+
static inline void qx_device_zero(void *p, size_t n, cudaStream_t s = 0)
45+
{
46+
if (p && n)
47+
CUDA_CHECK(cudaMemsetAsync(p, 0, n, s));
48+
}
49+
// Tensor allocators
50+
static inline Tensor *tensor_alloc_device(const TensorShape &sh, DType dt,
51+
int dev = 0, cudaStream_t s = 0,
52+
const char *name = "")
53+
{
54+
Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
55+
t->shape = sh;
56+
t->dtype = dt;
57+
t->mem_loc = MEM_DEVICE;
58+
t->owns_data = true;
59+
t->device_id = dev;
60+
strncpy(t->name, name, 63);
61+
t->data = qx_device_alloc((size_t)sh.numel() * dtype_size(dt), dev);
62+
qx_device_zero(t->data, (size_t)sh.numel() * dtype_size(dt), s);
63+
return t;
64+
}
65+
66+
static inline Tensor *tensor_alloc_host(const TensorShape &sh, DType dt,
67+
bool pinned = false, const char *name = "")
68+
{
69+
Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
70+
t->shape = sh;
71+
t->dtype = dt;
72+
t->mem_loc = pinned ? MEM_HOST_PINNED : MEM_HOST;
73+
t->owns_data = true;
74+
t->device_id = -1;
75+
strncpy(t->name, name, 63);
76+
size_t nb = (size_t)sh.numel() * dtype_size(dt);
77+
t->data = pinned ? qx_pinned_alloc(nb) : calloc(1, nb);
78+
return t;
79+
}
80+
81+
static inline void tensor_free(Tensor *t)
82+
{
83+
if (!t)
84+
return;
85+
if (t->owns_data && t->data)
86+
{
87+
if (t->mem_loc == MEM_DEVICE)
88+
qx_device_free(t->data);
89+
else if (t->mem_loc == MEM_HOST_PINNED)
90+
qx_pinned_free(t->data);
91+
else
92+
free(t->data);
93+
}
94+
free(t);
95+
}
96+
static inline void tensor_h2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
97+
{
98+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyHostToDevice, s));
99+
}
100+
static inline void tensor_d2h(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
101+
{
102+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToHost, s));
103+
}
104+
static inline void tensor_d2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
105+
{
106+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToDevice, s));
107+
}

0 commit comments

Comments
 (0)