|
| 1 | +#pragma once |
| 2 | +#include "common.h" |
| 3 | +#include "tensor.cuh" |
| 4 | +#include <cstring> |
| 5 | +static inline void *qx_host_alloc(size_t n) |
| 6 | +{ |
| 7 | + void *p = malloc(n); |
| 8 | + if (!p && n) |
| 9 | + { |
| 10 | + perror("[QX] malloc"); |
| 11 | + exit(1); |
| 12 | + } |
| 13 | + return p; |
| 14 | +} |
| 15 | +static inline void qx_host_free(void *p) |
| 16 | +{ |
| 17 | + free(p); |
| 18 | +} |
| 19 | + |
| 20 | +static inline void *qx_pinned_alloc(size_t n) |
| 21 | +{ |
| 22 | + void *p = nullptr; |
| 23 | + CUDA_CHECK(cudaMallocHost(&p, n)); |
| 24 | + return p; |
| 25 | +} |
| 26 | +static inline void qx_pinned_free(void *p) |
| 27 | +{ |
| 28 | + if (p) |
| 29 | + CUDA_CHECK(cudaFreeHost(p)); |
| 30 | +} |
| 31 | + |
| 32 | +static inline void *qx_device_alloc(size_t n, int dev = 0) |
| 33 | +{ |
| 34 | + CUDA_CHECK(cudaSetDevice(dev)); |
| 35 | + void *p = nullptr; |
| 36 | + CUDA_CHECK(cudaMalloc(&p, ROUND_UP(n, QX_MEM_ALIGN))); |
| 37 | + return p; |
| 38 | +} |
| 39 | +static inline void qx_device_free(void *p) |
| 40 | +{ |
| 41 | + if (p) |
| 42 | + CUDA_CHECK(cudaFree(p)); |
| 43 | +} |
| 44 | +static inline void qx_device_zero(void *p, size_t n, cudaStream_t s = 0) |
| 45 | +{ |
| 46 | + if (p && n) |
| 47 | + CUDA_CHECK(cudaMemsetAsync(p, 0, n, s)); |
| 48 | +} |
| 49 | +// Tensor allocators |
| 50 | +static inline Tensor *tensor_alloc_device(const TensorShape &sh, DType dt, |
| 51 | + int dev = 0, cudaStream_t s = 0, |
| 52 | + const char *name = "") |
| 53 | +{ |
| 54 | + Tensor *t = (Tensor *)calloc(1, sizeof(Tensor)); |
| 55 | + t->shape = sh; |
| 56 | + t->dtype = dt; |
| 57 | + t->mem_loc = MEM_DEVICE; |
| 58 | + t->owns_data = true; |
| 59 | + t->device_id = dev; |
| 60 | + strncpy(t->name, name, 63); |
| 61 | + t->data = qx_device_alloc((size_t)sh.numel() * dtype_size(dt), dev); |
| 62 | + qx_device_zero(t->data, (size_t)sh.numel() * dtype_size(dt), s); |
| 63 | + return t; |
| 64 | +} |
| 65 | + |
| 66 | +static inline Tensor *tensor_alloc_host(const TensorShape &sh, DType dt, |
| 67 | + bool pinned = false, const char *name = "") |
| 68 | +{ |
| 69 | + Tensor *t = (Tensor *)calloc(1, sizeof(Tensor)); |
| 70 | + t->shape = sh; |
| 71 | + t->dtype = dt; |
| 72 | + t->mem_loc = pinned ? MEM_HOST_PINNED : MEM_HOST; |
| 73 | + t->owns_data = true; |
| 74 | + t->device_id = -1; |
| 75 | + strncpy(t->name, name, 63); |
| 76 | + size_t nb = (size_t)sh.numel() * dtype_size(dt); |
| 77 | + t->data = pinned ? qx_pinned_alloc(nb) : calloc(1, nb); |
| 78 | + return t; |
| 79 | +} |
| 80 | + |
| 81 | +static inline void tensor_free(Tensor *t) |
| 82 | +{ |
| 83 | + if (!t) |
| 84 | + return; |
| 85 | + if (t->owns_data && t->data) |
| 86 | + { |
| 87 | + if (t->mem_loc == MEM_DEVICE) |
| 88 | + qx_device_free(t->data); |
| 89 | + else if (t->mem_loc == MEM_HOST_PINNED) |
| 90 | + qx_pinned_free(t->data); |
| 91 | + else |
| 92 | + free(t->data); |
| 93 | + } |
| 94 | + free(t); |
| 95 | +} |
| 96 | +static inline void tensor_h2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0) |
| 97 | +{ |
| 98 | + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyHostToDevice, s)); |
| 99 | +} |
| 100 | +static inline void tensor_d2h(Tensor *dst, const Tensor *src, cudaStream_t s = 0) |
| 101 | +{ |
| 102 | + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToHost, s)); |
| 103 | +} |
| 104 | +static inline void tensor_d2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0) |
| 105 | +{ |
| 106 | + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToDevice, s)); |
| 107 | +} |
0 commit comments