Eamon2009
diff --git a/‎asstes/run_20260430_192930.png‎ ‎assets/run_20260430_192930.png‎asstes/run_20260430_192930.png renamed to assets/run_20260430_192930.png b/‎asstes/run_20260430_192930.png‎ ‎assets/run_20260430_192930.png‎asstes/run_20260430_192930.png renamed to assets/run_20260430_192930.png
diff --git a/‎asstes/run_20260508_110726.png‎ ‎assets/run_20260508_110726.png‎asstes/run_20260508_110726.png renamed to assets/run_20260508_110726.png b/‎asstes/run_20260508_110726.png‎ ‎assets/run_20260508_110726.png‎asstes/run_20260508_110726.png renamed to assets/run_20260508_110726.png
diff --git a/‎cuda/includes/memory.cuh‎
Lines changed: 107 additions & 0 deletions b/‎cuda/includes/memory.cuh‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎cuda/includes/reduce.cuh‎
Lines changed: 66 additions & 0 deletions b/‎cuda/includes/reduce.cuh‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎cuda/includes/tensor.cuh‎
Lines changed: 100 additions & 0 deletions b/‎cuda/includes/tensor.cuh‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎cuda/includes/utils.cuh‎
Lines changed: 9 additions & 0 deletions b/‎cuda/includes/utils.cuh‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,107 @@
+#pragma once
+#include "common.h"
+#include "tensor.cuh"
+#include <cstring>
+static inline void *qx_host_alloc(size_t n)
+{
+    void *p = malloc(n);
+    if (!p && n)
+    {
+        perror("[QX] malloc");
+        exit(1);
+    }
+    return p;
+}
+static inline void qx_host_free(void *p)
+{
+    free(p);
+}
+
+static inline void *qx_pinned_alloc(size_t n)
+{
+    void *p = nullptr;
+    CUDA_CHECK(cudaMallocHost(&p, n));
+    return p;
+}
+static inline void qx_pinned_free(void *p)
+{
+    if (p)
+        CUDA_CHECK(cudaFreeHost(p));
+}
+
+static inline void *qx_device_alloc(size_t n, int dev = 0)
+{
+    CUDA_CHECK(cudaSetDevice(dev));
+    void *p = nullptr;
+    CUDA_CHECK(cudaMalloc(&p, ROUND_UP(n, QX_MEM_ALIGN)));
+    return p;
+}
+static inline void qx_device_free(void *p)
+{
+    if (p)
+        CUDA_CHECK(cudaFree(p));
+}
+static inline void qx_device_zero(void *p, size_t n, cudaStream_t s = 0)
+{
+    if (p && n)
+        CUDA_CHECK(cudaMemsetAsync(p, 0, n, s));
+}
+// Tensor allocators
+static inline Tensor *tensor_alloc_device(const TensorShape &sh, DType dt,
+                                          int dev = 0, cudaStream_t s = 0,
+                                          const char *name = "")
+{
+    Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
+    t->shape = sh;
+    t->dtype = dt;
+    t->mem_loc = MEM_DEVICE;
+    t->owns_data = true;
+    t->device_id = dev;
+    strncpy(t->name, name, 63);
+    t->data = qx_device_alloc((size_t)sh.numel() * dtype_size(dt), dev);
+    qx_device_zero(t->data, (size_t)sh.numel() * dtype_size(dt), s);
+    return t;
+}
+
+static inline Tensor *tensor_alloc_host(const TensorShape &sh, DType dt,
+                                        bool pinned = false, const char *name = "")
+{
+    Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
+    t->shape = sh;
+    t->dtype = dt;
+    t->mem_loc = pinned ? MEM_HOST_PINNED : MEM_HOST;
+    t->owns_data = true;
+    t->device_id = -1;
+    strncpy(t->name, name, 63);
+    size_t nb = (size_t)sh.numel() * dtype_size(dt);
+    t->data = pinned ? qx_pinned_alloc(nb) : calloc(1, nb);
+    return t;
+}
+
+static inline void tensor_free(Tensor *t)
+{
+    if (!t)
+        return;
+    if (t->owns_data && t->data)
+    {
+        if (t->mem_loc == MEM_DEVICE)
+            qx_device_free(t->data);
+        else if (t->mem_loc == MEM_HOST_PINNED)
+            qx_pinned_free(t->data);
+        else
+            free(t->data);
+    }
+    free(t);
+}
+static inline void tensor_h2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
+{
+    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyHostToDevice, s));
+}
+static inline void tensor_d2h(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
+{
+    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToHost, s));
+}
+static inline void tensor_d2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
+{
+    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToDevice, s));
+}
@@ -0,0 +1,66 @@
+#pragma once
+#include "common.h"
+
+#ifdef __CUDACC__
+
+static constexpr unsigned FULL_MASK = 0xffffffff;
+// Warp reductions
+__device__ QX_INLINE float warpReduceSum(float v)
+{
+    v += __shfl_xor_sync(FULL_MASK, v, 16);
+    v += __shfl_xor_sync(FULL_MASK, v, 8);
+    v += __shfl_xor_sync(FULL_MASK, v, 4);
+    v += __shfl_xor_sync(FULL_MASK, v, 2);
+    v += __shfl_xor_sync(FULL_MASK, v, 1);
+    return v;
+}
+__device__ QX_INLINE float warpReduceMax(float v)
+{
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 16));
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 8));
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 4));
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 2));
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 1));
+    return v;
+}
+__device__ QX_INLINE float warpReduceMin(float v)
+{
+    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 16));
+    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 8));
+    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 4));
+    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 2));
+    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 1));
+    return v;
+}
+__device__ QX_INLINE float warpBroadcast(float v)
+{
+    return __shfl_sync(FULL_MASK, v, 0);
+}
+__device__ QX_INLINE float blockReduceSum(float v, float *smem)
+{
+    int lane = threadIdx.x % QX_WARP_SIZE;
+    int wid = threadIdx.x / QX_WARP_SIZE;
+    v = warpReduceSum(v);
+    if (lane == 0)
+        smem[wid] = v;
+    __syncthreads();
+    v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : 0.f;
+    if (wid == 0)
+        v = warpReduceSum(v);
+    return v;
+}
+__device__ QX_INLINE float blockReduceMax(float v, float *smem)
+{
+    int lane = threadIdx.x % QX_WARP_SIZE;
+    int wid = threadIdx.x / QX_WARP_SIZE;
+    v = warpReduceMax(v);
+    if (lane == 0)
+        smem[wid] = v;
+    __syncthreads();
+    v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : QX_NEG_INF_F32;
+    if (wid == 0)
+        v = warpReduceMax(v);
+    return v;
+}
+
+#endif // __CUDACC__
@@ -0,0 +1,100 @@
+#pragma once
+#include "common.h"
+// TensorShape — dimensions + strides (row-major by default)
+struct QX_ALIGN_16 TensorShape
+{
+    int dims[QX_MAX_DIMS];
+    int strides[QX_MAX_DIMS];
+    int ndim;
+    int _pad;
+
+    QX_HOST_DEVICE QX_INLINE int64_t numel() const
+    {
+        int64_t n = 1;
+        for (int i = 0; i < ndim; i++)
+            n *= dims[i];
+        return n;
+    }
+
+    QX_HOST QX_INLINE void compute_strides()
+    {
+        strides[ndim - 1] = 1;
+        for (int i = ndim - 2; i >= 0; i--)
+            strides[i] = strides[i + 1] * dims[i + 1];
+    }
+
+    QX_HOST QX_INLINE bool is_contiguous() const
+    {
+        int expected = 1;
+        for (int i = ndim - 1; i >= 0; i--)
+        {
+            if (strides[i] != expected)
+                return false;
+            expected *= dims[i];
+        }
+        return true;
+    }
+};
+
+static inline TensorShape make_shape(const int *d, int ndim)
+{
+    TensorShape s;
+    s.ndim = ndim;
+    s._pad = 0;
+    for (int i = 0; i < ndim; i++)
+        s.dims[i] = d[i];
+    for (int i = ndim; i < QX_MAX_DIMS; i++)
+    {
+        s.dims[i] = 1;
+        s.strides[i] = 1;
+    }
+    s.compute_strides();
+    return s;
+}
+static inline TensorShape make_shape1d(int a)
+{
+    int d[] = {a};
+    return make_shape(d, 1);
+}
+static inline TensorShape make_shape2d(int a, int b)
+{
+    int d[] = {a, b};
+    return make_shape(d, 2);
+}
+static inline TensorShape make_shape3d(int a, int b, int c)
+{
+    int d[] = {a, b, c};
+    return make_shape(d, 3);
+}
+static inline TensorShape make_shape4d(int a, int b, int c, int e)
+{
+    int d[] = {a, b, c, e};
+    return make_shape(d, 4);
+}
+// Tensor — primary data carrier (host struct, kernels get raw pointers)
+struct Tensor
+{
+    void *data;
+    TensorShape shape;
+    DType dtype;
+    MemLocation mem_loc;
+    bool owns_data;
+    int device_id;
+    char name[64];
+
+    template <typename T>
+    QX_HOST_DEVICE QX_INLINE T *as()
+    {
+        return reinterpret_cast<T *>(data);
+    }
+    template <typename T>
+    QX_HOST_DEVICE QX_INLINE const T *as() const
+    {
+        return reinterpret_cast<const T *>(data);
+    }
+
+    QX_HOST QX_INLINE size_t nbytes() const { return (size_t)shape.numel() * dtype_size(dtype); }
+    QX_HOST_DEVICE QX_INLINE int dim(int i) const { return shape.dims[i]; }
+    QX_HOST_DEVICE QX_INLINE int ndim() const { return shape.ndim; }
+    QX_HOST_DEVICE QX_INLINE int64_t numel() const { return shape.numel(); }
+};
@@ -0,0 +1,9 @@
+#pragma once
+
+// Aggregator — include this one header to get the full Day 1 runtime.
+// Each sub-header is small and independently loadable.
+
+#include "common.h"   // macros, enums, error checks, dtype helpers
+#include "tensor.cuh" // TensorShape, Tensor struct
+#include "memory.cuh" // allocators, tensor_alloc_*, tensor_free, transfers
+#include "reduce.cuh" // warpReduceSum/Max/Min, blockReduceSum/Max