NVIDIA
diff --git a/‎cext/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎cext/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎cext/arena.h‎
Lines changed: 78 additions & 0 deletions b/‎cext/arena.h‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎cext/cuda_loader.h‎
Lines changed: 2 additions & 1 deletion b/‎cext/cuda_loader.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cext/memory.cpp‎
Lines changed: 10 additions & 0 deletions b/‎cext/memory.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎cext/memory.h‎
Lines changed: 4 additions & 0 deletions b/‎cext/memory.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cext/py.h‎
Lines changed: 18 additions & 0 deletions b/‎cext/py.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cext/test/test_arena.cpp‎
Lines changed: 91 additions & 0 deletions b/‎cext/test/test_arena.cpp‎
Lines changed: 91 additions & 0 deletions
@@ -104,3 +104,10 @@ target_include_directories(test_vec PRIVATE ${cext_include_dirs})
 target_compile_options(test_vec PUBLIC ${cext_compile_flags} ${test_coverage_options})
 target_link_options(test_vec PRIVATE ${test_coverage_options})
 target_link_libraries(test_vec PRIVATE ${Python_LIBRARIES})
+
+
+add_executable(test_arena test/test_arena.cpp memory.cpp)
+target_include_directories(test_arena PRIVATE ${cext_include_dirs})
+target_compile_options(test_arena PUBLIC ${cext_compile_flags} ${test_coverage_options})
+target_link_options(test_arena PRIVATE ${test_coverage_options})
+target_link_libraries(test_arena PRIVATE ${Python_LIBRARIES})
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec.h"
+#include <memory>
+
+namespace {
+template <typename T, size_t InitialSize = 16>
+class Arena {
+public:
+    Arena()
+      : cur_chunk_(new T[InitialSize])
+      , cur_chunk_avail_(InitialSize)
+      , cur_chunk_size_(InitialSize)
+    { }
+
+    T* alloc(size_t count) {
+        if (count > cur_chunk_avail_)
+            allocate_chunk(count);
+        cur_chunk_avail_ -= count;
+        return &cur_chunk_[cur_chunk_avail_];
+    }
+
+    template <size_t AlignmentBytes>
+    T* alloc_aligned(size_t count) {
+        static_assert(AlignmentBytes % sizeof(T) == 0);
+        static_assert((AlignmentBytes & (AlignmentBytes - 1)) == 0);
+        T* ret = try_alloc_aligned<AlignmentBytes>(count);
+        if (ret) return ret;
+
+        // Request enough items so that we can always find an aligned segment
+        allocate_chunk(count + AlignmentBytes / sizeof(T) - 1);
+        ret = try_alloc_aligned<AlignmentBytes>(count);
+        CHECK(ret);
+        return ret;
+    }
+
+    void clear() {
+        // Preserve the current (i.e. the biggest chunk) for future reuse
+        old_chunks_.clear();
+        cur_chunk_avail_ = cur_chunk_size_;
+    }
+
+private:
+    template <size_t AlignmentBytes>
+    T* try_alloc_aligned(size_t count) {
+        if (count > cur_chunk_avail_) return nullptr;
+        uintptr_t cur_chunk_start = reinterpret_cast<uintptr_t>(cur_chunk_.get());
+        uintptr_t addr = (cur_chunk_start + (cur_chunk_avail_ - count) * sizeof(T))
+                        & ~(AlignmentBytes - 1);
+        if (addr < cur_chunk_start) return nullptr;
+        cur_chunk_avail_ = (addr - cur_chunk_start) / sizeof(T);
+        return &cur_chunk_[cur_chunk_avail_];
+    }
+
+    void allocate_chunk(size_t min_capacity) {
+        // Always grow the chunk at least by a factor of two, so that eventually,
+        // after clearing the arena, we have one big enough chunk to satisfy all allocations.
+        cur_chunk_size_ *= 2;
+        if (cur_chunk_size_ < min_capacity)
+            cur_chunk_size_ = min_capacity;
+        old_chunks_.push_back(std::move(cur_chunk_));
+        cur_chunk_.reset(new T[cur_chunk_size_]);
+        cur_chunk_avail_ = cur_chunk_size_;
+    }
+
+public:
+    // Keep public for ease of testing
+    std::unique_ptr<T[]> cur_chunk_;
+    size_t cur_chunk_avail_;
+    size_t cur_chunk_size_;
+    Vec<std::unique_ptr<T[]>> old_chunks_;
+};
+}
+
@@ -54,7 +54,8 @@
     X(cuGraphAddMemFreeNode, 11040) \
     X(cuGraphInstantiateWithFlags, 11040) \
     X(cuGraphExecDestroy, 10000) \
-    X(cuGraphLaunch, 10000)
+    X(cuGraphLaunch, 10000) \
+    X(cuTensorMapEncodeTiled, 12000)
 
 
 #define DECLARE_CUDA_FUNC_EXTERN(name, _cuda_version) \
 
@@ -20,10 +20,20 @@ void* operator new (size_t len) {
     return ret;
 }
 
+void* operator new[] (size_t len) {
+    void* ret = PyMem_RawMalloc(len);
+    CHECK(ret);
+    return ret;
+}
+
 void operator delete (void* ptr, size_t) {
     PyMem_RawFree(ptr);
 }
 
+void operator delete[] (void* ptr) {
+    PyMem_RawFree(ptr);
+}
+
 void mem_free(void* p) {
     PyMem_RawFree(p);
 }
 
@@ -11,8 +11,12 @@
 
 void* operator new (size_t len);
 
+void* operator new[] (size_t len);
+
 void operator delete (void* ptr, size_t);
 
+void operator delete[] (void* ptr);
+
 void* xcalloc(size_t nmemb, size_t size);
 
 
 
@@ -33,10 +33,23 @@ static inline int pylong_as_int(PyObject* obj) {
     return static_cast<int>(val);
 }
 
+static inline unsigned pylong_as_uint(PyObject* obj) {
+    unsigned long val = PyLong_AsUnsignedLong(obj);
+    if (PyErr_Occurred()) return -1;
+    if (val > UINT_MAX) {
+        PyErr_SetString(PyExc_OverflowError,
+            "Python int too large to convert to C unsigned int");
+        return -1;
+    }
+    return static_cast<unsigned>(val);
+}
+
 template <typename T>
 T pylong_as(PyObject* obj) {
     if constexpr (std::is_same_v<T, int>) {
         return pylong_as_int(obj);
+    } else if constexpr (std::is_same_v<T, unsigned>) {
+        return pylong_as_uint(obj);
     } else if constexpr (std::is_same_v<T, long>) {
         return PyLong_AsLong(obj);
     } else if constexpr (std::is_same_v<T, long long>) {
@@ -50,6 +63,11 @@ T pylong_as(PyObject* obj) {
     }
 }
 
+template <typename T>
+T pylong_as(const PyPtr& ptr) {
+    return pylong_as<T>(ptr.get());
+}
+
 template <typename T>
 T pylong_as_overflow_and(PyObject* obj, int* overflow) {
     if constexpr (std::is_same_v<T, int>) {
 
@@ -0,0 +1,91 @@
+// SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../arena.h"
+#include "../check.h"
+
+
+template <typename T, size_t InitialSize>
+static inline bool in_cur_chunk(T* p, const Arena<T, InitialSize>& arena) {
+    T* chunk = arena.cur_chunk_.get();
+    return p >= chunk && p < chunk + arena.cur_chunk_size_;
+}
+
+int main() {
+    int64_t* p[8];
+
+    Arena<int64_t, 2> arena;
+    CHECK(arena.cur_chunk_size_ == 2);
+
+    p[0] = arena.alloc(1);
+    *p[0] = 100;
+    CHECK(arena.old_chunks_.empty());
+    CHECK(in_cur_chunk(p[0], arena));
+
+    p[1] = arena.alloc(1);
+    *p[1] = 101;
+    CHECK(arena.old_chunks_.empty());
+    CHECK(in_cur_chunk(p[1], arena));
+
+    for (int i = 0; i < 3; ++i) {
+        p[2 + i] = arena.alloc(1);
+        *p[2 + i] = 102 + i;
+        CHECK(arena.old_chunks_.size() == 1);
+        CHECK(in_cur_chunk(p[2 + i], arena));
+        CHECK(arena.cur_chunk_size_ == 4);
+    }
+
+    p[5] = arena.alloc(2);
+    *p[5] = 105;
+    *(p[5] + 1) = 205;
+    CHECK(arena.old_chunks_.size() == 2);
+    CHECK(in_cur_chunk(p[5], arena));
+    CHECK(arena.cur_chunk_size_ == 8);
+    CHECK(arena.cur_chunk_avail_ == 6);
+
+    p[6] = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
+    CHECK(reinterpret_cast<uintptr_t>(p[6]) % (sizeof(int64_t) * 2) == 0);
+    *p[6] = 106;
+    *(p[6] + 1) = 206;
+    CHECK(arena.old_chunks_.size() == 2);
+    CHECK(in_cur_chunk(p[6], arena));
+    CHECK(arena.cur_chunk_avail_ == 3 || arena.cur_chunk_avail_ == 4);
+
+    p[7] = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
+    CHECK(reinterpret_cast<uintptr_t>(p[6]) % (sizeof(int64_t) * 2) == 0);
+    *p[7] = 107;
+    *(p[7] + 1) = 207;
+    CHECK(arena.old_chunks_.size() == 2);
+    CHECK(in_cur_chunk(p[7], arena));
+    CHECK(arena.cur_chunk_avail_ == 1 || arena.cur_chunk_avail_ == 2);
+
+    for (int i = 0; i <= 7; ++i) {
+        CHECK(*p[i] == 100 + i);
+        if (i == 5 || i == 6 || i == 7)
+            CHECK(*(p[i] + 1) == 200 +i);
+    }
+
+    arena.clear();
+    CHECK(arena.old_chunks_.size() == 0);
+    CHECK(arena.cur_chunk_size_ == 8);
+    CHECK(arena.cur_chunk_avail_ == 8);
+
+    for (int i = 0; i < 2; ++i) {
+        int64_t* q = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
+        CHECK(reinterpret_cast<uintptr_t>(q) % (sizeof(int64_t) * 2) == 0);
+        CHECK(arena.old_chunks_.size() == 0);
+        CHECK(in_cur_chunk(q, arena));
+        arena.alloc(1);
+    }
+
+    int64_t* q = arena.alloc_aligned<sizeof(int64_t) * 2>(4);
+    CHECK(reinterpret_cast<uintptr_t>(q) % (sizeof(int64_t) * 2) == 0);
+    CHECK(arena.old_chunks_.size() == 1);
+    CHECK(in_cur_chunk(q, arena));
+
+    arena.alloc(777);
+    CHECK(arena.cur_chunk_size_ == 777);
+
+    return 0;
+}
Original file line number	Diff line number	Diff line change
`@@ -20,10 +20,20 @@ void* operator new (size_t len) {`
`20`	`20`	`return ret;`
`21`	`21`	`}`
`22`	`22`
	`23`	`+void* operator new[] (size_t len) {`
	`24`	`+ void* ret = PyMem_RawMalloc(len);`
	`25`	`+ CHECK(ret);`
	`26`	`+ return ret;`
	`27`	`+}`
	`28`	`+`
`23`	`29`	`void operator delete (void* ptr, size_t) {`
`24`	`30`	`PyMem_RawFree(ptr);`
`25`	`31`	`}`
`26`	`32`
	`33`	`+void operator delete[] (void* ptr) {`
	`34`	`+ PyMem_RawFree(ptr);`
	`35`	`+}`
	`36`	`+`
`27`	`37`	`void mem_free(void* p) {`
`28`	`38`	`PyMem_RawFree(p);`
`29`	`39`	`}`