Skip to content

Commit 91a80ad

Browse files
committed
[lang] Add cuda.lang.tensor_map_tiled()
Signed-off-by: Greg Bonik <gbonik@nvidia.com>
1 parent f1631a6 commit 91a80ad

38 files changed

Lines changed: 4205 additions & 3301 deletions

cext/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,10 @@ target_include_directories(test_vec PRIVATE ${cext_include_dirs})
104104
target_compile_options(test_vec PUBLIC ${cext_compile_flags} ${test_coverage_options})
105105
target_link_options(test_vec PRIVATE ${test_coverage_options})
106106
target_link_libraries(test_vec PRIVATE ${Python_LIBRARIES})
107+
108+
109+
add_executable(test_arena test/test_arena.cpp memory.cpp)
110+
target_include_directories(test_arena PRIVATE ${cext_include_dirs})
111+
target_compile_options(test_arena PUBLIC ${cext_compile_flags} ${test_coverage_options})
112+
target_link_options(test_arena PRIVATE ${test_coverage_options})
113+
target_link_libraries(test_arena PRIVATE ${Python_LIBRARIES})

cext/arena.h

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include "vec.h"
8+
#include <memory>
9+
10+
namespace {
11+
template <typename T, size_t InitialSize = 16>
12+
class Arena {
13+
public:
14+
Arena()
15+
: cur_chunk_(new T[InitialSize])
16+
, cur_chunk_avail_(InitialSize)
17+
, cur_chunk_size_(InitialSize)
18+
{ }
19+
20+
T* alloc(size_t count) {
21+
if (count > cur_chunk_avail_)
22+
allocate_chunk(count);
23+
cur_chunk_avail_ -= count;
24+
return &cur_chunk_[cur_chunk_avail_];
25+
}
26+
27+
template <size_t AlignmentBytes>
28+
T* alloc_aligned(size_t count) {
29+
static_assert(AlignmentBytes % sizeof(T) == 0);
30+
static_assert((AlignmentBytes & (AlignmentBytes - 1)) == 0);
31+
T* ret = try_alloc_aligned<AlignmentBytes>(count);
32+
if (ret) return ret;
33+
34+
// Request enough items so that we can always find an aligned segment
35+
allocate_chunk(count + AlignmentBytes / sizeof(T) - 1);
36+
ret = try_alloc_aligned<AlignmentBytes>(count);
37+
CHECK(ret);
38+
return ret;
39+
}
40+
41+
void clear() {
42+
// Preserve the current (i.e. the biggest chunk) for future reuse
43+
old_chunks_.clear();
44+
cur_chunk_avail_ = cur_chunk_size_;
45+
}
46+
47+
private:
48+
template <size_t AlignmentBytes>
49+
T* try_alloc_aligned(size_t count) {
50+
if (count > cur_chunk_avail_) return nullptr;
51+
uintptr_t cur_chunk_start = reinterpret_cast<uintptr_t>(cur_chunk_.get());
52+
uintptr_t addr = (cur_chunk_start + (cur_chunk_avail_ - count) * sizeof(T))
53+
& ~(AlignmentBytes - 1);
54+
if (addr < cur_chunk_start) return nullptr;
55+
cur_chunk_avail_ = (addr - cur_chunk_start) / sizeof(T);
56+
return &cur_chunk_[cur_chunk_avail_];
57+
}
58+
59+
void allocate_chunk(size_t min_capacity) {
60+
// Always grow the chunk at least by a factor of two, so that eventually,
61+
// after clearing the arena, we have one big enough chunk to satisfy all allocations.
62+
cur_chunk_size_ *= 2;
63+
if (cur_chunk_size_ < min_capacity)
64+
cur_chunk_size_ = min_capacity;
65+
old_chunks_.push_back(std::move(cur_chunk_));
66+
cur_chunk_.reset(new T[cur_chunk_size_]);
67+
cur_chunk_avail_ = cur_chunk_size_;
68+
}
69+
70+
public:
71+
// Keep public for ease of testing
72+
std::unique_ptr<T[]> cur_chunk_;
73+
size_t cur_chunk_avail_;
74+
size_t cur_chunk_size_;
75+
Vec<std::unique_ptr<T[]>> old_chunks_;
76+
};
77+
}
78+

cext/cuda_loader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
X(cuGraphAddMemFreeNode, 11040) \
5555
X(cuGraphInstantiateWithFlags, 11040) \
5656
X(cuGraphExecDestroy, 10000) \
57-
X(cuGraphLaunch, 10000)
57+
X(cuGraphLaunch, 10000) \
58+
X(cuTensorMapEncodeTiled, 12000)
5859

5960

6061
#define DECLARE_CUDA_FUNC_EXTERN(name, _cuda_version) \

cext/memory.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,20 @@ void* operator new (size_t len) {
2020
return ret;
2121
}
2222

23+
void* operator new[] (size_t len) {
24+
void* ret = PyMem_RawMalloc(len);
25+
CHECK(ret);
26+
return ret;
27+
}
28+
2329
void operator delete (void* ptr, size_t) {
2430
PyMem_RawFree(ptr);
2531
}
2632

33+
void operator delete[] (void* ptr) {
34+
PyMem_RawFree(ptr);
35+
}
36+
2737
void mem_free(void* p) {
2838
PyMem_RawFree(p);
2939
}

cext/memory.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,12 @@
1111

1212
void* operator new (size_t len);
1313

14+
void* operator new[] (size_t len);
15+
1416
void operator delete (void* ptr, size_t);
1517

18+
void operator delete[] (void* ptr);
19+
1620
void* xcalloc(size_t nmemb, size_t size);
1721

1822

cext/py.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,23 @@ static inline int pylong_as_int(PyObject* obj) {
3333
return static_cast<int>(val);
3434
}
3535

36+
static inline unsigned pylong_as_uint(PyObject* obj) {
37+
unsigned long val = PyLong_AsUnsignedLong(obj);
38+
if (PyErr_Occurred()) return -1;
39+
if (val > UINT_MAX) {
40+
PyErr_SetString(PyExc_OverflowError,
41+
"Python int too large to convert to C unsigned int");
42+
return -1;
43+
}
44+
return static_cast<unsigned>(val);
45+
}
46+
3647
template <typename T>
3748
T pylong_as(PyObject* obj) {
3849
if constexpr (std::is_same_v<T, int>) {
3950
return pylong_as_int(obj);
51+
} else if constexpr (std::is_same_v<T, unsigned>) {
52+
return pylong_as_uint(obj);
4053
} else if constexpr (std::is_same_v<T, long>) {
4154
return PyLong_AsLong(obj);
4255
} else if constexpr (std::is_same_v<T, long long>) {
@@ -50,6 +63,11 @@ T pylong_as(PyObject* obj) {
5063
}
5164
}
5265

66+
template <typename T>
67+
T pylong_as(const PyPtr& ptr) {
68+
return pylong_as<T>(ptr.get());
69+
}
70+
5371
template <typename T>
5472
T pylong_as_overflow_and(PyObject* obj, int* overflow) {
5573
if constexpr (std::is_same_v<T, int>) {

cext/test/test_arena.cpp

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include "../arena.h"
6+
#include "../check.h"
7+
8+
9+
template <typename T, size_t InitialSize>
10+
static inline bool in_cur_chunk(T* p, const Arena<T, InitialSize>& arena) {
11+
T* chunk = arena.cur_chunk_.get();
12+
return p >= chunk && p < chunk + arena.cur_chunk_size_;
13+
}
14+
15+
int main() {
16+
int64_t* p[8];
17+
18+
Arena<int64_t, 2> arena;
19+
CHECK(arena.cur_chunk_size_ == 2);
20+
21+
p[0] = arena.alloc(1);
22+
*p[0] = 100;
23+
CHECK(arena.old_chunks_.empty());
24+
CHECK(in_cur_chunk(p[0], arena));
25+
26+
p[1] = arena.alloc(1);
27+
*p[1] = 101;
28+
CHECK(arena.old_chunks_.empty());
29+
CHECK(in_cur_chunk(p[1], arena));
30+
31+
for (int i = 0; i < 3; ++i) {
32+
p[2 + i] = arena.alloc(1);
33+
*p[2 + i] = 102 + i;
34+
CHECK(arena.old_chunks_.size() == 1);
35+
CHECK(in_cur_chunk(p[2 + i], arena));
36+
CHECK(arena.cur_chunk_size_ == 4);
37+
}
38+
39+
p[5] = arena.alloc(2);
40+
*p[5] = 105;
41+
*(p[5] + 1) = 205;
42+
CHECK(arena.old_chunks_.size() == 2);
43+
CHECK(in_cur_chunk(p[5], arena));
44+
CHECK(arena.cur_chunk_size_ == 8);
45+
CHECK(arena.cur_chunk_avail_ == 6);
46+
47+
p[6] = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
48+
CHECK(reinterpret_cast<uintptr_t>(p[6]) % (sizeof(int64_t) * 2) == 0);
49+
*p[6] = 106;
50+
*(p[6] + 1) = 206;
51+
CHECK(arena.old_chunks_.size() == 2);
52+
CHECK(in_cur_chunk(p[6], arena));
53+
CHECK(arena.cur_chunk_avail_ == 3 || arena.cur_chunk_avail_ == 4);
54+
55+
p[7] = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
56+
CHECK(reinterpret_cast<uintptr_t>(p[6]) % (sizeof(int64_t) * 2) == 0);
57+
*p[7] = 107;
58+
*(p[7] + 1) = 207;
59+
CHECK(arena.old_chunks_.size() == 2);
60+
CHECK(in_cur_chunk(p[7], arena));
61+
CHECK(arena.cur_chunk_avail_ == 1 || arena.cur_chunk_avail_ == 2);
62+
63+
for (int i = 0; i <= 7; ++i) {
64+
CHECK(*p[i] == 100 + i);
65+
if (i == 5 || i == 6 || i == 7)
66+
CHECK(*(p[i] + 1) == 200 +i);
67+
}
68+
69+
arena.clear();
70+
CHECK(arena.old_chunks_.size() == 0);
71+
CHECK(arena.cur_chunk_size_ == 8);
72+
CHECK(arena.cur_chunk_avail_ == 8);
73+
74+
for (int i = 0; i < 2; ++i) {
75+
int64_t* q = arena.alloc_aligned<sizeof(int64_t) * 2>(2);
76+
CHECK(reinterpret_cast<uintptr_t>(q) % (sizeof(int64_t) * 2) == 0);
77+
CHECK(arena.old_chunks_.size() == 0);
78+
CHECK(in_cur_chunk(q, arena));
79+
arena.alloc(1);
80+
}
81+
82+
int64_t* q = arena.alloc_aligned<sizeof(int64_t) * 2>(4);
83+
CHECK(reinterpret_cast<uintptr_t>(q) % (sizeof(int64_t) * 2) == 0);
84+
CHECK(arena.old_chunks_.size() == 1);
85+
CHECK(in_cur_chunk(q, arena));
86+
87+
arena.alloc(777);
88+
CHECK(arena.cur_chunk_size_ == 777);
89+
90+
return 0;
91+
}

0 commit comments

Comments
 (0)