Skip to content

Commit 1b779df

Browse files
committed
Merge remote-tracking branch 'upstream/main' into 2025-autumn-PPPoint-t-T1-1-11
2 parents ac79266 + 3c8fb3c commit 1b779df

18 files changed

Lines changed: 436 additions & 26 deletions

File tree

include/infinicore/graph/graph.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ class GraphManager;
1212
class GraphTensor : public Tensor {
1313
public:
1414
GraphTensor(const Tensor &);
15-
void resume() const;
1615
};
1716

1817
class GraphOperator {
@@ -31,17 +30,21 @@ class GraphOperator {
3130

3231
class Graph {
3332
public:
34-
Graph() = default;
35-
~Graph() = default;
33+
Graph();
34+
~Graph();
3635

3736
void run() const;
3837

3938
protected:
4039
void add_operator(std::shared_ptr<GraphOperator> op);
41-
40+
void instantiate();
4241
std::vector<std::shared_ptr<GraphOperator>> op_list_;
4342

4443
friend class GraphManager;
44+
45+
private:
46+
struct DeviceGraph;
47+
std::unique_ptr<DeviceGraph> device_graph_;
4548
};
4649
} // namespace infinicore::graph
4750

include/infinicore/tensor.hpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@ class Tensor {
9090
Tensor(std::shared_ptr<TensorImpl> impl) : impl_(std::move(impl)) {}
9191
std::shared_ptr<TensorImpl> impl_;
9292
friend class TensorImpl;
93-
94-
void resume_from_blob_() const;
9593
};
9694

9795
class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
@@ -135,7 +133,18 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
135133

136134
void debug() const;
137135

138-
Tensor to_blob() const;
136+
/**
137+
* Unsafe API that returns a new tensor with the same raw memory untracked by allocator
138+
* This API is used for loosely tracking a piece of memory while allowing it to be reused,
139+
* typically in a compute graph scenario.
140+
*/
141+
Tensor to_blob_() const;
142+
143+
/**
144+
* Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
145+
* Should only be used on the tensor returned by to_blob_().
146+
*/
147+
Tensor resume_from_blob_() const;
139148

140149
///
141150
/// Data Transfer APIs
@@ -301,6 +310,10 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
301310
protected:
302311
TensorMetaData meta_;
303312
TensorData data_;
313+
314+
private:
315+
// Mark to indicate if the tensor is created from to_blob_()
316+
bool to_blob_mark_ = false;
304317
};
305318

306319
} // namespace infinicore

include/infinirt.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
typedef void *infinirtStream_t;
88
typedef void *infinirtEvent_t;
9+
typedef void *infinirtGraph_t;
10+
typedef void *infinirtGraphNode_t;
11+
typedef void *infinirtGraphExec_t;
912

1013
__C __export infiniStatus_t infinirtInit();
1114

@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
6366
__C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
6467
__C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);
6568

69+
// Graph
70+
typedef enum {
71+
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL = 0,
72+
INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
73+
INFINIRT_STREAM_CAPTURE_MODE_RELAXED = 2,
74+
75+
} infinirtStreamCaptureMode_t;
76+
77+
__C __export infiniStatus_t infinirtStreamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode);
78+
__C __export infiniStatus_t infinirtStreamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr);
79+
__C __export infiniStatus_t infinirtGraphDestroy(infinirtGraph_t graph);
80+
__C __export infiniStatus_t infinirtGraphInstantiate(
81+
infinirtGraphExec_t *graph_exec_ptr,
82+
infinirtGraph_t graph,
83+
infinirtGraphNode_t *node_ptr,
84+
char *log_buffer,
85+
size_t buffer_size);
86+
__C __export infiniStatus_t infinirtGraphExecDestroy(infinirtGraphExec_t graph_exec);
87+
__C __export infiniStatus_t infinirtGraphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream);
88+
6689
#endif // __INFINIRT_API_H__

src/infinicore/context/allocators/pinnable_block_allocator.cc

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,19 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
5252
if (size <= cls.block_size) {
5353
if (!cls.free_blocks.empty()) {
5454
block = cls.free_blocks.back();
55-
cls.free_blocks.pop_back();
56-
block->in_use = true;
57-
return reinterpret_cast<std::byte *>(block->ptr);
55+
while (block != nullptr && block->in_use) {
56+
cls.free_blocks.pop_back();
57+
if (cls.free_blocks.empty()) {
58+
block = nullptr;
59+
break;
60+
}
61+
block = cls.free_blocks.back();
62+
}
63+
if (block != nullptr) {
64+
cls.free_blocks.pop_back();
65+
block->in_use = true;
66+
return reinterpret_cast<std::byte *>(block->ptr);
67+
}
5868
}
5969
// Allocate a new block for this class
6070
block = std::make_shared<Block>();

src/infinicore/graph/graph.cc

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
#include "graph_manager.hpp"
22

33
#include "../utils.hpp"
4+
#include "infinicore/context/context.hpp"
5+
#include <infinirt.h>
46

57
namespace infinicore::graph {
68

79
/* =========================
810
* GraphTensor
911
* ========================= */
1012

11-
GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob()) {
12-
}
13-
14-
void GraphTensor::resume() const {
15-
resume_from_blob_();
13+
GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
1614
}
1715

1816
/* =========================
@@ -33,16 +31,91 @@ GraphOperator::~GraphOperator() {
3331
* Graph
3432
* ========================= */
3533

34+
struct Graph::DeviceGraph {
35+
infinirtGraph_t graph;
36+
infinirtGraphExec_t exec;
37+
infinirtGraphNode_t node;
38+
std::vector<char> log_buffer;
39+
40+
DeviceGraph() {
41+
log_buffer.resize(4 * 1024);
42+
}
43+
44+
~DeviceGraph() {
45+
if (exec) {
46+
infinirtGraphExecDestroy(exec);
47+
}
48+
if (graph) {
49+
infinirtGraphDestroy(graph);
50+
}
51+
}
52+
53+
void launch() {
54+
INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
55+
}
56+
};
57+
58+
Graph::Graph() {
59+
}
60+
3661
void Graph::run() const {
37-
for (auto &op : op_list_) {
38-
op->run();
62+
if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
63+
device_graph_.get()->launch();
64+
} else {
65+
for (auto &op : op_list_) {
66+
op->run();
67+
}
3968
}
4069
}
4170

4271
void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
4372
op_list_.push_back(op);
4473
}
4574

75+
void Graph::instantiate() {
76+
// Reset device graph
77+
device_graph_ = std::make_unique<DeviceGraph>();
78+
79+
// warmup
80+
for (size_t iter = 0; iter < 5; ++iter) {
81+
this->run();
82+
}
83+
infinicore::context::syncStream();
84+
85+
if (infinirtStreamBeginCapture(
86+
context::getStream(),
87+
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL)
88+
!= INFINI_STATUS_SUCCESS) {
89+
return;
90+
}
91+
92+
// Run and record
93+
this->run();
94+
95+
if (infinirtStreamEndCapture(
96+
context::getStream(),
97+
&device_graph_.get()->graph)
98+
!= INFINI_STATUS_SUCCESS) {
99+
return;
100+
}
101+
102+
if (infinirtGraphInstantiate(
103+
&device_graph_.get()->exec,
104+
device_graph_.get()->graph,
105+
&device_graph_.get()->node,
106+
device_graph_.get()->log_buffer.data(),
107+
device_graph_.get()->log_buffer.size())
108+
!= INFINI_STATUS_SUCCESS) {
109+
static bool warned_once = false;
110+
if (!warned_once) {
111+
warned_once = true;
112+
spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
113+
}
114+
}
115+
}
116+
117+
Graph::~Graph() = default;
118+
46119
/* =========================
47120
* GraphManager
48121
* ========================= */
@@ -52,19 +125,26 @@ bool GraphManager::is_recording() const {
52125
}
53126

54127
void GraphManager::start_recording() {
128+
if (is_recording()) {
129+
spdlog::warn("Graph is already recording. Previous recording will be dropped.");
130+
}
55131
recording_ = true;
56132
graph_ = std::make_shared<Graph>();
57133
}
58134

59135
void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
60-
INFINICORE_ASSERT(recording_);
136+
INFINICORE_ASSERT(is_recording());
61137

62138
graph_->add_operator(op);
63139
}
64140

65141
std::shared_ptr<Graph> GraphManager::stop_recording() {
66-
142+
if (!is_recording()) {
143+
spdlog::warn("Graph is not recording. Please start recording first.");
144+
return nullptr;
145+
}
67146
recording_ = false;
147+
graph_->instantiate();
68148
return std::exchange(graph_, nullptr);
69149
}
70150

src/infinicore/tensor/copy.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ void TensorImpl::copy_from(Tensor src) {
3838
} else {
3939
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
4040
context::setDevice(src->device());
41-
context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
41+
context::memcpyD2H(local_src->data(), src->data(), copy_size);
4242
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
4343
}
4444
} else if (src->device().getType() == Device::Type::CPU) {

src/infinicore/tensor/tensor.cc

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ Tensor::operator bool() const {
6565
return impl_ != nullptr;
6666
}
6767

68-
void Tensor::resume_from_blob_() const {
69-
context::reinstantiateBlob(impl_->data_.memory);
70-
}
71-
7268
TensorMetaData::TensorMetaData(const Shape &_shape, const Strides &_strides, const DataType &_dtype)
7369
: shape(_shape), strides(_strides), dtype(_dtype) {
7470
INFINICORE_CHECK_ERROR(infiniopCreateTensorDescriptor(&desc, shape.size(), shape.data(), strides.data(), (infiniDtype_t)dtype));
@@ -280,10 +276,22 @@ std::shared_ptr<TensorImpl> TensorImpl::strided_from_blob(
280276
return t;
281277
}
282278

283-
Tensor TensorImpl::to_blob() const {
279+
Tensor TensorImpl::to_blob_() const {
284280
auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape(), strides(), dtype()));
285281
t->data_.offset = this->data_.offset;
286282
t->data_.memory = std::make_shared<Memory>(this->data_.memory->data(), this->data_.memory->size(), this->data_.memory->device(), nullptr);
283+
t->to_blob_mark_ = true;
284+
return Tensor{t};
285+
}
286+
287+
Tensor TensorImpl::resume_from_blob_() const {
288+
auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape(), strides(), dtype()));
289+
t->data_.offset = this->data_.offset;
290+
if (to_blob_mark_) {
291+
t->data_.memory = context::reinstantiateBlob(this->data_.memory);
292+
} else {
293+
t->data_.memory = this->data_.memory;
294+
}
287295

288296
return Tensor{t};
289297
}

src/infiniop/devices/nvidia/nvidia_common.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ Handle::Internal::Internal(int device_id) {
2323
_grid_size[0] = prop.maxGridSize[0];
2424
_grid_size[1] = prop.maxGridSize[1];
2525
_grid_size[2] = prop.maxGridSize[2];
26+
this->useCublas(nullptr, [](cublasHandle_t handle) { return INFINI_STATUS_SUCCESS; });
27+
#ifdef ENABLE_CUDNN_API
28+
this->useCudnn(nullptr, [](cudnnHandle_t handle) { return INFINI_STATUS_SUCCESS; });
29+
#endif
2630
}
2731

2832
infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {

src/infinirt/ascend/infinirt_ascend.cc

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,5 +150,35 @@ infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
150150
infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
151151
return freeDevice(ptr);
152152
}
153+
154+
infiniStatus_t streamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode) {
155+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
156+
}
157+
158+
infiniStatus_t streamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr) {
159+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
160+
}
161+
162+
infiniStatus_t graphDestroy(infinirtGraph_t graph) {
163+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
164+
}
165+
166+
infiniStatus_t graphInstantiate(
167+
infinirtGraphExec_t *graph_exec_ptr,
168+
infinirtGraph_t graph,
169+
infinirtGraphNode_t *node_ptr,
170+
char *log_buffer,
171+
size_t buffer_size) {
172+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
173+
}
174+
175+
infiniStatus_t graphExecDestroy(infinirtGraphExec_t graph_exec) {
176+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
177+
}
178+
179+
infiniStatus_t graphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream) {
180+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
181+
}
182+
153183
} // namespace infinirt::ascend
154184
#undef CHECK_ACLRT

0 commit comments

Comments
 (0)