Merge pull request #934 from InfiniTensor/issue/811

PanZezhong1725 · web-flow · commit 97da99332294 · 2026-01-15T19:32:35.000+08:00
issue/811 remove shortcut for cpu runtime
diff --git a/src/infinicore/context/context_impl.cc b/src/infinicore/context/context_impl.cc
@@ -29,18 +29,16 @@ Runtime *ContextImpl::getCurrentRuntime() {
     return current_runtime_;
 }
 
-Runtime *ContextImpl::getCpuRuntime() {
-    return runtime_table_[int(Device::Type::CPU)][0].get();
-}
-
 void ContextImpl::setDevice(Device device) {
     if (device == getCurrentRuntime()->device()) {
         // Do nothing if the device is already set.
         return;
     }
 
-    if (getCurrentRuntime()->isGraphRecording()) {
+    thread_local bool warn_switch_runtime = false;
+    if (getCurrentRuntime()->isGraphRecording() && !warn_switch_runtime) {
         spdlog::warn("Switching device runtime during graph recording may break the graph!");
+        warn_switch_runtime = true;
     }
 
     if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {
@@ -104,11 +102,8 @@ infinirtStream_t getStream() {
 }
 
 infiniopHandle_t getInfiniopHandle(Device device) {
-    if (device.getType() == Device::Type::CPU) {
-        return ContextImpl::singleton().getCpuRuntime()->infiniopHandle();
-    }
     if (device != getDevice()) {
-        throw std::runtime_error("Requested device doesn't match current runtime.");
+        setDevice(device);
     }
     return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle();
 }
@@ -127,7 +122,7 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
 
 std::shared_ptr<Memory> allocateHostMemory(size_t size) {
     setDevice(Device::cpu());
-    return ContextImpl::singleton().getCpuRuntime()->allocateMemory(size);
+    return allocateMemory(size);
 }
 
 std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) {
@@ -147,7 +142,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
 }
 
 void memcpyH2H(void *dst, const void *src, size_t size) {
-    return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size);
+    setDevice(Device::cpu());
+    return ContextImpl::singleton().getCurrentRuntime()->memcpyD2D(dst, src, size);
 }
 
 // Timing API implementations
diff --git a/src/infinicore/context/context_impl.hpp b/src/infinicore/context/context_impl.hpp
@@ -19,8 +19,6 @@ class ContextImpl {
 public:
     Runtime *getCurrentRuntime();
 
-    Runtime *getCpuRuntime();
-
     void setDevice(Device);
 
     size_t getDeviceCount(Device::Type type);
diff --git a/src/infinicore/tensor/copy.cc b/src/infinicore/tensor/copy.cc
@@ -19,7 +19,8 @@ Tensor TensorImpl::to(Device device) const {
 
 void TensorImpl::copy_from(Tensor src) {
     if (src->shape() != this->shape()) {
-        throw std::runtime_error("Cannot copy from tensor with different shape");
+        throw std::runtime_error(
+            "Cannot copy from tensor with different shape. Src: " + src->info() + " Dst: " + this->info());
     }
     if (this->device() == src->device()) {
         op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), src);
@@ -31,11 +32,12 @@ void TensorImpl::copy_from(Tensor src) {
         // Use nbytes() to get the actual tensor size, not the full memory size
         size_t copy_size = std::min(this->nbytes(), src->nbytes());
         if (this->device().getType() == Device::Type::CPU) {
-            context::setDevice(src->device());
             if (this->is_contiguous()) {
+                context::setDevice(src->device());
                 context::memcpyD2H(this->data(), src->data(), copy_size);
             } else {
                 auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
+                context::setDevice(src->device());
                 context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
                 op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
             }
diff --git a/src/infinicore/utils.hpp b/src/infinicore/utils.hpp
@@ -23,14 +23,17 @@ inline struct SpdlogInitializer {
 #define STRINGIZE_(x) #x
 #define STRINGIZE(x) STRINGIZE_(x)
 
-#define INFINICORE_CHECK_ERROR(call)                                                                         \
-    do {                                                                                                     \
-        SPDLOG_DEBUG("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
-        infiniStatus_t ret = (call);                                                                         \
-        SPDLOG_DEBUG("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                      \
-        if (ret != INFINI_STATUS_SUCCESS) {                                                                  \
-            throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
-        }                                                                                                    \
+#define INFINICORE_CHECK_ERROR(call)                                                                            \
+    do {                                                                                                        \
+        SPDLOG_DEBUG("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                        \
+        infiniStatus_t ret = (call);                                                                            \
+        SPDLOG_DEBUG("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                         \
+        if (ret != INFINI_STATUS_SUCCESS) {                                                                     \
+            throw std::runtime_error("`" #call "` failed with error: " + std::string(infini_status_string(ret)) \
+                                     + " from " + std::string(__func__)                                         \
+                                     + " at " + std::string(__FILE__)                                           \
+                                     + ":" + std::to_string(__LINE__) + ".");                                   \
+        }                                                                                                       \
     } while (false)
 
 #define INFINICORE_ASSERT_TENSORS_SAME_DEVICE(FIRST___, ...)                      \
diff --git a/src/infinirt/cuda/infinirt_cuda.cu b/src/infinirt/cuda/infinirt_cuda.cu
@@ -4,6 +4,14 @@
 
 #define CHECK_CUDART(RT_API) CHECK_INTERNAL(RT_API, cudaSuccess)
 
+#define RUN_CUDART(RT_API)                           \
+    do {                                             \
+        auto api_result_ = (RT_API);                 \
+        if (api_result_ != (cudaSuccess)) {          \
+            { return INFINI_STATUS_INTERNAL_ERROR; } \
+        }                                            \
+    } while (0)
+
 // 根据宏定义选择命名空间并实现
 #if defined(ENABLE_NVIDIA_API)
 namespace infinirt::cuda {
@@ -40,7 +48,7 @@ infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
 }
 
 infiniStatus_t streamDestroy(infinirtStream_t stream) {
-    CHECK_CUDART(cudaStreamDestroy((cudaStream_t)stream));
+    RUN_CUDART(cudaStreamDestroy((cudaStream_t)stream));
     return INFINI_STATUS_SUCCESS;
 }
 
@@ -105,7 +113,7 @@ infiniStatus_t eventSynchronize(infinirtEvent_t event) {
 }
 
 infiniStatus_t eventDestroy(infinirtEvent_t event) {
-    CHECK_CUDART(cudaEventDestroy((cudaEvent_t)event));
+    RUN_CUDART(cudaEventDestroy((cudaEvent_t)event));
     return INFINI_STATUS_SUCCESS;
 }
 
@@ -125,12 +133,12 @@ infiniStatus_t mallocHost(void **p_ptr, size_t size) {
 }
 
 infiniStatus_t freeDevice(void *ptr) {
-    CHECK_CUDART(cudaFree(ptr));
+    RUN_CUDART(cudaFree(ptr));
     return INFINI_STATUS_SUCCESS;
 }
 
 infiniStatus_t freeHost(void *ptr) {
-    CHECK_CUDART(cudaFreeHost(ptr));
+    RUN_CUDART(cudaFreeHost(ptr));
     return INFINI_STATUS_SUCCESS;
 }
 
@@ -165,7 +173,7 @@ infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
 }
 
 infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
-    CHECK_CUDART(cudaFreeAsync(ptr, (cudaStream_t)stream));
+    RUN_CUDART(cudaFreeAsync(ptr, (cudaStream_t)stream));
     return INFINI_STATUS_SUCCESS;
 }
 }

Original file line number	Diff line number	Diff line change
`@@ -29,18 +29,16 @@ Runtime *ContextImpl::getCurrentRuntime() {`
`29`	`29`	`return current_runtime_;`
`30`	`30`	`}`
`31`	`31`
`32`		`-Runtime *ContextImpl::getCpuRuntime() {`
`33`		`- return runtime_table_[int(Device::Type::CPU)][0].get();`
`34`		`-}`
`35`		`-`
`36`	`32`	`void ContextImpl::setDevice(Device device) {`
`37`	`33`	`if (device == getCurrentRuntime()->device()) {`
`38`	`34`	`// Do nothing if the device is already set.`
`39`	`35`	`return;`
`40`	`36`	`}`
`41`	`37`
`42`		`- if (getCurrentRuntime()->isGraphRecording()) {`
	`38`	`+ thread_local bool warn_switch_runtime = false;`
	`39`	`+ if (getCurrentRuntime()->isGraphRecording() && !warn_switch_runtime) {`
`43`	`40`	`spdlog::warn("Switching device runtime during graph recording may break the graph!");`
	`41`	`+ warn_switch_runtime = true;`
`44`	`42`	`}`
`45`	`43`
`46`	`44`	`if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {`
`@@ -104,11 +102,8 @@ infinirtStream_t getStream() {`
`104`	`102`	`}`
`105`	`103`
`106`	`104`	`infiniopHandle_t getInfiniopHandle(Device device) {`
`107`		`- if (device.getType() == Device::Type::CPU) {`
`108`		`- return ContextImpl::singleton().getCpuRuntime()->infiniopHandle();`
`109`		`- }`
`110`	`105`	`if (device != getDevice()) {`
`111`		`- throw std::runtime_error("Requested device doesn't match current runtime.");`
	`106`	`+ setDevice(device);`
`112`	`107`	`}`
`113`	`108`	`return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle();`
`114`	`109`	`}`
`@@ -127,7 +122,7 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {`
`127`	`122`
`128`	`123`	`std::shared_ptr<Memory> allocateHostMemory(size_t size) {`
`129`	`124`	`setDevice(Device::cpu());`
`130`		`- return ContextImpl::singleton().getCpuRuntime()->allocateMemory(size);`
	`125`	`+ return allocateMemory(size);`
`131`	`126`	`}`
`132`	`127`
`133`	`128`	`std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) {`
`@@ -147,7 +142,8 @@ void memcpyD2D(void dst, const void src, size_t size, bool async) {`
`147`	`142`	`}`
`148`	`143`
`149`	`144`	`void memcpyH2H(void dst, const void src, size_t size) {`
`150`		`- return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size);`
	`145`	`+ setDevice(Device::cpu());`
	`146`	`+ return ContextImpl::singleton().getCurrentRuntime()->memcpyD2D(dst, src, size);`
`151`	`147`	`}`
`152`	`148`
`153`	`149`	`// Timing API implementations`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,14 @@`
`4`	`4`
`5`	`5`	`#define CHECK_CUDART(RT_API) CHECK_INTERNAL(RT_API, cudaSuccess)`
`6`	`6`
	`7`	`+#define RUN_CUDART(RT_API) \`
	`8`	`+ do { \`
	`9`	`+ auto api_result_ = (RT_API); \`
	`10`	`+ if (api_result_ != (cudaSuccess)) { \`
	`11`	`+ { return INFINI_STATUS_INTERNAL_ERROR; } \`
	`12`	`+ } \`
	`13`	`+ } while (0)`
	`14`	`+`
`7`	`15`	`// 根据宏定义选择命名空间并实现`
`8`	`16`	`#if defined(ENABLE_NVIDIA_API)`
`9`	`17`	`namespace infinirt::cuda {`
`@@ -40,7 +48,7 @@ infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {`
`40`	`48`	`}`
`41`	`49`
`42`	`50`	`infiniStatus_t streamDestroy(infinirtStream_t stream) {`
`43`		`- CHECK_CUDART(cudaStreamDestroy((cudaStream_t)stream));`
	`51`	`+ RUN_CUDART(cudaStreamDestroy((cudaStream_t)stream));`
`44`	`52`	`return INFINI_STATUS_SUCCESS;`
`45`	`53`	`}`
`46`	`54`
`@@ -105,7 +113,7 @@ infiniStatus_t eventSynchronize(infinirtEvent_t event) {`
`105`	`113`	`}`
`106`	`114`
`107`	`115`	`infiniStatus_t eventDestroy(infinirtEvent_t event) {`
`108`		`- CHECK_CUDART(cudaEventDestroy((cudaEvent_t)event));`
	`116`	`+ RUN_CUDART(cudaEventDestroy((cudaEvent_t)event));`
`109`	`117`	`return INFINI_STATUS_SUCCESS;`
`110`	`118`	`}`
`111`	`119`
`@@ -125,12 +133,12 @@ infiniStatus_t mallocHost(void **p_ptr, size_t size) {`
`125`	`133`	`}`
`126`	`134`
`127`	`135`	`infiniStatus_t freeDevice(void *ptr) {`
`128`		`- CHECK_CUDART(cudaFree(ptr));`
	`136`	`+ RUN_CUDART(cudaFree(ptr));`
`129`	`137`	`return INFINI_STATUS_SUCCESS;`
`130`	`138`	`}`
`131`	`139`
`132`	`140`	`infiniStatus_t freeHost(void *ptr) {`
`133`		`- CHECK_CUDART(cudaFreeHost(ptr));`
	`141`	`+ RUN_CUDART(cudaFreeHost(ptr));`
`134`	`142`	`return INFINI_STATUS_SUCCESS;`
`135`	`143`	`}`
`136`	`144`
`@@ -165,7 +173,7 @@ infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {`
`165`	`173`	`}`
`166`	`174`
`167`	`175`	`infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {`
`168`		`- CHECK_CUDART(cudaFreeAsync(ptr, (cudaStream_t)stream));`
	`176`	`+ RUN_CUDART(cudaFreeAsync(ptr, (cudaStream_t)stream));`
`169`	`177`	`return INFINI_STATUS_SUCCESS;`
`170`	`178`	`}`
`171`	`179`	`}`