InfiniTensor
diff --git a/‎infini_train/include/common/cuda/gemm.cuh‎
Lines changed: 2 additions & 18 deletions b/‎infini_train/include/common/cuda/gemm.cuh‎
Lines changed: 2 additions & 18 deletions
diff --git a/‎infini_train/src/kernels/cuda/gemm.cu‎
Lines changed: 13 additions & 22 deletions b/‎infini_train/src/kernels/cuda/gemm.cu‎
Lines changed: 13 additions & 22 deletions
diff --git a/‎infini_train/src/kernels/cuda/linear.cu‎
Lines changed: 84 additions & 83 deletions b/‎infini_train/src/kernels/cuda/linear.cu‎
Lines changed: 84 additions & 83 deletions
@@ -1,25 +1,12 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <cuda_runtime_api.h>
 
 #include "infini_train/include/datatype.h"
 #include "infini_train/include/device.h"
 
 namespace infini_train::kernels::cuda {
 
-/**
- * Return the cuBLAS handle associated with the given device.
- * Shared by linear.cu, matmul.cu, and any future GEMM-using kernels.
- */
-cublasHandle_t GetCublasHandle(const Device &device);
-
-/**
- * Return the CUDA stream associated with the given device.
- * Shared by kernels that need to launch device-side code directly.
- */
-cudaStream_t GetCudaStream(const Device &device);
-
 /**
  * Parameter bundle for a single GEMM call:
  *   C = alpha * op(A) * op(B) + beta * C
@@ -56,8 +43,6 @@ struct GemmParams {
 
     DataType input_dtype;  // dtype of A and B
     DataType output_dtype; // dtype of C (may differ, e.g. bf16 in → fp32 out)
-
-    cublasHandle_t blas_handle = nullptr;
 };
 
 /**
@@ -67,7 +52,7 @@ struct GemmParams {
  * Uses CUBLAS_COMPUTE_32F for all input dtypes to ensure precision.
  * Aborts on cuBLAS error (via CUBLAS_CHECK / LOG(FATAL)).
  */
-void GemmCuda(const GemmParams &p);
+void GemmCuda(const Device &device, const GemmParams &p);
 
 /**
  * Parameter bundle for a single SGEMV call (fp32 only):
@@ -88,9 +73,8 @@ struct SgemvParams {
     int incy = 1;
     float alpha = 1.0f;
     float beta = 0.0f;
-    cublasHandle_t blas_handle = nullptr;
 };
 
-void SgemvCuda(const SgemvParams &p);
+void SgemvCuda(const Device &device, const SgemvParams &p);
 
 } // namespace infini_train::kernels::cuda
@@ -12,18 +12,6 @@
 
 namespace infini_train::kernels::cuda {
 
-cublasHandle_t GetCublasHandle(const Device &device) {
-    return dynamic_cast<infini_train::core::cuda::CudaBlasHandle *>(
-               infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
-        ->cublas_handle();
-}
-
-cudaStream_t GetCudaStream(const Device &device) {
-    return dynamic_cast<infini_train::core::cuda::CudaStream *>(
-               infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
-        ->cuda_stream();
-}
-
 namespace {
 
 cudaDataType_t ToCudaDataType(DataType dt) {
@@ -42,8 +30,10 @@ cudaDataType_t ToCudaDataType(DataType dt) {
 
 } // namespace
 
-void GemmCuda(const GemmParams &p) {
-    DCHECK(p.blas_handle != nullptr);
+void GemmCuda(const Device &device, const GemmParams &p) {
+    const cublasHandle_t blas_handle = dynamic_cast<infini_train::core::cuda::CudaBlasHandle *>(
+                                           infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                           ->cublas_handle();
 
     if (p.batch_count == 1) {
         // strides are unused in the non-batched path; assert they are left at 0
@@ -61,19 +51,20 @@ void GemmCuda(const GemmParams &p) {
     const cublasComputeType_t ctype = CUBLAS_COMPUTE_32F;
 
     if (p.batch_count == 1) {
-        CUBLAS_CHECK(cublasGemmEx(p.blas_handle, p.trans_a, p.trans_b, p.m, p.n, p.k, &p.alpha, p.A, type_a, p.lda, p.B,
+        CUBLAS_CHECK(cublasGemmEx(blas_handle, p.trans_a, p.trans_b, p.m, p.n, p.k, &p.alpha, p.A, type_a, p.lda, p.B,
                                   type_b, p.ldb, &p.beta, p.C, type_c, p.ldc, ctype, CUBLAS_GEMM_DEFAULT));
     } else {
-        CUBLAS_CHECK(cublasGemmStridedBatchedEx(p.blas_handle, p.trans_a, p.trans_b, p.m, p.n, p.k, &p.alpha, p.A,
-                                                type_a, p.lda, p.stride_a, p.B, type_b, p.ldb, p.stride_b, &p.beta, p.C,
-                                                type_c, p.ldc, p.stride_c, p.batch_count, ctype, CUBLAS_GEMM_DEFAULT));
+        CUBLAS_CHECK(cublasGemmStridedBatchedEx(blas_handle, p.trans_a, p.trans_b, p.m, p.n, p.k, &p.alpha, p.A, type_a,
+                                                p.lda, p.stride_a, p.B, type_b, p.ldb, p.stride_b, &p.beta, p.C, type_c,
+                                                p.ldc, p.stride_c, p.batch_count, ctype, CUBLAS_GEMM_DEFAULT));
     }
 }
 
-void SgemvCuda(const SgemvParams &p) {
-    DCHECK(p.blas_handle != nullptr);
-    CUBLAS_CHECK(
-        cublasSgemv(p.blas_handle, p.trans, p.m, p.n, &p.alpha, p.A, p.lda, p.x, p.incx, &p.beta, p.y, p.incy));
+void SgemvCuda(const Device &device, const SgemvParams &p) {
+    const cublasHandle_t blas_handle = dynamic_cast<infini_train::core::cuda::CudaBlasHandle *>(
+                                           infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                           ->cublas_handle();
+    CUBLAS_CHECK(cublasSgemv(blas_handle, p.trans, p.m, p.n, &p.alpha, p.A, p.lda, p.x, p.incx, &p.beta, p.y, p.incy));
 }
 
 } // namespace infini_train::kernels::cuda
@@ -9,9 +9,11 @@
 #include "infini_train/include/common/cuda/common_cuda.h"
 #include "infini_train/include/common/cuda/gemm.cuh"
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 #include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
+#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
 
@@ -58,7 +60,9 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
     auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
 
     auto device = input->GetDevice();
-    const auto cuda_stream = GetCudaStream(device);
+    const auto cuda_stream = dynamic_cast<infini_train::core::cuda::CudaStream *>(
+                                 infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                 ->cuda_stream();
 
     if (bias) {
         CHECK_EQ(bias->Dims().size(), 1);
@@ -80,18 +84,17 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
     // When bs==1 and fp32, use cublasSgemv (more efficient than GEMM for matrix-vector).
     // cublasSgemv does not support bf16, so bf16 falls through to GemmCuda.
     if (bs == 1 && dtype == DataType::kFLOAT32) {
-        SgemvCuda(SgemvParams{
-            .trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
-            .m = static_cast<int>(transpose ? in_features : out_features),
-            .n = static_cast<int>(transpose ? out_features : in_features),
-            .A = static_cast<const float *>(weight->DataPtr()),
-            .lda = static_cast<int>(transpose ? in_features : out_features),
-            .x = static_cast<const float *>(input->DataPtr()),
-            .y = static_cast<float *>(output->DataPtr()),
-            .alpha = 1.0f,
-            .beta = 1.0f, // output already initialized with bias or zero above
-            .blas_handle = GetCublasHandle(device),
-        });
+        SgemvCuda(device, SgemvParams{
+                              .trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
+                              .m = static_cast<int>(transpose ? in_features : out_features),
+                              .n = static_cast<int>(transpose ? out_features : in_features),
+                              .A = static_cast<const float *>(weight->DataPtr()),
+                              .lda = static_cast<int>(transpose ? in_features : out_features),
+                              .x = static_cast<const float *>(input->DataPtr()),
+                              .y = static_cast<float *>(output->DataPtr()),
+                              .alpha = 1.0f,
+                              .beta = 1.0f, // output already initialized with bias or zero above
+                          });
     } else {
         // cuBLAS is colmun-major
         // - if a is transposed:
@@ -106,25 +109,24 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
         // C = output.T[out_features, bs]
         // A = weight.T[out_features, in_features]
         // B = input.T[in_features, bs]
-        GemmCuda(GemmParams{
-            .trans_a = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
-            .trans_b = CUBLAS_OP_N,
-            .m = static_cast<int>(out_features),
-            .n = static_cast<int>(bs),
-            .k = static_cast<int>(in_features),
-            .A = weight->DataPtr(),
-            .lda = static_cast<int>(transpose ? in_features : out_features),
-            .B = input->DataPtr(),
-            .ldb = static_cast<int>(in_features),
-            .C = output->DataPtr(),
-            .ldc = static_cast<int>(out_features),
-            .alpha = 1.0f,
-            .beta = 1.0f, // bias already written into output; beta=1 accumulates
-            .batch_count = 1,
-            .input_dtype = dtype,
-            .output_dtype = dtype,
-            .blas_handle = GetCublasHandle(device),
-        });
+        GemmCuda(device, GemmParams{
+                             .trans_a = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
+                             .trans_b = CUBLAS_OP_N,
+                             .m = static_cast<int>(out_features),
+                             .n = static_cast<int>(bs),
+                             .k = static_cast<int>(in_features),
+                             .A = weight->DataPtr(),
+                             .lda = static_cast<int>(transpose ? in_features : out_features),
+                             .B = input->DataPtr(),
+                             .ldb = static_cast<int>(in_features),
+                             .C = output->DataPtr(),
+                             .ldc = static_cast<int>(out_features),
+                             .alpha = 1.0f,
+                             .beta = 1.0f, // bias already written into output; beta=1 accumulates
+                             .batch_count = 1,
+                             .input_dtype = dtype,
+                             .output_dtype = dtype,
+                         });
     }
 
     return output;
@@ -171,18 +173,17 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
     // When bs==1 and fp32, use cublasSgemv (more efficient than GEMM for matrix-vector).
     // cublasSgemv does not support bf16, so bf16 falls through to GemmCuda.
     if (bs == 1 && compute_dtype == DataType::kFLOAT32) {
-        SgemvCuda(SgemvParams{
-            .trans = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
-            .m = static_cast<int>(transpose ? in_features : out_features),
-            .n = static_cast<int>(transpose ? out_features : in_features),
-            .A = static_cast<const float *>(weight->DataPtr()),
-            .lda = static_cast<int>(transpose ? in_features : out_features),
-            .x = static_cast<const float *>(grad_output_promoted->DataPtr()),
-            .y = static_cast<float *>(grad_input->DataPtr()),
-            .alpha = 1.0f,
-            .beta = 0.0f,
-            .blas_handle = GetCublasHandle(grad_output->GetDevice()),
-        });
+        SgemvCuda(grad_output->GetDevice(), SgemvParams{
+                                                .trans = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
+                                                .m = static_cast<int>(transpose ? in_features : out_features),
+                                                .n = static_cast<int>(transpose ? out_features : in_features),
+                                                .A = static_cast<const float *>(weight->DataPtr()),
+                                                .lda = static_cast<int>(transpose ? in_features : out_features),
+                                                .x = static_cast<const float *>(grad_output_promoted->DataPtr()),
+                                                .y = static_cast<float *>(grad_input->DataPtr()),
+                                                .alpha = 1.0f,
+                                                .beta = 0.0f,
+                                            });
     } else {
         // - if transpose:
         // weight is [out_features, in_features] here
@@ -197,25 +198,24 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
         // C = d_input.T[in_features, bs]
         // A = weight.T[out_features, in_features]
         // B = d_output.T[out_features, bs]
-        GemmCuda(GemmParams{
-            .trans_a = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
-            .trans_b = CUBLAS_OP_N,
-            .m = static_cast<int>(in_features),
-            .n = static_cast<int>(bs),
-            .k = static_cast<int>(out_features),
-            .A = weight->DataPtr(),
-            .lda = static_cast<int>(transpose ? in_features : out_features),
-            .B = grad_output_promoted->DataPtr(),
-            .ldb = static_cast<int>(out_features),
-            .C = grad_input->DataPtr(),
-            .ldc = static_cast<int>(in_features),
-            .alpha = 1.0f,
-            .beta = 0.0f,
-            .batch_count = 1,
-            .input_dtype = compute_dtype,
-            .output_dtype = output_dtype,
-            .blas_handle = GetCublasHandle(grad_output->GetDevice()),
-        });
+        GemmCuda(grad_output->GetDevice(), GemmParams{
+                                               .trans_a = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
+                                               .trans_b = CUBLAS_OP_N,
+                                               .m = static_cast<int>(in_features),
+                                               .n = static_cast<int>(bs),
+                                               .k = static_cast<int>(out_features),
+                                               .A = weight->DataPtr(),
+                                               .lda = static_cast<int>(transpose ? in_features : out_features),
+                                               .B = grad_output_promoted->DataPtr(),
+                                               .ldb = static_cast<int>(out_features),
+                                               .C = grad_input->DataPtr(),
+                                               .ldc = static_cast<int>(in_features),
+                                               .alpha = 1.0f,
+                                               .beta = 0.0f,
+                                               .batch_count = 1,
+                                               .input_dtype = compute_dtype,
+                                               .output_dtype = output_dtype,
+                                           });
     }
 
     return grad_input;
@@ -257,25 +257,24 @@ std::shared_ptr<Tensor> LinearBackwardWeight(const std::shared_ptr<Tensor> &inpu
     const int lda = static_cast<int>(transpose ? in_features : out_features);
     const int ldb = static_cast<int>(transpose ? out_features : in_features);
 
-    GemmCuda(GemmParams{
-        .trans_a = CUBLAS_OP_N,
-        .trans_b = CUBLAS_OP_T,
-        .m = static_cast<int>(transpose ? in_features : out_features),
-        .n = static_cast<int>(transpose ? out_features : in_features),
-        .k = static_cast<int>(bs),
-        .A = a,
-        .lda = lda,
-        .B = b,
-        .ldb = ldb,
-        .C = grad_weight->DataPtr(),
-        .ldc = static_cast<int>(transpose ? in_features : out_features),
-        .alpha = 1.0f,
-        .beta = 0.0f,
-        .batch_count = 1,
-        .input_dtype = compute_dtype,
-        .output_dtype = output_dtype,
-        .blas_handle = GetCublasHandle(grad_output->GetDevice()),
-    });
+    GemmCuda(grad_output->GetDevice(), GemmParams{
+                                           .trans_a = CUBLAS_OP_N,
+                                           .trans_b = CUBLAS_OP_T,
+                                           .m = static_cast<int>(transpose ? in_features : out_features),
+                                           .n = static_cast<int>(transpose ? out_features : in_features),
+                                           .k = static_cast<int>(bs),
+                                           .A = a,
+                                           .lda = lda,
+                                           .B = b,
+                                           .ldb = ldb,
+                                           .C = grad_weight->DataPtr(),
+                                           .ldc = static_cast<int>(transpose ? in_features : out_features),
+                                           .alpha = 1.0f,
+                                           .beta = 0.0f,
+                                           .batch_count = 1,
+                                           .input_dtype = compute_dtype,
+                                           .output_dtype = output_dtype,
+                                       });
 
     return grad_weight;
 }
@@ -292,7 +291,9 @@ std::shared_ptr<Tensor> LinearBackwardBias(const std::shared_ptr<Tensor> &grad_o
         = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, output_dtype, grad_output->GetDevice());
 
     auto device = grad_output->GetDevice();
-    const auto cuda_stream = GetCudaStream(device);
+    const auto cuda_stream = dynamic_cast<infini_train::core::cuda::CudaStream *>(
+                                 infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                 ->cuda_stream();
 
     // d_bias = \sum_i(i=0, bs-1) d_output[i]
     // TODO(dcj): use thrust::fill or reduce kernel do this