InfiniTensor
diff --git a/‎infini_train/include/common/cuda/gemm.cuh‎
Lines changed: 72 additions & 0 deletions b/‎infini_train/include/common/cuda/gemm.cuh‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎infini_train/src/autograd/matmul.cc‎
Lines changed: 7 additions & 7 deletions b/‎infini_train/src/autograd/matmul.cc‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎infini_train/src/kernels/cpu/linear.cc‎
Lines changed: 1 addition & 125 deletions b/‎infini_train/src/kernels/cpu/linear.cc‎
Lines changed: 1 addition & 125 deletions
diff --git a/‎infini_train/src/kernels/cpu/matmul.cc‎
Lines changed: 145 additions & 0 deletions b/‎infini_train/src/kernels/cpu/matmul.cc‎
Lines changed: 145 additions & 0 deletions
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda_runtime_api.h>
+
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/device.h"
+
+namespace infini_train::kernels::cuda {
+
+/**
+ * Return the cuBLAS handle associated with the given device.
+ * Shared by linear.cu, matmul.cu, and any future GEMM-using kernels.
+ */
+cublasHandle_t GetCublasHandle(const Device &device);
+
+/**
+ * Return the CUDA stream associated with the given device.
+ * Shared by kernels that need to launch device-side code directly.
+ */
+cudaStream_t GetCudaStream(const Device &device);
+
+/**
+ * Parameter bundle for a single GEMM call:
+ *   C = alpha * op(A) * op(B) + beta * C
+ *
+ * batch_count == 1  →  non-batched path  (cublasGemmEx)
+ * batch_count  > 1  →  strided-batched   (cublasGemmStridedBatchedEx)
+ *
+ * When batch_count == 1, stride_a/b/c are unused and must be left at 0.
+ */
+struct GemmParams {
+    cublasOperation_t trans_a = CUBLAS_OP_N;
+    cublasOperation_t trans_b = CUBLAS_OP_N;
+
+    int m = 0; // rows of op(A) and C
+    int n = 0; // cols of op(B) and C
+    int k = 0; // cols of op(A) == rows of op(B)
+
+    const void *A = nullptr;
+    int lda       = 0;
+    const void *B = nullptr;
+    int ldb       = 0;
+    void *C       = nullptr;
+    int ldc       = 0;
+
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+
+    // batch_count=1: non-batched (Linear path); stride_a/b/c must be 0
+    // batch_count>1: strided-batched (Matmul path)
+    int batch_count       = 1;
+    long long stride_a    = 0;
+    long long stride_b    = 0;
+    long long stride_c    = 0;
+
+    DataType input_dtype;  // dtype of A and B
+    DataType output_dtype; // dtype of C (may differ, e.g. bf16 in → fp32 out)
+
+    cublasHandle_t blas_handle = nullptr;
+};
+
+/**
+ * Execute the GEMM described by `p` via cuBLAS.
+ * Dispatches to cublasGemmEx (batch_count==1) or
+ * cublasGemmStridedBatchedEx (batch_count>1).
+ * Uses CUBLAS_COMPUTE_32F for all input dtypes to ensure precision.
+ * Aborts on cuBLAS error (via CUBLAS_CHECK / LOG(FATAL)).
+ */
+void GemmCuda(const GemmParams &p);
+
+} // namespace infini_train::kernels::cuda
@@ -58,18 +58,18 @@ std::vector<std::shared_ptr<Tensor>> Matmul::Backward(const std::vector<std::sha
 
     auto device = input1->GetDevice().type();
 
-    std::shared_ptr<Tensor> grad_input1 = nullptr;
-    std::shared_ptr<Tensor> grad_input2 = nullptr;
+    std::shared_ptr<Tensor> grad_input = nullptr;
+    std::shared_ptr<Tensor> grad_other = nullptr;
 
     if (need_grad_input1) {
-        grad_input1 = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput1"}, input2,
-                                                                           grad_output, input1->Dims());
+        grad_input = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput"}, input2,
+                                                                          grad_output, input1->Dims());
     }
     if (need_grad_input2) {
-        grad_input2 = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput2"}, input1,
-                                                                           grad_output, input2->Dims());
+        grad_other = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardOther"}, input1,
+                                                                          grad_output, input2->Dims());
     }
 
-    return {grad_input1, grad_input2};
+    return {grad_input, grad_other};
 }
 } // namespace infini_train::autograd
@@ -9,128 +9,6 @@
 #include "infini_train/include/tensor.h"
 
 namespace infini_train::kernels::cpu {
-std::shared_ptr<Tensor> MatmulForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other) {
-    /*
-    output[*, m, n] = input[*, m, k] * other[*, k, n]
-    */
-    // TODO(dcj): support broadcast later
-    const auto &input_dims = input->Dims();
-    const auto &other_dims = other->Dims();
-
-    CHECK_GE(input_dims.size(), 2);
-    CHECK_GE(other_dims.size(), 2);
-    CHECK_EQ(input_dims.size(), other_dims.size());
-
-    const int64_t m = input_dims[input_dims.size() - 2];
-    const int64_t k = input_dims[input_dims.size() - 1];
-    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
-    const int64_t n = other_dims[other_dims.size() - 1];
-
-    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
-    for (int64_t i = 0; i < input_dims.size() - 2; ++i) {
-        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
-    }
-
-    std::vector<int64_t> output_dims = input_dims;
-    output_dims[output_dims.size() - 1] = n;
-    auto output = std::make_shared<Tensor>(output_dims, DataType::kFLOAT32);
-
-    for (int64_t b = 0; b < bs; ++b) {
-        for (int64_t i = 0; i < m; ++i) {
-            for (int64_t j = 0; j < n; ++j) {
-                float acc = 0.0f;
-                for (int64_t p = 0; p < k; ++p) {
-                    acc += static_cast<const float *>(input->DataPtr())[b * m * k + i * k + p]
-                         * static_cast<const float *>(other->DataPtr())[b * k * n + p * n + j];
-                }
-                static_cast<float *>(output->DataPtr())[b * m * n + i * n + j] = acc;
-            }
-        }
-    }
-    return {output};
-}
-
-std::shared_ptr<Tensor> MatmulBackwardInput1(const std::shared_ptr<Tensor> &other,
-                                             const std::shared_ptr<Tensor> &grad_output,
-                                             const std::vector<int64_t> &input_dims) {
-    /*
-    grad_input[*, m, k] = grad_output[*, m, n] * other[*, k, n]^T
-    */
-    const auto &other_dims = other->Dims();
-    const auto &grad_output_dims = grad_output->Dims();
-
-    CHECK_GE(other_dims.size(), 2);
-    CHECK_EQ(other_dims.size(), grad_output_dims.size());
-
-    const int64_t m = grad_output_dims[grad_output_dims.size() - 2];
-    const int64_t k = other_dims[other_dims.size() - 2];
-    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
-
-    const int64_t bs
-        = std::accumulate(grad_output_dims.rbegin() + 2, grad_output_dims.rend(), 1, std::multiplies<int64_t>{});
-    for (int64_t i = 0; i < grad_output_dims.size() - 2; ++i) {
-        CHECK_EQ(grad_output_dims[i], other_dims[i]) << "Batch dims must match";
-    }
-
-    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
-    grad_input->Fill<float>(0.0f);
-
-    for (int64_t b = 0; b < bs; ++b) {
-        for (int64_t i = 0; i < m; ++i) {
-            for (int64_t j = 0; j < n; ++j) {
-                const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
-                for (int64_t p = 0; p < k; ++p) {
-                    const auto other_idx = b * k * n + p * n + j;
-                    static_cast<float *>(grad_input->DataPtr())[b * m * k + i * k + p]
-                        += grad * static_cast<const float *>(other->DataPtr())[other_idx];
-                }
-            }
-        }
-    }
-    return grad_input;
-}
-
-std::shared_ptr<Tensor> MatmulBackwardInput2(const std::shared_ptr<Tensor> &input1,
-                                             const std::shared_ptr<Tensor> &grad_output,
-                                             const std::vector<int64_t> &other_dims) {
-    /*
-    grad_other[*, k, n] = input[*, m, k]^T * grad_output[*, m, n]
-    */
-    const auto &input_dims = input1->Dims();
-    const auto &grad_output_dims = grad_output->Dims();
-
-    CHECK_GE(input_dims.size(), 2);
-    CHECK_EQ(input_dims.size(), grad_output_dims.size());
-
-    const int64_t m = input_dims[input_dims.size() - 2];
-    const int64_t k = input_dims[input_dims.size() - 1];
-    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
-    CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
-    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
-
-    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
-    for (int64_t i = 0; i < input_dims.size() - 2; ++i) {
-        CHECK_EQ(input_dims[i], grad_output_dims[i]) << "Batch dims must match";
-        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
-    }
-
-    auto grad_other = std::make_shared<Tensor>(other_dims, DataType::kFLOAT32);
-    grad_other->Fill<float>(0.0f);
-
-    for (int64_t b = 0; b < bs; ++b) {
-        for (int64_t i = 0; i < m; ++i) {
-            for (int64_t j = 0; j < n; ++j) {
-                const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
-                for (int64_t p = 0; p < k; ++p) {
-                    const auto input_idx = b * m * k + i * k + p;
-                    static_cast<float *>(grad_other->DataPtr())[b * k * n + p * n + j]
-                        += grad * static_cast<const float *>(input1->DataPtr())[input_idx];
-                }
-            }
-        }
-    }
-    return grad_other;
-}
 
 std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
                                       bool transpose, const std::shared_ptr<Tensor> &bias) {
@@ -225,14 +103,12 @@ std::shared_ptr<Tensor> LinearBackwardBias(const std::shared_ptr<Tensor> &grad_o
     grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum();
     return grad_bias;
 }
+
 } // namespace infini_train::kernels::cpu
 
 #define REGISTER_CPU_LINEAR_KERNEL(kernel_name)                                                                        \
     REGISTER_KERNEL(infini_train::Device::DeviceType::kCPU, kernel_name, infini_train::kernels::cpu::kernel_name)
 
-REGISTER_CPU_LINEAR_KERNEL(MatmulForward)
-REGISTER_CPU_LINEAR_KERNEL(MatmulBackwardInput1)
-REGISTER_CPU_LINEAR_KERNEL(MatmulBackwardInput2)
 REGISTER_CPU_LINEAR_KERNEL(LinearForward)
 REGISTER_CPU_LINEAR_KERNEL(LinearBackwardInput)
 REGISTER_CPU_LINEAR_KERNEL(LinearBackwardWeight)
 
@@ -0,0 +1,145 @@
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernels::cpu {
+
+std::shared_ptr<Tensor> MatmulForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other) {
+    /*
+    output[*, m, n] = input[*, m, k] * other[*, k, n]
+    */
+    // TODO(dcj): support broadcast later
+    const auto &input_dims = input->Dims();
+    const auto &other_dims = other->Dims();
+
+    CHECK_GE(input_dims.size(), 2);
+    CHECK_GE(other_dims.size(), 2);
+    CHECK_EQ(input_dims.size(), other_dims.size());
+
+    const int64_t m = input_dims[input_dims.size() - 2];
+    const int64_t k = input_dims[input_dims.size() - 1];
+    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
+    const int64_t n = other_dims[other_dims.size() - 1];
+
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < static_cast<int64_t>(input_dims.size()) - 2; ++i) {
+        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
+    }
+
+    std::vector<int64_t> output_dims = input_dims;
+    output_dims[output_dims.size() - 1] = n;
+    auto output = std::make_shared<Tensor>(output_dims, DataType::kFLOAT32);
+
+    for (int64_t b = 0; b < bs; ++b) {
+        for (int64_t i = 0; i < m; ++i) {
+            for (int64_t j = 0; j < n; ++j) {
+                float acc = 0.0f;
+                for (int64_t p = 0; p < k; ++p) {
+                    acc += static_cast<const float *>(input->DataPtr())[b * m * k + i * k + p]
+                         * static_cast<const float *>(other->DataPtr())[b * k * n + p * n + j];
+                }
+                static_cast<float *>(output->DataPtr())[b * m * n + i * n + j] = acc;
+            }
+        }
+    }
+    return {output};
+}
+
+std::shared_ptr<Tensor> MatmulBackwardInput(const std::shared_ptr<Tensor> &other,
+                                            const std::shared_ptr<Tensor> &grad_output,
+                                            const std::vector<int64_t> &input_dims) {
+    /*
+    grad_input[*, m, k] = grad_output[*, m, n] * other[*, k, n]^T
+    */
+    const auto &other_dims = other->Dims();
+    const auto &grad_output_dims = grad_output->Dims();
+
+    CHECK_GE(other_dims.size(), 2);
+    CHECK_EQ(other_dims.size(), grad_output_dims.size());
+
+    const int64_t m = grad_output_dims[grad_output_dims.size() - 2];
+    const int64_t k = other_dims[other_dims.size() - 2];
+    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
+
+    const int64_t bs
+        = std::accumulate(grad_output_dims.rbegin() + 2, grad_output_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < static_cast<int64_t>(grad_output_dims.size()) - 2; ++i) {
+        CHECK_EQ(grad_output_dims[i], other_dims[i]) << "Batch dims must match";
+    }
+
+    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
+    grad_input->Fill<float>(0.0f);
+
+    for (int64_t b = 0; b < bs; ++b) {
+        for (int64_t i = 0; i < m; ++i) {
+            for (int64_t j = 0; j < n; ++j) {
+                const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
+                for (int64_t p = 0; p < k; ++p) {
+                    const auto other_idx = b * k * n + p * n + j;
+                    static_cast<float *>(grad_input->DataPtr())[b * m * k + i * k + p]
+                        += grad * static_cast<const float *>(other->DataPtr())[other_idx];
+                }
+            }
+        }
+    }
+    return grad_input;
+}
+
+std::shared_ptr<Tensor> MatmulBackwardOther(const std::shared_ptr<Tensor> &input1,
+                                            const std::shared_ptr<Tensor> &grad_output,
+                                            const std::vector<int64_t> &other_dims) {
+    /*
+    grad_other[*, k, n] = input[*, m, k]^T * grad_output[*, m, n]
+    */
+    const auto &input_dims = input1->Dims();
+    const auto &grad_output_dims = grad_output->Dims();
+
+    CHECK_GE(input_dims.size(), 2);
+    CHECK_EQ(input_dims.size(), grad_output_dims.size());
+
+    const int64_t m = input_dims[input_dims.size() - 2];
+    const int64_t k = input_dims[input_dims.size() - 1];
+    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
+    CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
+    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
+
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < static_cast<int64_t>(input_dims.size()) - 2; ++i) {
+        CHECK_EQ(input_dims[i], grad_output_dims[i]) << "Batch dims must match";
+        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
+    }
+
+    auto grad_other = std::make_shared<Tensor>(other_dims, DataType::kFLOAT32);
+    grad_other->Fill<float>(0.0f);
+
+    for (int64_t b = 0; b < bs; ++b) {
+        for (int64_t i = 0; i < m; ++i) {
+            for (int64_t j = 0; j < n; ++j) {
+                const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
+                for (int64_t p = 0; p < k; ++p) {
+                    const auto input_idx = b * m * k + i * k + p;
+                    static_cast<float *>(grad_other->DataPtr())[b * k * n + p * n + j]
+                        += grad * static_cast<const float *>(input1->DataPtr())[input_idx];
+                }
+            }
+        }
+    }
+    return grad_other;
+}
+
+} // namespace infini_train::kernels::cpu
+
+#define REGISTER_CPU_MATMUL_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kCPU, kernel_name, infini_train::kernels::cpu::kernel_name)
+
+REGISTER_CPU_MATMUL_KERNEL(MatmulForward)
+REGISTER_CPU_MATMUL_KERNEL(MatmulBackwardInput)
+REGISTER_CPU_MATMUL_KERNEL(MatmulBackwardOther)
+
+#undef REGISTER_CPU_MATMUL_KERNEL