refactor(matmul): split MatmulBackward kernel into 2 independent kernels

chen2021673 · chen2021673 · commit 66e45dcb75d6 · 2026-04-10T08:18:21.000Z
Move needs_input_grad logic from kernel to autograd layer. The monolithic MatmulBackward kernel
is replaced by MatmulBackwardInput1 and MatmulBackwardInput2.
diff --git a/infini_train/src/autograd/matmul.cc b/infini_train/src/autograd/matmul.cc
@@ -31,10 +31,17 @@ void Matmul::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tens
     // FIXME: compute_dtype is not necessarily the dtype of output_tensor; it should be
     // determined by autocast, not derived from output->Dtype().
     auto compute_dtype = output->Dtype();
-    saved_tensors_ = {
-        input1->Dtype() == compute_dtype ? input1 : std::make_shared<Tensor>(input1->To(compute_dtype)),
-        input2->Dtype() == compute_dtype ? input2 : std::make_shared<Tensor>(input2->To(compute_dtype)),
+
+    // grad_input1 = grad_output @ input2^T, so input2 is needed
+    // grad_input2 = grad_output^T @ input1, so input1 is needed
+    bool need_grad_input1 = needs_input_grad_.size() > 0 && needs_input_grad_[0];
+    bool need_grad_input2 = needs_input_grad_.size() > 1 && needs_input_grad_[1];
+
+    auto cast = [&](const std::shared_ptr<Tensor> &t) {
+        return t->Dtype() == compute_dtype ? t : std::make_shared<Tensor>(t->To(compute_dtype));
     };
+
+    saved_tensors_ = {need_grad_input2 ? cast(input1) : nullptr, need_grad_input1 ? cast(input2) : nullptr};
     out_features_ = output->Dims()[0];
 }
 
@@ -45,10 +52,24 @@ std::vector<std::shared_ptr<Tensor>> Matmul::Backward(const std::vector<std::sha
     CHECK_EQ(grad_outputs.size(), 1);
     const auto &grad_output = grad_outputs[0];
 
+    CHECK(!needs_input_grad_.empty()) << "needs_input_grad_ not populated in Matmul::Backward";
+    bool need_grad_input1 = needs_input_grad_.size() > 0 && needs_input_grad_[0];
+    bool need_grad_input2 = needs_input_grad_.size() > 1 && needs_input_grad_[1];
+
     auto device = input1->GetDevice().type();
-    auto [grad_input1, grad_input2]
-        = Dispatcher::Instance().Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
-            {device, "MatmulBackward"}, input1, input2, grad_output);
+
+    std::shared_ptr<Tensor> grad_input1 = nullptr;
+    std::shared_ptr<Tensor> grad_input2 = nullptr;
+
+    if (need_grad_input1) {
+        grad_input1 = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput1"}, input2,
+                                                                           grad_output, input1->Dims());
+    }
+    if (need_grad_input2) {
+        grad_input2 = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput2"}, input1,
+                                                                           grad_output, input2->Dims());
+    }
+
     return {grad_input1, grad_input2};
 }
 } // namespace infini_train::autograd
diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc
@@ -50,38 +50,71 @@ std::shared_ptr<Tensor> MatmulForward(const std::shared_ptr<Tensor> &input, cons
     return {output};
 }
 
-std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
-MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
-               const std::shared_ptr<Tensor> &grad_output) {
+std::shared_ptr<Tensor> MatmulBackwardInput1(const std::shared_ptr<Tensor> &other,
+                                             const std::shared_ptr<Tensor> &grad_output,
+                                             const std::vector<int64_t> &input_dims) {
     /*
     grad_input[*, m, k] = grad_output[*, m, n] * other[*, k, n]^T
-    grad_other[*, k, n] = input[*, m, k]^T * grad_output[*, m, n]
     */
-    const auto &input_dims = input->Dims();
     const auto &other_dims = other->Dims();
     const auto &grad_output_dims = grad_output->Dims();
 
+    CHECK_GE(other_dims.size(), 2);
+    CHECK_EQ(other_dims.size(), grad_output_dims.size());
+
+    const int64_t m = grad_output_dims[grad_output_dims.size() - 2];
+    const int64_t k = other_dims[other_dims.size() - 2];
+    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
+
+    const int64_t bs
+        = std::accumulate(grad_output_dims.rbegin() + 2, grad_output_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < grad_output_dims.size() - 2; ++i) {
+        CHECK_EQ(grad_output_dims[i], other_dims[i]) << "Batch dims must match";
+    }
+
+    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
+    grad_input->Fill<float>(0.0f);
+
+    for (int64_t b = 0; b < bs; ++b) {
+        for (int64_t i = 0; i < m; ++i) {
+            for (int64_t j = 0; j < n; ++j) {
+                const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
+                for (int64_t p = 0; p < k; ++p) {
+                    const auto other_idx = b * k * n + p * n + j;
+                    static_cast<float *>(grad_input->DataPtr())[b * m * k + i * k + p]
+                        += grad * static_cast<const float *>(other->DataPtr())[other_idx];
+                }
+            }
+        }
+    }
+    return grad_input;
+}
+
+std::shared_ptr<Tensor> MatmulBackwardInput2(const std::shared_ptr<Tensor> &input1,
+                                             const std::shared_ptr<Tensor> &grad_output,
+                                             const std::vector<int64_t> &other_dims) {
+    /*
+    grad_other[*, k, n] = input[*, m, k]^T * grad_output[*, m, n]
+    */
+    const auto &input_dims = input1->Dims();
+    const auto &grad_output_dims = grad_output->Dims();
+
     CHECK_GE(input_dims.size(), 2);
-    CHECK_EQ(input_dims.size(), other_dims.size());
     CHECK_EQ(input_dims.size(), grad_output_dims.size());
 
     const int64_t m = input_dims[input_dims.size() - 2];
     const int64_t k = input_dims[input_dims.size() - 1];
-    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
-    const int64_t n = other_dims[other_dims.size() - 1];
-
+    const int64_t n = grad_output_dims[grad_output_dims.size() - 1];
     CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
-    CHECK_EQ(n, grad_output_dims[grad_output_dims.size() - 1]);
+    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
 
     const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
     for (int64_t i = 0; i < input_dims.size() - 2; ++i) {
-        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
         CHECK_EQ(input_dims[i], grad_output_dims[i]) << "Batch dims must match";
+        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
     }
 
-    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
     auto grad_other = std::make_shared<Tensor>(other_dims, DataType::kFLOAT32);
-    grad_input->Fill<float>(0.0f);
     grad_other->Fill<float>(0.0f);
 
     for (int64_t b = 0; b < bs; ++b) {
@@ -90,16 +123,13 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
                 const float grad = static_cast<float *>(grad_output->DataPtr())[b * m * n + i * n + j];
                 for (int64_t p = 0; p < k; ++p) {
                     const auto input_idx = b * m * k + i * k + p;
-                    const auto other_idx = b * k * n + p * n + j;
-                    static_cast<float *>(grad_input->DataPtr())[input_idx]
-                        += grad * static_cast<const float *>(other->DataPtr())[other_idx];
-                    static_cast<float *>(grad_other->DataPtr())[other_idx]
-                        += grad * static_cast<const float *>(input->DataPtr())[input_idx];
+                    static_cast<float *>(grad_other->DataPtr())[b * k * n + p * n + j]
+                        += grad * static_cast<const float *>(input1->DataPtr())[input_idx];
                 }
             }
         }
     }
-    return {grad_input, grad_other};
+    return grad_other;
 }
 
 std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
@@ -201,7 +231,8 @@ std::shared_ptr<Tensor> LinearBackwardBias(const std::shared_ptr<Tensor> &grad_o
     REGISTER_KERNEL(infini_train::Device::DeviceType::kCPU, kernel_name, infini_train::kernels::cpu::kernel_name)
 
 REGISTER_CPU_LINEAR_KERNEL(MatmulForward)
-REGISTER_CPU_LINEAR_KERNEL(MatmulBackward)
+REGISTER_CPU_LINEAR_KERNEL(MatmulBackwardInput1)
+REGISTER_CPU_LINEAR_KERNEL(MatmulBackwardInput2)
 REGISTER_CPU_LINEAR_KERNEL(LinearForward)
 REGISTER_CPU_LINEAR_KERNEL(LinearBackwardInput)
 REGISTER_CPU_LINEAR_KERNEL(LinearBackwardWeight)
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu