fixup! Refactor(linear): split LinearBackward kernel into 3 independent kernels

chen2021673 · chen2021673 · commit d385edba6628 · 2026-04-10T07:39:43.000Z
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu
@@ -330,6 +330,7 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
 
     // For bf16 compute, accumulate in fp32 to preserve precision (matches PyTorch behavior).
     auto output_dtype = (compute_dtype == DataType::kBFLOAT16) ? DataType::kFLOAT32 : compute_dtype;
+    // No Fill(0) needed: cuBLAS beta=0.0f fully overwrites output.
     auto grad_input = std::make_shared<Tensor>(input_dims, output_dtype, grad_output->GetDevice());
 
     auto device = grad_output->GetDevice();
@@ -339,6 +340,7 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
                                 infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
                                 ->cublas_handle();
 
+    // TODO(zbl): use cublasSgemv if possible
     // - if transpose:
     // weight is [out_features, in_features] here
     // d_input = d_output * weight --> d_input.T = weight.T * d_output.T
@@ -393,6 +395,7 @@ std::shared_ptr<Tensor> LinearBackwardWeight(const std::shared_ptr<Tensor> &inpu
     auto output_dtype = (compute_dtype == DataType::kBFLOAT16) ? DataType::kFLOAT32 : compute_dtype;
     const std::vector<int64_t> weight_dims
         = transpose ? std::vector<int64_t>{out_features, in_features} : std::vector<int64_t>{in_features, out_features};
+    // No Fill(0) needed: cuBLAS beta=0.0f fully overwrites output.
     auto grad_weight = std::make_shared<Tensor>(weight_dims, output_dtype, grad_output->GetDevice());
 
     auto device = grad_output->GetDevice();
@@ -460,6 +463,7 @@ std::shared_ptr<Tensor> LinearBackwardBias(const std::shared_ptr<Tensor> &grad_o
                                   ->cuda_stream();
 
     // d_bias = \sum_i(i=0, bs-1) d_output[i]
+    // TODO(dcj): use thrust::fill or reduce kernel do this
     constexpr int BLOCK_SIZE = 256;
     switch (compute_dtype) {
         DISPATCH_CASE(WRAP({