refactor: fix Matmul nullptr safety and convert GemmParams/SgemvParams to designated initializers

chen2021673 · chen2021673 · commit a7e1b99e431c · 2026-04-30T07:09:55.000Z
- Save input1_dims_/input2_dims_ in Matmul::SetupContext to avoid Dims()
  calls on potentially-null saved tensors in Backward
- Get device from grad_output instead of input1 in Matmul::Backward
- Add CHECK guards before dereferencing nullable saved tensors
- Convert all GemmParams/SgemvParams construction in linear.cu, matmul.cu,
  outer.cu to C++20 designated initializer form
diff --git a/infini_train/include/autograd/matmul.h b/infini_train/include/autograd/matmul.h
@@ -23,5 +23,7 @@ class Matmul : public Function {
 
 private:
     int64_t out_features_ = 0;
+    std::vector<int64_t> input1_dims_;
+    std::vector<int64_t> input2_dims_;
 };
 } // namespace infini_train::autograd
diff --git a/infini_train/src/autograd/matmul.cc b/infini_train/src/autograd/matmul.cc
@@ -42,6 +42,8 @@ void Matmul::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tens
     };
 
     saved_tensors_ = {need_grad_input2 ? cast(input1) : nullptr, need_grad_input1 ? cast(input2) : nullptr};
+    input1_dims_ = input1->Dims();
+    input2_dims_ = input2->Dims();
     out_features_ = output->Dims()[0];
 }
 
@@ -56,18 +58,20 @@ std::vector<std::shared_ptr<Tensor>> Matmul::Backward(const std::vector<std::sha
     bool need_grad_input1 = needs_input_grad_.size() > 0 && needs_input_grad_[0];
     bool need_grad_input2 = needs_input_grad_.size() > 1 && needs_input_grad_[1];
 
-    auto device = input1->GetDevice().type();
+    auto device = grad_output->GetDevice().type();
 
     std::shared_ptr<Tensor> grad_input = nullptr;
     std::shared_ptr<Tensor> grad_other = nullptr;
 
     if (need_grad_input1) {
+        CHECK(input2 != nullptr) << "input2 not saved but need_grad_input1 is true";
         grad_input = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardInput"}, input2,
-                                                                          grad_output, input1->Dims());
+                                                                          grad_output, input1_dims_);
     }
     if (need_grad_input2) {
+        CHECK(input1 != nullptr) << "input1 not saved but need_grad_input2 is true";
         grad_other = Dispatcher::Instance().Call<std::shared_ptr<Tensor>>({device, "MatmulBackwardOther"}, input1,
-                                                                          grad_output, input2->Dims());
+                                                                          grad_output, input2_dims_);
     }
 
     return {grad_input, grad_other};
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu
@@ -80,18 +80,18 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
     // When bs==1 and fp32, use cublasSgemv (more efficient than GEMM for matrix-vector).
     // cublasSgemv does not support bf16, so bf16 falls through to GemmCuda.
     if (bs == 1 && dtype == DataType::kFLOAT32) {
-        SgemvParams p;
-        p.trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
-        p.m = static_cast<int>(transpose ? in_features : out_features);
-        p.n = static_cast<int>(transpose ? out_features : in_features);
-        p.A = static_cast<const float *>(weight->DataPtr());
-        p.lda = static_cast<int>(transpose ? in_features : out_features);
-        p.x = static_cast<const float *>(input->DataPtr());
-        p.y = static_cast<float *>(output->DataPtr());
-        p.alpha = 1.0f;
-        p.beta = 1.0f; // output already initialized with bias or zero above
-        p.blas_handle = GetCublasHandle(device);
-        SgemvCuda(p);
+        SgemvCuda(SgemvParams{
+            .trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
+            .m = static_cast<int>(transpose ? in_features : out_features),
+            .n = static_cast<int>(transpose ? out_features : in_features),
+            .A = static_cast<const float *>(weight->DataPtr()),
+            .lda = static_cast<int>(transpose ? in_features : out_features),
+            .x = static_cast<const float *>(input->DataPtr()),
+            .y = static_cast<float *>(output->DataPtr()),
+            .alpha = 1.0f,
+            .beta = 1.0f, // output already initialized with bias or zero above
+            .blas_handle = GetCublasHandle(device),
+        });
     } else {
         // cuBLAS is colmun-major
         // - if a is transposed:
@@ -106,26 +106,25 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
         // C = output.T[out_features, bs]
         // A = weight.T[out_features, in_features]
         // B = input.T[in_features, bs]
-        GemmParams p;
-        p.trans_a = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
-        p.trans_b = CUBLAS_OP_N;
-        p.m = static_cast<int>(out_features);
-        p.n = static_cast<int>(bs);
-        p.k = static_cast<int>(in_features);
-        p.A = weight->DataPtr();
-        p.lda = static_cast<int>(transpose ? in_features : out_features);
-        p.B = input->DataPtr();
-        p.ldb = static_cast<int>(in_features);
-        p.C = output->DataPtr();
-        p.ldc = static_cast<int>(out_features);
-        p.alpha = 1.0f;
-        p.beta = 1.0f; // bias already written into output; beta=1 accumulates
-        p.batch_count = 1;
-        p.input_dtype = dtype;
-        p.output_dtype = dtype;
-        p.blas_handle = GetCublasHandle(device);
-
-        GemmCuda(p);
+        GemmCuda(GemmParams{
+            .trans_a = transpose ? CUBLAS_OP_T : CUBLAS_OP_N,
+            .trans_b = CUBLAS_OP_N,
+            .m = static_cast<int>(out_features),
+            .n = static_cast<int>(bs),
+            .k = static_cast<int>(in_features),
+            .A = weight->DataPtr(),
+            .lda = static_cast<int>(transpose ? in_features : out_features),
+            .B = input->DataPtr(),
+            .ldb = static_cast<int>(in_features),
+            .C = output->DataPtr(),
+            .ldc = static_cast<int>(out_features),
+            .alpha = 1.0f,
+            .beta = 1.0f, // bias already written into output; beta=1 accumulates
+            .batch_count = 1,
+            .input_dtype = dtype,
+            .output_dtype = dtype,
+            .blas_handle = GetCublasHandle(device),
+        });
     }
 
     return output;
@@ -172,18 +171,18 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
     // When bs==1 and fp32, use cublasSgemv (more efficient than GEMM for matrix-vector).
     // cublasSgemv does not support bf16, so bf16 falls through to GemmCuda.
     if (bs == 1 && compute_dtype == DataType::kFLOAT32) {
-        SgemvParams p;
-        p.trans = transpose ? CUBLAS_OP_N : CUBLAS_OP_T;
-        p.m = static_cast<int>(transpose ? in_features : out_features);
-        p.n = static_cast<int>(transpose ? out_features : in_features);
-        p.A = static_cast<const float *>(weight->DataPtr());
-        p.lda = static_cast<int>(transpose ? in_features : out_features);
-        p.x = static_cast<const float *>(grad_output_promoted->DataPtr());
-        p.y = static_cast<float *>(grad_input->DataPtr());
-        p.alpha = 1.0f;
-        p.beta = 0.0f;
-        p.blas_handle = GetCublasHandle(grad_output->GetDevice());
-        SgemvCuda(p);
+        SgemvCuda(SgemvParams{
+            .trans = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
+            .m = static_cast<int>(transpose ? in_features : out_features),
+            .n = static_cast<int>(transpose ? out_features : in_features),
+            .A = static_cast<const float *>(weight->DataPtr()),
+            .lda = static_cast<int>(transpose ? in_features : out_features),
+            .x = static_cast<const float *>(grad_output_promoted->DataPtr()),
+            .y = static_cast<float *>(grad_input->DataPtr()),
+            .alpha = 1.0f,
+            .beta = 0.0f,
+            .blas_handle = GetCublasHandle(grad_output->GetDevice()),
+        });
     } else {
         // - if transpose:
         // weight is [out_features, in_features] here
@@ -198,26 +197,25 @@ std::shared_ptr<Tensor> LinearBackwardInput(const std::shared_ptr<Tensor> &weigh
         // C = d_input.T[in_features, bs]
         // A = weight.T[out_features, in_features]
         // B = d_output.T[out_features, bs]
-        GemmParams p;
-        p.trans_a = transpose ? CUBLAS_OP_N : CUBLAS_OP_T;
-        p.trans_b = CUBLAS_OP_N;
-        p.m = static_cast<int>(in_features);
-        p.n = static_cast<int>(bs);
-        p.k = static_cast<int>(out_features);
-        p.A = weight->DataPtr();
-        p.lda = static_cast<int>(transpose ? in_features : out_features);
-        p.B = grad_output_promoted->DataPtr();
-        p.ldb = static_cast<int>(out_features);
-        p.C = grad_input->DataPtr();
-        p.ldc = static_cast<int>(in_features);
-        p.alpha = 1.0f;
-        p.beta = 0.0f;
-        p.batch_count = 1;
-        p.input_dtype = compute_dtype;
-        p.output_dtype = output_dtype;
-        p.blas_handle = GetCublasHandle(grad_output->GetDevice());
-
-        GemmCuda(p);
+        GemmCuda(GemmParams{
+            .trans_a = transpose ? CUBLAS_OP_N : CUBLAS_OP_T,
+            .trans_b = CUBLAS_OP_N,
+            .m = static_cast<int>(in_features),
+            .n = static_cast<int>(bs),
+            .k = static_cast<int>(out_features),
+            .A = weight->DataPtr(),
+            .lda = static_cast<int>(transpose ? in_features : out_features),
+            .B = grad_output_promoted->DataPtr(),
+            .ldb = static_cast<int>(out_features),
+            .C = grad_input->DataPtr(),
+            .ldc = static_cast<int>(in_features),
+            .alpha = 1.0f,
+            .beta = 0.0f,
+            .batch_count = 1,
+            .input_dtype = compute_dtype,
+            .output_dtype = output_dtype,
+            .blas_handle = GetCublasHandle(grad_output->GetDevice()),
+        });
     }
 
     return grad_input;
@@ -259,26 +257,25 @@ std::shared_ptr<Tensor> LinearBackwardWeight(const std::shared_ptr<Tensor> &inpu
     const int lda = static_cast<int>(transpose ? in_features : out_features);
     const int ldb = static_cast<int>(transpose ? out_features : in_features);
 
-    GemmParams p;
-    p.trans_a = CUBLAS_OP_N;
-    p.trans_b = CUBLAS_OP_T;
-    p.m = static_cast<int>(transpose ? in_features : out_features);
-    p.n = static_cast<int>(transpose ? out_features : in_features);
-    p.k = static_cast<int>(bs);
-    p.A = a;
-    p.lda = lda;
-    p.B = b;
-    p.ldb = ldb;
-    p.C = grad_weight->DataPtr();
-    p.ldc = static_cast<int>(transpose ? in_features : out_features);
-    p.alpha = 1.0f;
-    p.beta = 0.0f;
-    p.batch_count = 1;
-    p.input_dtype = compute_dtype;
-    p.output_dtype = output_dtype;
-    p.blas_handle = GetCublasHandle(grad_output->GetDevice());
-
-    GemmCuda(p);
+    GemmCuda(GemmParams{
+        .trans_a = CUBLAS_OP_N,
+        .trans_b = CUBLAS_OP_T,
+        .m = static_cast<int>(transpose ? in_features : out_features),
+        .n = static_cast<int>(transpose ? out_features : in_features),
+        .k = static_cast<int>(bs),
+        .A = a,
+        .lda = lda,
+        .B = b,
+        .ldb = ldb,
+        .C = grad_weight->DataPtr(),
+        .ldc = static_cast<int>(transpose ? in_features : out_features),
+        .alpha = 1.0f,
+        .beta = 0.0f,
+        .batch_count = 1,
+        .input_dtype = compute_dtype,
+        .output_dtype = output_dtype,
+        .blas_handle = GetCublasHandle(grad_output->GetDevice()),
+    });
 
     return grad_weight;
 }
diff --git a/infini_train/src/kernels/cuda/matmul.cu b/infini_train/src/kernels/cuda/matmul.cu
@@ -47,29 +47,28 @@ std::shared_ptr<Tensor> MatmulForward(const std::shared_ptr<Tensor> &input, cons
     // A = other.T[*, n, k]
     // B = input.T[*, k, m]
     // NOTE(zbl): the last cublasGemmAlgo_t param has no effect on GPU arch >= sm_80(Ampere)
-    GemmParams p;
-    p.trans_a = CUBLAS_OP_N;
-    p.trans_b = CUBLAS_OP_N;
-    p.m = static_cast<int>(n);
-    p.n = static_cast<int>(m);
-    p.k = static_cast<int>(k);
-    p.A = other->DataPtr();
-    p.lda = static_cast<int>(n);
-    p.stride_a = n * k;
-    p.B = input->DataPtr();
-    p.ldb = static_cast<int>(k);
-    p.stride_b = k * m;
-    p.C = output->DataPtr();
-    p.ldc = static_cast<int>(n);
-    p.stride_c = m * n;
-    p.alpha = 1.0f;
-    p.beta = 0.0f;
-    p.batch_count = static_cast<int>(bs);
-    p.input_dtype = dtype;
-    p.output_dtype = dtype;
-    p.blas_handle = GetCublasHandle(device);
-
-    GemmCuda(p);
+    GemmCuda(GemmParams{
+        .trans_a = CUBLAS_OP_N,
+        .trans_b = CUBLAS_OP_N,
+        .m = static_cast<int>(n),
+        .n = static_cast<int>(m),
+        .k = static_cast<int>(k),
+        .A = other->DataPtr(),
+        .lda = static_cast<int>(n),
+        .B = input->DataPtr(),
+        .ldb = static_cast<int>(k),
+        .C = output->DataPtr(),
+        .ldc = static_cast<int>(n),
+        .alpha = 1.0f,
+        .beta = 0.0f,
+        .batch_count = static_cast<int>(bs),
+        .stride_a = n * k,
+        .stride_b = k * m,
+        .stride_c = m * n,
+        .input_dtype = dtype,
+        .output_dtype = dtype,
+        .blas_handle = GetCublasHandle(device),
+    });
 
     return output;
 }
@@ -119,29 +118,28 @@ std::shared_ptr<Tensor> MatmulBackwardInput(const std::shared_ptr<Tensor> &other
     // C = grad_input.T[*, k, m]
     // A = other.T[*, n, k]
     // B = grad_output.T[*, n, m]
-    GemmParams p;
-    p.trans_a = CUBLAS_OP_T;
-    p.trans_b = CUBLAS_OP_N;
-    p.m = static_cast<int>(k);
-    p.n = static_cast<int>(m);
-    p.k = static_cast<int>(n);
-    p.A = other->DataPtr();
-    p.lda = static_cast<int>(n);
-    p.stride_a = k * n;
-    p.B = grad_output_promoted->DataPtr();
-    p.ldb = static_cast<int>(n);
-    p.stride_b = n * m;
-    p.C = grad_input->DataPtr();
-    p.ldc = static_cast<int>(k);
-    p.stride_c = m * k;
-    p.alpha = 1.0f;
-    p.beta = 0.0f;
-    p.batch_count = static_cast<int>(bs);
-    p.input_dtype = compute_dtype;
-    p.output_dtype = output_dtype;
-    p.blas_handle = GetCublasHandle(device);
-
-    GemmCuda(p);
+    GemmCuda(GemmParams{
+        .trans_a = CUBLAS_OP_T,
+        .trans_b = CUBLAS_OP_N,
+        .m = static_cast<int>(k),
+        .n = static_cast<int>(m),
+        .k = static_cast<int>(n),
+        .A = other->DataPtr(),
+        .lda = static_cast<int>(n),
+        .B = grad_output_promoted->DataPtr(),
+        .ldb = static_cast<int>(n),
+        .C = grad_input->DataPtr(),
+        .ldc = static_cast<int>(k),
+        .alpha = 1.0f,
+        .beta = 0.0f,
+        .batch_count = static_cast<int>(bs),
+        .stride_a = k * n,
+        .stride_b = n * m,
+        .stride_c = m * k,
+        .input_dtype = compute_dtype,
+        .output_dtype = output_dtype,
+        .blas_handle = GetCublasHandle(device),
+    });
 
     return grad_input;
 }
@@ -190,29 +188,28 @@ std::shared_ptr<Tensor> MatmulBackwardOther(const std::shared_ptr<Tensor> &input
     // C = grad_other.T[*, n, k]
     // A = grad_output.T[*, n, m]
     // B = input.T[*, k, m]
-    GemmParams p;
-    p.trans_a = CUBLAS_OP_N;
-    p.trans_b = CUBLAS_OP_T;
-    p.m = static_cast<int>(n);
-    p.n = static_cast<int>(k);
-    p.k = static_cast<int>(m);
-    p.A = grad_output_promoted->DataPtr();
-    p.lda = static_cast<int>(n);
-    p.stride_a = n * m;
-    p.B = input1->DataPtr();
-    p.ldb = static_cast<int>(k);
-    p.stride_b = k * m;
-    p.C = grad_other->DataPtr();
-    p.ldc = static_cast<int>(n);
-    p.stride_c = n * k;
-    p.alpha = 1.0f;
-    p.beta = 0.0f;
-    p.batch_count = static_cast<int>(bs);
-    p.input_dtype = compute_dtype;
-    p.output_dtype = output_dtype;
-    p.blas_handle = GetCublasHandle(device);
-
-    GemmCuda(p);
+    GemmCuda(GemmParams{
+        .trans_a = CUBLAS_OP_N,
+        .trans_b = CUBLAS_OP_T,
+        .m = static_cast<int>(n),
+        .n = static_cast<int>(k),
+        .k = static_cast<int>(m),
+        .A = grad_output_promoted->DataPtr(),
+        .lda = static_cast<int>(n),
+        .B = input1->DataPtr(),
+        .ldb = static_cast<int>(k),
+        .C = grad_other->DataPtr(),
+        .ldc = static_cast<int>(n),
+        .alpha = 1.0f,
+        .beta = 0.0f,
+        .batch_count = static_cast<int>(bs),
+        .stride_a = n * m,
+        .stride_b = k * m,
+        .stride_c = n * k,
+        .input_dtype = compute_dtype,
+        .output_dtype = output_dtype,
+        .blas_handle = GetCublasHandle(device),
+    });
 
     return grad_other;
 }
diff --git a/infini_train/src/kernels/cuda/outer.cu b/infini_train/src/kernels/cuda/outer.cu