InfiniTensor · chen2021673 · Mar 8, 2026 · Mar 25, 2026 · Mar 27, 2026 · Apr 2, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,12 @@
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
 [submodule "third_party/glog"]
 	path = third_party/glog
-	url = git@github.com:google/glog.git
+	url = https://github.com/google/glog.git
 [submodule "third_party/gflags"]
 	path = third_party/gflags
-	url = git@github.com:gflags/gflags.git
+	url = https://github.com/gflags/gflags.git
 [submodule "third_party/eigen"]
 	path = third_party/eigen
-	url = git@github.com:InfiniTensor/eigen-mirror.git
+	url = https://github.com/eigenteam/eigen-git-mirror.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running" ON)
+option(BUILD_TEST "Build InfiniTrain tests" OFF)
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
 
@@ -14,6 +15,19 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 # Generate compile_commands.json
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# ------------------------------------------------------------------------------
+# GoogleTest (submodule)
+# ------------------------------------------------------------------------------
+if(BUILD_TEST)
+  if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/CMakeLists.txt)
+    message(FATAL_ERROR "googletest submodule not found at third_party/googletest. "
+      "Run: git submodule update --init third_party/googletest")
+  endif()
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  add_subdirectory(third_party/googletest)
+  enable_testing()
+endif()
+
 # ------------------------------------------------------------------------------
 # Third-party deps
 # ------------------------------------------------------------------------------
@@ -26,7 +40,9 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 add_subdirectory(third_party/glog)
+# add_compile_definitions(GLOG_USE_GLOG_EXPORT=1)
 include_directories(${glog_SOURCE_DIR}/src)
+# include_directories(${glog_BINARY_DIR}/glog)
 
 # eigen
 if(USE_OMP)
@@ -48,6 +64,10 @@ endif()
 # Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
+if(NOT USE_CUDA)
+  list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
+  list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
+endif()
 if(NOT USE_NCCL)
   list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
 endif()
@@ -190,17 +210,8 @@ add_executable(llama3
 )
 link_infini_train_exe(llama3)
 
-# Tools
-add_subdirectory(tools/infini_run)
-set_target_properties(infini_run PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
 # Tests
-add_executable(test_hook test/hook/test_hook.cc)
-link_infini_train_exe(test_hook)
-
-add_executable(test_precision_check test/hook/test_precision_check.cc)
-link_infini_train_exe(test_precision_check)
-
-add_executable(test_lora test/lora/test_lora.cc)
-link_infini_train_exe(test_lora)
-
+if(BUILD_TEST)
+  add_subdirectory(tests)
+endif()
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
@@ -47,6 +47,7 @@ class Function : public std::enable_shared_from_this<Function> {
 
 protected:
     std::vector<std::shared_ptr<Tensor>> saved_tensors_;
+    std::vector<bool> needs_input_grad_;
 
 private:
     std::vector<std::pair<std::shared_ptr<Function>, int>> next_functions_;

diff --git a/infini_train/include/autograd/linear.h b/infini_train/include/autograd/linear.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -10,6 +11,13 @@ class Tensor;
 }
 
 namespace infini_train::autograd {
+
+struct LinearGradFlags {
+    bool input = false;
+    bool weight = false;
+    bool bias = false;
+};
+
 class Linear : public Function {
 public:
     static constexpr char kType[] = "LinearFunction";
@@ -22,7 +30,10 @@ class Linear : public Function {
     std::vector<std::shared_ptr<Tensor>> Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) override;
 
 private:
+    bool transpose_ = false;
+    bool bias_ = false;
+    int64_t in_features_ = 0;
     int64_t out_features_ = 0;
-    bool bias_ = true;
+    std::vector<int64_t> input_dims_;
 };
 } // namespace infini_train::autograd
diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h
@@ -138,6 +138,10 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
 
     std::shared_ptr<Tensor> View(const std::vector<int64_t> &dims);
     std::shared_ptr<Tensor> Contiguous();
+    // FIXME: Currently returns true unconditionally. Requires stride tracking in the Tensor
+    // class before this can be implemented correctly. The guard in elementwise.cu ensures
+    // non-contiguous tensors fall back to the broadcast path until this is resolved.
+    bool IsContiguous() const;
     std::shared_ptr<Tensor> Flatten(int64_t start = 0, int64_t end = -1);
     std::shared_ptr<Tensor> Squeeze(int64_t dim);
     std::shared_ptr<Tensor> Unsqueeze(int64_t dim);

diff --git a/infini_train/src/autograd/accumulate.cc b/infini_train/src/autograd/accumulate.cc
@@ -26,6 +26,13 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
     core::DeviceGuard guard(device);
 
     if (grad_output) {
+        if (grad_output->Dtype() != tensor_->Dtype()) {
+            LOG(WARNING) << "AccumulateGrad: grad dtype (" << kDataTypeToDesc.at(grad_output->Dtype())
+                         << ") does not match parameter dtype (" << kDataTypeToDesc.at(tensor_->Dtype())
+                         << "). This indicates a dtype mismatch in the autograd graph (e.g. autocast "
+                            "running before autograd). The grad is not cast and will be used as-is.";
+        }
+
         if (grad) {
             if (tensor_->ConsumeGradOverwriteFlag()) {
                 // If the tensor is marked to overrite its current grad on next grad update

diff --git a/infini_train/src/autograd/elementwise.cc b/infini_train/src/autograd/elementwise.cc
@@ -390,6 +390,11 @@ std::vector<std::shared_ptr<Tensor>> Add::Backward(const std::vector<std::shared
     CHECK_EQ(grad_outputs.size(), 1);
     const auto &grad_output = grad_outputs[0];
 
+    // Fast path: no broadcast — grad_a and grad_b are both just grad_output
+    if (a_dims_ == b_dims_) {
+        return {grad_output, grad_output};
+    }
+
     auto device = grad_output->GetDevice().type();
     auto [grad_a, grad_b] = Dispatcher::Instance().Call<std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
         {device, "AddBackward"}, grad_output, a_dims_, b_dims_);

diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
@@ -36,6 +36,16 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
         }
     }
 
+    // Populate needs_input_grad_ before Forward/SetupContext so that
+    // SetupContext can use it for saved-tensor pruning.
+    // Must be done before NoGradGuard since it checks GradMode.
+    if (autograd::GradMode::IsEnabled()) {
+        needs_input_grad_.resize(input_tensors.size());
+        for (size_t idx = 0; idx < input_tensors.size(); ++idx) {
+            needs_input_grad_[idx] = input_tensors[idx]->requires_grad();
+        }
+    }
+
     std::vector<std::shared_ptr<Tensor>> output_tensors;
     {
         autograd::NoGradGuard no_grad;
@@ -129,6 +139,7 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 
         saved_tensors_.clear();
         grad_outputs_.clear();
+        needs_input_grad_.clear();
         grad_outputs_reached_ = 0;
         dependencies_reached_ = 0;
 

diff --git a/infini_train/src/autograd/linear.cc b/infini_train/src/autograd/linear.cc
@@ -17,12 +17,35 @@ std::vector<std::shared_ptr<Tensor>> Linear::Forward(const std::vector<std::shar
 }
 
 void Linear::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
-                          const std::vector<std::shared_ptr<Tensor>> &) {
+                          const std::vector<std::shared_ptr<Tensor>> &output_tensors) {
     const auto &input = input_tensors[0];
     const auto &weight = input_tensors[1];
-    saved_tensors_ = {input, weight};
+    // Cast saved tensors to forward compute dtype (output dtype) so backward
+    // computes in the same precision as forward, matching PyTorch's behavior.
+
+    // FIXME: An extra cast (input/weight -> compute_dtype) is performed here because
+    // autocast runs before autograd. The correct approach is to adjust the ordering or
+    // integration of autocast and autograd so that autograd receives already-cast tensors,
+    // avoiding the redundant cast.
+
+    // FIXME: compute_dtype is not necessarily the dtype of output_tensor; it should be
+    // determined by autocast, not derived from output_tensors[0]->Dtype().
+    auto compute_dtype = output_tensors[0]->Dtype();
+    bool need_input = needs_input_grad_.size() > 0 && needs_input_grad_[0];
+    bool need_weight = needs_input_grad_.size() > 1 && needs_input_grad_[1];
+
+    auto cast = [&](const std::shared_ptr<Tensor> &t) {
+        return t->Dtype() == compute_dtype ? t : std::make_shared<Tensor>(t->To(compute_dtype));
+    };
+
+    // grad_input needs weight, grad_weight needs input
+    saved_tensors_ = {need_weight ? cast(input) : nullptr, need_input ? cast(weight) : nullptr};
+
+    transpose_ = true;
     bias_ = input_tensors.size() == 3;
+    in_features_ = weight->Dims()[1];
     out_features_ = weight->Dims()[0];
+    input_dims_ = input->Dims();
 }
 
 std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
@@ -32,13 +55,22 @@ std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::sha
     CHECK_EQ(grad_outputs.size(), 1);
     const auto &grad_output = grad_outputs[0];
 
-    auto device = input->GetDevice().type();
+    CHECK(!needs_input_grad_.empty()) << "needs_input_grad_ not populated in Linear::Backward";
+    LinearGradFlags grad_flags = {.input = needs_input_grad_[0],
+                                  .weight = needs_input_grad_.size() > 1 && needs_input_grad_[1],
+                                  .bias = bias_ && needs_input_grad_.size() > 2 && needs_input_grad_[2]};
+
+    auto device = grad_output->GetDevice().type();
+    // TODO: skip autograd graph construction entirely when no input requires grad
     auto [grad_input, grad_weight, grad_bias]
         = Dispatcher::Instance()
               .Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
-                  {device, "LinearBackward"}, input, weight, true, out_features_, grad_output, bias_);
-    return bias_ ? std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight, grad_bias}
-                 : std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight};
-    ;
+                  {device, "LinearBackward"}, input, weight, transpose_, in_features_, out_features_, input_dims_,
+                  grad_output, bias_, grad_flags);
+    if (bias_) {
+        return {grad_input, grad_weight, grad_bias};
+    } else {
+        return {grad_input, grad_weight};
+    }
 }
 } // namespace infini_train::autograd
diff --git a/infini_train/src/autograd/matmul.cc b/infini_train/src/autograd/matmul.cc
@@ -20,7 +20,21 @@ void Matmul::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tens
     const auto &input1 = input_tensors[0];
     const auto &input2 = input_tensors[1];
     const auto &output = output_tensors[0];
-    saved_tensors_ = {input1, input2};
+    // Cast saved tensors to forward compute dtype (output dtype) so backward
+    // computes in the same precision as forward, matching PyTorch's behavior.
+
+    // FIXME: An extra cast (input1/input2 -> compute_dtype) is performed here because
+    // autocast runs before autograd. The correct approach is to adjust the ordering or
+    // integration of autocast and autograd so that autograd receives already-cast tensors,
+    // avoiding the redundant cast.
+
+    // FIXME: compute_dtype is not necessarily the dtype of output_tensor; it should be
+    // determined by autocast, not derived from output->Dtype().
+    auto compute_dtype = output->Dtype();
+    saved_tensors_ = {
+        input1->Dtype() == compute_dtype ? input1 : std::make_shared<Tensor>(input1->To(compute_dtype)),
+        input2->Dtype() == compute_dtype ? input2 : std::make_shared<Tensor>(input2->To(compute_dtype)),
+    };
     out_features_ = output->Dims()[0];
 }
 

diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc
@@ -1,11 +1,11 @@
 #include <cstdint>
-#include <fcntl.h>
 #include <memory>
 #include <numeric>
 #include <tuple>
 
 #include "glog/logging.h"
 
+#include "infini_train/include/autograd/linear.h"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
@@ -70,6 +70,7 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     const int64_t k = input_dims[input_dims.size() - 1];
     CHECK_EQ(k, other_dims[other_dims.size() - 2]);
     const int64_t n = other_dims[other_dims.size() - 1];
+
     CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
     CHECK_EQ(n, grad_output_dims[grad_output_dims.size() - 1]);
 
@@ -148,7 +149,9 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
 // TODO(dcj): support linear without bias later
 std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
 LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight, bool transpose,
-               int64_t out_features, const std::shared_ptr<Tensor> &grad_output, const bool bias) {
+               int64_t in_features, int64_t out_features, const std::vector<int64_t> &input_dims,
+               const std::shared_ptr<Tensor> &grad_output, bool bias,
+               infini_train::autograd::LinearGradFlags grad_flags) {
     /*
     transpose: grad_input = grad_output * weight
     grad_input[*, in_features] = grad_output[*, out_features] * weight[out_features, in_features]
@@ -160,32 +163,41 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     grad_weight[in_features, out_features] = input[*, in_features]^T * grad_output[*, out_features]
     grad_bias[out_features] = grad_output[*, out_features].sum(axis=0)
     */
+    const auto compute_grad_input = grad_flags.input;
+    const auto compute_grad_weight = grad_flags.weight;
+    const auto compute_grad_bias = grad_flags.bias;
 
-    const auto &input_dims = input->Dims();
     CHECK_GE(input_dims.size(), 2);
-    const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
-    const int64_t in_features = *input_dims.rbegin();
 
-    const auto &weight_dims = weight->Dims();
-    CHECK_EQ(weight_dims.size(), 2);
-    CHECK_EQ(in_features, weight_dims[transpose ? 1 : 0]);
-    CHECK_EQ(out_features, weight_dims[transpose ? 0 : 1]);
+    std::vector<int64_t> weight_dims
+        = transpose ? std::vector<int64_t>{out_features, in_features} : std::vector<int64_t>{in_features, out_features};
 
-    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
-    auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
+    std::shared_ptr<Tensor> grad_input = nullptr;
+    std::shared_ptr<Tensor> grad_weight = nullptr;
     std::shared_ptr<Tensor> grad_bias = nullptr;
-    if (bias) {
-        grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
+
+    if (compute_grad_input) {
+        CHECK(weight != nullptr) << "compute_grad_input=true but weight is nullptr (selective save mismatch)";
+        grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
+        if (transpose) {
+            grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
+        } else {
+            grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
+        }
     }
 
-    if (transpose) {
-        grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
-        grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
-    } else {
-        grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
-        grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
+    if (compute_grad_weight) {
+        CHECK(input != nullptr) << "compute_grad_weight=true but input is nullptr (selective save mismatch)";
+        grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
+        if (transpose) {
+            grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
+        } else {
+            grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
+        }
     }
-    if (bias) {
+
+    if (compute_grad_bias && bias) {
+        grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
         grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum();
     }