Skip to content
Closed

CTest #143

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "third_party/glog"]
path = third_party/glog
url = git@github.com:google/glog.git
url = https://github.com/google/glog.git
[submodule "third_party/gflags"]
path = third_party/gflags
url = git@github.com:gflags/gflags.git
url = https://github.com/gflags/gflags.git
[submodule "third_party/eigen"]
path = third_party/eigen
url = git@github.com:InfiniTensor/eigen-mirror.git
url = https://github.com/eigenteam/eigen-git-mirror.git
35 changes: 23 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
option(USE_OMP "Use OpenMP as backend for Eigen" ON)
option(USE_NCCL "Build project for distributed running" ON)
option(BUILD_TEST "Build InfiniTrain tests" OFF)

project(infini_train VERSION 0.5.0 LANGUAGES CXX)

Expand All @@ -14,6 +15,19 @@ set(CMAKE_CXX_EXTENSIONS OFF)
# Generate compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# ------------------------------------------------------------------------------
# GoogleTest (submodule)
# ------------------------------------------------------------------------------
if(BUILD_TEST)
if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/CMakeLists.txt)
message(FATAL_ERROR "googletest submodule not found at third_party/googletest. "
"Run: git submodule update --init third_party/googletest")
endif()
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
add_subdirectory(third_party/googletest)
enable_testing()
endif()

# ------------------------------------------------------------------------------
# Third-party deps
# ------------------------------------------------------------------------------
Expand All @@ -26,7 +40,9 @@ include_directories(${gflags_SOURCE_DIR}/include)
set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
add_subdirectory(third_party/glog)
# add_compile_definitions(GLOG_USE_GLOG_EXPORT=1)
include_directories(${glog_SOURCE_DIR}/src)
# include_directories(${glog_BINARY_DIR}/glog)

# eigen
if(USE_OMP)
Expand All @@ -48,6 +64,10 @@ endif()
# Framework core sources (*.cc), excluding cpu kernels (they are built separately)
file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
if(NOT USE_CUDA)
list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
endif()
if(NOT USE_NCCL)
list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
endif()
Expand Down Expand Up @@ -190,17 +210,8 @@ add_executable(llama3
)
link_infini_train_exe(llama3)

# Tools
add_subdirectory(tools/infini_run)
set_target_properties(infini_run PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

# Tests
add_executable(test_hook test/hook/test_hook.cc)
link_infini_train_exe(test_hook)

add_executable(test_precision_check test/hook/test_precision_check.cc)
link_infini_train_exe(test_precision_check)

add_executable(test_lora test/lora/test_lora.cc)
link_infini_train_exe(test_lora)

if(BUILD_TEST)
add_subdirectory(tests)
endif()
1 change: 1 addition & 0 deletions infini_train/include/autograd/function.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Function : public std::enable_shared_from_this<Function> {

protected:
std::vector<std::shared_ptr<Tensor>> saved_tensors_;
std::vector<bool> needs_input_grad_;

private:
std::vector<std::pair<std::shared_ptr<Function>, int>> next_functions_;
Expand Down
13 changes: 12 additions & 1 deletion infini_train/include/autograd/linear.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cstdint>
#include <memory>
#include <vector>

Expand All @@ -10,6 +11,13 @@ class Tensor;
}

namespace infini_train::autograd {

struct LinearGradFlags {
bool input = false;
bool weight = false;
bool bias = false;
};

class Linear : public Function {
public:
static constexpr char kType[] = "LinearFunction";
Expand All @@ -22,7 +30,10 @@ class Linear : public Function {
std::vector<std::shared_ptr<Tensor>> Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) override;

private:
bool transpose_ = false;
bool bias_ = false;
int64_t in_features_ = 0;
int64_t out_features_ = 0;
bool bias_ = true;
std::vector<int64_t> input_dims_;
};
} // namespace infini_train::autograd
4 changes: 4 additions & 0 deletions infini_train/include/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ class Tensor : public std::enable_shared_from_this<Tensor> {

std::shared_ptr<Tensor> View(const std::vector<int64_t> &dims);
std::shared_ptr<Tensor> Contiguous();
// FIXME: Currently returns true unconditionally. Requires stride tracking in the Tensor
// class before this can be implemented correctly. The guard in elementwise.cu ensures
// non-contiguous tensors fall back to the broadcast path until this is resolved.
bool IsContiguous() const;
std::shared_ptr<Tensor> Flatten(int64_t start = 0, int64_t end = -1);
std::shared_ptr<Tensor> Squeeze(int64_t dim);
std::shared_ptr<Tensor> Unsqueeze(int64_t dim);
Expand Down
7 changes: 7 additions & 0 deletions infini_train/src/autograd/accumulate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
core::DeviceGuard guard(device);

if (grad_output) {
if (grad_output->Dtype() != tensor_->Dtype()) {
LOG(WARNING) << "AccumulateGrad: grad dtype (" << kDataTypeToDesc.at(grad_output->Dtype())
<< ") does not match parameter dtype (" << kDataTypeToDesc.at(tensor_->Dtype())
<< "). This indicates a dtype mismatch in the autograd graph (e.g. autocast "
"running before autograd). The grad is not cast and will be used as-is.";
}

if (grad) {
if (tensor_->ConsumeGradOverwriteFlag()) {
// If the tensor is marked to overrite its current grad on next grad update
Expand Down
5 changes: 5 additions & 0 deletions infini_train/src/autograd/elementwise.cc
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ std::vector<std::shared_ptr<Tensor>> Add::Backward(const std::vector<std::shared
CHECK_EQ(grad_outputs.size(), 1);
const auto &grad_output = grad_outputs[0];

// Fast path: no broadcast — grad_a and grad_b are both just grad_output
if (a_dims_ == b_dims_) {
return {grad_output, grad_output};
}

auto device = grad_output->GetDevice().type();
auto [grad_a, grad_b] = Dispatcher::Instance().Call<std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
{device, "AddBackward"}, grad_output, a_dims_, b_dims_);
Expand Down
11 changes: 11 additions & 0 deletions infini_train/src/autograd/function.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
}
}

// Populate needs_input_grad_ before Forward/SetupContext so that
// SetupContext can use it for saved-tensor pruning.
// Must be done before NoGradGuard since it checks GradMode.
if (autograd::GradMode::IsEnabled()) {
needs_input_grad_.resize(input_tensors.size());
for (size_t idx = 0; idx < input_tensors.size(); ++idx) {
needs_input_grad_[idx] = input_tensors[idx]->requires_grad();
}
}

std::vector<std::shared_ptr<Tensor>> output_tensors;
{
autograd::NoGradGuard no_grad;
Expand Down Expand Up @@ -129,6 +139,7 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g

saved_tensors_.clear();
grad_outputs_.clear();
needs_input_grad_.clear();
grad_outputs_reached_ = 0;
dependencies_reached_ = 0;

Expand Down
46 changes: 39 additions & 7 deletions infini_train/src/autograd/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,35 @@ std::vector<std::shared_ptr<Tensor>> Linear::Forward(const std::vector<std::shar
}

void Linear::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
const std::vector<std::shared_ptr<Tensor>> &) {
const std::vector<std::shared_ptr<Tensor>> &output_tensors) {
const auto &input = input_tensors[0];
const auto &weight = input_tensors[1];
saved_tensors_ = {input, weight};
// Cast saved tensors to forward compute dtype (output dtype) so backward
// computes in the same precision as forward, matching PyTorch's behavior.

// FIXME: An extra cast (input/weight -> compute_dtype) is performed here because
// autocast runs before autograd. The correct approach is to adjust the ordering or
// integration of autocast and autograd so that autograd receives already-cast tensors,
// avoiding the redundant cast.

// FIXME: compute_dtype is not necessarily the dtype of output_tensor; it should be
// determined by autocast, not derived from output_tensors[0]->Dtype().
auto compute_dtype = output_tensors[0]->Dtype();
bool need_input = needs_input_grad_.size() > 0 && needs_input_grad_[0];
bool need_weight = needs_input_grad_.size() > 1 && needs_input_grad_[1];

auto cast = [&](const std::shared_ptr<Tensor> &t) {
return t->Dtype() == compute_dtype ? t : std::make_shared<Tensor>(t->To(compute_dtype));
};

// grad_input needs weight, grad_weight needs input
saved_tensors_ = {need_weight ? cast(input) : nullptr, need_input ? cast(weight) : nullptr};

transpose_ = true;
bias_ = input_tensors.size() == 3;
in_features_ = weight->Dims()[1];
out_features_ = weight->Dims()[0];
input_dims_ = input->Dims();
}

std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
Expand All @@ -32,13 +55,22 @@ std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::sha
CHECK_EQ(grad_outputs.size(), 1);
const auto &grad_output = grad_outputs[0];

auto device = input->GetDevice().type();
CHECK(!needs_input_grad_.empty()) << "needs_input_grad_ not populated in Linear::Backward";
LinearGradFlags grad_flags = {.input = needs_input_grad_[0],
.weight = needs_input_grad_.size() > 1 && needs_input_grad_[1],
.bias = bias_ && needs_input_grad_.size() > 2 && needs_input_grad_[2]};

auto device = grad_output->GetDevice().type();
// TODO: skip autograd graph construction entirely when no input requires grad
auto [grad_input, grad_weight, grad_bias]
= Dispatcher::Instance()
.Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
{device, "LinearBackward"}, input, weight, true, out_features_, grad_output, bias_);
return bias_ ? std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight, grad_bias}
: std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight};
;
{device, "LinearBackward"}, input, weight, transpose_, in_features_, out_features_, input_dims_,
grad_output, bias_, grad_flags);
if (bias_) {
return {grad_input, grad_weight, grad_bias};
} else {
return {grad_input, grad_weight};
}
}
} // namespace infini_train::autograd
16 changes: 15 additions & 1 deletion infini_train/src/autograd/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,21 @@ void Matmul::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tens
const auto &input1 = input_tensors[0];
const auto &input2 = input_tensors[1];
const auto &output = output_tensors[0];
saved_tensors_ = {input1, input2};
// Cast saved tensors to forward compute dtype (output dtype) so backward
// computes in the same precision as forward, matching PyTorch's behavior.

// FIXME: An extra cast (input1/input2 -> compute_dtype) is performed here because
// autocast runs before autograd. The correct approach is to adjust the ordering or
// integration of autocast and autograd so that autograd receives already-cast tensors,
// avoiding the redundant cast.

// FIXME: compute_dtype is not necessarily the dtype of output_tensor; it should be
// determined by autocast, not derived from output->Dtype().
auto compute_dtype = output->Dtype();
saved_tensors_ = {
input1->Dtype() == compute_dtype ? input1 : std::make_shared<Tensor>(input1->To(compute_dtype)),
input2->Dtype() == compute_dtype ? input2 : std::make_shared<Tensor>(input2->To(compute_dtype)),
};
out_features_ = output->Dims()[0];
}

Expand Down
52 changes: 32 additions & 20 deletions infini_train/src/kernels/cpu/linear.cc
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#include <cstdint>
#include <fcntl.h>
#include <memory>
#include <numeric>
#include <tuple>

#include "glog/logging.h"

#include "infini_train/include/autograd/linear.h"
#include "infini_train/include/dispatcher.h"
#include "infini_train/include/tensor.h"

Expand Down Expand Up @@ -70,6 +70,7 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
const int64_t k = input_dims[input_dims.size() - 1];
CHECK_EQ(k, other_dims[other_dims.size() - 2]);
const int64_t n = other_dims[other_dims.size() - 1];

CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
CHECK_EQ(n, grad_output_dims[grad_output_dims.size() - 1]);

Expand Down Expand Up @@ -148,7 +149,9 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
// TODO(dcj): support linear without bias later
std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight, bool transpose,
int64_t out_features, const std::shared_ptr<Tensor> &grad_output, const bool bias) {
int64_t in_features, int64_t out_features, const std::vector<int64_t> &input_dims,
const std::shared_ptr<Tensor> &grad_output, bool bias,
infini_train::autograd::LinearGradFlags grad_flags) {
/*
transpose: grad_input = grad_output * weight
grad_input[*, in_features] = grad_output[*, out_features] * weight[out_features, in_features]
Expand All @@ -160,32 +163,41 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
grad_weight[in_features, out_features] = input[*, in_features]^T * grad_output[*, out_features]
grad_bias[out_features] = grad_output[*, out_features].sum(axis=0)
*/
const auto compute_grad_input = grad_flags.input;
const auto compute_grad_weight = grad_flags.weight;
const auto compute_grad_bias = grad_flags.bias;

const auto &input_dims = input->Dims();
CHECK_GE(input_dims.size(), 2);
const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
const int64_t in_features = *input_dims.rbegin();

const auto &weight_dims = weight->Dims();
CHECK_EQ(weight_dims.size(), 2);
CHECK_EQ(in_features, weight_dims[transpose ? 1 : 0]);
CHECK_EQ(out_features, weight_dims[transpose ? 0 : 1]);
std::vector<int64_t> weight_dims
= transpose ? std::vector<int64_t>{out_features, in_features} : std::vector<int64_t>{in_features, out_features};

auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
std::shared_ptr<Tensor> grad_input = nullptr;
std::shared_ptr<Tensor> grad_weight = nullptr;
std::shared_ptr<Tensor> grad_bias = nullptr;
if (bias) {
grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);

if (compute_grad_input) {
CHECK(weight != nullptr) << "compute_grad_input=true but weight is nullptr (selective save mismatch)";
grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
if (transpose) {
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
} else {
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
}
}

if (transpose) {
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
} else {
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
if (compute_grad_weight) {
CHECK(input != nullptr) << "compute_grad_weight=true but input is nullptr (selective save mismatch)";
grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
if (transpose) {
grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
} else {
grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
}
}
if (bias) {

if (compute_grad_bias && bias) {
grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum();
}

Expand Down
Loading
Loading