InfiniTensor
diff --git a/‎tests/common/test_macros.cmake‎ ‎cmake/test_macros.cmake‎tests/common/test_macros.cmake renamed to cmake/test_macros.cmake
Lines changed: 3 additions & 10 deletions b/‎tests/common/test_macros.cmake‎ ‎cmake/test_macros.cmake‎tests/common/test_macros.cmake renamed to cmake/test_macros.cmake
Lines changed: 3 additions & 10 deletions
diff --git a/‎docs/test_infrastructure_design.md‎
Lines changed: 39 additions & 34 deletions b/‎docs/test_infrastructure_design.md‎
Lines changed: 39 additions & 34 deletions
diff --git a/‎docs/test_usage_guide.md‎
Lines changed: 16 additions & 17 deletions b/‎docs/test_usage_guide.md‎
Lines changed: 16 additions & 17 deletions
diff --git a/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 1 addition & 0 deletions b/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example/gpt2/config.h‎
Lines changed: 16 additions & 0 deletions b/‎example/gpt2/config.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 1 addition & 0 deletions b/‎example/gpt2/main.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 1 addition & 0 deletions b/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example/llama3/config.h‎
Lines changed: 22 additions & 0 deletions b/‎example/llama3/config.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎example/llama3/main.cc‎
Lines changed: 1 addition & 0 deletions b/‎example/llama3/main.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎infini_train/include/nn/parallel/global.h‎
Lines changed: 0 additions & 2 deletions b/‎infini_train/include/nn/parallel/global.h‎
Lines changed: 0 additions & 2 deletions
@@ -17,9 +17,6 @@
 
 include_guard(GLOBAL)
 
-# Path to this file's directory (tests/common/)
-set(TEST_MACROS_DIR "${CMAKE_CURRENT_LIST_DIR}")
-
 # -----------------------------------------------------------------------------
 # Load GoogleTest module (provides gtest_discover_tests)
 # -----------------------------------------------------------------------------
@@ -59,20 +56,16 @@ macro(infini_train_add_test)
   endif()
 
   # 1. Create executable target
-  add_executable(${ARG_TEST_NAME} ${ARG_SOURCES})
+  add_executable(${ARG_TEST_NAME} ${ARG_SOURCES} $<TARGET_OBJECTS:test_main>)
 
   # 2. Disable -Werror so tests can run under relaxed warning levels
   target_compile_options(${ARG_TEST_NAME} PRIVATE -Wno-error)
 
-  # 3. Link Google Test
-  target_link_libraries(${ARG_TEST_NAME} PRIVATE
-    GTest::gtest
-    GTest::gtest_main
-  )
+  # 3. Link Google Test (uses custom main from test_main that initializes GlobalEnv)
+  target_link_libraries(${ARG_TEST_NAME} PRIVATE GTest::gtest)
 
   # 4. Add include paths
   target_include_directories(${ARG_TEST_NAME} PRIVATE
-    ${TEST_MACROS_DIR}
     ${glog_SOURCE_DIR}/src
   )
 
 
@@ -9,7 +9,6 @@ tests/
 ├── CMakeLists.txt              # 顶层：include 宏 + add_subdirectory
 ├── common/
 │   ├── CMakeLists.txt          # header-only interface library
-│   ├── test_macros.cmake       # CMake 宏：infini_train_add_test / infini_train_add_test_suite
 │   └── test_utils.h            # C++ 基类、skip 宏、填充工具函数
 ├── tensor/                     # Tensor 创建 / 拷贝 / 销毁 / 算子
 ├── optimizer/                  # Optimizer 创建 / step
@@ -18,6 +17,9 @@ tests/
 ├── lora/                       # LoRA 相关
 ├── dtype/                      # Scalar / dtype dispatch + 编译期负面测试
 └── transformer/                # Transformer 架构测试
+
+cmake/
+└── test_macros.cmake           # CMake 宏：infini_train_add_test / infini_train_add_test_suite
 ```
 
 ### 核心设计：设备参数化
@@ -32,14 +34,15 @@ tests/
 
 | 基类 | 用途 | 提供的能力 |
 |------|------|-----------|
-| `InfiniTrainTest` | 通用参数化测试 | `GetDevice()`, `createTensor(shape, dtype, requires_grad)` |
-| `AutogradTestBase` | Autograd 测试 | `createTensor(shape, value)` 自动 `requires_grad=true` + 顺序填充 |
+| `InfiniTrainTest` | 通用参数化测试 | `GetDevice()`（当前参数化的 `Device`） |
 
-**为什么需要 AutogradTestBase？**
+测试中的张量直接通过 `Tensor` 构造接口创建：
 
-- 所有 autograd 测试都需要 `requires_grad=true`
-- 所有 autograd 测试都需要填充数据
-- 前向/反向传播测试必须有输入数据才能验证结果。`AutogradTestBase` 把 `FillSequentialTensor` 内置了，避免每个测试都手动调用
+```cpp
+auto t = std::make_shared<Tensor>(shape, DataType::kFLOAT32, GetDevice());
+auto g = std::make_shared<Tensor>(shape, DataType::kFLOAT32, GetDevice(), /*requires_grad=*/true);
+t->Fill(1.0f);                   // 常量填充（framework 内置 API）
+```
 
 ### 跳过特定平台
 
@@ -110,14 +113,14 @@ mkdir tests/foo
 // tests/foo/test_foo_basic.cc
 #include <gtest/gtest.h>
 #include "infini_train/include/tensor.h"
-#include "test_utils.h"
+#include "tests/common/test_utils.h"
 
 using namespace infini_train;
 
 class FooBasicTest : public infini_train::test::InfiniTrainTest {};
 
 TEST_P(FooBasicTest, CreateTensor) {
-    auto tensor = createTensor({2, 3});
+    auto tensor = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
     EXPECT_NE(tensor, nullptr);
 }
 
@@ -129,12 +132,9 @@ TEST_P(FooBasicTest, CUDAOnlyFeature) {
 INFINI_TRAIN_REGISTER_TEST(FooBasicTest);
 ```
 
-**基类选择（或创建）：**
+**基类选择：**
 
-| 场景 | 基类 |
-|------|------|
-| 通用测试 | `InfiniTrainTest`（提供 `createTensor(shape, dtype, requires_grad)`） |
-| 需要 autograd | `AutogradTestBase`（提供 `createTensor(shape, value)`，自动 `requires_grad=true` + 顺序填充） |
+所有测试类都继承 `InfiniTrainTest`。需要梯度时，给 `Tensor` 构造传 `requires_grad=true`；需要填充数据时用 `Tensor::Fill`。
 
 **Step 2: 写 CMakeLists.txt**
 
@@ -176,8 +176,6 @@ add_subdirectory(foo)
 | 函数 / 宏 | 用途 |
 |-----------|------|
 | `GetDevice()` | 返回当前参数化的 `Device`（基类方法） |
-| `createTensor(shape, dtype, requires_grad)` | 在当前设备创建 tensor（`InfiniTrainTest` 基类方法） |
-| `FillSequentialTensor(tensor, start)` | 填充递增值，自动处理 Device tensor（先填 CPU 再 copy） |
 | `SKIP_CPU()` | 跳过 CPU 实例 |
 | `ONLY_CPU()` | 只在 CPU 实例运行 |
 | `ONLY_CUDA()` | 只在 CUDA 实例运行 |
@@ -201,23 +199,12 @@ enum class DeviceType : int8_t {
 
 ### 4.2 测试工具层：`test_utils.h`
 
-1. 新增运行时检测函数和 `CudaDeviceTypes` 的对称版本：
+1. 新增 MACA 头文件的编译期引入（和 CUDA 对称）：
 
 ```cpp
-#ifdef USE_MACA
-inline int GetMacaDeviceCount() { /* macaGetDeviceCount ... */ }
-#else
-inline int GetMacaDeviceCount() { return 0; }
+#if defined(USE_MACA)
+#include <maca_runtime_api.h>
 #endif
-inline bool HasMacaRuntime() { return GetMacaDeviceCount() > 0; }
-
-inline std::vector<Device::DeviceType> MacaDeviceTypes() {
-    if (HasMacaRuntime()) {
-        return {Device::DeviceType::kMACA};
-    }
-    LOG(INFO) << "No MACA runtime found, skipping MACA tests.";
-    return {};
-}
 ```
 
 2. 新增 `ONLY_MACA()` 宏：
@@ -227,18 +214,36 @@ inline std::vector<Device::DeviceType> MacaDeviceTypes() {
     do { if (GetParam() != infini_train::Device::DeviceType::kMACA) { GTEST_SKIP() << "MACA-only test"; } } while (0)
 ```
 
+如果希望有类似 `REQUIRE_MIN_DEVICES(n)` 但针对 MACA 的语义，可以按 `USE_CUDA` 分支的写法增加一个新的宏；同理 `USE_MACA` 不开时该宏直接 skip 即可。
+
 ### 4.3 注册宏：新增 MACA 实例
 
+沿用 `USE_CUDA` 的做法，未开启编译开关时不注册对应实例：
+
 ```cpp
+#if defined(USE_CUDA) && defined(USE_MACA)
 #define INFINI_TRAIN_REGISTER_TEST(TestName)                                    \
     INSTANTIATE_TEST_SUITE_P(CPU, TestName,                                     \
         ::testing::Values(infini_train::Device::DeviceType::kCPU));             \
     INSTANTIATE_TEST_SUITE_P(CUDA, TestName,                                    \
-        ::testing::ValuesIn(infini_train::test::CudaDeviceTypes()));            \
+        ::testing::Values(infini_train::Device::DeviceType::kCUDA));            \
     INSTANTIATE_TEST_SUITE_P(MACA, TestName,                                    \
-        ::testing::ValuesIn(infini_train::test::MacaDeviceTypes()))
+        ::testing::Values(infini_train::Device::DeviceType::kMACA))
+#elif defined(USE_CUDA)
+#define INFINI_TRAIN_REGISTER_TEST(TestName) /* CPU + CUDA, 同现状 */
+#elif defined(USE_MACA)
+#define INFINI_TRAIN_REGISTER_TEST(TestName)                                    \
+    INSTANTIATE_TEST_SUITE_P(CPU, TestName,                                     \
+        ::testing::Values(infini_train::Device::DeviceType::kCPU));             \
+    INSTANTIATE_TEST_SUITE_P(MACA, TestName,                                    \
+        ::testing::Values(infini_train::Device::DeviceType::kMACA))
+#else
+#define INFINI_TRAIN_REGISTER_TEST(TestName) /* 仅 CPU */
+#endif
 ```
 
+运行时如果机器上没有对应设备（例如 `USE_MACA` 编译但无 MACA 硬件），让测试直接报错而不是静默跳过。
+
 ### 4.4 CMake 层：`test_macros.cmake`
 
 将默认 label 列表从 `cpu cuda` 扩展为 `cpu cuda maca`
@@ -248,7 +253,7 @@ inline std::vector<Device::DeviceType> MacaDeviceTypes() {
 | 步骤 | 文件 | 改动 |
 |------|------|------|
 | 1 | `device.h` | `DeviceType` 枚举新增 `kMACA` |
-| 2 | `test_utils.h` | 新增 `GetMacaDeviceCount()` / `HasMacaRuntime()` / `MacaDeviceTypes()` / `ONLY_MACA()` |
-| 3 | `test_utils.h` | `INFINI_TRAIN_REGISTER_TEST` 新增 MACA 实例 |
+| 2 | `test_utils.h` | 新增 `USE_MACA` 下的 `<maca_runtime_api.h>` 引入、`ONLY_MACA()` 宏 |
+| 3 | `test_utils.h` | `INFINI_TRAIN_REGISTER_TEST` 按 `USE_MACA` 条件新增 MACA 实例 |
 | 4 | `test_macros.cmake` | 将默认 label 列表扩展为 `cpu cuda maca` |
 | 5 | `CMakeLists.txt`（根） | 新增 `USE_MACA` option + MACA SDK 查找 + kernel 编译 |
@@ -49,15 +49,15 @@ ctest -R tensor --output-on-failure
 在 `tests/` 下对应子目录中新建文件，例如 `tests/tensor/test_tensor_copy.cc`：
 
 ```cpp
-#include "common/test_utils.h"
+#include "tests/common/test_utils.h"
 
 class TensorCopyTest : public infini_train::test::InfiniTrainTest {};
 
 TEST_P(TensorCopyTest, CopiesDataCorrectly) {
-    auto src = createTensor({4}, DataType::kFLOAT32);
-    FillSequentialTensor(src);
+    auto src = std::make_shared<Tensor>(std::vector<int64_t>{4}, DataType::kFLOAT32, GetDevice());
+    src->Fill(1.0f);
 
-    auto dst = createTensor({4}, DataType::kFLOAT32);
+    auto dst = std::make_shared<Tensor>(std::vector<int64_t>{4}, DataType::kFLOAT32, GetDevice());
     // ... 执行拷贝并断言 ...
     EXPECT_EQ(dst->Dims(), src->Dims());
 }
@@ -66,7 +66,7 @@ INFINI_TRAIN_REGISTER_TEST(TensorCopyTest);
 ```
 
 注意事项：
-- 继承 `InfiniTrainTest`（autograd 测试继承 `AutogradTestBase`）。
+- 继承 `InfiniTrainTest`。
 - 使用 `TEST_P`，设备参数由框架自动注入。
 - 文件末尾调用 `INFINI_TRAIN_REGISTER_TEST`，自动实例化 CPU 和 CUDA 两个变体。
 
@@ -89,11 +89,9 @@ infini_train_add_test_suite(test_tensor_copy test_tensor_copy.cc)
 | 方法 | 说明 |
 |---|---|
 | `GetDevice()` | 返回当前测试实例的设备（CPU 或 CUDA） |
-| `createTensor(shape)` | 在当前设备上创建 `kFLOAT32` 张量 |
-| `createTensor(shape, dtype)` | 创建指定数据类型的张量 |
-| `createTensor(shape, dtype, requires_grad)` | 创建启用自动微分的张量 |
-| `FillSequentialTensor(tensor)` | 用 0, 1, 2, … 填充张量（自动处理 CPU/GPU 传输） |
-| `FillConstantTensor(tensor, value)` | 用常量填充张量所有元素 |
+| `tensor->Fill(value)` | 用常量填充张量所有元素（`Tensor` 内置方法） |
+
+张量创建直接使用 `std::make_shared<Tensor>(shape, dtype, GetDevice(), requires_grad)`，`requires_grad` 参数默认 `false`，需要梯度的测试传 `true` 即可。
 
 ---
 
@@ -130,20 +128,21 @@ TEST_P(MyTest, 需要多卡) {
 
 ## Autograd 测试
 
-需要预填充输入张量时，继承 `AutogradTestBase`：
+创建启用自动微分的张量时，给 `Tensor` 构造的第四个参数传 `true`：
 
 ```cpp
-#include "common/test_utils.h"
+#include "tests/common/test_utils.h"
 
-class MyOpTest : public infini_train::test::AutogradTestBase {};
+class MyOpTest : public infini_train::test::InfiniTrainTest {};
 
 TEST_P(MyOpTest, 前向传播) {
-    // input_ 和 weight_ 已在当前设备上创建并填充好序列值
-    auto output = MyOp(input_, weight_);
+    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice(), true);
+    input->Fill(1.0f);
+    auto weight = std::make_shared<Tensor>(std::vector<int64_t>{4, 3}, DataType::kFLOAT32, GetDevice(), true);
+    weight->Fill(0.5f);
+    auto output = MyOp(input, weight);
     EXPECT_NE(output, nullptr);
 }
 
 INFINI_TRAIN_REGISTER_TEST(MyOpTest);
 ```
-
-`AutogradTestBase` 继承自 `InfiniTrainTest`，预先创建了 `input_` 和 `weight_` 张量并填充了序列值。
@@ -87,6 +87,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     gpt2_config.n_layer = n_layer;
     gpt2_config.n_head = n_head;
     gpt2_config.n_embd = n_embd;
+    gpt2::SanitizeGPT2Config(gpt2_config);
     auto local_gpt2 = std::make_shared<nn::TransformerModel>(gpt2_config);
 
     LOG(INFO) << "magic: " << magic << " version: " << version << " block_size: " << block_size
 
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "glog/logging.h"
+
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
 namespace nn = infini_train::nn;
@@ -23,4 +25,18 @@ inline nn::TransformerConfig GPT2Config() {
             .multiple_of = 1};
 }
 
+inline void SanitizeGPT2Config(const nn::TransformerConfig &c) {
+    CHECK_GT(c.block_size, 0);
+    CHECK_GT(c.vocab_size, 0);
+    CHECK_GE(c.vocab_size, c.original_vocab_size);
+    CHECK_GT(c.n_layer, 0);
+    CHECK_GT(c.n_head, 0);
+    CHECK_GT(c.n_embd, 0);
+    CHECK_EQ(c.n_embd % c.n_head, 0) << "n_embd must be divisible by n_head";
+    CHECK_EQ(c.n_kv_head, c.n_head) << "GPT-2 does not use GQA; n_kv_head must equal n_head";
+    CHECK(c.attention_type == nn::AttentionType::kStandard) << "GPT-2 requires standard attention";
+    CHECK(c.activation_type == nn::MLPType::kGELU) << "GPT-2 requires GELU activation";
+    CHECK(c.norm_type == nn::NormType::kLayerNorm) << "GPT-2 requires LayerNorm";
+}
+
 } // namespace gpt2
@@ -190,6 +190,7 @@ void Train(const nn::parallel::Rank &rank) {
         model = gpt2::LoadFromLLMC(FLAGS_llmc_filepath);
     } else if (kModelToConfigs.count(FLAGS_model)) {
         model_config = kModelToConfigs.at(FLAGS_model);
+        gpt2::SanitizeGPT2Config(model_config);
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
 
@@ -80,6 +80,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     llama3_config.use_scaled_rope = static_cast<bool>(use_scaled_rope);
     llama3_config.norm_eps = norm_eps;
     llama3_config.max_gen_batch_size = max_gen_bs;
+    llama3::SanitizeLLaMA3Config(llama3_config);
     auto llama3 = std::make_shared<nn::TransformerModel>(llama3_config);
 
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
 
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "glog/logging.h"
+
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
 namespace nn = infini_train::nn;
@@ -22,4 +24,24 @@ inline nn::TransformerConfig LLaMA3Config() {
             .ffn_dim_multiplier = 1.5f,
             .multiple_of = 256};
 }
+
+inline void SanitizeLLaMA3Config(const nn::TransformerConfig &c) {
+    CHECK_GT(c.block_size, 0);
+    CHECK_GT(c.vocab_size, 0);
+    CHECK_GE(c.vocab_size, c.original_vocab_size);
+    CHECK_GT(c.n_layer, 0);
+    CHECK_GT(c.n_head, 0);
+    CHECK_GT(c.n_kv_head, 0);
+    CHECK_LE(c.n_kv_head, c.n_head);
+    CHECK_EQ(c.n_head % c.n_kv_head, 0) << "n_head must be divisible by n_kv_head for GQA";
+    CHECK_GT(c.n_embd, 0);
+    CHECK_EQ(c.n_embd % c.n_head, 0) << "n_embd must be divisible by n_head";
+    CHECK(c.attention_type == nn::AttentionType::kRoPE) << "LLaMA-3 requires RoPE attention";
+    CHECK(c.activation_type == nn::MLPType::kSwiGLU) << "LLaMA-3 requires SwiGLU activation";
+    CHECK(c.norm_type == nn::NormType::kRMSNorm) << "LLaMA-3 requires RMSNorm";
+    CHECK(!c.add_bias_linear) << "LLaMA-3 has no bias in linear layers";
+    CHECK(!c.tie_weights) << "LLaMA-3 does not tie embedding and lm_head weights";
+    CHECK(c.ffn_dim_multiplier.has_value()) << "LLaMA-3 requires ffn_dim_multiplier";
+    CHECK_GT(c.multiple_of, 0);
+}
 } // namespace llama3
@@ -174,6 +174,7 @@ void Train(const nn::parallel::Rank &rank) {
     if (!FLAGS_llmc_filepath.empty()) {
         model = llama3::LoadFromLLMC(FLAGS_llmc_filepath);
     } else {
+        llama3::SanitizeLLaMA3Config(model_config);
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
 
@@ -31,8 +31,6 @@ class GlobalEnv {
     void Init(int threads_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
               int pipeline_parallel_size, int virtual_pipeline_parallel_size);
 
-    bool IsInitialized() const;
-
     int nnodes() const;
 
     int nproc_per_node() const;
Original file line number	Diff line number	Diff line change
`@@ -190,6 +190,7 @@ void Train(const nn::parallel::Rank &rank) {`
`190`	`190`	`model = gpt2::LoadFromLLMC(FLAGS_llmc_filepath);`
`191`	`191`	`} else if (kModelToConfigs.count(FLAGS_model)) {`
`192`	`192`	`model_config = kModelToConfigs.at(FLAGS_model);`
	`193`	`+ gpt2::SanitizeGPT2Config(model_config);`
`193`	`194`	`model = std::make_shared<nn::TransformerModel>(model_config);`
`194`	`195`	`}`
`195`	`196`
Original file line number	Diff line number	Diff line change
`@@ -174,6 +174,7 @@ void Train(const nn::parallel::Rank &rank) {`
`174`	`174`	`if (!FLAGS_llmc_filepath.empty()) {`
`175`	`175`	`model = llama3::LoadFromLLMC(FLAGS_llmc_filepath);`
`176`	`176`	`} else {`
	`177`	`+ llama3::SanitizeLLaMA3Config(model_config);`
`177`	`178`	`model = std::make_shared<nn::TransformerModel>(model_config);`
`178`	`179`	`}`
`179`	`180`