update implementation of load_state_dict and add test case

Ceng23333 · Ceng23333 · commit ed586f820b72 · 2025-10-29T16:34:35.000+08:00
Signed-off-by: Ceng23333 &lt;441651826@qq.com&gt;
diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp
@@ -10,11 +10,11 @@ class Linear : public Module {
     Linear(size_t in_features, size_t out_features, bool bias = true, const Device &device = Device());
 
     // Forward pass: output = input @ weight.T + bias
-    Tensor forward(const Tensor &input) const;
+    Tensor forward(Tensor &input) const;
 
     // Forward pass with residual connection (InfiniLM-style)
     // output = input @ weight.T + bias + residual
-    Tensor forward(const Tensor &input, const Tensor &residual) const;
+    Tensor forward(Tensor &input, Tensor &residual) const;
 
     // Accessors for parameters
     Tensor weight() const { return weight_; }
@@ -34,7 +34,7 @@ class Linear : public Module {
 
 private:
     // Helper method for common forward computation
-    Tensor compute_linear(const Tensor &input) const;
+    Tensor compute_linear(Tensor &input) const;
 
     size_t in_features_;
     size_t out_features_;
diff --git a/src/infinicore-test/test_nn_module.cc b/src/infinicore-test/test_nn_module.cc
@@ -184,29 +184,209 @@ TestResult NNModuleTest::testStateDict() {
 TestResult NNModuleTest::testLoadStateDict() {
     return measureTime("LoadStateDict", [this]() {
         try {
+            spdlog::info("Testing Module::load_state_dict functionality");
+
             MockLinearModule module(4, 2, infinicore::Device());
 
-            // Create new tensors
-            infinicore::Tensor new_weight = infinicore::Tensor::empty({2, 4}, infinicore::DataType::F32, infinicore::Device());
-            infinicore::Tensor new_bias = infinicore::Tensor::empty({2}, infinicore::DataType::F32, infinicore::Device());
+            // Test 1: Load parameters using load_parameter
+            spdlog::info("Test 1: Loading individual parameters with load_parameter");
+            infinicore::Tensor new_weight = infinicore::Tensor::ones({2, 4}, infinicore::DataType::F32, infinicore::Device());
+            infinicore::Tensor new_bias = infinicore::Tensor::zeros({2}, infinicore::DataType::F32, infinicore::Device());
 
-            // Load individual parameters
             module.load_parameter("weight", new_weight);
             module.load_parameter("bias", new_bias);
 
-            std::cout << "Successfully loaded parameters" << std::endl;
+            spdlog::debug("Successfully loaded parameters using load_parameter");
 
             // Verify the parameters were updated
             auto updated_state_dict = module.state_dict();
             if (updated_state_dict.size() != 2) {
-                std::cout << "Error: State dict size mismatch after loading" << std::endl;
+                spdlog::error("State dict size mismatch after loading. Expected 2, got {}", updated_state_dict.size());
+                return false;
+            }
+
+            // Verify parameter values
+            if (!tensorsAllClose(updated_state_dict.at("weight"), new_weight, 1e-6, 1e-6)) {
+                spdlog::error("Weight parameter values do not match after load_parameter");
+                return false;
+            }
+            if (!tensorsAllClose(updated_state_dict.at("bias"), new_bias, 1e-6, 1e-6)) {
+                spdlog::error("Bias parameter values do not match after load_parameter");
+                return false;
+            }
+            spdlog::debug("load_parameter verification passed");
+
+            // Test 2: Load entire state dict using load_state_dict
+            spdlog::info("Test 2: Loading entire state dict with load_state_dict");
+
+            // Create custom weight and bias tensors with known values
+            // Just use ones for simplicity - all values will be 1.0
+            auto custom_weight = infinicore::Tensor::ones({2, 4}, infinicore::DataType::F32, infinicore::Device());
+            auto custom_bias = infinicore::Tensor::ones({2}, infinicore::DataType::F32, infinicore::Device());
+
+            // Create state dict
+            std::unordered_map<std::string, infinicore::Tensor> new_state_dict;
+            new_state_dict.emplace("weight", custom_weight);
+            new_state_dict.emplace("bias", custom_bias);
+
+            // Load the entire state dict
+            module.load_state_dict(new_state_dict);
+            spdlog::debug("Successfully loaded state dict using load_state_dict");
+
+            // Verify that parameters were loaded correctly
+            auto final_state_dict = module.state_dict();
+
+            if (final_state_dict.size() != 2) {
+                spdlog::error("State dict size mismatch after load_state_dict. Expected 2, got {}", final_state_dict.size());
+                return false;
+            }
+
+            if (final_state_dict.at("weight")->shape() != std::vector<size_t>({2, 4})) {
+                spdlog::error("Loaded weight shape mismatch");
+                return false;
+            }
+            if (final_state_dict.at("bias")->shape() != std::vector<size_t>({2})) {
+                spdlog::error("Loaded bias shape mismatch");
+                return false;
+            }
+
+            spdlog::debug("load_state_dict verification passed - shapes are correct");
+            spdlog::info("Skipping value comparison for now - test focuses on load mechanism");
+
+            // Test 3: Test with Linear module to verify field synchronization
+            spdlog::info("Test 3: Testing load_state_dict with Linear module (field synchronization)");
+            infinicore::nn::Linear linear_module(4, 2, true, infinicore::Device());
+
+            // Create known parameter values - just use ones for simplicity
+            auto linear_weight = infinicore::Tensor::ones({2, 4}, infinicore::DataType::F32, infinicore::Device());
+            auto linear_bias = infinicore::Tensor::ones({2}, infinicore::DataType::F32, infinicore::Device());
+
+            std::unordered_map<std::string, infinicore::Tensor> linear_state_dict;
+            linear_state_dict.emplace("weight", linear_weight);
+            linear_state_dict.emplace("bias", linear_bias);
+
+            // Load state dict into Linear module
+            linear_module.load_state_dict(linear_state_dict);
+
+            // Verify shapes using both state_dict() and direct field access
+            auto loaded_via_state_dict_weight = linear_module.state_dict().at("weight");
+            auto loaded_via_field_weight = linear_module.weight();
+            auto loaded_via_field_bias = linear_module.bias();
+
+            if (loaded_via_state_dict_weight->shape() != std::vector<size_t>({2, 4})) {
+                spdlog::error("Linear weight shape mismatch via state_dict");
+                return false;
+            }
+            if (loaded_via_field_weight->shape() != std::vector<size_t>({2, 4})) {
+                spdlog::error("Linear weight field shape mismatch");
+                return false;
+            }
+            if (loaded_via_field_bias->shape() != std::vector<size_t>({2})) {
+                spdlog::error("Linear bias field shape mismatch");
+                return false;
+            }
+
+            spdlog::debug("Linear module load_state_dict verification passed - field shapes synchronized");
+            spdlog::info("Skipping value comparison - test focuses on field synchronization mechanism");
+
+            // Test 4: Deep nesting (2-level hierarchy)
+            spdlog::info("Test 4: Testing load_state_dict with 2-level deep nesting");
+
+            // Create parent -> child -> grandchild hierarchy
+            MockLinearModule deep_parent(10, 8, infinicore::Device());
+            auto deep_child = std::make_shared<MockLinearModule>(8, 6, infinicore::Device());
+            auto deep_grandchild = std::make_shared<MockLinearModule>(6, 4, infinicore::Device());
+
+            // Build hierarchy: parent -> layer1 -> sublayer
+            deep_child->add_module("sublayer", deep_grandchild);
+            deep_parent.add_module("layer1", deep_child);
+
+            // Verify initial state dict includes all 2-level hierarchical parameters
+            auto deep_initial_state = deep_parent.state_dict();
+            spdlog::debug("Deep hierarchical state dict has {} parameters", deep_initial_state.size());
+
+            // Expected parameters:
+            // parent: weight, bias (2)
+            // layer1: layer1.weight, layer1.bias (2)
+            // sublayer: layer1.sublayer.weight, layer1.sublayer.bias (2)
+            // Total: 6 parameters
+            if (deep_initial_state.size() < 6) {
+                spdlog::error("Deep hierarchy state dict size mismatch. Expected at least 6, got {}",
+                              deep_initial_state.size());
+                return false;
+            }
+
+            // Verify 2-level parameter names exist
+            bool has_sublayer_weight = deep_initial_state.find("layer1.sublayer.weight") != deep_initial_state.end();
+            bool has_sublayer_bias = deep_initial_state.find("layer1.sublayer.bias") != deep_initial_state.end();
+
+            if (!has_sublayer_weight || !has_sublayer_bias) {
+                spdlog::error("2-level nested parameters missing from state dict");
+                return false;
+            }
+            spdlog::debug("All 2-level hierarchical parameter names verified");
+
+            // Create state dict for 2-level hierarchy with all 1.0 values
+            std::unordered_map<std::string, infinicore::Tensor> deep_state_dict;
+            deep_state_dict.emplace("weight", infinicore::Tensor::ones({8, 10}, infinicore::DataType::F32, infinicore::Device()));
+            deep_state_dict.emplace("bias", infinicore::Tensor::ones({8}, infinicore::DataType::F32, infinicore::Device()));
+            deep_state_dict.emplace("layer1.weight", infinicore::Tensor::ones({6, 8}, infinicore::DataType::F32, infinicore::Device()));
+            deep_state_dict.emplace("layer1.bias", infinicore::Tensor::ones({6}, infinicore::DataType::F32, infinicore::Device()));
+            deep_state_dict.emplace("layer1.sublayer.weight", infinicore::Tensor::ones({4, 6}, infinicore::DataType::F32, infinicore::Device()));
+            deep_state_dict.emplace("layer1.sublayer.bias", infinicore::Tensor::ones({4}, infinicore::DataType::F32, infinicore::Device()));
+
+            // Load the deep hierarchical state dict
+            deep_parent.load_state_dict(deep_state_dict);
+            spdlog::debug("Successfully loaded 2-level deep hierarchical state dict");
+
+            // Verify all parameters were loaded correctly
+            auto deep_loaded_state = deep_parent.state_dict();
+
+            // Verify shapes at all levels
+            if (deep_loaded_state.at("weight")->shape() != std::vector<size_t>({8, 10})) {
+                spdlog::error("Deep parent weight shape mismatch");
+                return false;
+            }
+            if (deep_loaded_state.at("layer1.weight")->shape() != std::vector<size_t>({6, 8})) {
+                spdlog::error("Deep layer1 weight shape mismatch");
+                return false;
+            }
+            if (deep_loaded_state.at("layer1.sublayer.weight")->shape() != std::vector<size_t>({4, 6})) {
+                spdlog::error("Deep sublayer weight shape mismatch");
                 return false;
             }
+            spdlog::debug("All 2-level deep parameter shapes verified");
 
-            std::cout << "Load state dict test passed" << std::endl;
+            // Verify actual weight loading correctness by checking that loaded parameters
+            // match what we provided in the state dict (use the original tensors)
+            spdlog::info("Verifying weight loading correctness by direct comparison");
+
+            // Get the tensors we loaded from the state dict
+            auto loaded_parent_weight = deep_loaded_state.at("weight");
+            auto loaded_layer1_weight = deep_loaded_state.at("layer1.weight");
+            auto loaded_sublayer_weight = deep_loaded_state.at("layer1.sublayer.weight");
+
+            // Compare with the original tensors we put in the state dict
+            if (!tensorsAllClose(loaded_parent_weight, deep_state_dict.at("weight"), 1e-5, 1e-5)) {
+                spdlog::error("Deep parent weight not preserved after loading");
+                return false;
+            }
+            if (!tensorsAllClose(loaded_layer1_weight, deep_state_dict.at("layer1.weight"), 1e-5, 1e-5)) {
+                spdlog::error("Deep layer1 weight not preserved after loading");
+                return false;
+            }
+            if (!tensorsAllClose(loaded_sublayer_weight, deep_state_dict.at("layer1.sublayer.weight"), 1e-5, 1e-5)) {
+                spdlog::error("Deep sublayer weight not preserved after loading");
+                return false;
+            }
+
+            spdlog::info("Weight loading correctness verified - loaded values match input state dict");
+            spdlog::info("2-level deep hierarchy load_state_dict verification passed");
+
+            spdlog::info("All load_state_dict tests passed (including deep hierarchy)");
             return true;
         } catch (const std::exception &e) {
-            std::cout << "Exception in testLoadStateDict: " << e.what() << std::endl;
+            spdlog::error("Exception in testLoadStateDict: {}", e.what());
             return false;
         }
     });
@@ -432,6 +612,25 @@ TestResult NNModuleTest::testModuleLinear() {
             }
             spdlog::debug("Linear computation without bias passed. Input shape: {{1, 16}}, Output shape: {{1, 3}}");
 
+            // Test load_state_dict for m2 (without bias)
+            spdlog::info("Testing load_state_dict on Linear without bias");
+            auto m2_load_weight = infinicore::Tensor::ones({3, 16}, infinicore::DataType::F32, infinicore::Device());
+            std::unordered_map<std::string, infinicore::Tensor> m2_state_dict;
+            m2_state_dict.emplace("weight", m2_load_weight);
+            // Note: no bias parameter
+            m2.load_state_dict(m2_state_dict);
+
+            // Verify via state_dict() and direct access
+            if (!tensorsAllClose(m2.state_dict().at("weight"), m2_load_weight, 1e-5, 1e-5)) {
+                spdlog::error("m2 weight not loaded correctly");
+                return false;
+            }
+            if (!tensorsAllClose(m2.weight(), m2_load_weight, 1e-5, 1e-5)) {
+                spdlog::error("m2 weight field not synchronized");
+                return false;
+            }
+            spdlog::debug("m2 load_state_dict verified - weight loaded correctly (no bias)");
+
             // Test batch processing
             spdlog::info("Testing batch linear computation (batch size 3)");
             auto input3 = infinicore::Tensor::ones({3, 8}, infinicore::DataType::F32, infinicore::Device());
@@ -455,6 +654,30 @@ TestResult NNModuleTest::testModuleLinear() {
                 return false;
             }
 
+            // Test load_state_dict for m1 (with bias)
+            spdlog::info("Testing load_state_dict on Linear with bias");
+            auto m1_load_weight = infinicore::Tensor::ones({4, 8}, infinicore::DataType::F32, infinicore::Device());
+            auto m1_load_bias = infinicore::Tensor::ones({4}, infinicore::DataType::F32, infinicore::Device());
+            std::unordered_map<std::string, infinicore::Tensor> m1_state_dict;
+            m1_state_dict.emplace("weight", m1_load_weight);
+            m1_state_dict.emplace("bias", m1_load_bias);
+            m1.load_state_dict(m1_state_dict);
+
+            // Verify via state_dict() and direct access
+            if (!tensorsAllClose(m1.state_dict().at("weight"), m1_load_weight, 1e-5, 1e-5)) {
+                spdlog::error("m1 weight not loaded correctly");
+                return false;
+            }
+            if (!tensorsAllClose(m1.weight(), m1_load_weight, 1e-5, 1e-5)) {
+                spdlog::error("m1 weight field not synchronized");
+                return false;
+            }
+            if (!tensorsAllClose(m1.bias(), m1_load_bias, 1e-5, 1e-5)) {
+                spdlog::error("m1 bias field not synchronized");
+                return false;
+            }
+            spdlog::debug("m1 load_state_dict verified - parameters and fields synchronized");
+
             // Test extra_repr
             std::string repr = m1.extra_repr();
             spdlog::debug("Linear module representation: {}", repr);
@@ -575,7 +798,7 @@ TestResult NNModuleTest::testModuleLinear() {
             spdlog::debug("Basic forward computation correctness test passed - both implementations produce identical results");
             spdlog::debug("Basic InfiniCore output shape: {{2, 4}}, Basic naive output shape: {{2, 4}}");
 
-            spdlog::info("Linear module test with computation verification passed");
+            spdlog::info("All Linear module tests passed (with/without bias, load_state_dict, computation verification)");
             return true;
         } catch (const std::exception &e) {
             spdlog::error("Exception in testModuleLinear: {}", e.what());
diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc
@@ -25,7 +25,7 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const Device
                   in_features, out_features, bias);
 }
 
-Tensor Linear::compute_linear(const Tensor &input) const {
+Tensor Linear::compute_linear(Tensor &input) const {
     // Create output tensor with shape [batch_size, out_features]
     auto output_shape = input->shape();
     output_shape[output_shape.size() - 1] = out_features_;
@@ -55,11 +55,11 @@ Tensor Linear::compute_linear(const Tensor &input) const {
     return output;
 }
 
-Tensor Linear::forward(const Tensor &input) const {
+Tensor Linear::forward(Tensor &input) const {
     return compute_linear(input);
 }
 
-Tensor Linear::forward(const Tensor &input, const Tensor &residual) const {
+Tensor Linear::forward(Tensor &input, Tensor &residual) const {
     auto output = compute_linear(input);
 
     // Add residual: output = output + residual
diff --git a/src/infinicore/nn/module.cc b/src/infinicore/nn/module.cc
@@ -17,8 +17,40 @@ const std::unordered_map<std::string, Parameter> &Module::state_dict() const {
 }
 
 void Module::load_state_dict(const std::unordered_map<std::string, Tensor> &_state_dict) {
-    for (auto &p : parameters_) {
-        load_parameter(p.first, p.second);
+    // Collect all parameters from this module and its submodules
+    std::unordered_map<std::string, Parameter> all_params;
+    collect_all_parameters("", all_params);
+
+    // For each parameter in this module hierarchy, load from the state dict
+    for (const auto &[param_full_name, param] : all_params) {
+        // Look up the corresponding tensor in the input state dict
+        auto it = _state_dict.find(param_full_name);
+        if (it != _state_dict.end()) {
+            // Navigate to the correct module by following the path
+            Module *target_module = this;
+            std::string remaining_path = param_full_name;
+
+            // Split the name by dots and traverse the module hierarchy
+            size_t pos = 0;
+            while ((pos = remaining_path.find('.')) != std::string::npos) {
+                std::string submodule_name = remaining_path.substr(0, pos);
+                remaining_path = remaining_path.substr(pos + 1);
+
+                // Navigate to the submodule
+                auto sub_it = target_module->submodules_.find(submodule_name);
+                if (sub_it != target_module->submodules_.end()) {
+                    target_module = sub_it->second.get();
+                } else {
+                    target_module = nullptr;
+                    break;
+                }
+            }
+
+            // Load the parameter into the target module
+            if (target_module != nullptr) {
+                target_module->load_parameter(remaining_path, it->second);
+            }
+        }
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const Device`
`25`	`25`	`in_features, out_features, bias);`
`26`	`26`	`}`
`27`	`27`
`28`		`-Tensor Linear::compute_linear(const Tensor &input) const {`
	`28`	`+Tensor Linear::compute_linear(Tensor &input) const {`
`29`	`29`	`// Create output tensor with shape [batch_size, out_features]`
`30`	`30`	`auto output_shape = input->shape();`
`31`	`31`	`output_shape[output_shape.size() - 1] = out_features_;`
`@@ -55,11 +55,11 @@ Tensor Linear::compute_linear(const Tensor &input) const {`
`55`	`55`	`return output;`
`56`	`56`	`}`
`57`	`57`
`58`		`-Tensor Linear::forward(const Tensor &input) const {`
	`58`	`+Tensor Linear::forward(Tensor &input) const {`
`59`	`59`	`return compute_linear(input);`
`60`	`60`	`}`
`61`	`61`
`62`		`-Tensor Linear::forward(const Tensor &input, const Tensor &residual) const {`
	`62`	`+Tensor Linear::forward(Tensor &input, Tensor &residual) const {`
`63`	`63`	`auto output = compute_linear(input);`
`64`	`64`
`65`	`65`	`// Add residual: output = output + residual`