InfiniTensor
diff --git a/‎include/infinicore.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/infinicore.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infinicore/nn.hpp‎
Lines changed: 5 additions & 0 deletions b/‎include/infinicore/nn.hpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/infinicore/nn/embedding.hpp‎
Lines changed: 83 additions & 0 deletions b/‎include/infinicore/nn/embedding.hpp‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎include/infinicore/nn/linear.hpp‎
Lines changed: 45 additions & 0 deletions b/‎include/infinicore/nn/linear.hpp‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎include/infinicore/nn/module.hpp‎
Lines changed: 96 additions & 8 deletions b/‎include/infinicore/nn/module.hpp‎
Lines changed: 96 additions & 8 deletions
diff --git a/‎include/infinicore/nn/rmsnorm.hpp‎
Lines changed: 77 additions & 0 deletions b/‎include/infinicore/nn/rmsnorm.hpp‎
Lines changed: 77 additions & 0 deletions
@@ -1,4 +1,5 @@
 #pragma once
 
+#include "infinicore/nn.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/tensor.hpp"
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "nn/embedding.hpp"
+#include "nn/linear.hpp"
+#include "nn/rmsnorm.hpp"
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+#include <optional>
+
+namespace infinicore::nn {
+
+/**
+ * @brief Embedding layer that maps indices to dense vectors
+ *
+ * A simple lookup table that stores embeddings of a fixed dictionary and size.
+ * This module is often used to store word embeddings and retrieve them using indices.
+ * The input to the module is a tensor of indices, and the output is the corresponding
+ * embedding vectors.
+ *
+ * Similar to PyTorch's nn.Embedding:
+ * https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
+ *
+ * Example:
+ * @code
+ *   // Create embedding: 10000 words, 300-dimensional embeddings
+ *   auto embedding = Embedding(10000, 300);
+ *
+ *   // Input: tensor of indices [batch_size, seq_len]
+ *   auto indices = Tensor::from_data({2, 5}, {3, 5, 12, 8, 99, 0, 1, 45, 67, 23});
+ *
+ *   // Output: [batch_size, seq_len, embedding_dim] = [2, 5, 300]
+ *   auto embeddings = embedding.forward(indices);
+ * @endcode
+ */
+class Embedding : public Module {
+public:
+    /**
+     * @brief Construct an Embedding layer
+     *
+     * @param num_embeddings Size of the dictionary of embeddings (vocabulary size)
+     * @param embedding_dim The size of each embedding vector
+     * @param padding_idx If specified, the entries at padding_idx do not contribute to gradient
+     *                    and the embedding vector at padding_idx is not updated during training
+     * @param device Device to create the embedding weight on
+     */
+    Embedding(size_t num_embeddings,
+              size_t embedding_dim,
+              std::optional<int64_t> padding_idx = std::nullopt,
+              const Device &device = Device());
+
+    /**
+     * @brief Forward pass: lookup embeddings for given indices
+     *
+     * @param indices Tensor containing indices into the embedding matrix.
+     *                Can be any shape (*), typically [batch_size] or [batch_size, seq_len]
+     * @return Tensor containing the embedding vectors.
+     *         Shape: (*, embedding_dim) where * matches the input shape
+     *
+     * Example:
+     *   Input shape: [2, 3] -> Output shape: [2, 3, embedding_dim]
+     *   Input shape: [10] -> Output shape: [10, embedding_dim]
+     */
+    Tensor forward(const Tensor &indices) const;
+
+    // Module information
+    size_t num_embeddings() const { return num_embeddings_; }
+    size_t embedding_dim() const { return embedding_dim_; }
+    std::optional<int64_t> padding_idx() const { return padding_idx_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+
+protected:
+    // Parameters
+    Parameter weight_;
+
+private:
+    size_t num_embeddings_;   // Vocabulary size
+    size_t embedding_dim_;    // Embedding dimension
+    std::optional<int64_t> padding_idx_;  // Optional padding index
+};
+
+} // namespace infinicore::nn
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+
+namespace infinicore::nn {
+
+class Linear : public Module {
+public:
+    Linear(size_t in_features, size_t out_features, bool bias = true, const Device &device = Device());
+
+    // Forward pass: output = input @ weight.T + bias
+    Tensor forward(Tensor &input) const;
+
+    // Forward pass with residual connection (InfiniLM-style)
+    // output = input @ weight.T + bias + residual
+    Tensor forward(Tensor &input, Tensor &residual) const;
+
+    // Module information
+    size_t in_features() const { return in_features_; }
+    size_t out_features() const { return out_features_; }
+    bool has_bias() const { return has_bias_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+    Tensor bias() const { return bias_; }
+
+protected:
+    // Parameters
+    Parameter weight_;
+    Parameter bias_;
+
+private:
+    // Helper method for common forward computation
+    Tensor compute_linear(Tensor &input) const;
+
+    size_t in_features_;
+    size_t out_features_;
+    bool has_bias_;
+};
+
+} // namespace infinicore::nn
@@ -1,12 +1,17 @@
 #pragma once
 
 #include "parameter.hpp"
+#include "../tensor.hpp"
 
 #include <unordered_map>
+#include <type_traits>
+#include <vector>
 
 namespace infinicore::nn {
 class Module {
 public:
+    Module() = default;
+
     const std::unordered_map<std::string, Parameter> &state_dict() const;
 
     void load_state_dict(const std::unordered_map<std::string, Tensor> &_state_dict);
@@ -15,35 +20,118 @@ class Module {
 
     void load_parameter_from_blob(const std::string &name, const void *data);
 
+protected:
     Tensor register_parameter(const std::string &name, Parameter param);
 
+    // Add an existing submodule to this module's hierarchy
+    // Template parameter M must be a type derived from Module
+    // Returns the submodule for convenience (allows method chaining)
     template <typename M>
     std::shared_ptr<M> add_module(const std::string &name, std::shared_ptr<M> submodule) {
+        // Ensure M is derived from Module (compile-time check)
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        // Store in the submodules map (std::shared_ptr<M> automatically converts to std::shared_ptr<Module>)
         submodules_[name] = submodule;
+
         return submodule;
     }
 
+    // Create and register a new submodule by constructing it with the given arguments
+    // Template parameter M must be a type derived from Module
+    // Args are forwarded to M's constructor
     template <typename M, typename... Args>
     std::shared_ptr<M> register_module(const std::string &name, Args &&...args) {
+        // Ensure M is derived from Module (compile-time check)
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        // Construct the submodule
         auto submodule = std::make_shared<M>(std::forward<Args>(args)...);
+
         return add_module(name, submodule);
     }
 
+    // Create and register multiple submodules of the same type
+    // Each submodule is named as "name.0", "name.1", etc.
+    // Template parameter M must be a type derived from Module
     template <typename M, typename... Args>
-    std::vector<std::shared_ptr<M>> register_modules(size_t layers, const std::string &name, Args &&...args) {
-        auto submodules = std::vector<std::shared_ptr<M>>(layers);
-        for (size_t i = 0; i < layers; i++) {
-            register_module<M>(name + "." + std::to_string(i), std::forward<Args>(args)...);
+    std::vector<std::shared_ptr<M>> register_modules(size_t count, const std::string &name, Args &&...args) {
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        std::vector<std::shared_ptr<M>> modules;
+        modules.reserve(count);
+        for (size_t i = 0; i < count; i++) {
+            modules.push_back(register_module<M>(name + "." + std::to_string(i), std::forward<Args>(args)...));
         }
-        return submodules;
+        return modules;
     }
 
-private:
-    void collect_all_parameters(const std::string &prefix, std::unordered_map<std::string, Parameter> &all_params) const;
-
 protected:
     Device device_;
     std::unordered_map<std::string, std::shared_ptr<Module>> submodules_;
     std::unordered_map<std::string, Parameter> parameters_;
+
+private:
+    void collect_all_parameters(std::unordered_map<std::string, Parameter> &all_params, const std::string &prefix = "") const;
 };
+
+// ============================================================================
+// PyTorch-like Macros for Convenient Module Registration
+// ============================================================================
+
+/**
+ * @brief Register submodules with automatic name inference from variable name
+ *
+ * Usage:
+ * @code
+ *   class MyModel : public Module {
+ *   protected:
+ *       INFINICORE_NN_MODULE(Linear, layer1);
+ *       INFINICORE_NN_MODULE(Linear, layer2);
+ *       INFINICORE_NN_MODULE_VEC(Linear, layers);
+ *       INFINICORE_NN_PARAMETER(scaling_factor);
+ *
+ *   public:
+ *       MyModel() {
+ *           INFINICORE_NN_MODULE_INIT(layer1, 128, 64);
+ *           INFINICORE_NN_MODULE_INIT(layer2, 64, 32);
+ *           INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 32, 16);
+ *           INFINICORE_NN_PARAMETER_INIT(scaling_factor, ({1}, DataType::F32, Device()));
+ *       }
+ *   };
+ * @endcode
+ */
+
+// Declare a single module member variable
+#define INFINICORE_NN_MODULE(ModuleType, name) \
+    std::shared_ptr<ModuleType> name##_
+
+// Declare a vector of modules member variable
+#define INFINICORE_NN_MODULE_VEC(ModuleType, name) \
+    std::vector<std::shared_ptr<ModuleType>> name##_
+
+// Initialize a module in constructor
+#define INFINICORE_NN_MODULE_INIT(name, ...) \
+    name##_ = this->register_module<std::remove_reference<decltype(*name##_)>::type>(#name, ##__VA_ARGS__)
+
+// Initialize a vector of modules in constructor
+// Usage: INFINICORE_NN_MODULE_VEC_INIT(layers, count, ModuleType, ctor_args...)
+// Example: INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 128, 64)
+#define INFINICORE_NN_MODULE_VEC_INIT(name, count, ModuleType, ...) \
+    name##_ = this->register_modules<ModuleType>(count, #name, ##__VA_ARGS__)
+
+// Declare a parameter member variable
+#define INFINICORE_NN_PARAMETER(name) \
+    Parameter name##_
+
+// Initialize a parameter in constructor
+// Usage: INFINICORE_NN_PARAMETER_INIT(name, (shape, dtype, device))
+// Example: INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, DataType::F32, device))
+#define INFINICORE_NN_PARAMETER_INIT(name, args) \
+    name##_ = Parameter args; \
+    this->register_parameter(#name, name##_)
+
 } // namespace infinicore::nn
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+
+namespace infinicore::nn {
+
+/**
+ * @brief Root Mean Square Layer Normalization (RMSNorm)
+ *
+ * Applies Root Mean Square Layer Normalization over the last dimension.
+ * Unlike LayerNorm, RMSNorm doesn't subtract mean and doesn't use bias.
+ *
+ * Formula: y = (x / RMS(x)) * weight
+ * where RMS(x) = sqrt(mean(x^2) + eps)
+ *
+ * Used in LLaMA, Galactica, and other modern language models as a
+ * simpler and faster alternative to LayerNorm.
+ *
+ * Example:
+ * @code
+ *   // Create RMSNorm for hidden size 4096
+ *   auto norm = RMSNorm(4096);
+ *
+ *   // Input: [batch, seq_len, hidden_size]
+ *   auto input = Tensor::randn({2, 10, 4096});
+ *
+ *   // Output: [batch, seq_len, hidden_size]
+ *   auto output = norm.forward(input);
+ * @endcode
+ */
+class RMSNorm : public Module {
+public:
+    /**
+     * @brief Construct a RMSNorm layer
+     *
+     * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size)
+     * @param eps Small constant for numerical stability (default: 1e-6)
+     * @param device Device to create the weight on
+     */
+    RMSNorm(size_t normalized_shape,
+            double eps = 1e-6,
+            const Device &device = Device());
+
+    /**
+     * @brief Forward pass: apply RMSNorm
+     *
+     * @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions
+     * @return Normalized tensor with same shape as input
+     *
+     * The normalization is applied over the last dimension.
+     * For example:
+     *   Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
+     *   Input: [batch, hidden_size] -> normalize over hidden_size
+     */
+    Tensor forward(const Tensor &x) const;
+
+    // Module information
+    size_t normalized_shape() const { return normalized_shape_; }
+    double eps() const { return eps_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+
+protected:
+    // Parameters
+    Parameter weight_;
+
+private:
+    size_t normalized_shape_;  // Size of the feature dimension
+    double eps_;               // Epsilon for numerical stability
+};
+
+} // namespace infinicore::nn