feat: add Qwen3-8B

JYMiracle305 · JYMiracle305 · commit a3f84f80e11d · 2026-05-11T10:42:14.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -190,6 +190,14 @@ add_executable(llama3
 )
 link_infini_train_exe(llama3)
 
+add_executable(qwen3
+  example/qwen3/main.cc
+  example/common/tiny_shakespeare_dataset.cc
+  example/common/utils.cc
+  example/qwen3/checkpoint_loader.cc
+  example/common/tokenizer.cc
+)
+link_infini_train_exe(qwen3)
 # Tools
 add_subdirectory(tools/infini_run)
 set_target_properties(infini_run PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
diff --git a/example/qwen3/checkpoint_loader.cc b/example/qwen3/checkpoint_loader.cc
diff --git a/example/qwen3/checkpoint_loader.h b/example/qwen3/checkpoint_loader.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+namespace infini_train::nn {
+class TransformerModel;
+} // namespace infini_train::nn
+
+namespace qwen3 {
+std::shared_ptr<infini_train::nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
+} // namespace qwen3
diff --git a/example/qwen3/config.h b/example/qwen3/config.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "infini_train/include/nn/modules/transformer/transformer_config.h"
+
+namespace nn = infini_train::nn;
+namespace qwen3 {
+inline nn::TransformerConfig Qwen3Config() {
+    return {.block_size = 40960,
+            .vocab_size = 151936,
+            .original_vocab_size = 151936,
+            .n_layer = 36,
+            .n_head = 32,
+            .n_kv_head = 8,
+            .n_embd = 4096,
+            .attention_type = nn::AttentionType::kRoPE,
+            .activation_type = nn::MLPType::kSwiGLU,
+            .norm_type = nn::NormType::kRMSNorm,
+            .add_bias_linear = false,
+            .add_bias_lm_head = false,
+            .tie_weights = false,
+            .ffn_expansion_ratio = 4.5f, // 4096*4.5*2/3 = 12288
+            .ffn_dim_multiplier = std::nullopt,
+            .multiple_of = 1,
+            .rope_theta = 1000000.0f,
+            .use_scaled_rope = false,
+            .norm_eps = 1e-6f};
+}
+} // namespace qwen3
diff --git a/example/qwen3/main.cc b/example/qwen3/main.cc
diff --git a/infini_train/include/nn/modules/transformer/causal_self_attention.h b/infini_train/include/nn/modules/transformer/causal_self_attention.h
@@ -5,6 +5,7 @@
 #include <vector>
 
 #include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
 namespace infini_train::nn {
@@ -15,6 +16,9 @@ class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfA
     static constexpr char kCAttnLayerName[] = "c_attn";
     static constexpr char kCProjLayerName[] = "c_proj";
 
+    static constexpr char kQNormLayerName[] = "q_norm";
+    static constexpr char kKNormLayerName[] = "k_norm";
+
     static constexpr char kParamBiasName[] = "bias";
 
     explicit CausalSelfAttention(const TransformerConfig &config);
@@ -32,6 +36,9 @@ class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfA
     int64_t n_rep_ = 0;
     int64_t head_dim_ = 0;
 
+    std::shared_ptr<infini_train::nn::RMSNorm> q_norm_;
+    std::shared_ptr<infini_train::nn::RMSNorm> k_norm_;
+
     // Setup method for different attention modes
     void SetupAttention(const TransformerConfig &config);
 
diff --git a/infini_train/include/nn/modules/transformer/transformer_config.h b/infini_train/include/nn/modules/transformer/transformer_config.h
@@ -61,6 +61,10 @@ struct TransformerConfig {
     bool flash = false;             // flash attention
     int64_t max_gen_batch_size = 4; // max batch size during inference
 
+    // ===== Q-K Norm (Qwen3 特有) =====
+    bool use_qk_norm = false;
+    float qk_norm_eps = 1e-6f;
+
     bool UseGQA() const;
     int GetChunkSize() const;
 };
diff --git a/infini_train/src/nn/modules/transformer/causal_self_attention.cc b/infini_train/src/nn/modules/transformer/causal_self_attention.cc
@@ -21,6 +21,13 @@ namespace infini_train::nn {
 CausalSelfAttention::CausalSelfAttention(const TransformerConfig &config) : CloneableModule(kType), config_(config) {
     SetupAttention(config);
 
+    if (config_.use_qk_norm) {
+        q_norm_ = std::make_shared<nn::RMSNorm>(head_dim_, config_.qk_norm_eps);
+        k_norm_ = std::make_shared<nn::RMSNorm>(head_dim_, config_.qk_norm_eps);
+        modules_[kQNormLayerName] = q_norm_;
+        modules_[kKNormLayerName] = k_norm_;
+    }
+
     int64_t qkv_dim = (config.n_head + 2 * n_kv_head_) * head_dim_;
     // qkv: ColumnParallel (do not gather output)
     modules_[kCAttnLayerName] = std::make_shared<nn::parallel::ColumnParallelLinear>(
@@ -212,6 +219,18 @@ CausalSelfAttention::ForwardWithRoPE(const std::vector<std::shared_ptr<infini_tr
     // v: (B, T, KV_local, D)
     auto v = qkv->Slice(2, q_size_local + kv_size_local, q_size_local + 2 * kv_size_local)->View({B, T, KV_local, D});
 
+    if (config_.use_qk_norm) {
+        auto q_shape = q->Dims(); // [B, T, H_local, D]
+        q = q->View({B * T * H_local, D});
+        q = (*q_norm_)({q})[0];
+        q = q->View(q_shape);
+
+        auto k_shape = k->Dims(); // [B, T, KV_local, D]
+        k = k->View({B * T * KV_local, D});
+        k = (*k_norm_)({k})[0];
+        k = k->View(k_shape);
+    }
+
     // -> RoPE on q, k
     // q: (B, T, H_local, D)
     // k: (B, T, KV_local, D)
diff --git a/scripts/convert_hf_qwen3_to_llmc.py b/scripts/convert_hf_qwen3_to_llmc.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Convert HuggingFace Qwen3-8B checkpoint to InfiniTrain LLMC format.
+
+Usage:
+    python convert_hf_qwen3_to_llmc.py \
+        --hf-path ./Qwen3-8B \
+        --output qwen3-8b-fp32.llmc \
+        --tp-size 1
+"""
+
+import argparse
+import struct
+import os
+import sys
+
+import torch
+from safetensors.torch import load_file
+
+
+# ============================================================
+# Qwen3 magic number (different from llama3's 20240803)
+# ============================================================
+K_QWEN3_MAGIC = 20240804
+K_LLMC_FP32_VERSION = 3
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert HF Qwen3 to LLMC format")
+    parser.add_argument("--hf-path", required=True, help="Path to HF Qwen3 checkpoint dir")
+    parser.add_argument("--output", required=True, help="Output LLMC file path")
+    parser.add_argument("--tp-size", type=int, default=1, help="Tensor parallel size (default 1)")
+    parser.add_argument("--tp-rank", type=int, default=0, help="Tensor parallel rank (default 0)")
+    return parser.parse_args()
+
+
+def load_hf_weights(hf_path):
+    """Load all HF safetensors / pytorch_model.bin into a flat dict."""
+    print(f"[1/4] Loading HF weights from {hf_path}...")
+    state_dict = {}
+
+    if os.path.exists(os.path.join(hf_path, "model.safetensors.index.json")):
+        import json
+        with open(os.path.join(hf_path, "model.safetensors.index.json")) as f:
+            index = json.load(f)
+        loaded_files = set()
+        for key, filename in index["weight_map"].items():
+            if filename not in loaded_files:
+                shard = load_file(os.path.join(hf_path, filename), device="cpu")
+                state_dict.update(shard)
+                loaded_files.add(filename)
+                print(f"  Loaded {filename} ({len(shard)} tensors)")
+    elif os.path.exists(os.path.join(hf_path, "model.safetensors")):
+        state_dict = load_file(os.path.join(hf_path, "model.safetensors"), device="cpu")
+    else:
+        # Fallback: pytorch_model.bin
+        state_dict = torch.load(
+            os.path.join(hf_path, "pytorch_model.bin"), map_location="cpu"
+        )
+
+    print(f"  Total tensors: {len(state_dict)}")
+    return state_dict
+
+
+def write_header(f, config, tp_rank, tp_size):
+    """Write the 1024-byte LLMC header."""
+    header = [0] * 256  # 256 int32 slots
+
+    header[0] = K_QWEN3_MAGIC          # magic
+    header[1] = K_LLMC_FP32_VERSION    # version
+    header[2] = config["block_size"]
+    header[3] = config["vocab_size"]
+    header[4] = config["n_layer"]
+    header[5] = config["n_head"]
+    header[6] = config["n_kv_head"]
+    header[7] = config["n_embd"]
+
+    # Qwen3 has no ffn_dim_multiplier, write 0.0
+    # Pack as float then unpack as int32 for the header slot
+    ffn_mult_bytes = struct.pack("f", 0.0)
+    ffn_mult_int = struct.unpack("i", ffn_mult_bytes)[0]
+    header[8] = ffn_mult_int
+
+    header[9] = config["multiple_of"]
+
+    norm_eps_bytes = struct.pack("f", config["norm_eps"])
+    header[10] = struct.unpack("i", norm_eps_bytes)[0]
+
+    rope_theta_bytes = struct.pack("f", config["rope_theta"])
+    header[11] = struct.unpack("i", rope_theta_bytes)[0]
+
+    header[12] = int(config["use_scaled_rope"])
+    header[13] = config.get("max_gen_bs", 4)
+    header[14] = 1  # version_major
+    header[15] = 0  # version_minor
+
+    # Write 256 int32 values
+    data = struct.pack(f"{len(header)}i", *header)
+    assert len(data) == 1024
+    f.write(data)
+    print(f"[2/4] Header written (magic={K_QWEN3_MAGIC}, vocab={config['vocab_size']}, layers={config['n_layer']})")
+
+
+def write_matrix(f, tensor):
+    """Write a 2D tensor as fp32 row-major."""
+    t = tensor.float().cpu()
+    assert t.dim() == 2, f"Expected 2D tensor, got {t.dim()}D: {t.shape}"
+    arr = t.contiguous().numpy().flatten().tolist()
+    data = struct.pack(f"{len(arr)}f", *arr)
+    f.write(data)
+
+
+def write_vector(f, tensor):
+    """Write a 1D tensor as fp32."""
+    t = tensor.float().cpu()
+    assert t.dim() == 1, f"Expected 1D tensor, got {t.dim()}D: {t.shape}"
+    arr = t.contiguous().numpy().tolist()
+    data = struct.pack(f"{len(arr)}f", *arr)
+    f.write(data)
+
+
+def shard_rows(tensor, tp_rank, tp_size):
+    """Row-parallel shard: split along dim 0."""
+    if tp_size == 1:
+        return tensor
+    chunks = torch.chunk(tensor, tp_size, dim=0)
+    return chunks[tp_rank]
+
+
+def shard_cols(tensor, tp_rank, tp_size):
+    """Column-parallel shard: split along dim 1."""
+    if tp_size == 1:
+        return tensor
+    chunks = torch.chunk(tensor, tp_size, dim=1)
+    return chunks[tp_rank]
+
+
+def convert(hf_path, output_path, tp_size=1, tp_rank=0):
+    """Main conversion pipeline."""
+
+    # ---- Load HF weights ----
+    sd = load_hf_weights(hf_path)
+
+    # ---- Build config from HF ----
+    import json
+    with open(os.path.join(hf_path, "config.json")) as f:
+        hf_config = json.load(f)
+
+    config = {
+        "block_size": hf_config.get("max_position_embeddings", 40960),
+        "vocab_size": hf_config["vocab_size"],
+        "n_layer": hf_config["num_hidden_layers"],
+        "n_head": hf_config["num_attention_heads"],
+        "n_kv_head": hf_config["num_key_value_heads"],
+        "n_embd": hf_config["hidden_size"],
+        "norm_eps": hf_config.get("rms_norm_eps", 1e-6),
+        "rope_theta": hf_config.get("rope_theta", 1000000.0),
+        "use_scaled_rope": False,
+        "multiple_of": 1,
+        "max_gen_bs": 4,
+    }
+
+    head_dim = config["n_embd"] // config["n_head"]
+    q_out = config["n_embd"]            # 4096
+    kv_out = config["n_kv_head"] * head_dim  # 8 * 128 = 1024
+    ffn_hidden = hf_config["intermediate_size"]  # 12288
+
+    print(f"[3/4] Config: {config}")
+    print(f"  head_dim={head_dim}, q_out={q_out}, kv_out={kv_out}, ffn_hidden={ffn_hidden}")
+
+    # ---- Write LLMC file ----
+    with open(output_path, "wb") as f:
+        # Header
+        write_header(f, config, tp_rank, tp_size)
+
+        # 1. wte.weight [vocab_size, n_embd] → row-shard for TP
+        wte = sd["model.embed_tokens.weight"]
+        wte_shard = shard_rows(wte, tp_rank, tp_size)
+        write_matrix(f, wte_shard)
+        print(f"  wte: {wte.shape} → shard {wte_shard.shape}")
+
+        n_layer = config["n_layer"]
+
+        for i in range(n_layer):
+            prefix = f"model.layers.{i}"
+
+            if (i + 1) % 6 == 0 or i == 0:
+                print(f"  Layer {i}/{n_layer - 1}...")
+
+            # 2. ln_1.weight (input_layernorm) [n_embd] — full copy
+            ln1 = sd[f"{prefix}.input_layernorm.weight"]
+            write_vector(f, ln1)
+
+            # ===== 新增：Q-K Norm =====
+            # 3. q_norm.weight [head_dim] — full copy
+            q_norm_w = sd[f"{prefix}.self_attn.q_norm.weight"]
+            write_vector(f, q_norm_w)
+
+            # 4. k_norm.weight [head_dim] — full copy
+            k_norm_w = sd[f"{prefix}.self_attn.k_norm.weight"]
+            write_vector(f, k_norm_w)
+
+            # 5. c_attn.weight [q_out + 2*kv_out, n_embd] — row-shard
+            # HF: q_proj, k_proj, v_proj are separate → concat
+            q_proj = sd[f"{prefix}.self_attn.q_proj.weight"]    # [n_embd, n_embd]
+            k_proj = sd[f"{prefix}.self_attn.k_proj.weight"]    # [kv_out, n_embd]
+            v_proj = sd[f"{prefix}.self_attn.v_proj.weight"]    # [kv_out, n_embd]
+            c_attn = torch.cat([q_proj, k_proj, v_proj], dim=0)  # [q+2kv, n_embd]
+            c_attn_shard = shard_rows(c_attn, tp_rank, tp_size)
+            write_matrix(f, c_attn_shard)
+
+            # 6. c_proj (attn o_proj) [n_embd, n_embd] — col-shard
+            o_proj = sd[f"{prefix}.self_attn.o_proj.weight"]     # [n_embd, n_embd]
+            o_proj_shard = shard_cols(o_proj, tp_rank, tp_size)
+            write_matrix(f, o_proj_shard)
+
+            # 7. ln_2.weight (post_attention_layernorm) [n_embd] — full copy
+            ln2 = sd[f"{prefix}.post_attention_layernorm.weight"]
+            write_vector(f, ln2)
+
+            # 8. c_fc (gate_proj) [ffn_hidden, n_embd] — row-shard
+            gate_proj = sd[f"{prefix}.mlp.gate_proj.weight"]
+            gate_shard = shard_rows(gate_proj, tp_rank, tp_size)
+            write_matrix(f, gate_shard)
+
+            # 9. c_fc2 (up_proj) [ffn_hidden, n_embd] — row-shard
+            up_proj = sd[f"{prefix}.mlp.up_proj.weight"]
+            up_shard = shard_rows(up_proj, tp_rank, tp_size)
+            write_matrix(f, up_shard)
+
+            # 10. c_proj (mlp down_proj) [n_embd, ffn_hidden] — col-shard
+            down_proj = sd[f"{prefix}.mlp.down_proj.weight"]
+            down_shard = shard_cols(down_proj, tp_rank, tp_size)
+            write_matrix(f, down_shard)
+
+        # 11. ln_f.weight (model.norm) [n_embd] — full copy
+        ln_f = sd["model.norm.weight"]
+        write_vector(f, ln_f)
+
+        # 12. lm_head.weight [vocab_size, n_embd] — row-shard for TP
+        lm_head = sd["lm_head.weight"]
+        lm_head_shard = shard_rows(lm_head, tp_rank, tp_size)
+        write_matrix(f, lm_head_shard)
+        print(f"  lm_head: {lm_head.shape} → shard {lm_head_shard.shape}")
+
+    file_size = os.path.getsize(output_path)
+    print(f"[4/4] Done! Output: {output_path} ({file_size / 1e9:.2f} GB)")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    convert(args.hf_path, args.output, args.tp_size, args.tp_rank)
diff --git a/scripts/convert_input_to_bin.py b/scripts/convert_input_to_bin.py
@@ -0,0 +1,15 @@
+from transformers import AutoTokenizer
+import struct
+
+tokenizer = AutoTokenizer.from_pretrained("/var/qy_home/jiyiming/Qwen3-8B")
+
+with open("input.txt", "r") as f:
+    text = f.read()
+
+ids = tokenizer.encode(text)
+
+with open("tiny_shakespeare_qwen3.bin", "wb") as f:
+    for tid in ids:
+        f.write(struct.pack("I", tid))
+
+print(f"Wrote {len(ids)} tokens")
diff --git a/scripts/temp.py b/scripts/temp.py