diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp index 7f694dbbc..a06ba1a57 100644 --- a/csrc/config/model_config.hpp +++ b/csrc/config/model_config.hpp @@ -6,10 +6,16 @@ #include "quant_config.hpp" #include #include +#include #include #include namespace infinilm::config { + +inline size_t json_size(const nlohmann::json &config, const char *key, size_t fallback = 0) { + return config.contains(key) ? config.at(key).get() : fallback; +} + class ModelConfig { // Model config is implemented using nlohmann/json and is primarily used for advanced configuration // beyond the standard model config. It is initialized via ModelConfig(const std::string& path) diff --git a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp index 512fe3079..f2482bfaf 100644 --- a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp +++ b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp @@ -1,6 +1,8 @@ #include "qwen3_vl_for_conditional_generation.hpp" + #include "../../global_state/global_state.hpp" #include "../models_registry.hpp" + #include #include @@ -10,14 +12,59 @@ Qwen3VLModel::Qwen3VLModel(std::shared_ptr model_ const infinicore::Device &device) { nlohmann::json &config_json = model_config->get_config_json(); nlohmann::json &text_config_json = config_json["text_config"]; - std::shared_ptr text_config = std::make_shared(text_config_json); + auto text_config = std::make_shared(text_config_json); + const auto &dtype{model_config->get_dtype()}; + image_token_id_ = config_json.value("image_token_id", 151655); INFINICORE_NN_MODULE_INIT(language_model, text_config, device); + INFINICORE_NN_MODULE_INIT(visual, config_json["vision_config"], dtype, device); +} + +void Qwen3VLModel::replace_image_embeddings_(infinicore::Tensor inputs_embeds, + const infinicore::Tensor &input_ids, + const infinicore::Tensor &image_embeds) const { + auto ids_cpu = input_ids->to(infinicore::Device::cpu()); + const int64_t *ids = reinterpret_cast(ids_cpu->data()); + size_t seq_len = ids_cpu->numel(); + size_t hidden_size = inputs_embeds->size(2); + size_t image_idx = 0; + for (size_t i = 0; i < seq_len; ++i) { + if (static_cast(ids[i]) != image_token_id_) { + continue; + } + if (image_idx >= image_embeds->size(0)) { + throw std::runtime_error("Qwen3VLModel: more image tokens than image embeddings"); + } + inputs_embeds->narrow({{1, i, 1}})->copy_from(image_embeds->narrow({{0, image_idx, 1}})->view({1, 1, hidden_size})); + image_idx++; + } + if (image_idx != image_embeds->size(0)) { + throw std::runtime_error("Qwen3VLModel: image features and image tokens do not match"); + } } infinicore::Tensor Qwen3VLModel::forward(const infinilm::InfinilmModel::Input &input) const { - auto hidden_states = language_model_->forward(input); - return hidden_states; + auto input_ids = input.input_ids.value(); + if (input.pixel_values.has_value() && !input.pixel_values->empty()) { + if (!input.tgt_sizes.has_value()) { + throw std::runtime_error("Qwen3VLModel: image_grid_thw must be provided via tgt_sizes with pixel_values"); + } + auto inputs_embeds = language_model_->embed_tokens(input_ids); + auto input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu()); + const int32_t *offsets = reinterpret_cast(input_offsets_cpu->data()); + auto req_ids = infinilm::global_state::get_forward_context().mm_metadata.image_req_ids; + for (size_t i = 0; i < input.pixel_values->size(); ++i) { + size_t req_id = req_ids.has_value() ? req_ids->at(i) : i; + auto image_embeds = visual_->forward(input.pixel_values->at(i), input.tgt_sizes->at(i)); + size_t start = static_cast(offsets[req_id]); + size_t len = static_cast(offsets[req_id + 1] - offsets[req_id]); + auto embeds_slice = inputs_embeds->narrow({{1, start, len}}); + auto ids_slice = input_ids->narrow({{1, start, len}}); + replace_image_embeddings_(embeds_slice, ids_slice, image_embeds); + } + return language_model_->forward_embeds(inputs_embeds, input.position_ids.value()); + } + return language_model_->forward(input); } Qwen3VLForConditionalGeneration::Qwen3VLForConditionalGeneration(std::shared_ptr model_config, @@ -66,9 +113,17 @@ std::shared_ptr create_qwen3_vl_model_config(std: nlohmann::json &config_json = model_config->get_config_json(); nlohmann::json &text_config_json = config_json["text_config"]; - if (!config_json.contains("torch_dtype")) { - std::string dtype = text_config_json["dtype"]; - config_json["torch_dtype"] = dtype; + if (!config_json.contains("torch_dtype") || config_json["torch_dtype"].is_null()) { + config_json["torch_dtype"] = text_config_json.value("dtype", "bfloat16"); + } + if (!config_json.contains("dtype") || config_json["dtype"].is_null()) { + config_json["dtype"] = text_config_json.value("dtype", "bfloat16"); + } + if (!text_config_json.contains("torch_dtype") || text_config_json["torch_dtype"].is_null()) { + text_config_json["torch_dtype"] = text_config_json.value("dtype", "bfloat16"); + } + if (!text_config_json.contains("model_type") || text_config_json["model_type"] == "qwen3_vl_text") { + text_config_json["model_type"] = "qwen3"; } return model_config; } diff --git a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp index dae5bc39b..e60703285 100644 --- a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp +++ b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp @@ -1,6 +1,10 @@ #pragma once +#include "../../layers/linear/linear.hpp" #include "../../models/qwen3/qwen3_for_causal_lm.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/tensor.hpp" +#include "qwen3_vl_vision.hpp" namespace infinilm::models::qwen3_vl { @@ -14,7 +18,13 @@ class Qwen3VLModel : public infinicore::nn::Module { infinicore::Tensor forward(const infinilm::InfinilmModel::Input &input) const; protected: + void replace_image_embeddings_(infinicore::Tensor inputs_embeds, + const infinicore::Tensor &input_ids, + const infinicore::Tensor &image_embeds) const; + + size_t image_token_id_; INFINICORE_NN_MODULE(Qwen3VLTextModel, language_model); + INFINICORE_NN_MODULE(Qwen3VLVisionModel, visual); }; class Qwen3VLForConditionalGeneration : public InfinilmModel { diff --git a/csrc/models/qwen3_vl/qwen3_vl_position.cpp b/csrc/models/qwen3_vl/qwen3_vl_position.cpp new file mode 100644 index 000000000..41395d787 --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_position.cpp @@ -0,0 +1,209 @@ +#include "qwen3_vl_position.hpp" + +#include "../../utils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace infinilm::models::qwen3_vl { +namespace { + +size_t grid_num_patches(const int64_t *grid, size_t num_grids) { + size_t total_patches = 0; + for (size_t i = 0; i < num_grids; ++i) { + total_patches += static_cast(grid[i * 3]) + * static_cast(grid[i * 3 + 1]) + * static_cast(grid[i * 3 + 2]); + } + return total_patches; +} + +void validate_grid(size_t h, size_t w, size_t spatial_merge_size) { + if (spatial_merge_size == 0 || h % spatial_merge_size != 0 || w % spatial_merge_size != 0) { + throw std::runtime_error("Qwen3VLPositionBuilder: image grid must be divisible by spatial_merge_size"); + } +} + +float load_scalar(const std::byte *ptr, infinicore::DataType dtype) { + switch (dtype) { + case infinicore::DataType::F32: + return *reinterpret_cast(ptr); + case infinicore::DataType::F16: + return f16_to_f32(*reinterpret_cast(ptr)); + case infinicore::DataType::BF16: + return bf16_to_f32(*reinterpret_cast(ptr)); + default: + throw std::runtime_error("Qwen3VLPositionBuilder: unsupported pos_embed dtype"); + } +} + +} // namespace + +Qwen3VLPositionBuilder::Qwen3VLPositionBuilder(size_t hidden_size, + size_t spatial_merge_size, + size_t num_grid_per_side, + size_t num_heads, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(hidden_size), + spatial_merge_size_(spatial_merge_size), + num_grid_per_side_(num_grid_per_side), + num_heads_(num_heads), + dtype_(dtype), + device_(device) {} + +infinicore::Tensor Qwen3VLPositionBuilder::values_to_device_(const std::vector &values, + const infinicore::Shape &shape) const { + if (dtype_ == infinicore::DataType::F32) { + auto cpu = infinicore::Tensor::from_blob(const_cast(values.data()), shape, dtype_, infinicore::Device::cpu()); + return cpu->to(device_); + } + + std::vector packed(values.size()); + if (dtype_ == infinicore::DataType::BF16) { + for (size_t i = 0; i < values.size(); ++i) { + packed[i] = f32_to_bf16(values[i]); + } + } else if (dtype_ == infinicore::DataType::F16) { + for (size_t i = 0; i < values.size(); ++i) { + packed[i] = f32_to_f16(values[i]); + } + } else { + throw std::runtime_error("Qwen3VLPositionBuilder: unsupported dtype for generated position tables"); + } + + auto cpu = infinicore::Tensor::from_blob(packed.data(), shape, dtype_, infinicore::Device::cpu()); + return cpu->to(device_); +} + +infinicore::Tensor Qwen3VLPositionBuilder::position_embeddings(const infinicore::Tensor &image_grid_thw, + const infinicore::nn::Embedding &pos_embed) const { + auto grid_cpu = image_grid_thw->to(infinicore::Device::cpu()); + auto weight_cpu = pos_embed.weight()->to(infinicore::Device::cpu()); + const int64_t *grid = reinterpret_cast(grid_cpu->data()); + const size_t num_grids = grid_cpu->size(0); + const size_t total_patches = grid_num_patches(grid, num_grids); + const size_t num_positions = weight_cpu->size(0); + const size_t hidden_size = weight_cpu->size(1); + const size_t elem_size = weight_cpu->element_size(); + const auto dtype = weight_cpu->dtype(); + const auto *weight = weight_cpu->data(); + std::vector values(total_patches * hidden_size); + + size_t offset = 0; + for (size_t g = 0; g < num_grids; ++g) { + const size_t t = static_cast(grid[g * 3]); + const size_t h = static_cast(grid[g * 3 + 1]); + const size_t w = static_cast(grid[g * 3 + 2]); + validate_grid(h, w, spatial_merge_size_); + + for (size_t token = 0; token < t * h * w; ++token) { + const size_t frame_offset = token % (h * w); + const size_t merged_idx = frame_offset / (spatial_merge_size_ * spatial_merge_size_); + const size_t intra = frame_offset % (spatial_merge_size_ * spatial_merge_size_); + const size_t merged_h = merged_idx / (w / spatial_merge_size_); + const size_t merged_w = merged_idx % (w / spatial_merge_size_); + const size_t ph = merged_h * spatial_merge_size_ + intra / spatial_merge_size_; + const size_t pw = merged_w * spatial_merge_size_ + intra % spatial_merge_size_; + + const float h_pos = h > 1 ? static_cast(ph) * static_cast(num_grid_per_side_ - 1) / static_cast(h - 1) : 0.0f; + const float w_pos = w > 1 ? static_cast(pw) * static_cast(num_grid_per_side_ - 1) / static_cast(w - 1) : 0.0f; + const size_t h_floor = static_cast(std::floor(h_pos)); + const size_t w_floor = static_cast(std::floor(w_pos)); + const size_t h_ceil = std::min(h_floor + 1, num_grid_per_side_ - 1); + const size_t w_ceil = std::min(w_floor + 1, num_grid_per_side_ - 1); + const float dh = h_pos - static_cast(h_floor); + const float dw = w_pos - static_cast(w_floor); + + const size_t pos_ids[4] = { + h_floor * num_grid_per_side_ + w_floor, + h_floor * num_grid_per_side_ + w_ceil, + h_ceil * num_grid_per_side_ + w_floor, + h_ceil * num_grid_per_side_ + w_ceil, + }; + const float weights[4] = { + (1.0f - dh) * (1.0f - dw), + (1.0f - dh) * dw, + dh * (1.0f - dw), + dh * dw, + }; + + for (size_t k = 0; k < 4; ++k) { + if (pos_ids[k] >= num_positions) { + throw std::runtime_error("Qwen3VLPositionBuilder: generated position id is out of range"); + } + } + + for (size_t d = 0; d < hidden_size; ++d) { + float value = 0.0f; + for (size_t k = 0; k < 4; ++k) { + const auto *src = weight + (pos_ids[k] * hidden_size + d) * elem_size; + value += weights[k] * load_scalar(src, dtype); + } + values[offset * hidden_size + d] = value; + } + ++offset; + } + } + + return values_to_device_(values, {total_patches, hidden_size}); +} + +std::tuple +Qwen3VLPositionBuilder::rotary_embeddings(const infinicore::Tensor &image_grid_thw) const { + auto grid_cpu = image_grid_thw->to(infinicore::Device::cpu()); + const int64_t *grid = reinterpret_cast(grid_cpu->data()); + const size_t num_grids = grid_cpu->size(0); + const size_t total_patches = grid_num_patches(grid, num_grids); + + auto pos_cpu = infinicore::Tensor::empty({total_patches}, infinicore::DataType::I64, infinicore::Device::cpu()); + auto *pos = reinterpret_cast(pos_cpu->data()); + for (size_t i = 0; i < total_patches; ++i) { + pos[i] = static_cast(i); + } + + const size_t head_dim = hidden_size_ / num_heads_; + const size_t half_dim = head_dim / 2; + const size_t axis_dim = half_dim / 2; + std::vector sin_values(total_patches * half_dim); + std::vector cos_values(total_patches * half_dim); + + size_t offset = 0; + for (size_t g = 0; g < num_grids; ++g) { + const size_t t = static_cast(grid[g * 3]); + const size_t h = static_cast(grid[g * 3 + 1]); + const size_t w = static_cast(grid[g * 3 + 2]); + validate_grid(h, w, spatial_merge_size_); + + for (size_t token = 0; token < t * h * w; ++token) { + const size_t frame_offset = token % (h * w); + const size_t merged_idx = frame_offset / (spatial_merge_size_ * spatial_merge_size_); + const size_t intra = frame_offset % (spatial_merge_size_ * spatial_merge_size_); + const size_t merged_h = merged_idx / (w / spatial_merge_size_); + const size_t merged_w = merged_idx % (w / spatial_merge_size_); + const size_t py = merged_h * spatial_merge_size_ + intra / spatial_merge_size_; + const size_t px = merged_w * spatial_merge_size_ + intra % spatial_merge_size_; + + for (size_t d = 0; d < axis_dim; ++d) { + const float inv_freq = 1.0f / std::pow(10000.0f, static_cast(2 * d) / static_cast(half_dim)); + const float ay = static_cast(py) * inv_freq; + const float ax = static_cast(px) * inv_freq; + sin_values[offset * half_dim + d] = std::sin(ay); + cos_values[offset * half_dim + d] = std::cos(ay); + sin_values[offset * half_dim + axis_dim + d] = std::sin(ax); + cos_values[offset * half_dim + axis_dim + d] = std::cos(ax); + } + ++offset; + } + } + + return {pos_cpu->to(device_), + values_to_device_(sin_values, {total_patches, half_dim}), + values_to_device_(cos_values, {total_patches, half_dim})}; +} + +} // namespace infinilm::models::qwen3_vl diff --git a/csrc/models/qwen3_vl/qwen3_vl_position.hpp b/csrc/models/qwen3_vl/qwen3_vl_position.hpp new file mode 100644 index 000000000..750b49c8c --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_position.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "infinicore/nn/embedding.hpp" +#include "infinicore/tensor.hpp" + +#include +#include + +namespace infinilm::models::qwen3_vl { + +class Qwen3VLPositionBuilder { +public: + Qwen3VLPositionBuilder(size_t hidden_size, + size_t spatial_merge_size, + size_t num_grid_per_side, + size_t num_heads, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor position_embeddings(const infinicore::Tensor &image_grid_thw, + const infinicore::nn::Embedding &pos_embed) const; + + std::tuple + rotary_embeddings(const infinicore::Tensor &image_grid_thw) const; + +private: + infinicore::Tensor values_to_device_(const std::vector &values, + const infinicore::Shape &shape) const; + + size_t hidden_size_; + size_t spatial_merge_size_; + size_t num_grid_per_side_; + size_t num_heads_; + infinicore::DataType dtype_; + infinicore::Device device_; +}; + +} // namespace infinilm::models::qwen3_vl diff --git a/csrc/models/qwen3_vl/qwen3_vl_vision.cpp b/csrc/models/qwen3_vl/qwen3_vl_vision.cpp new file mode 100644 index 000000000..9b5fea7e2 --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_vision.cpp @@ -0,0 +1,43 @@ +#include "qwen3_vl_vision.hpp" + +#include "../../config/model_config.hpp" +#include "infinicore/ops.hpp" + +#include + +namespace infinilm::models::qwen3_vl { + +using infinilm::config::json_size; + +Qwen3VLVisionModel::Qwen3VLVisionModel(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(json_size(config, "hidden_size")), + spatial_merge_size_(json_size(config, "spatial_merge_size", 2)), + num_grid_per_side_(static_cast(std::sqrt(static_cast(json_size(config, "num_position_embeddings"))))), + position_builder_(hidden_size_, + spatial_merge_size_, + num_grid_per_side_, + json_size(config, "num_heads"), + dtype, + device) { + INFINICORE_NN_MODULE_INIT(patch_embed, config, dtype, device); + INFINICORE_NN_MODULE_INIT(pos_embed, json_size(config, "num_position_embeddings"), hidden_size_, std::nullopt, dtype, device); + INFINICORE_NN_MODULE_VEC_INIT(blocks, json_size(config, "depth"), Qwen3VLVisionBlock, config, dtype, device); + INFINICORE_NN_MODULE_INIT(merger, config, false, dtype, device); + size_t deepstack_count = config.contains("deepstack_visual_indexes") ? config["deepstack_visual_indexes"].size() : 0; + INFINICORE_NN_MODULE_VEC_INIT(deepstack_merger_list, deepstack_count, Qwen3VLPatchMerger, config, true, dtype, device); +} + +infinicore::Tensor Qwen3VLVisionModel::forward(const infinicore::Tensor &pixel_values, + const infinicore::Tensor &image_grid_thw) const { + auto hidden_states = patch_embed_->forward(pixel_values); + hidden_states = infinicore::op::add(hidden_states, position_builder_.position_embeddings(image_grid_thw, *pos_embed_)); + auto [rotary_pos_ids, sin_table, cos_table] = position_builder_.rotary_embeddings(image_grid_thw); + for (const auto &block : blocks_) { + hidden_states = block->forward(hidden_states, rotary_pos_ids, sin_table, cos_table); + } + return merger_->forward(hidden_states); +} + +} // namespace infinilm::models::qwen3_vl diff --git a/csrc/models/qwen3_vl/qwen3_vl_vision.hpp b/csrc/models/qwen3_vl/qwen3_vl_vision.hpp new file mode 100644 index 000000000..914192ce9 --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_vision.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "infinicore/nn/embedding.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/tensor.hpp" +#include "qwen3_vl_position.hpp" +#include "qwen3_vl_vision_layers.hpp" + +#include + +namespace infinilm::models::qwen3_vl { + +class Qwen3VLVisionModel : public infinicore::nn::Module { +public: + Qwen3VLVisionModel(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &pixel_values, + const infinicore::Tensor &image_grid_thw) const; + +private: + size_t hidden_size_; + size_t spatial_merge_size_; + size_t num_grid_per_side_; + Qwen3VLPositionBuilder position_builder_; + + INFINICORE_NN_MODULE(Qwen3VLPatchEmbed, patch_embed); + INFINICORE_NN_MODULE(infinicore::nn::Embedding, pos_embed); + INFINICORE_NN_MODULE_VEC(Qwen3VLVisionBlock, blocks); + INFINICORE_NN_MODULE(Qwen3VLPatchMerger, merger); + INFINICORE_NN_MODULE_VEC(Qwen3VLPatchMerger, deepstack_merger_list); +}; + +} // namespace infinilm::models::qwen3_vl diff --git a/csrc/models/qwen3_vl/qwen3_vl_vision_layers.cpp b/csrc/models/qwen3_vl/qwen3_vl_vision_layers.cpp new file mode 100644 index 000000000..f669fcdfc --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_vision_layers.cpp @@ -0,0 +1,149 @@ +#include "qwen3_vl_vision_layers.hpp" + +#include "../../config/model_config.hpp" +#include "infinicore/nn/rope.hpp" +#include "infinicore/ops.hpp" +#include "infinicore/ops/mha.hpp" +#include "infinicore/ops/rope.hpp" + +#include +#include + +namespace infinilm::models::qwen3_vl { + +using infinilm::config::json_size; + +Qwen3VLPatchProjection::Qwen3VLPatchProjection(size_t out_features, + size_t in_channels, + size_t temporal_patch_size, + size_t patch_size, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : patch_dim_(in_channels * temporal_patch_size * patch_size * patch_size) { + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_channels, temporal_patch_size, patch_size, patch_size}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype, device)); +} + +infinicore::Tensor Qwen3VLPatchProjection::forward(const infinicore::Tensor &pixel_values) const { + auto input = const_cast(pixel_values); + auto weight_2d = static_cast(weight_)->view({weight_->size(0), patch_dim_}); + auto bias_tensor = static_cast(bias_); + return infinicore::op::linear(input, weight_2d, bias_tensor); +} + +Qwen3VLPatchEmbed::Qwen3VLPatchEmbed(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + INFINICORE_NN_MODULE_INIT(proj, + json_size(config, "hidden_size"), + json_size(config, "in_channels", 3), + json_size(config, "temporal_patch_size", 2), + json_size(config, "patch_size", 16), + dtype, + device); +} + +infinicore::Tensor Qwen3VLPatchEmbed::forward(const infinicore::Tensor &pixel_values) const { + return proj_->forward(pixel_values); +} + +Qwen3VLVisionMLP::Qwen3VLVisionMLP(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + size_t hidden_size = json_size(config, "hidden_size"); + size_t intermediate_size = json_size(config, "intermediate_size"); + INFINICORE_NN_MODULE_INIT(linear_fc1, hidden_size, intermediate_size, true, dtype, device); + INFINICORE_NN_MODULE_INIT(linear_fc2, intermediate_size, hidden_size, true, dtype, device); +} + +infinicore::Tensor Qwen3VLVisionMLP::forward(const infinicore::Tensor &hidden_states) const { + auto x = linear_fc1_->forward(const_cast(hidden_states)); + x = infinicore::op::gelu_tanh(x); + return linear_fc2_->forward(x); +} + +Qwen3VLVisionAttention::Qwen3VLVisionAttention(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(json_size(config, "hidden_size")), + num_heads_(json_size(config, "num_heads")), + head_dim_(hidden_size_ / num_heads_), + scale_(1.0f / std::sqrt(static_cast(head_dim_))) { + INFINICORE_NN_MODULE_INIT(qkv, hidden_size_, hidden_size_ * 3, true, dtype, device); + INFINICORE_NN_MODULE_INIT(proj, hidden_size_, hidden_size_, true, dtype, device); +} + +infinicore::Tensor Qwen3VLVisionAttention::forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + const infinicore::Tensor &sin_table, + const infinicore::Tensor &cos_table) const { + size_t seq_len = hidden_states->size(0); + auto qkv = qkv_->forward(const_cast(hidden_states)) + ->view({seq_len, 3, num_heads_, head_dim_}); + auto q = qkv->narrow({{1, 0, 1}})->squeeze(1); + auto k = qkv->narrow({{1, 1, 1}})->squeeze(1); + auto v = qkv->narrow({{1, 2, 1}})->squeeze(1)->view({1, seq_len, num_heads_, head_dim_}); + + q = infinicore::op::rope(q, position_ids, sin_table, cos_table, infinicore::nn::RoPE::Algo::GPT_NEOX) + ->view({1, seq_len, num_heads_, head_dim_}); + k = infinicore::op::rope(k, position_ids, sin_table, cos_table, infinicore::nn::RoPE::Algo::GPT_NEOX) + ->view({1, seq_len, num_heads_, head_dim_}); + + auto attn_output = infinicore::op::mha(q, k, v, std::nullopt, scale_, false) + ->view({seq_len, hidden_size_}); + return proj_->forward(attn_output); +} + +Qwen3VLVisionBlock::Qwen3VLVisionBlock(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + size_t hidden_size = json_size(config, "hidden_size"); + INFINICORE_NN_MODULE_INIT(norm1, hidden_size, 1e-6, dtype, device); + INFINICORE_NN_MODULE_INIT(attn, config, dtype, device); + INFINICORE_NN_MODULE_INIT(norm2, hidden_size, 1e-6, dtype, device); + INFINICORE_NN_MODULE_INIT(mlp, config, dtype, device); +} + +infinicore::Tensor Qwen3VLVisionBlock::forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + const infinicore::Tensor &sin_table, + const infinicore::Tensor &cos_table) const { + auto residual = hidden_states; + auto x = norm1_->forward(hidden_states); + x = attn_->forward(x, position_ids, sin_table, cos_table); + x = infinicore::op::add(x, residual); + + residual = x; + x = norm2_->forward(x); + x = mlp_->forward(x); + return infinicore::op::add(x, residual); +} + +Qwen3VLPatchMerger::Qwen3VLPatchMerger(const nlohmann::json &config, + bool use_postshuffle_norm, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(json_size(config, "hidden_size")), + merged_size_(hidden_size_ * json_size(config, "spatial_merge_size", 2) * json_size(config, "spatial_merge_size", 2)), + use_postshuffle_norm_(use_postshuffle_norm) { + size_t out_hidden_size = json_size(config, "out_hidden_size"); + INFINICORE_NN_MODULE_INIT(norm, use_postshuffle_norm_ ? merged_size_ : hidden_size_, 1e-6, dtype, device); + INFINICORE_NN_MODULE_INIT(linear_fc1, merged_size_, merged_size_, true, dtype, device); + INFINICORE_NN_MODULE_INIT(linear_fc2, merged_size_, out_hidden_size, true, dtype, device); +} + +infinicore::Tensor Qwen3VLPatchMerger::forward(const infinicore::Tensor &hidden_states) const { + infinicore::Tensor x; + if (use_postshuffle_norm_) { + x = hidden_states->view({hidden_states->size(0) / 4, merged_size_}); + x = norm_->forward(x); + } else { + x = norm_->forward(hidden_states); + x = x->view({x->size(0) / 4, merged_size_}); + } + x = linear_fc1_->forward(x); + x = infinicore::op::gelu(x); + return linear_fc2_->forward(x); +} + +} // namespace infinilm::models::qwen3_vl diff --git a/csrc/models/qwen3_vl/qwen3_vl_vision_layers.hpp b/csrc/models/qwen3_vl/qwen3_vl_vision_layers.hpp new file mode 100644 index 000000000..d656e2468 --- /dev/null +++ b/csrc/models/qwen3_vl/qwen3_vl_vision_layers.hpp @@ -0,0 +1,110 @@ +#pragma once + +#include "../../layers/linear/linear.hpp" +#include "infinicore/nn/layer_norm.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/tensor.hpp" + +#include + +namespace infinilm::models::qwen3_vl { + +class Qwen3VLPatchProjection : public infinicore::nn::Module { +public: + Qwen3VLPatchProjection(size_t out_features, + size_t in_channels, + size_t temporal_patch_size, + size_t patch_size, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &pixel_values) const; + +private: + size_t patch_dim_; + INFINICORE_NN_PARAMETER(weight); + INFINICORE_NN_PARAMETER(bias); +}; + +class Qwen3VLPatchEmbed : public infinicore::nn::Module { +public: + Qwen3VLPatchEmbed(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &pixel_values) const; + +private: + INFINICORE_NN_MODULE(Qwen3VLPatchProjection, proj); +}; + +class Qwen3VLVisionMLP : public infinicore::nn::Module { +public: + Qwen3VLVisionMLP(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; + +private: + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, linear_fc1); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, linear_fc2); +}; + +class Qwen3VLVisionAttention : public infinicore::nn::Module { +public: + Qwen3VLVisionAttention(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + const infinicore::Tensor &sin_table, + const infinicore::Tensor &cos_table) const; + +private: + size_t hidden_size_; + size_t num_heads_; + size_t head_dim_; + float scale_; + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, qkv); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, proj); +}; + +class Qwen3VLVisionBlock : public infinicore::nn::Module { +public: + Qwen3VLVisionBlock(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + const infinicore::Tensor &sin_table, + const infinicore::Tensor &cos_table) const; + +private: + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, norm1); + INFINICORE_NN_MODULE(Qwen3VLVisionAttention, attn); + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, norm2); + INFINICORE_NN_MODULE(Qwen3VLVisionMLP, mlp); +}; + +class Qwen3VLPatchMerger : public infinicore::nn::Module { +public: + Qwen3VLPatchMerger(const nlohmann::json &config, + bool use_postshuffle_norm, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; + +private: + size_t hidden_size_; + size_t merged_size_; + bool use_postshuffle_norm_; + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, norm); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, linear_fc1); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, linear_fc2); +}; + +} // namespace infinilm::models::qwen3_vl diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index 75890b0f8..126611756 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -3,7 +3,7 @@ import infinicore -from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig +from infinilm.cache import PagedKVCacheConfig from infinilm.distributed import DistConfig from infinilm.lib import _infinilm @@ -13,6 +13,16 @@ import os +def _get_config_dtype(config_dict): + dtype = config_dict.get("torch_dtype") or config_dict.get("dtype") + if dtype is not None: + return dtype + text_config = config_dict.get("text_config") + if isinstance(text_config, dict): + return text_config.get("dtype") or text_config.get("torch_dtype") + return None + + def read_hf_config(model_path): config_path = os.path.join(model_path, "config.json") with open(config_path, "r") as f: @@ -24,6 +34,10 @@ def read_hf_config(model_path): and config_dict.get("dtype") is None ): config_dict["torch_dtype"] = "float32" + if config_dict.get("torch_dtype") is None and config_dict.get("dtype") is None: + dtype = _get_config_dtype(config_dict) + if dtype is not None: + config_dict["torch_dtype"] = dtype if "model_type" not in config_dict: raise ValueError( f"`model_type` is not specified in the config file `{config_path}`." @@ -94,10 +108,7 @@ def __init__( @property def dtype(self): - torch_dtype = self.hf_config.get("torch_dtype") - if torch_dtype is None: - torch_dtype = self.hf_config.get("dtype") - return parse_dtype(torch_dtype) + return parse_dtype(_get_config_dtype(self.hf_config)) @property def model_type(self): diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 9d7ff2988..d978afb98 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -181,6 +181,7 @@ def load_model_state_dict_by_file( already_loaded_keys = [] embed_tokens_torch_unscaled = None + qwen3_vl_embed_tokens_torch = None file_list = glob.glob(os.path.join(model_path, "*.safetensors")) if len(file_list) > 0: @@ -210,6 +211,10 @@ def load_model_state_dict_by_file( model_param["model.embed_tokens.weight"] = ( embed_tokens_torch_unscaled * float(scale_emb) ) + if "model.language_model.embed_tokens.weight" in model_param: + qwen3_vl_embed_tokens_torch = model_param[ + "model.language_model.embed_tokens.weight" + ] # --------------------------------------------------------- # # model_param_infini references torch.Tensor @@ -255,8 +260,13 @@ def load_model_state_dict_by_file( # Handle tied weights: if lm_head.weight is missing, share embed_tokens.weight # Use unscaled weight for lm_head (C++ alpha handles dim_model_base scaling) if "lm_head.weight" in model_keys and "lm_head.weight" not in already_loaded_keys: - if embed_tokens_torch_unscaled is not None: - lm_head_tensor = infinicore.from_torch(embed_tokens_torch_unscaled) + tied_weight = ( + embed_tokens_torch_unscaled + if embed_tokens_torch_unscaled is not None + else qwen3_vl_embed_tokens_torch + ) + if tied_weight is not None: + lm_head_tensor = infinicore.from_torch(tied_weight) model.load_state_dict({"lm_head.weight": lm_head_tensor}, strict=False) already_loaded_keys.append("lm_head.weight") @@ -332,18 +342,19 @@ def load_model_state_dict_by_tensor( t2 = time.time() print(f" load weights over! {(t2 - t1) * 1000} ms \n") + # ============================================================================ # Common weight transformation utilities # ============================================================================ + def drop_keys( state_dict: Dict[str, torch.Tensor], substrings: List[str], ) -> Dict[str, torch.Tensor]: """Drop keys containing any of the given substrings.""" return { - k: v for k, v in state_dict.items() - if not any(sub in k for sub in substrings) + k: v for k, v in state_dict.items() if not any(sub in k for sub in substrings) } @@ -425,6 +436,7 @@ def split_fused_weight( return result + def split_fused_weight_with_sizes( state_dict: Dict[str, torch.Tensor], fused_key: str, @@ -465,6 +477,7 @@ def split_fused_weight_with_sizes( return result + # ============================================================================ # Model-specific remap functions # ============================================================================ @@ -511,18 +524,22 @@ def _remap_chatglm(state_dict, config=None): ) # 4. Rename keys - state_dict = rename_keys(state_dict, { - "transformer.encoder.layers.": "model.layers.", - "transformer.embedding.word_embeddings": "model.embed_tokens", - "transformer.encoder.final_layernorm": "model.norm", - "transformer.output_layer": "lm_head", - "self_attention.": "self_attn.", - "self_attn.dense": "self_attn.o_proj", - "mlp.dense_4h_to_h": "mlp.down_proj", - }) + state_dict = rename_keys( + state_dict, + { + "transformer.encoder.layers.": "model.layers.", + "transformer.embedding.word_embeddings": "model.embed_tokens", + "transformer.encoder.final_layernorm": "model.norm", + "transformer.output_layer": "lm_head", + "self_attention.": "self_attn.", + "self_attn.dense": "self_attn.o_proj", + "mlp.dense_4h_to_h": "mlp.down_proj", + }, + ) return state_dict + def _is_baichuan2(config): """ Baichuan1 and Baichuan2 share the same model_type "baichuan" in official HuggingFace configs, @@ -535,6 +552,7 @@ def _is_baichuan2(config): """ return config.get("vocab_size") == 125696 + def _remap_baichuan(state_dict, config=None): """Split Baichuan fused W_pack into q_proj, k_proj, v_proj and apply Baichuan2-specific fixes.""" @@ -543,7 +561,7 @@ def _remap_baichuan(state_dict, config=None): hf_config = config or {} hidden_size = hf_config.get("hidden_size", 4096) num_heads = hf_config.get("num_attention_heads", 32) - vocab_size = hf_config.get("vocab_size", 125696) + # vocab_size = hf_config.get("vocab_size", 125696) per_head_dim = num_heads * (hidden_size // num_heads) # 1. Split W_pack → q_proj, k_proj, v_proj diff --git a/python/infinilm/processors/qwen3_vl_processor.py b/python/infinilm/processors/qwen3_vl_processor.py new file mode 100644 index 000000000..50611ad7e --- /dev/null +++ b/python/infinilm/processors/qwen3_vl_processor.py @@ -0,0 +1,271 @@ +from typing_extensions import override + +import torch +from transformers import AutoConfig, AutoProcessor + +from .processor import InfinilmProcessor, register_processor +from ..llm.static_scheduler import StaticSchedulerOutput +from ..llm.scheduler import SchedulerOutput + + +@register_processor("qwen3_vl") +class Qwen3VLProcessor(InfinilmProcessor): + def __init__(self, model_dir_path: str): + self.processor = AutoProcessor.from_pretrained( + model_dir_path, trust_remote_code=True + ) + self.tokenizer = self.processor.tokenizer + self.config = AutoConfig.from_pretrained(model_dir_path, trust_remote_code=True) + self.image_token_id = self.config.image_token_id + text_config = getattr(self.config, "text_config", None) + self.pixel_values_dtype = getattr(text_config, "dtype", None) or getattr( + text_config, "torch_dtype", None + ) + if self.pixel_values_dtype is None: + self.pixel_values_dtype = torch.bfloat16 + + @override + def __call__( + self, + prompt, + images=None, + videos=None, + audios=None, + return_tensors: str = None, + **kwargs, + ) -> dict: + if not images and not videos and not audios: + return self.tokenizer(prompt, return_tensors=return_tensors, **kwargs) + + processor_kwargs = {"text": [prompt], "return_tensors": "pt", **kwargs} + if images: + processor_kwargs["images"] = images + if videos: + processor_kwargs["videos"] = videos + return self.processor(**processor_kwargs) + + @override + def apply_chat_template( + self, + conversation, + add_generation_prompt: bool = False, + tokenize: bool = True, + **kwargs, + ): + normalized = [] + for msg in conversation: + content = msg["content"] + if not isinstance(content, list): + normalized.append(msg) + continue + + normalized_content = [] + for item in content: + if item.get("type") == "text": + normalized_content.append(item) + elif item.get("type") == "image_url": + normalized_content.append( + {"type": "image", "image": item["image_url"]["url"]} + ) + else: + normalized_content.append(item) + normalized.append( + {"role": msg.get("role", "user"), "content": normalized_content} + ) + + return self.processor.apply_chat_template( + conversation=normalized, + add_generation_prompt=add_generation_prompt, + tokenize=tokenize, + **kwargs, + ) + + @override + def build_model_inputs( + self, + scheduler_output: SchedulerOutput | StaticSchedulerOutput, + temperature: float = 1.0, + top_p: float = 0.8, + top_k: int = 1, + **kwargs, + ) -> dict: + if isinstance(scheduler_output, StaticSchedulerOutput): + return self._build_static(scheduler_output, temperature, top_p, top_k) + return self._build_paged(scheduler_output, temperature, top_p, top_k) + + def _append_mm_data(self, mm_data: dict, req_id: int, req, num_cached: int): + if req.processed_inputs is None or "pixel_values" not in req.processed_inputs: + return + if num_cached > 0: + image_token_positions = [ + i + for i, token in enumerate(req.prompt_token_ids) + if token == self.image_token_id + ] + if image_token_positions and image_token_positions[-1] < num_cached: + return + + import infinicore + + pixel_values = req.processed_inputs["pixel_values"].to(self.pixel_values_dtype) + image_grid_thw = req.processed_inputs["image_grid_thw"].to(torch.int64) + + mm_data.setdefault("pixel_values", []).append( + infinicore.from_torch(pixel_values) + ) + mm_data.setdefault("tgt_sizes", []).append( + infinicore.from_torch(image_grid_thw) + ) + mm_data.setdefault("image_req_ids", []).append(req_id) + + def _build_static(self, scheduler_output, temperature, top_p, top_k): + import infinicore + + req = scheduler_output.scheduled_requests[0] + mm_data = {} + if scheduler_output.is_prefill: + tokens = req.get_input_tokens() + prefix_hit_len = scheduler_output.prefix_hit_len + input_tokens = tokens[prefix_hit_len:] + position_ids = [list(range(prefix_hit_len, len(tokens)))] + past_kv_len = prefix_hit_len + total_kv_len = len(tokens) + input_offsets = [0, len(input_tokens)] + self._append_mm_data(mm_data, 0, req, prefix_hit_len) + else: + last_token = req.generated_token_ids[-1] + current_position = req.get_total_length() - 1 + input_tokens = [last_token] + position_ids = [[current_position]] + past_kv_len = current_position + total_kv_len = req.get_total_length() + input_offsets = [0, 1] + + return { + "input_ids": infinicore.from_list([input_tokens], dtype=infinicore.int64), + "position_ids": infinicore.from_list(position_ids, dtype=infinicore.int64), + "past_kv_lengths": infinicore.from_list( + [past_kv_len], dtype=infinicore.int32 + ), + "total_kv_lengths": infinicore.from_list( + [total_kv_len], dtype=infinicore.int32 + ), + "input_offsets": infinicore.from_list( + input_offsets, dtype=infinicore.int32 + ), + "cu_seqlens": infinicore.from_list( + [0, total_kv_len], dtype=infinicore.int32 + ), + "block_tables": None, + "slot_mapping": None, + "temperature": temperature, + "top_k": top_k, + "top_p": top_p, + **mm_data, + } + + def _build_paged(self, scheduler_output, temperature, top_p, top_k): + import infinicore + + if not scheduler_output.scheduled_requests: + raise RuntimeError( + "build_model_inputs called with empty scheduled_requests" + ) + + tokens = [] + seq_lens = [] + seq_offsets = [0] + block_tables = [] + slot_mapping = [] + cached_lens = [] + position_ids = [] + cu_seqlens = [0] + mm_data = {} + + max_block_table_len = max( + len(req.block_table) for req in scheduler_output.scheduled_requests + ) + current_offset = 0 + + for req_id, req in enumerate(scheduler_output.scheduled_requests): + num_cached = req.num_cached_tokens + if scheduler_output.is_prefill: + req_tokens = req.get_input_tokens() + tokens_to_compute = req_tokens[num_cached:] + tokens.extend(tokens_to_compute) + compute_len = len(tokens_to_compute) + seq_len = len(req_tokens) + seq_lens.append(seq_len) + current_offset += compute_len + seq_offsets.append(current_offset) + slot_mapping.extend(req.slot_mapping) + cached_lens.append(num_cached) + position_ids.extend(range(num_cached, num_cached + compute_len)) + self._append_mm_data(mm_data, req_id, req, num_cached) + else: + seq_len = req.get_total_length() + last_token = req.generated_token_ids[-1] + tokens.append(last_token) + seq_lens.append(seq_len) + current_offset += 1 + seq_offsets.append(current_offset) + slot_mapping.extend(req.slot_mapping) + cached_lens.append(num_cached) + position_ids.append(seq_len - 1) + + block_tables.append( + req.block_table + [-1] * (max_block_table_len - len(req.block_table)) + ) + cu_seqlens.append(cu_seqlens[-1] + seq_len) + + return { + "input_ids": infinicore.from_list([tokens], dtype=infinicore.int64), + "position_ids": infinicore.from_list(position_ids, dtype=infinicore.int64), + "past_kv_lengths": infinicore.from_list( + cached_lens, dtype=infinicore.int32 + ), + "total_kv_lengths": infinicore.from_list(seq_lens, dtype=infinicore.int32), + "input_offsets": infinicore.from_list(seq_offsets, dtype=infinicore.int32), + "cu_seqlens": infinicore.from_list(cu_seqlens, dtype=infinicore.int32), + "block_tables": infinicore.from_list(block_tables, dtype=infinicore.int32), + "slot_mapping": infinicore.from_list(slot_mapping, dtype=infinicore.int64), + "temperature": temperature, + "top_k": top_k, + "top_p": top_p, + **mm_data, + } + + @override + def get_tokenizer(self): + return self.tokenizer + + @override + def get_mm_token_index_list( + self, prompt_token_ids, image_ids=None, video_ids=None, **kwargs + ): + mappings = [] + idx = 0 + image_idx = 0 + while idx < len(prompt_token_ids): + if prompt_token_ids[idx] != self.image_token_id: + idx += 1 + continue + start = idx + while ( + idx < len(prompt_token_ids) + and prompt_token_ids[idx] == self.image_token_id + ): + idx += 1 + mappings.append( + { + "start_index": start, + "end_index": idx, + "identifier": ( + image_ids[image_idx] + if image_ids and image_idx < len(image_ids) + else image_idx + ), + } + ) + image_idx += 1 + return mappings