Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion xllm/core/distributed_runtime/master.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ void resolve_npu_kernel_backend_for_options(Options* options) {
return;
}

const std::string model_type = get_model_type(options->model_path());
const std::string model_type =
util::get_model_type(options->model_path(), options->backend());
std::string effective_backend;
std::string resolved_name;
std::string error_message;
Expand Down
54 changes: 50 additions & 4 deletions xllm/core/distributed_runtime/vlm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ bool VLMEngine::init_model() {
n_local_kv_heads_ = std::max<int64_t>(1, n_kv_heads / world_size);
head_dim_ = args_.head_dim();
dtype_ = util::parse_dtype(args_.dtype(), options_.devices()[0]);
if (has_linear_attention_layers(args_)) {
const int64_t linear_n_k_heads = args_.linear_num_key_heads();
const int64_t linear_n_v_heads = args_.linear_num_value_heads();
n_local_linear_k_heads_ =
std::max<int64_t>(1, linear_n_k_heads / world_size);
n_local_linear_v_heads_ =
std::max<int64_t>(1, linear_n_v_heads / world_size);
}

// key + value for all layers
LOG(INFO) << "Block info, block_size: " << options_.block_size()
Expand Down Expand Up @@ -247,13 +255,38 @@ Engine::KVCacheCapacity VLMEngine::estimate_kv_cache_capacity() {
slot_size = 2 * dtype_size * head_dim_ * n_local_kv_heads_;
}
kv_cache_cap.slot_size = slot_size;
if (has_linear_attention_layers(args_)) {
const int64_t head_k_dim = args_.linear_key_head_dim();
const int64_t head_v_dim = args_.linear_value_head_dim();
const int64_t linear_ssm_slot_size =
dtype_size * n_local_linear_v_heads_ * head_k_dim * head_v_dim;
const int64_t linear_conv_slot_size =
dtype_size *
(head_k_dim * n_local_linear_k_heads_ * 2 +
head_v_dim * n_local_linear_v_heads_) *
(args_.linear_conv_kernel_dim() - 1);
kv_cache_cap.linear_slot_size =
linear_ssm_slot_size + linear_conv_slot_size;
}
kv_cache_cap.n_layers = args_.n_layers();

// compute kv cache n_blocks
int64_t full_attention_interval = (args_.full_attention_interval() < 1)
? 1
: args_.full_attention_interval();
int64_t num_full_attention_layers =
kv_cache_cap.n_layers / full_attention_interval;
int64_t num_linear_attention_layers =
kv_cache_cap.n_layers - num_full_attention_layers;
const int32_t block_size = options_.block_size();
const int64_t block_size_in_bytes = block_size * slot_size;
kv_cache_cap.n_blocks = kv_cache_cap.cache_size_in_bytes /
(args_.n_layers() * block_size_in_bytes);
const int64_t full_cache_block_size_in_bytes = block_size * slot_size;
const int64_t total_cache_block_size_in_bytes =
num_full_attention_layers * full_cache_block_size_in_bytes +
num_linear_attention_layers * kv_cache_cap.linear_slot_size;
CHECK_GT(total_cache_block_size_in_bytes, 0)
<< "invalid cache block size estimate";
kv_cache_cap.n_blocks =
kv_cache_cap.cache_size_in_bytes / total_cache_block_size_in_bytes;
CHECK_GT(kv_cache_cap.n_blocks, 0) << "no n_blocks for kv cache";

return kv_cache_cap;
Expand All @@ -266,14 +299,27 @@ bool VLMEngine::allocate_kv_cache(const Engine::KVCacheCapacity& kv_cache_cap) {
<< ", slot_size: " << kv_cache_cap.slot_size;

const int32_t block_size = options_.block_size();
const bool enable_linear_attention = has_linear_attention_layers(args_);

// init kv cache for each worker
std::vector<std::vector<int64_t>> kv_cache_shape;
kv_cache_shape.reserve(2);
kv_cache_shape.reserve(enable_linear_attention ? 4 : 2);
kv_cache_shape.emplace_back(std::vector<int64_t>{
kv_cache_cap.n_blocks, block_size, n_local_kv_heads_, head_dim_});
kv_cache_shape.emplace_back(std::vector<int64_t>{
kv_cache_cap.n_blocks, block_size, n_local_kv_heads_, head_dim_});
if (enable_linear_attention) {
kv_cache_shape.emplace_back(std::vector<int64_t>{
kv_cache_cap.n_blocks,
args_.linear_key_head_dim() * n_local_linear_k_heads_ * 2 +
args_.linear_key_head_dim() * n_local_linear_v_heads_,
Comment thread
yingxudeng marked this conversation as resolved.
args_.linear_conv_kernel_dim() - 1});
kv_cache_shape.emplace_back(
std::vector<int64_t>{kv_cache_cap.n_blocks,
n_local_linear_v_heads_,
args_.linear_key_head_dim(),
args_.linear_value_head_dim()});
}
#if defined(USE_MLU)
// transpose kv_cache layout for mlu
// default layout: [n_blocks, block_size, n_head, head_dim]
Expand Down
2 changes: 2 additions & 0 deletions xllm/core/distributed_runtime/vlm_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ class VLMEngine : public Engine {
// config for kv cache
int64_t n_local_kv_heads_ = 0;
int64_t head_dim_ = 0;
int64_t n_local_linear_v_heads_ = 0;
int64_t n_local_linear_k_heads_ = 0;
};

} // namespace xllm
7 changes: 6 additions & 1 deletion xllm/core/framework/batch/mposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ std::vector<std::tuple<std::string, int32_t, int32_t>> groupByTokenType(
current_key, start, static_cast<int32_t>(token_types.size()));
return groups;
}

bool is_qwen3_vl_position_model(const std::string& model_type) {
return absl::StartsWith(model_type, "qwen3_vl") ||
absl::StartsWith(model_type, "qwen3_5_vl");
}
} // namespace

torch::Tensor MPositionHelper::get_positions() {
Expand All @@ -63,7 +68,7 @@ torch::Tensor MPositionHelper::get_positions() {
std::tuple<torch::Tensor, int32_t> res;
if (absl::StartsWith(args_.model_type(), "glm4v")) {
res = get_positions_glm(image_grid_thw, video_grid_thw);
} else if (absl::StartsWith(args_.model_type(), "qwen3_vl")) {
} else if (is_qwen3_vl_position_model(args_.model_type())) {
res = get_positions_qwen3(image_grid_thw, video_grid_thw);
} else {
res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
Expand Down
10 changes: 3 additions & 7 deletions xllm/core/framework/hf_model_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ limitations under the License.
#include "core/util/blocking_counter.h"
#include "core/util/json_reader.h"
#include "core/util/rec_model_utils.h"
#include "core/util/model_config_utils.h"
#include "core/util/scope_guard.h"
#include "core/util/tensor_helper.h"
#include "models/model_registry.h"
Expand Down Expand Up @@ -724,13 +725,8 @@ bool HFModelLoader::load_model_args(const std::string& model_weights_path) {
return false;
}

std::string model_type;
if (auto data = reader.value<std::string>("model_type")) {
model_type = data.value();
} else {
LOG(ERROR) << "Failed to find model_type in " << args_file_path;
return false;
}
const std::string model_type = util::get_model_type(
reader, std::filesystem::path(model_weights_path), FLAGS_backend);

std::string resolved_model_type;
std::string error_message;
Expand Down
57 changes: 57 additions & 0 deletions xllm/core/framework/hf_model_loader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,38 @@ limitations under the License.

#include <gtest/gtest.h>

#include <filesystem>

#include "core/platform/device.h"
#include "core/util/model_config_utils.h"
#if defined(USE_NPU)
#include "models/model_registry.h"
#endif

namespace xllm {

TEST(HFModelLoaderTest, Qwen35BackendAwareModelTypeSelection) {
JsonReader reader;
ASSERT_TRUE(reader.parse_text(R"json(
{
"architectures": ["Qwen3_5ForConditionalGeneration"],
"model_type": "qwen3_5",
"text_config": {
"model_type": "qwen3_5_text"
},
"vision_config": {
"model_type": "qwen3_5"
}
}
)json"));

const auto fake_model_path = std::filesystem::path("/tmp/Qwen3.5-27B");
EXPECT_EQ(util::get_model_type(reader, fake_model_path), "qwen3_5_text");
EXPECT_EQ(util::get_model_type(reader, fake_model_path, "vlm"), "qwen3_5_vl");
EXPECT_EQ(util::get_model_type(reader, fake_model_path, "llm"),
"qwen3_5_text");
}

TEST(HFModelLoaderTest, LoadCompressedTensorsFp8StaticConfig) {
JsonReader reader;
ASSERT_TRUE(reader.parse_text(R"json(
Expand Down Expand Up @@ -123,6 +148,38 @@ TEST(HFModelLoaderTest, Qwen35MtpModelArgsFromMoeConfig) {
EXPECT_EQ(args.layer_types()[0], "full_attention");
EXPECT_EQ(args.layer_types()[1], "full_attention");
}

TEST(HFModelLoaderTest, Qwen35TextModelArgsKeepTextTypeAndMropeConfig) {
auto loader = ModelRegistry::get_model_args_loader("qwen3_5_text");
ASSERT_TRUE(loader != nullptr);

JsonReader reader;
ASSERT_TRUE(reader.parse_text(R"json(
{
"architectures": ["Qwen3_5ForConditionalGeneration"],
"model_type": "qwen3_5",
"text_config": {
"model_type": "qwen3_5_text",
"rope_parameters": {
"mrope_interleaved": true,
"mrope_section": [11, 11, 10],
"partial_rotary_factor": 0.25,
"rope_theta": 10000000
}
},
"vision_config": {
"model_type": "qwen3_5"
}
}
)json"));

ModelArgs args;
ASSERT_TRUE(loader(reader, &args));
EXPECT_EQ(args.model_type(), "qwen3_5_text");
EXPECT_EQ(args.rope_scaling_mrope_section(),
(std::vector<int64_t>{11, 11, 10}));
EXPECT_TRUE(args.rope_scaling_mrope_interleaved());
}
#endif

} // namespace xllm
24 changes: 20 additions & 4 deletions xllm/core/framework/model/causal_vlm.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,19 +79,35 @@ class CausalVLMImpl : public CausalVLM {

#if defined(USE_NPU)
layer::NpuLmHead get_npu_lm_head() override {
return model_->get_npu_lm_head();
if constexpr (detail::has_get_npu_lm_head<Model>::value) {
return model_->get_npu_lm_head();
} else {
return CausalLM::get_npu_lm_head();
}
}

void set_npu_lm_head(layer::NpuLmHead& head) override {
model_->set_npu_lm_head(head);
if constexpr (detail::has_set_npu_lm_head<Model>::value) {
model_->set_npu_lm_head(head);
} else {
CausalLM::set_npu_lm_head(head);
}
}

layer::NpuWordEmbedding get_npu_word_embedding() override {
return model_->get_npu_word_embedding();
if constexpr (detail::has_get_npu_word_embedding<Model>::value) {
return model_->get_npu_word_embedding();
} else {
return CausalLM::get_npu_word_embedding();
}
}

void set_npu_word_embedding(layer::NpuWordEmbedding& embedding) override {
model_->set_npu_word_embedding(embedding);
if constexpr (detail::has_set_npu_word_embedding<Model>::value) {
model_->set_npu_word_embedding(embedding);
} else {
CausalLM::set_npu_word_embedding(embedding);
}
}
#endif
layer::LmHead get_lm_head() override {
Expand Down
1 change: 1 addition & 0 deletions xllm/core/framework/model/model_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct ModelArgs {
PROPERTY(float, rope_scaling_mscale) = 0.0f;
PROPERTY(float, rope_scaling_mscale_all_dim) = 0.0f;
PROPERTY(std::vector<int64_t>, rope_scaling_mrope_section);
PROPERTY(bool, rope_scaling_mrope_interleaved) = false;

// the maximum sequence length to use for rotary position embeddings.
PROPERTY(int64_t, max_position_embeddings) = 0;
Expand Down
28 changes: 28 additions & 0 deletions xllm/core/layers/common/rotary_embedding_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,17 @@ class CosSinCacheManager {
using torch::indexing::None;
using ISlice = torch::indexing::Slice;

inline torch::Tensor rotate_every_two(const torch::Tensor& x) {
auto x1 = x.index({ISlice(), ISlice(), ISlice(0, None, 2)});
auto x2 = x.index({ISlice(), ISlice(), ISlice(1, None, 2)});
return torch::stack({-x2, x1}, /*dim=*/-1).flatten(/*start_dim=*/-2);
}

inline torch::Tensor rotate_half(const torch::Tensor& x) {
auto chunks = x.chunk(2, /*dim=*/-1);
return torch::cat({-chunks[1], chunks[0]}, /*dim=*/-1);
}
Comment thread
yingxudeng marked this conversation as resolved.

// Inverse dim formula to find dim based on number of rotations
inline double yarn_find_correction_dim(int num_rotations,
int dim,
Expand Down Expand Up @@ -420,6 +431,23 @@ torch::Tensor get_deepseek_rotary_embedding(
options);
return cos_sin;
}

std::tuple<torch::Tensor, torch::Tensor> apply_rotary_pos_emb(
const torch::Tensor& q,
const torch::Tensor& k,
const torch::Tensor& cos,
const torch::Tensor& sin,
bool interleaved) {
if (interleaved) {
auto q_embed = (q * cos) + (rotate_every_two(q) * sin);
auto k_embed = (k * cos) + (rotate_every_two(k) * sin);
return std::make_tuple(q_embed, k_embed);
}

auto q_embed = (q * cos) + (rotate_half(q) * sin);
auto k_embed = (k * cos) + (rotate_half(k) * sin);
return std::make_tuple(q_embed, k_embed);
}
} // namespace rotary
} // namespace layer
} // namespace xllm
7 changes: 7 additions & 0 deletions xllm/core/layers/common/rotary_embedding_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,13 @@ torch::Tensor get_deepseek_rotary_embedding(
float mscale_all_dim,
const torch::TensorOptions& options);

std::tuple<torch::Tensor, torch::Tensor> apply_rotary_pos_emb(
const torch::Tensor& q,
const torch::Tensor& k,
const torch::Tensor& cos,
const torch::Tensor& sin,
bool interleaved);

#if defined(USE_MUSA)
torch::Tensor get_interleave_rotary_embedding(
int64_t dim,
Expand Down
Loading
Loading