jd-opensource · yingxudeng · Apr 3, 2026 · Apr 9, 2026
@@ -106,7 +106,8 @@ void resolve_npu_kernel_backend_for_options(Options* options) {
     return;
   }
 
-  const std::string model_type = get_model_type(options->model_path());
+  const std::string model_type =
+      util::get_model_type(options->model_path(), options->backend());
   std::string effective_backend;
   std::string resolved_name;
   std::string error_message;

@@ -137,6 +137,14 @@ bool VLMEngine::init_model() {
   n_local_kv_heads_ = std::max<int64_t>(1, n_kv_heads / world_size);
   head_dim_ = args_.head_dim();
   dtype_ = util::parse_dtype(args_.dtype(), options_.devices()[0]);
+  if (has_linear_attention_layers(args_)) {
+    const int64_t linear_n_k_heads = args_.linear_num_key_heads();
+    const int64_t linear_n_v_heads = args_.linear_num_value_heads();
+    n_local_linear_k_heads_ =
+        std::max<int64_t>(1, linear_n_k_heads / world_size);
+    n_local_linear_v_heads_ =
+        std::max<int64_t>(1, linear_n_v_heads / world_size);
+  }
 
   // key + value for all layers
   LOG(INFO) << "Block info, block_size: " << options_.block_size()
@@ -247,13 +255,38 @@ Engine::KVCacheCapacity VLMEngine::estimate_kv_cache_capacity() {
     slot_size = 2 * dtype_size * head_dim_ * n_local_kv_heads_;
   }
   kv_cache_cap.slot_size = slot_size;
+  if (has_linear_attention_layers(args_)) {
+    const int64_t head_k_dim = args_.linear_key_head_dim();
+    const int64_t head_v_dim = args_.linear_value_head_dim();
+    const int64_t linear_ssm_slot_size =
+        dtype_size * n_local_linear_v_heads_ * head_k_dim * head_v_dim;
+    const int64_t linear_conv_slot_size =
+        dtype_size *
+        (head_k_dim * n_local_linear_k_heads_ * 2 +
+         head_v_dim * n_local_linear_v_heads_) *
+        (args_.linear_conv_kernel_dim() - 1);
+    kv_cache_cap.linear_slot_size =
+        linear_ssm_slot_size + linear_conv_slot_size;
+  }
   kv_cache_cap.n_layers = args_.n_layers();
 
   // compute kv cache n_blocks
+  int64_t full_attention_interval = (args_.full_attention_interval() < 1)
+                                        ? 1
+                                        : args_.full_attention_interval();
+  int64_t num_full_attention_layers =
+      kv_cache_cap.n_layers / full_attention_interval;
+  int64_t num_linear_attention_layers =
+      kv_cache_cap.n_layers - num_full_attention_layers;
   const int32_t block_size = options_.block_size();
-  const int64_t block_size_in_bytes = block_size * slot_size;
-  kv_cache_cap.n_blocks = kv_cache_cap.cache_size_in_bytes /
-                          (args_.n_layers() * block_size_in_bytes);
+  const int64_t full_cache_block_size_in_bytes = block_size * slot_size;
+  const int64_t total_cache_block_size_in_bytes =
+      num_full_attention_layers * full_cache_block_size_in_bytes +
+      num_linear_attention_layers * kv_cache_cap.linear_slot_size;
+  CHECK_GT(total_cache_block_size_in_bytes, 0)
+      << "invalid cache block size estimate";
+  kv_cache_cap.n_blocks =
+      kv_cache_cap.cache_size_in_bytes / total_cache_block_size_in_bytes;
   CHECK_GT(kv_cache_cap.n_blocks, 0) << "no n_blocks for kv cache";
 
   return kv_cache_cap;
@@ -266,14 +299,27 @@ bool VLMEngine::allocate_kv_cache(const Engine::KVCacheCapacity& kv_cache_cap) {
             << ", slot_size: " << kv_cache_cap.slot_size;
 
   const int32_t block_size = options_.block_size();
+  const bool enable_linear_attention = has_linear_attention_layers(args_);
 
   // init kv cache for each worker
   std::vector<std::vector<int64_t>> kv_cache_shape;
-  kv_cache_shape.reserve(2);
+  kv_cache_shape.reserve(enable_linear_attention ? 4 : 2);
   kv_cache_shape.emplace_back(std::vector<int64_t>{
       kv_cache_cap.n_blocks, block_size, n_local_kv_heads_, head_dim_});
   kv_cache_shape.emplace_back(std::vector<int64_t>{
       kv_cache_cap.n_blocks, block_size, n_local_kv_heads_, head_dim_});
+  if (enable_linear_attention) {
+    kv_cache_shape.emplace_back(std::vector<int64_t>{
+        kv_cache_cap.n_blocks,
+        args_.linear_key_head_dim() * n_local_linear_k_heads_ * 2 +
+            args_.linear_key_head_dim() * n_local_linear_v_heads_,
+        args_.linear_conv_kernel_dim() - 1});
+    kv_cache_shape.emplace_back(
+        std::vector<int64_t>{kv_cache_cap.n_blocks,
+                             n_local_linear_v_heads_,
+                             args_.linear_key_head_dim(),
+                             args_.linear_value_head_dim()});
+  }
 #if defined(USE_MLU)
   // transpose kv_cache layout for mlu
   // default layout: [n_blocks, block_size, n_head, head_dim]

@@ -91,6 +91,8 @@ class VLMEngine : public Engine {
   // config for kv cache
   int64_t n_local_kv_heads_ = 0;
   int64_t head_dim_ = 0;
+  int64_t n_local_linear_v_heads_ = 0;
+  int64_t n_local_linear_k_heads_ = 0;
 };
 
 }  // namespace xllm
@@ -42,6 +42,11 @@ std::vector<std::tuple<std::string, int32_t, int32_t>> groupByTokenType(
       current_key, start, static_cast<int32_t>(token_types.size()));
   return groups;
 }
+
+bool is_qwen3_vl_position_model(const std::string& model_type) {
+  return absl::StartsWith(model_type, "qwen3_vl") ||
+         absl::StartsWith(model_type, "qwen3_5_vl");
+}
 }  // namespace
 
 torch::Tensor MPositionHelper::get_positions() {
@@ -63,7 +68,7 @@ torch::Tensor MPositionHelper::get_positions() {
     std::tuple<torch::Tensor, int32_t> res;
     if (absl::StartsWith(args_.model_type(), "glm4v")) {
       res = get_positions_glm(image_grid_thw, video_grid_thw);
-    } else if (absl::StartsWith(args_.model_type(), "qwen3_vl")) {
+    } else if (is_qwen3_vl_position_model(args_.model_type())) {
       res = get_positions_qwen3(image_grid_thw, video_grid_thw);
     } else {
       res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);

@@ -47,6 +47,7 @@ limitations under the License.
 #include "core/util/blocking_counter.h"
 #include "core/util/json_reader.h"
 #include "core/util/rec_model_utils.h"
+#include "core/util/model_config_utils.h"
 #include "core/util/scope_guard.h"
 #include "core/util/tensor_helper.h"
 #include "models/model_registry.h"
@@ -724,13 +725,8 @@ bool HFModelLoader::load_model_args(const std::string& model_weights_path) {
     return false;
   }
 
-  std::string model_type;
-  if (auto data = reader.value<std::string>("model_type")) {
-    model_type = data.value();
-  } else {
-    LOG(ERROR) << "Failed to find model_type in " << args_file_path;
-    return false;
-  }
+  const std::string model_type = util::get_model_type(
+      reader, std::filesystem::path(model_weights_path), FLAGS_backend);
 
   std::string resolved_model_type;
   std::string error_message;

@@ -17,13 +17,38 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 
+#include <filesystem>
+
 #include "core/platform/device.h"
+#include "core/util/model_config_utils.h"
 #if defined(USE_NPU)
 #include "models/model_registry.h"
 #endif
 
 namespace xllm {
 
+TEST(HFModelLoaderTest, Qwen35BackendAwareModelTypeSelection) {
+  JsonReader reader;
+  ASSERT_TRUE(reader.parse_text(R"json(
+    {
+      "architectures": ["Qwen3_5ForConditionalGeneration"],
+      "model_type": "qwen3_5",
+      "text_config": {
+        "model_type": "qwen3_5_text"
+      },
+      "vision_config": {
+        "model_type": "qwen3_5"
+      }
+    }
+  )json"));
+
+  const auto fake_model_path = std::filesystem::path("/tmp/Qwen3.5-27B");
+  EXPECT_EQ(util::get_model_type(reader, fake_model_path), "qwen3_5_text");
+  EXPECT_EQ(util::get_model_type(reader, fake_model_path, "vlm"), "qwen3_5_vl");
+  EXPECT_EQ(util::get_model_type(reader, fake_model_path, "llm"),
+            "qwen3_5_text");
+}
+
 TEST(HFModelLoaderTest, LoadCompressedTensorsFp8StaticConfig) {
   JsonReader reader;
   ASSERT_TRUE(reader.parse_text(R"json(
@@ -123,6 +148,38 @@ TEST(HFModelLoaderTest, Qwen35MtpModelArgsFromMoeConfig) {
   EXPECT_EQ(args.layer_types()[0], "full_attention");
   EXPECT_EQ(args.layer_types()[1], "full_attention");
 }
+
+TEST(HFModelLoaderTest, Qwen35TextModelArgsKeepTextTypeAndMropeConfig) {
+  auto loader = ModelRegistry::get_model_args_loader("qwen3_5_text");
+  ASSERT_TRUE(loader != nullptr);
+
+  JsonReader reader;
+  ASSERT_TRUE(reader.parse_text(R"json(
+    {
+      "architectures": ["Qwen3_5ForConditionalGeneration"],
+      "model_type": "qwen3_5",
+      "text_config": {
+        "model_type": "qwen3_5_text",
+        "rope_parameters": {
+          "mrope_interleaved": true,
+          "mrope_section": [11, 11, 10],
+          "partial_rotary_factor": 0.25,
+          "rope_theta": 10000000
+        }
+      },
+      "vision_config": {
+        "model_type": "qwen3_5"
+      }
+    }
+  )json"));
+
+  ModelArgs args;
+  ASSERT_TRUE(loader(reader, &args));
+  EXPECT_EQ(args.model_type(), "qwen3_5_text");
+  EXPECT_EQ(args.rope_scaling_mrope_section(),
+            (std::vector<int64_t>{11, 11, 10}));
+  EXPECT_TRUE(args.rope_scaling_mrope_interleaved());
+}
 #endif
 
 }  // namespace xllm
@@ -79,19 +79,35 @@ class CausalVLMImpl : public CausalVLM {
 
 #if defined(USE_NPU)
   layer::NpuLmHead get_npu_lm_head() override {
-    return model_->get_npu_lm_head();
+    if constexpr (detail::has_get_npu_lm_head<Model>::value) {
+      return model_->get_npu_lm_head();
+    } else {
+      return CausalLM::get_npu_lm_head();
+    }
   }
 
   void set_npu_lm_head(layer::NpuLmHead& head) override {
-    model_->set_npu_lm_head(head);
+    if constexpr (detail::has_set_npu_lm_head<Model>::value) {
+      model_->set_npu_lm_head(head);
+    } else {
+      CausalLM::set_npu_lm_head(head);
+    }
   }
 
   layer::NpuWordEmbedding get_npu_word_embedding() override {
-    return model_->get_npu_word_embedding();
+    if constexpr (detail::has_get_npu_word_embedding<Model>::value) {
+      return model_->get_npu_word_embedding();
+    } else {
+      return CausalLM::get_npu_word_embedding();
+    }
   }
 
   void set_npu_word_embedding(layer::NpuWordEmbedding& embedding) override {
-    model_->set_npu_word_embedding(embedding);
+    if constexpr (detail::has_set_npu_word_embedding<Model>::value) {
+      model_->set_npu_word_embedding(embedding);
+    } else {
+      CausalLM::set_npu_word_embedding(embedding);
+    }
   }
 #endif
   layer::LmHead get_lm_head() override {

@@ -83,6 +83,7 @@ struct ModelArgs {
   PROPERTY(float, rope_scaling_mscale) = 0.0f;
   PROPERTY(float, rope_scaling_mscale_all_dim) = 0.0f;
   PROPERTY(std::vector<int64_t>, rope_scaling_mrope_section);
+  PROPERTY(bool, rope_scaling_mrope_interleaved) = false;
 
   // the maximum sequence length to use for rotary position embeddings.
   PROPERTY(int64_t, max_position_embeddings) = 0;

@@ -86,6 +86,17 @@ class CosSinCacheManager {
 using torch::indexing::None;
 using ISlice = torch::indexing::Slice;
 
+inline torch::Tensor rotate_every_two(const torch::Tensor& x) {
+  auto x1 = x.index({ISlice(), ISlice(), ISlice(0, None, 2)});
+  auto x2 = x.index({ISlice(), ISlice(), ISlice(1, None, 2)});
+  return torch::stack({-x2, x1}, /*dim=*/-1).flatten(/*start_dim=*/-2);
+}
+
+inline torch::Tensor rotate_half(const torch::Tensor& x) {
+  auto chunks = x.chunk(2, /*dim=*/-1);
+  return torch::cat({-chunks[1], chunks[0]}, /*dim=*/-1);
+}
+
 // Inverse dim formula to find dim based on number of rotations
 inline double yarn_find_correction_dim(int num_rotations,
                                        int dim,
@@ -420,6 +431,23 @@ torch::Tensor get_deepseek_rotary_embedding(
                                        options);
   return cos_sin;
 }
+
+std::tuple<torch::Tensor, torch::Tensor> apply_rotary_pos_emb(
+    const torch::Tensor& q,
+    const torch::Tensor& k,
+    const torch::Tensor& cos,
+    const torch::Tensor& sin,
+    bool interleaved) {
+  if (interleaved) {
+    auto q_embed = (q * cos) + (rotate_every_two(q) * sin);
+    auto k_embed = (k * cos) + (rotate_every_two(k) * sin);
+    return std::make_tuple(q_embed, k_embed);
+  }
+
+  auto q_embed = (q * cos) + (rotate_half(q) * sin);
+  auto k_embed = (k * cos) + (rotate_half(k) * sin);
+  return std::make_tuple(q_embed, k_embed);
+}
 }  // namespace rotary
 }  // namespace layer
 }  // namespace xllm
@@ -116,6 +116,13 @@ torch::Tensor get_deepseek_rotary_embedding(
     float mscale_all_dim,
     const torch::TensorOptions& options);
 
+std::tuple<torch::Tensor, torch::Tensor> apply_rotary_pos_emb(
+    const torch::Tensor& q,
+    const torch::Tensor& k,
+    const torch::Tensor& cos,
+    const torch::Tensor& sin,
+    bool interleaved);
+
 #if defined(USE_MUSA)
 torch::Tensor get_interleave_rotary_embedding(
     int64_t dim,