Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions xllm/core/common/global_flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ DEFINE_int32(max_tokens_for_graph_mode,
2048,
"Maximum number of tokens for graph execution. "
"If 0, no limit is applied.");

DEFINE_int32(acl_graph_decode_batch_size_limit,
16,
"Decode batch size threshold for ACL graph on NPU. "
"When actual decode batch_size > this value, ACL graph decode "
"falls back to eager mode to avoid OOM.");
// --- vlm config ---

DEFINE_int32(limit_image_per_prompt,
Expand Down
1 change: 1 addition & 0 deletions xllm/core/common/global_flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ DECLARE_bool(enable_prefill_piecewise_graph);
DECLARE_bool(enable_graph_vmm_pool);

DECLARE_int32(max_tokens_for_graph_mode);
DECLARE_int32(acl_graph_decode_batch_size_limit);

DECLARE_bool(enable_chunked_prefill);

Expand Down
1 change: 1 addition & 0 deletions xllm/core/common/help_formatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const OptionCategory kCommonOptions = {"COMMON OPTIONS",
"enable_graph_mode_decode_no_padding",
"enable_prefill_piecewise_graph",
"max_tokens_for_graph_mode",
"acl_graph_decode_batch_size_limit",
"communication_backend",
"task"}};

Expand Down
28 changes: 20 additions & 8 deletions xllm/core/layers/npu/npu_glm4_decoder_layer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ NpuGlm4DecoderLayerImpl::NpuGlm4DecoderLayerImpl(const ModelContext& context)
auto options = context.get_tensor_options();

param_from_args(prefill_param_, model_args, parallel_args, true);
param_from_args(decode_param_, model_args, parallel_args, false);
param_from_args(decode_graph_param_, model_args, parallel_args, false);
decode_eager_param_ = decode_graph_param_;
decode_eager_param_.enableAclGraphPagedAttention = false;
Comment on lines +90 to +91
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Manual assignment of decode_eager_param_ from decode_graph_param_ followed by a specific member override is error-prone. Consider adding a copy constructor or a dedicated factory method to ChatglmLayerParam to handle this initialization safely.

atb_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
placeholder_vec_ = {1};
dtype_ = c10::typeMetaToScalarType(options.dtype());
Expand All @@ -102,7 +104,10 @@ int64_t NpuGlm4DecoderLayerImpl::init_layer() {
name_ = "glm4_decoder_layer";
model_name_ = "glm4";
CHECK_OPERATION_STATUS_RETURN(init_node(prefill_node_, prefill_param_));
CHECK_OPERATION_STATUS_RETURN(init_node(decode_node_, decode_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_graph_node_, decode_graph_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_eager_node_, decode_eager_param_));

return atb::NO_ERROR;
}
Expand Down Expand Up @@ -164,21 +169,27 @@ torch::Tensor NpuGlm4DecoderLayerImpl::forward(torch::Tensor& x,
attn_mask,
kv_cache,
input_params,
true);
true,
false);
// mstxRangeEnd(id);
st = execute_node(prefill_node_, node_id, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< "excute prefill layer fail, error code: " << st;
} else {
build_node_variant_pack(decode_node_,
const bool use_graph_decode_input =
FLAGS_enable_graph && input_params.graph_buffer.tiling_data.defined();
auto& decode_node =
use_graph_decode_input ? decode_graph_node_ : decode_eager_node_;
build_node_variant_pack(decode_node,
x,
cos_pos,
sin_pos,
decode_attn_mask_,
kv_cache,
input_params,
false);
st = execute_node(decode_node_, node_id + 1000, event, event_flag);
false,
use_graph_decode_input);
st = execute_node(decode_node, node_id + 1000, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< "excute decode layer fail, error code: " << st;
}
Expand All @@ -194,7 +205,8 @@ void NpuGlm4DecoderLayerImpl::build_node_variant_pack(
at::Tensor& attn_mask,
KVCache& kv_cache,
ModelInputParams& input_params,
bool is_prefill) {
bool is_prefill,
bool use_graph_decode_input) {
internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x);
// std::cout<<"node.variantPack.inTensors.size:"<<node.variantPack.inTensors.size()<<std::endl;
node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER) = internal_tensors_;
Expand Down Expand Up @@ -229,7 +241,7 @@ void NpuGlm4DecoderLayerImpl::build_node_variant_pack(
input_params.q_seq_lens_vec.data();
}

if (FLAGS_enable_graph && !is_prefill &&
if (!is_prefill && use_graph_decode_input &&
input_params.graph_buffer.tiling_data.defined()) {
node.variantPack.inTensors.at(input_idx++) =
atb_speed::Utils::AtTensor2Tensor(
Expand Down
9 changes: 6 additions & 3 deletions xllm/core/layers/npu/npu_glm4_decoder_layer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ class NpuGlm4DecoderLayerImpl : public BaseLayer {
torch::Tensor& attn_mask,
KVCache& kv_cache,
ModelInputParams& input_params,
bool is_prefill);
bool is_prefill,
bool use_graph_decode_input);

void initialize_quantization_parameters(
atb_speed::chatglm::ChatglmLayerParam& param);
Expand All @@ -86,10 +87,12 @@ class NpuGlm4DecoderLayerImpl : public BaseLayer {
int64_t init_attn_mask();

atb_speed::Model::Node prefill_node_;
atb_speed::Model::Node decode_node_;
atb_speed::Model::Node decode_graph_node_;
atb_speed::Model::Node decode_eager_node_;
std::string model_name_;
atb_speed::chatglm::ChatglmLayerParam prefill_param_;
atb_speed::chatglm::ChatglmLayerParam decode_param_;
atb_speed::chatglm::ChatglmLayerParam decode_graph_param_;
atb_speed::chatglm::ChatglmLayerParam decode_eager_param_;
atb::Tensor internal_tensors_;
atb::Tensor placeholder_;

Expand Down
28 changes: 20 additions & 8 deletions xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ NpuGlm4MoeDecoderImpl::NpuGlm4MoeDecoderImpl(const ModelContext& context,
end_expert_id_ = start_expert_id_ + num_experts_per_partition_ - 1;

param_from_args(prefill_param_, model_args, parallel_args, true);
param_from_args(decode_param_, model_args, parallel_args, false);
param_from_args(decode_graph_param_, model_args, parallel_args, false);
decode_eager_param_ = decode_graph_param_;
decode_eager_param_.enableAclGraphPagedAttention = false;
Comment on lines +51 to +52
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to other decoder implementations, manual copying and modification of decode_eager_param_ is fragile. Please encapsulate this logic within the parameter struct or a factory method to ensure consistency.

atb_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
placeholder_vec_ = {1};
device_id_ = options.device().index();
Expand Down Expand Up @@ -300,7 +302,10 @@ int64_t NpuGlm4MoeDecoderImpl::init_layer() {
BaseLayer::name_ = "glm4_moe_decoder_layer " + std::to_string(layer_id_);
model_name_ = "Glm4_Moe";
CHECK_OPERATION_STATUS_RETURN(init_node(prefill_node_, prefill_param_));
CHECK_OPERATION_STATUS_RETURN(init_node(decode_node_, decode_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_graph_node_, decode_graph_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_eager_node_, decode_eager_param_));

return atb::NO_ERROR;
}
Expand Down Expand Up @@ -356,20 +361,26 @@ torch::Tensor NpuGlm4MoeDecoderImpl::forward(
attn_mask,
kv_cache,
input_params,
true);
true,
false);
st = execute_node(prefill_node_, node_id, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< " excute prefill layer fail, error code: " << st;
} else {
build_node_variant_pack(decode_node_,
const bool use_graph_decode_input =
FLAGS_enable_graph && input_params.graph_buffer.tiling_data.defined();
auto& decode_node =
use_graph_decode_input ? decode_graph_node_ : decode_eager_node_;
build_node_variant_pack(decode_node,
x,
cos_pos,
sin_pos,
/*attn_mask*/ tensor_placeholder_,
kv_cache,
input_params,
false);
st = execute_node(decode_node_, node_id + 1000, event, event_flag);
false,
use_graph_decode_input);
st = execute_node(decode_node, node_id + 1000, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< " excute decode layer fail, error code: " << st;
}
Expand All @@ -385,7 +396,8 @@ void NpuGlm4MoeDecoderImpl::build_node_variant_pack(
torch::Tensor& attn_mask,
KVCache& kv_cache,
const ModelInputParams& input_params,
bool is_prefill) {
bool is_prefill,
bool use_graph_decode_input) {
internal_tensor_ = atb_speed::Utils::AtTensor2Tensor(x);
auto& dp_ep_padding = input_params.dp_ep_padding_data;

Expand Down Expand Up @@ -458,7 +470,7 @@ void NpuGlm4MoeDecoderImpl::build_node_variant_pack(
node.variantPack.inTensors.at(input_idx++) =
atb_speed::Utils::AtTensor2Tensor(tensor_placeholder_);

if (FLAGS_enable_graph && !is_prefill &&
if (!is_prefill && use_graph_decode_input &&
input_params.graph_buffer.tiling_data.defined()) {
node.variantPack.inTensors.at(input_idx++) =
atb_speed::Utils::AtTensor2Tensor(
Expand Down
11 changes: 7 additions & 4 deletions xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ class NpuGlm4MoeDecoderImpl : public BaseLayer {
torch::Tensor& attn_mask,
KVCache& kv_cache,
const ModelInputParams& input_params,
bool is_prefill);
bool is_prefill,
bool use_graph_decode_input);

std::string model_name_;

Expand All @@ -116,10 +117,12 @@ class NpuGlm4MoeDecoderImpl : public BaseLayer {

int32_t num_speculative_tokens_ = 0;
atb_speed::glm::MoeLayerParam prefill_param_;
atb_speed::glm::MoeLayerParam decode_param_;
atb_speed::glm::MoeLayerParam decode_graph_param_;
atb_speed::glm::MoeLayerParam decode_eager_param_;

atb_speed::Model::Node prefill_node_;
atb_speed::Model::Node decode_node_;
atb_speed::Model::Node decode_graph_node_;
atb_speed::Model::Node decode_eager_node_;

atb::Tensor internal_tensor_;

Expand All @@ -144,4 +147,4 @@ std::vector<torch::Tensor> get_dtp_inputs(torch::Tensor token_size_per_dp_group,
int32_t rank,
at::Device device);
} // namespace layer
} // namespace xllm
} // namespace xllm
28 changes: 20 additions & 8 deletions xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ NpuQwen3DecoderLayerImpl::NpuQwen3DecoderLayerImpl(const ModelContext& context)
auto options = context.get_tensor_options();

param_from_args(prefill_param_, model_args, parallel_args, true);
param_from_args(decode_param_, model_args, parallel_args, false);
param_from_args(decode_graph_param_, model_args, parallel_args, false);
decode_eager_param_ = decode_graph_param_;
decode_eager_param_.enableAclGraphPagedAttention = false;
Comment on lines +161 to +162
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The manual override of enableAclGraphPagedAttention after copying the parameter struct is prone to maintenance issues. Encapsulate this initialization logic to prevent future regressions.

atb_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
placeholder_vec_ = {1};
dtype_ = c10::typeMetaToScalarType(options.dtype());
Expand Down Expand Up @@ -188,7 +190,10 @@ int64_t NpuQwen3DecoderLayerImpl::init_layer() {
name_ = "qwen3_decoder_layer";
model_name_ = "qwen3";
CHECK_OPERATION_STATUS_RETURN(init_node(prefill_node_, prefill_param_));
CHECK_OPERATION_STATUS_RETURN(init_node(decode_node_, decode_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_graph_node_, decode_graph_param_));
CHECK_OPERATION_STATUS_RETURN(
init_node(decode_eager_node_, decode_eager_param_));

return atb::NO_ERROR;
}
Expand Down Expand Up @@ -246,22 +251,28 @@ torch::Tensor NpuQwen3DecoderLayerImpl::forward(torch::Tensor& x,
kv_cache,
input_params,
/*is_prefill=*/true,
node_id);
node_id,
/*use_graph_decode_input=*/false);
// mstxRangeEnd(id);
st = execute_node(prefill_node_, node_id, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< "excute prefill layer fail, error code: " << st;
} else {
build_node_variant_pack(decode_node_,
const bool use_graph_decode_input =
FLAGS_enable_graph && input_params.graph_buffer.tiling_data.defined();
auto& decode_node =
use_graph_decode_input ? decode_graph_node_ : decode_eager_node_;
build_node_variant_pack(decode_node,
x,
cos_pos,
sin_pos,
decode_attn_mask_,
kv_cache,
input_params,
/*is_prefill=*/false,
node_id);
st = execute_node(decode_node_, node_id + 1000, event, event_flag);
node_id,
use_graph_decode_input);
st = execute_node(decode_node, node_id + 1000, event, event_flag);
LOG_IF(FATAL, st != 0) << model_name_
<< "excute decode layer fail, error code: " << st;
}
Expand All @@ -278,7 +289,8 @@ void NpuQwen3DecoderLayerImpl::build_node_variant_pack(
KVCache& kv_cache,
ModelInputParams& input_params,
bool is_prefill,
int node_id) {
int node_id,
bool use_graph_decode_input) {
internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x);
node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER) = internal_tensors_;
node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 1) =
Expand Down Expand Up @@ -342,7 +354,7 @@ void NpuQwen3DecoderLayerImpl::build_node_variant_pack(
input_params.q_seq_lens_vec.data();
}

if (FLAGS_enable_graph && !is_prefill &&
if (!is_prefill && use_graph_decode_input &&
input_params.graph_buffer.tiling_data.defined()) {
node.variantPack.inTensors.at(input_idx++) =
atb_speed::Utils::AtTensor2Tensor(
Expand Down
9 changes: 6 additions & 3 deletions xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ class NpuQwen3DecoderLayerImpl : public BaseLayer {
KVCache& kv_cache,
ModelInputParams& input_params,
bool is_prefill,
int node_id);
int node_id,
bool use_graph_decode_input);

void initialize_parallel_parameters(atb_speed::qwen::QwenLayerParam& param,
const ParallelArgs& parallel_args);
Expand All @@ -90,10 +91,12 @@ class NpuQwen3DecoderLayerImpl : public BaseLayer {
int64_t init_attn_mask();

atb_speed::Model::Node prefill_node_;
atb_speed::Model::Node decode_node_;
atb_speed::Model::Node decode_graph_node_;
atb_speed::Model::Node decode_eager_node_;
std::string model_name_;
atb_speed::qwen::QwenLayerParam prefill_param_;
atb_speed::qwen::QwenLayerParam decode_param_;
atb_speed::qwen::QwenLayerParam decode_graph_param_;
atb_speed::qwen::QwenLayerParam decode_eager_param_;
atb::Tensor internal_tensors_;
atb::Tensor placeholder_;

Expand Down
Loading
Loading