ModelTC
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/CN/source/getting_started/installation.rst‎
Lines changed: 8 additions & 4 deletions b/‎docs/CN/source/getting_started/installation.rst‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎docs/CN/source/index.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/CN/source/index.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/CN/source/models/add_new_model.md‎
Lines changed: 0 additions & 37 deletions b/‎docs/CN/source/models/add_new_model.md‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎…N/source/tutorial/api_server_args_zh.rst‎ ‎…s/CN/source/tutorial/api_server_args.rst‎docs/CN/source/tutorial/api_server_args_zh.rst renamed to docs/CN/source/tutorial/api_server_args.rst
Lines changed: 40 additions & 42 deletions b/‎…N/source/tutorial/api_server_args_zh.rst‎ ‎…s/CN/source/tutorial/api_server_args.rst‎docs/CN/source/tutorial/api_server_args_zh.rst renamed to docs/CN/source/tutorial/api_server_args.rst
Lines changed: 40 additions & 42 deletions
@@ -6,3 +6,4 @@ dist
 .idea
 .vscode
 tmp/
+requirements-musa.txt
@@ -10,4 +10,4 @@ repos:
     rev: 6.1.0 
     hooks:
       - id: flake8
-        args: ['--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606, E231']
+        args: ['--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606, E231, F541']
@@ -27,7 +27,7 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $ # 前请确保你的docker设置中已经分配了足够的共享内存，否则可能导致
     $ # 服务无法正常启动。
     $ # 1.如果是纯文本服务，建议分配2GB以上的共享内存, 如果你的内存充足，建议分配16GB以上的共享内存.
-    $ # 2.如果是多模态服务，建议分配16GB以上的共享内存，具体可以根据实际情况进行调整. 
+    $ # 2.如果是多模态服务，建议分配16GB以上的共享内存，具体可以根据实际情况进行调整.
     $ # 如果你没有足够的共享内存，可以尝试在启动服务的时候调低 --running_max_req_size 参数，这会降低
     $ # 服务的并发请求数量，但可以减少共享内存的占用。如果是多模态服务，也可以通过降低 --cache_capacity
     $ # 参数来减少共享内存的占用。
@@ -38,7 +38,7 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
 你也可以使用源码手动构建镜像并运行,建议手动构建镜像,因为更新比较频繁：
 
 .. code-block:: console
-    
+
     $ # 进入代码仓库的根目录
     $ cd /lightllm
     $ # 手动构建镜像, docker 目录下有不同功能场景的镜像构建文件，按需构建。
@@ -52,7 +52,7 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
 或者你也可以直接使用脚本一键启动镜像并且运行：
 
 .. code-block:: console
-    
+
     $ # 查看脚本参数
     $ python tools/quick_launch_docker.py --help
 
@@ -80,6 +80,10 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $ # 安装lightllm的依赖 (cuda 12.4)
     $ pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124
     $
+    $ # 安装lightllm的依赖 (摩尔线程 GPU)
+    $ ./generate_requirements_musa.sh
+    $ pip install -r requirements-musa.txt
+    $
     $ # 安装lightllm
     $ python setup.py install
 
@@ -97,6 +101,6 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     .. code-block:: console
 
         $ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly --no-deps
-    
+
     具体原因可以参考：`issue <https://github.com/triton-lang/triton/issues/3619>`_ 和 `fix PR <https://github.com/triton-lang/triton/pull/3638>`_
 
@@ -53,6 +53,8 @@ Lightllm 整合了众多的开源方案的优点，包括但不限于 FasterTran
    多模态部署 <tutorial/multimodal>
    奖励模型部署 <tutorial/reward_model>
    OpenAI 接口使用 <tutorial/openai>
+   工具调用（Function Calling） <tutorial/function_calling>
+   思考解析（Reasoning Parser） <tutorial/reasoning_parser>
    APIServer 参数详解 <tutorial/api_server_args_zh>
    lightllm api介绍 <tutorial/api_param>
 
 
@@ -162,19 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
                                                                  self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
             self.lm_head_weight_ = self.wte_weight_
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.pre_norm_weight_, 
-                   self.pre_norm_bias_, 
-                   self.final_norm_weight_, 
-                   self.final_norm_bias_,
-                   self.wte_weight_,
-                   self.lm_head_weight_]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
-
 ~~~
 
 ***transformer_layer_weight.py***
@@ -204,30 +191,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
         self._load_qkvo_weights(weights)
         self._load_ffn_weights(weights)
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.att_norm_weight_,
-                   self.att_norm_bias_,
-                   self.q_weight_,
-                   self.k_weight_,
-                   self.v_weight_,
-                   self.q_bias_,
-                   self.k_bias_,
-                   self.v_bias_,
-                   self.o_weight_,
-                   self.o_bias_,
-
-                   self.ffn_norm_weight_,
-                   self.ffn_norm_bias_,
-                   self.ffn_1_weight_,
-                   self.ffn_1_bias_,
-                   self.ffn_2_weight_,
-                   self.ffn_2_bias_,
-                   ]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
     def _load_qkvo_weights(self, weights):
         if f"h.{self.layer_num_}.input_layernorm.weight" in weights:
 
@@ -1,4 +1,4 @@
-.. _tutorial/api_server_args_zh:
+.. _tutorial/api_server_args:
 
 APIServer 参数详解
 ==================
@@ -183,22 +183,6 @@ PD 分离模式参数
     设置为 True 时，--nccl_host 必须等于 config_server_host，--nccl_port 对于 config_server 必须是唯一的，
     不要为不同的推理节点使用相同的 nccl_port，这将是严重错误
 
-attention类型选择参数
----------------------
-
-.. option:: --mode
-
-    模型推理模式，可以指定多个值：
-    
-    * ``triton_int8kv``: 使用 int8 存储 kv cache，可增加 token 容量，使用 triton kernel
-    * ``ppl_int8kv``: 使用 int8 存储 kv cache，使用 ppl 快速 kernel
-    * ``ppl_fp16``: 使用 ppl 快速 fp16 解码注意力 kernel
-    * ``triton_flashdecoding``: 用于长上下文的 flashdecoding 模式，当前支持 llama llama2 qwen
-    * ``triton_gqa_attention``: 使用 GQA 的模型的快速 kernel
-    * ``triton_gqa_flashdecoding``: 使用 GQA 的模型的快速 flashdecoding kernel
-    * ``triton_fp8kv``: 使用 float8 存储 kv cache，目前仅用于 deepseek2
-    
-    需要阅读源代码以确认所有模型支持的具体模式
 
 调度参数
 --------
@@ -300,6 +284,17 @@ attention类型选择参数
 
     为 ViT 构建分布式环境的 NCCL 端口列表，例如 29500 29501 29502，默认为 [29500]
 
+.. option:: --vit_att_backend
+
+    设置 ViT 使用的注意力后端。可选值为：
+
+    * ``auto``: 自动选择最佳后端（默认值），优先级为 fa3 > xformers > sdpa > triton
+    * ``fa3``: 使用 Flash-Attention 3 后端
+    * ``xformers``: 使用 xformers 后端
+    * ``sdpa``: 使用 sdpa 后端
+    * ``triton``: 使用 Triton 后端
+
+
 性能优化参数
 ------------
 
@@ -326,18 +321,28 @@ attention类型选择参数
 .. option:: --enable_decode_microbatch_overlap
 
     推理后端将为解码使用微批次重叠模式
-    
-.. option:: --enable_flashinfer_prefill
 
-    推理后端将为预填充使用 flashinfer 的注意力 kernel
-    
-.. option:: --enable_flashinfer_decode
+.. option:: --llm_prefill_att_backend
+
+    设置预填充（Prefill）阶段使用的注意力后端。可选值为：
+
+    * ``auto``: 自动选择最佳后端（默认值），优先级为 fa3 > flashinfer > triton
+    * ``fa3``: 使用 Flash-Attention 3 后端
+    * ``flashinfer``: 使用 FlashInfer 后端
+    * ``triton``: 使用 Triton 后端
+
+.. option:: --llm_decode_att_backend
+
+    设置解码（Decode）阶段使用的注意力后端。可选值为：
 
-    推理后端将为解码使用 flashinfer 的注意力 kernel
+    * ``auto``: 自动选择最佳后端（默认值），优先级为 fa3 > flashinfer > triton
+    * ``fa3``: 使用 Flash-Attention 3 后端
+    * ``flashinfer``: 使用 FlashInfer 后端
+    * ``triton``: 使用 Triton 后端
 
-.. option:: --enable_fa3
+.. option:: --llm_kv_type
 
-    推理后端将为预填充和解码使用 fa3 注意力 kernel
+    推理后端使用什么类型的数据存储kv cache, 可选值为 "None", "int8kv", "int4kv", "fp8kv"
 
 .. option:: --disable_cudagraph
 
@@ -373,17 +378,14 @@ attention类型选择参数
 .. option:: --quant_type
 
     量化方法，可选值：
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``vllm-fp8w8a8-b128``
+    * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``awq``
+    * ``awq_marlin``
     * ``none`` (默认)
 
 .. option:: --quant_cfg
@@ -395,13 +397,7 @@ attention类型选择参数
 .. option:: --vit_quant_type
 
     ViT 量化方法，可选值：
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``none`` (默认)
@@ -447,10 +443,12 @@ MTP 多预测参数
 
 .. option:: --mtp_mode
 
-    支持的 mtp 模式，建议使用 deepseekv3_eagle获得更好的性能体验，可选值：
+    支持的 mtp 模式，建议使用 eagle_with_att获得更好的性能体验，可选值：
 
-    * ``deepseekv3_vanilla``
-    * ``deepseekv3_eagle``
+    * ``vanilla_with_att``
+    * ``eagle_with_att``
+    * ``vanilla_no_att``
+    * ``eagle_no_att``
     * ``None``: 不启用 mtp（默认）
 
 .. option:: --mtp_draft_model_dir
-Original file line number
+Diff line change
 .idea
 .vscode
 tmp/
 +requirements-musa.txt