From 80f469530a1d12d3eda97085ede1534a144792ec Mon Sep 17 00:00:00 2001
From: ali-88123 <1940747290@qq.com>
Date: Fri, 31 Oct 2025 15:28:24 +0800
Subject: [PATCH 1/2] delete tokenizer folder;refactor the deployment test
 script to take external parameters

---
 README.md                                     |  13 +-
 README_en.md                                  |  13 +-
 angelslim/models/llm/kimi_k2.py               |   2 +-
 .../llm/tiktoken_tokenizer.py}                |   0
 angelslim/tokenizer/__init__.py               |  15 --
 scripts/deploy/lm_eval.sh                     | 162 ++++++++++++++----
 scripts/deploy/lmms_eval.sh                   | 115 +++++++++++--
 scripts/deploy/offline.py                     |   2 +-
 scripts/deploy/openai.sh                      | 104 ++++++++++-
 scripts/deploy/run_sglang.sh                  |  71 +++++++-
 scripts/deploy/run_vllm.sh                    |  87 +++++++++-
 11 files changed, 488 insertions(+), 96 deletions(-)
 rename angelslim/{tokenizer/kimi_k2.py => models/llm/tiktoken_tokenizer.py} (100%)
 delete mode 100644 angelslim/tokenizer/__init__.py

diff --git a/README.md b/README.md
index 64efc221..7937cc0b 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ python3 tools/spec_benchmark.py \
 测试`transformers`加载量化模型离线推理：
 
 ```shell
-python deploy/offline.py $MODEL_PATH
+python deploy/offline.py $MODEL_PATH "Hello, my name is"
 ```
 
 其中 `MODEL_PATH` 为量化产出模型路径。
@@ -168,15 +168,16 @@ python deploy/offline.py $MODEL_PATH
 [vLLM](https://github.com/vllm-project/vllm) 服务启动脚本，建议版本`vllm>=0.8.5.post1`，部署MOE INT8量化模型需要`vllm>=0.9.2`。
 
 ```shell
-bash deploy/run_vllm.sh $MODEL_PATH
+bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
 ```
+其中`-d`为可见设备，`-t`为张量并行度，`-p`为流水线并行度，`-g`为显存使用率。
 
 **SGLang**
 
 [SGLang](https://github.com/sgl-project/sglang) 服务启动脚本，建议版本 `sglang>=0.4.6.post1`：
 
 ```shell
-bash deploy/run_sglang.sh $MODEL_PATH
+bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
 ```
 
 #### 3. 服务调用
@@ -184,16 +185,18 @@ bash deploy/run_sglang.sh $MODEL_PATH
 通过 [OpenAI 格式](https://platform.openai.com/docs/api-reference/introduction) 接口发起请求：
 
 ```shell
-bash deploy/openai.sh $MODEL_PATH
+bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
 ```
+其中`-p`为输入prompt
 
 #### 4. 效果验证
 
 使用 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 评估量化模型精度，建议版本`lm-eval>=0.4.8`：
 
 ```shell
-bash deploy/lm_eval.sh $MODEL_PATH
+bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
 ```
+其中`RESULT_PATH`为测试结果保存目录，`-b`为batch size大小，`--tasks`为评测任务，`-n`为few-shot数量
 
 详细操作指南请参阅[部署文档](https://angelslim.readthedocs.io/zh-cn/latest/deployment/deploy.html)。
 
diff --git a/README_en.md b/README_en.md
index 8b7fb219..e70519d4 100644
--- a/README_en.md
+++ b/README_en.md
@@ -154,7 +154,7 @@ If you need to load a quantized model via `transformers`, please set the `deploy
 To test offline inference with a quantized model loaded via `transformers`, run the following command:
 
 ```shell
-python deploy/offline.py $MODEL_PATH
+python deploy/offline.py $MODEL_PATH "Hello, my name is"
 ```
 
 Where `MODEL_PATH` is the path to the quantized model output.
@@ -169,8 +169,9 @@ Use the following script to launch a [vLLM](https://github.com/vllm-project/vllm
 
 
 ```shell
-bash deploy/run_vllm.sh $MODEL_PATH
+bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
 ```
+Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipeline parallel size, and `-g` is the GPU memory utilization.
 
 **SGLang**
 
@@ -178,7 +179,7 @@ bash deploy/run_vllm.sh $MODEL_PATH
 Use the following script to launch a [SGLang](https://github.com/sgl-project/sglang) server, recommended version `sglang>=0.4.6.post1`.
 
 ```shell
-bash deploy/run_sglang.sh $MODEL_PATH
+bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
 ```
 
 #### 3. Service Invocation
@@ -186,16 +187,18 @@ bash deploy/run_sglang.sh $MODEL_PATH
 Invoke requests via [OpenAI's API format](https://platform.openai.com/docs/api-reference/introduction):
 
 ```shell
-bash deploy/openai.sh $MODEL_PATH
+bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
 ```
+where `-p` is the input prompt.
 
 #### 4. Performance Evaluation
 
 Evaluate the performance of quantized model using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness), recommended version`lm-eval>=0.4.8`:
 
 ```shell
-bash deploy/lm_eval.sh $MODEL_PATH
+bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
 ```
+where `RESULT_PATH` is the directory for saving test results, `-b` is batch size, `--tasks` specifies the evaluation tasks, and `-n` is the number of few-shot examples.
 
 For more detaileds, please refer to the [Deployment Documentation](https://angelslim.readthedocs.io/zh-cn/latest/deployment/deploy.html).
 
diff --git a/angelslim/models/llm/kimi_k2.py b/angelslim/models/llm/kimi_k2.py
index bb14285f..a6a500cf 100644
--- a/angelslim/models/llm/kimi_k2.py
+++ b/angelslim/models/llm/kimi_k2.py
@@ -16,11 +16,11 @@
 from transformers import AutoModelForCausalLM
 from transformers.models.deepseek_v3 import DeepseekV3Config
 
-from ...tokenizer import TikTokenTokenizer
 from ...utils import print_info
 from ..model_factory import SlimModelFactory
 from .deepseek import DeepSeek
 from .modeling_deepseek import DeepseekV3ForCausalLM
+from .tiktoken_tokenizer import TikTokenTokenizer
 
 
 @SlimModelFactory.register
diff --git a/angelslim/tokenizer/kimi_k2.py b/angelslim/models/llm/tiktoken_tokenizer.py
similarity index 100%
rename from angelslim/tokenizer/kimi_k2.py
rename to angelslim/models/llm/tiktoken_tokenizer.py
diff --git a/angelslim/tokenizer/__init__.py b/angelslim/tokenizer/__init__.py
deleted file mode 100644
index 35ef522c..00000000
--- a/angelslim/tokenizer/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2025 Tencent Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .kimi_k2 import TikTokenTokenizer  # noqa: F401
diff --git a/scripts/deploy/lm_eval.sh b/scripts/deploy/lm_eval.sh
index 0af8fcdc..70a2da4d 100644
--- a/scripts/deploy/lm_eval.sh
+++ b/scripts/deploy/lm_eval.sh
@@ -1,12 +1,82 @@
 #!/bin/bash
 
-# Set environment variables
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-export PYTHON_MULTIPROCESSING_METHOD=spawn
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export HF_ALLOW_CODE_EVAL=1
+usage() {
+    cat << EOF
+Usage: $0 [OPTIONS] <model_path1> <model_path2> ...
+
+Options:
+  -d, --devices DEVICES          CUDA devices to use (default: 0,1,2,3)
+  -t, --tensor-parallel SIZE     Tensor parallel size (default: 4)
+  -g, --gpu-memory UTILIZATION   GPU memory utilization (default: 0.9)
+  -r, --result-dir DIR           Base result directory (default: ./results)
+  -b, --batch-size SIZE          Batch size for auto tasks (default: auto)
+  --tasks TASK1,TASK2,...        Comma-separated list of tasks to evaluate (default: ceval-valid,mmlu,gsm8k,humaneval)
+  -n, --num-fewshot NUM          Number of few-shot examples (default: 0)
+  -h, --help                     Show this help message
+
+Examples:
+  bash $0 -d 0,1 -t 2 --gpu-memory 0.8 /path/to/model1 /path/to/model2
+  bash $0 --tasks ceval-valid,mmlu,gsm8k,humaneval /path/to/model1
+EOF
+}
 
+CUDA_VISIBLE_DEVICES="0,1,2,3"
 INFERENCE_TP_SIZE=4
+GPU_MEMORY_UTILIZATION=0.9
+RESULT_BASE_DIR="./results"
+BATCH_SIZE="auto"
+TASKS=("ceval-valid" "mmlu" "gsm8k" "humaneval")
+NUM_FEWSHOT=0
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -d|--devices)
+            CUDA_VISIBLE_DEVICES="$2"
+            shift 2
+            ;;
+        -t|--tensor-parallel)
+            INFERENCE_TP_SIZE="$2"
+            shift 2
+            ;;
+        -g|--gpu-memory)
+            GPU_MEMORY_UTILIZATION="$2"
+            shift 2
+            ;;
+        -r|--result-dir)
+            RESULT_BASE_DIR="$2"
+            shift 2
+            ;;
+        -b|--batch-size)
+            BATCH_SIZE="$2"
+            shift 2
+            ;;
+        --tasks)
+            IFS=',' read -ra TASKS <<< "$2"
+            shift 2
+            ;;
+        -n|--num-fewshot)
+            NUM_FEWSHOT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*|--*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
 
 # Check if model paths are provided
 if [ $# -eq 0 ]; then
@@ -14,6 +84,30 @@ if [ $# -eq 0 ]; then
     exit 1
 fi
 
+# Set environment variables
+export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+export PYTHON_MULTIPROCESSING_METHOD=spawn
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export HF_ALLOW_CODE_EVAL=1
+
+echo "======================================================"
+echo "           Model Evaluation Configuration"
+echo "======================================================"
+echo "CUDA Visible Devices:      $CUDA_VISIBLE_DEVICES"
+echo "Tensor Parallel Size:      $INFERENCE_TP_SIZE"
+echo "GPU Memory Utilization:    $GPU_MEMORY_UTILIZATION"
+echo "Result Base Directory:     $RESULT_BASE_DIR"
+echo "Batch Size:                $BATCH_SIZE"
+echo "Number of Few-shot:        $NUM_FEWSHOT"
+echo "Tasks to Evaluate:         ${TASKS[*]}"
+echo "Number of Models:          $#"
+echo "Model Paths:"
+for model_path in "$@"; do
+    echo "  - $model_path"
+done
+echo "======================================================"
+echo
+
 # Iterate over all provided model paths
 for MODEL_PATH in "$@"; do
     # Extract model name from path (last directory name)
@@ -21,43 +115,35 @@ for MODEL_PATH in "$@"; do
     echo "======================================================"
     echo "Evaluating model: $MODEL_NAME"
     echo "Model path: $MODEL_PATH"
-    echo "======================================================"
     
     # Create dedicated result directory for the model
-    RESULT_PATH="./results/$MODEL_NAME"
+    RESULT_PATH="$RESULT_BASE_DIR/$MODEL_NAME"
     mkdir -p "$RESULT_PATH"
     
-    # Evaluate ceval, mmlu, gsm8k
-    lm_eval --model vllm \
-        --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \
-        --tasks ceval-valid \
-        --num_fewshot 5 \
-        --batch_size auto \
-        --output_path "$RESULT_PATH/ceval_results.json" 2>&1 | tee "$RESULT_PATH/ceval.log"
-    
-    lm_eval --model vllm \
-        --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \
-        --tasks mmlu \
-        --num_fewshot 4 \
-        --batch_size 1 \
-        --output_path "$RESULT_PATH/mmlu_results.json" 2>&1 | tee "$RESULT_PATH/mmlu.log"
-    
-    lm_eval --model vllm \
-        --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \
-        --tasks gsm8k \
-        --num_fewshot 5 \
-        --batch_size auto \
-        --output_path "$RESULT_PATH/gsm8k_results.json" 2>&1 | tee "$RESULT_PATH/gsm8k.log"
-    
-    # Evaluate humaneval
-    lm_eval --model vllm \
-        --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \
-        --tasks humaneval \
-        --num_fewshot 0 \
-        --batch_size auto \
-        --confirm_run_unsafe_code \
-        --output_path "$RESULT_PATH/humaneval_results.json" 2>&1 | tee "$RESULT_PATH/humaneval.log"
-    
+    for TASK in "${TASKS[@]}"; do
+        echo "=============================================="
+        echo "Evaluating task: $TASK"
+        echo "Number of few-shot: $NUM_FEWSHOT"
+        echo "=============================================="
+        if [[ "$TASK" == *"humaneval"* ]]; then
+            # Evaluate humaneval
+            lm_eval --model vllm \
+                --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \
+                --tasks $TASK \
+                --num_fewshot $NUM_FEWSHOT \
+                --batch_size $BATCH_SIZE \
+                --confirm_run_unsafe_code \
+                --output_path "$RESULT_PATH/$TASK.json" 2>&1 | tee "$RESULT_PATH/$TASK.log"
+        else
+            lm_eval --model vllm \
+                --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \
+                --tasks $TASK \
+                --num_fewshot $NUM_FEWSHOT \
+                --batch_size $BATCH_SIZE \
+                --output_path "$RESULT_PATH/$TASK.json" 2>&1 | tee "$RESULT_PATH/$TASK.log"
+        fi
+    done
+
     echo "Evaluation completed for $MODEL_NAME"
     echo "Results saved to: $RESULT_PATH"
 done
diff --git a/scripts/deploy/lmms_eval.sh b/scripts/deploy/lmms_eval.sh
index 0d218ddb..1b352215 100644
--- a/scripts/deploy/lmms_eval.sh
+++ b/scripts/deploy/lmms_eval.sh
@@ -1,26 +1,116 @@
 #!/bin/bash
 
-# Set environment variables
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-export PYTHON_MULTIPROCESSING_METHOD=spawn
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export HF_HOME="~/.cache/huggingface"
-export HF_HUB_ENABLE_HF_TRANSFER="1"
-export NCCL_BLOCKING_WAIT=1
-export NCCL_TIMEOUT=18000000
-export NCCL_DEBUG=DEBUG
+usage() {
+    cat << EOF
+Usage: $0 [OPTIONS] <model_path1> <model_path2> ...
 
+Options:
+  -d, --devices DEVICES          CUDA devices to use (default: 0,1,2,3)
+  -t, --tensor-parallel SIZE     Tensor parallel size (default: 4)
+  -g, --gpu-memory UTILIZATION   GPU memory utilization (default: 0.9)
+  -r, --result-dir DIR           Base result directory (default: ./results)
+  -b, --batch-size SIZE          Batch size for auto tasks (default: auto)
+  -c, --hf-home                  Cache dir for HF datasets
+  --tasks TASK1,TASK2,...        Comma-separated list of tasks to evaluate (default: mmmu_val,docvqa_val,chartqa)
+  -h, --help                     Show this help message
+
+Examples:
+  bash $0 -d 0,1 -t 2 --gpu-memory 0.8 /path/to/model1 /path/to/model2
+  bash $0 --tasks mmmu_val,docvqa_val,chartqa /path/to/model1
+EOF
+}
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
 INFERENCE_TP_SIZE=4
+GPU_MEMORY_UTILIZATION=0.9
+RESULT_BASE_DIR="./results"
 BATCH_SIZE=16
-
+HF_HOME="~/.cache/huggingface"
 TASKS=("mmmu_val" "docvqa_val" "chartqa")
 
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -d|--devices)
+            CUDA_VISIBLE_DEVICES="$2"
+            shift 2
+            ;;
+        -t|--tensor-parallel)
+            INFERENCE_TP_SIZE="$2"
+            shift 2
+            ;;
+        -g|--gpu-memory)
+            GPU_MEMORY_UTILIZATION="$2"
+            shift 2
+            ;;
+        -r|--result-dir)
+            RESULT_BASE_DIR="$2"
+            shift 2
+            ;;
+        -b|--batch-size)
+            BATCH_SIZE="$2"
+            shift 2
+            ;;
+        -c|--hf-home)
+            HF_HOME="$2"
+            shift 2
+            ;;
+        --tasks)
+            IFS=',' read -ra TASKS <<< "$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*|--*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
+
 # Check if model paths are provided
 if [ $# -eq 0 ]; then
     echo "Usage: $0 <model_path1> <model_path2> ..."
     exit 1
 fi
 
+# Set environment variables
+export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+export PYTHON_MULTIPROCESSING_METHOD=spawn
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export HF_HOME=$HF_HOME
+export HF_HUB_ENABLE_HF_TRANSFER="1"
+export NCCL_BLOCKING_WAIT=1
+export NCCL_TIMEOUT=18000000
+export NCCL_DEBUG=DEBUG
+
+echo "======================================================"
+echo "           Model Evaluation Configuration"
+echo "======================================================"
+echo "CUDA Visible Devices:      $CUDA_VISIBLE_DEVICES"
+echo "Tensor Parallel Size:      $INFERENCE_TP_SIZE"
+echo "GPU Memory Utilization:    $GPU_MEMORY_UTILIZATION"
+echo "Result Base Directory:     $RESULT_BASE_DIR"
+echo "Batch Size:                $BATCH_SIZE"
+echo "Tasks to Evaluate:         ${TASKS[*]}"
+echo "Number of Models:          $#"
+echo "Model Paths:"
+for model_path in "$@"; do
+    echo "  - $model_path"
+done
+echo "======================================================"
+echo
+
 # Iterate over all provided model paths
 for MODEL_PATH in "$@"; do
     # Extract model name from path (last directory name)
@@ -28,10 +118,9 @@ for MODEL_PATH in "$@"; do
     echo "======================================================"
     echo "Evaluating model: $MODEL_NAME"
     echo "Model path: $MODEL_PATH"
-    echo "======================================================"
 
     # Create dedicated result directory for the model
-    RESULT_PATH="./results/$MODEL_NAME"
+    RESULT_PATH="$RESULT_BASE_DIR/$MODEL_NAME"
     mkdir -p "$RESULT_PATH"
 
     for TASK in "${TASKS[@]}"; do
@@ -42,7 +131,7 @@ for MODEL_PATH in "$@"; do
         
         python3 -m lmms_eval \
             --model vllm \
-            --model_args model_version=$MODEL_PATH,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \
+            --model_args model_version=$MODEL_PATH,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \
             --tasks $TASK \
             --batch_size $BATCH_SIZE \
             --log_samples \
diff --git a/scripts/deploy/offline.py b/scripts/deploy/offline.py
index 6322df86..8c1a17fe 100644
--- a/scripts/deploy/offline.py
+++ b/scripts/deploy/offline.py
@@ -3,6 +3,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_path = sys.argv[1]
+prompt = sys.argv[2]
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
     device_map="auto",
@@ -12,7 +13,6 @@
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-prompt = "Hello, my name is"
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 outputs = model.generate(**inputs)
 print(tokenizer.decode(outputs[0]))
diff --git a/scripts/deploy/openai.sh b/scripts/deploy/openai.sh
index 6c43c54f..7c42430a 100644
--- a/scripts/deploy/openai.sh
+++ b/scripts/deploy/openai.sh
@@ -1,5 +1,93 @@
-MODEL_PATH=$1
+#!/bin/bash
+
+PROMPT="一种零件的内径尺寸在图纸上是30±0.02(单位：毫米） 表示这种零件的标准尺寸是30毫米．加工要求最大不超过标准尺寸__毫米 最小不低于标准尺寸__毫米。"
 PORT=8080
+MAX_TOKENS=2048
+TEMPERATURE=0.7
+TOP_P=0.8
+TOP_K=20
+REPETITION_PENALTY=1.05
+SYSTEM_PROMPT="You are a helpful assistant."
+
+usage() {
+    cat << EOF
+Usage: $0 -m /path/to/model [OPTIONS]
+
+Options:
+  -m,  --model                   Model path(Needed)
+  -p, --prompt PROMPT            Prompt text to send to the model
+  --port PORT                    API server port (default: 8080)
+  --max-tokens TOKENS            Maximum tokens to generate (default: 2048)
+  --temperature TEMP             Sampling temperature (default: 0.7)
+  --top-p TOP_P                  Top-p sampling parameter (default: 0.8)
+  --top-k TOP_K                  Top-k sampling parameter (default: 20)
+  --repetition-penalty PENALTY   Repetition penalty (default: 1.05)
+  --system-prompt PROMPT         System prompt (default: "You are a helpful assistant.")
+  -h, --help                     Show this help message
+
+Examples:
+  bash $0 -m /path/to/model --port 8000 -p "你的提示词" 
+EOF
+}
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -m|--model)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        -p|--prompt)
+            PROMPT="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --max-tokens)
+            MAX_TOKENS="$2"
+            shift 2
+            ;;
+        --temperature)
+            TEMPERATURE="$2"
+            shift 2
+            ;;
+        --top-p)
+            TOP_P="$2"
+            shift 2
+            ;;
+        --top-k)
+            TOP_K="$2"
+            shift 2
+            ;;
+        --repetition-penalty)
+            REPETITION_PENALTY="$2"
+            shift 2
+            ;;
+        --system-prompt)
+            SYSTEM_PROMPT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
+
 curl http://0.0.0.0:$PORT/v1/chat/completions \
     -H 'Content-Type: application/json' \
     -d '{
@@ -7,16 +95,16 @@ curl http://0.0.0.0:$PORT/v1/chat/completions \
         "messages": [
             {
                 "role": "system",
-                "content": "You are a helpful assistant."
+                "content": "'"$SYSTEM_PROMPT"'"
             },
             {
                 "role": "user",
-                "content": "一种零件的内径尺寸在图纸上是30±0.02(单位：毫米） 表示这种零件的标准尺寸是30毫米．加工要求最大不超过标准尺寸__毫米 最小不低于标准尺寸__毫米。"
+                "content": "'"$PROMPT"'"
             }
         ],
-        "max_tokens": 2048,
-        "temperature": 0.7,
-        "top_p": 0.8,
-        "top_k": 20,
-        "repetition_penalty": 1.05
+        "max_tokens": '"$MAX_TOKENS"',
+        "temperature": '"$TEMPERATURE"',
+        "top_p": '"$TOP_P"',
+        "top_k": '"$TOP_K"',
+        "repetition_penalty": '"$REPETITION_PENALTY"'
     }'
\ No newline at end of file
diff --git a/scripts/deploy/run_sglang.sh b/scripts/deploy/run_sglang.sh
index 5e29c3f1..f4f68adf 100644
--- a/scripts/deploy/run_sglang.sh
+++ b/scripts/deploy/run_sglang.sh
@@ -1,12 +1,75 @@
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-MODEL_PATH=$1  # your/path/to/model
-PORT=8080
+#!/bin/bash
+
+usage() {
+    cat << EOF
+Usage: $0 <model_path1> [OPTIONS]
+
+Options:
+  --model-path                   Model path (Need)
+  --port PORT                    Servive port (default: 8080)
+  -d, --devices DEVICES          CUDA devices to use (default: 0,1,2,3)
+  -t, --tensor-parallel SIZE     Tensor parallel size (default: 4)
+  -g, --gpu-memory UTILIZATION   GPU memory utilization (default: 0.9)
+  -h, --help                     Show this help message
+
+Examples:
+  bash $0 --model-path /path/to/model -d 0,1 -t 2 --gpu-memory-utilization 0.8
+EOF
+}
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
 INFERENCE_TP_SIZE=4
+PORT=8080
+GPU_MEMORY_UTILIZATION=0.9
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model-path)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        -d|--devices)
+            CUDA_VISIBLE_DEVICES="$2"
+            shift 2
+            ;;
+        -t|--tensor-parallel)
+            INFERENCE_TP_SIZE="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        -g|--gpu-memory)
+            GPU_MEMORY_UTILIZATION="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*|--*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
+
+export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
 
 python -m sglang.launch_server \
     --host 0.0.0.0 \
     --port ${PORT} \
     --model-path $MODEL_PATH \
     --tp $INFERENCE_TP_SIZE \
-    --mem-fraction-static 0.9 \
+    --mem-fraction-static $GPU_MEMORY_UTILIZATION \
     --trust-remote-code
\ No newline at end of file
diff --git a/scripts/deploy/run_vllm.sh b/scripts/deploy/run_vllm.sh
index 84be4787..8c86f288 100644
--- a/scripts/deploy/run_vllm.sh
+++ b/scripts/deploy/run_vllm.sh
@@ -1,14 +1,89 @@
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-MODEL_PATH=$1  # your/path/to/model
-PORT=8080
+#!/bin/bash
+
+usage() {
+    cat << EOF
+Usage: $0 <model_path1> [OPTIONS]
+
+Options:
+  --model-path                   Model path (Need)
+  --port PORT                    Servive port (default: 8080)
+  -d, --devices DEVICES          CUDA devices to use (default: 0,1,2,3)
+  -t, --tensor-parallel SIZE     Tensor parallel size (default: 4)
+  -p, --pipeline-parallel-size   Pipline parallel size (default: 1)
+  -g, --gpu-memory UTILIZATION   GPU memory utilization (default: 0.9)
+  --max-model-len                Max model len (default: 4096)
+  -h, --help                     Show this help message
+
+Examples:
+  bash $0 --model-path /path/to/model -d 0,1 -t 2 --gpu-memory-utilization 0.8
+EOF
+}
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
 INFERENCE_TP_SIZE=4
+PIPELINE_PARALLEL_SIZE=1
+PORT=8080
+GPU_MEMORY_UTILIZATION=0.9
+MAX_MODEL_LEN=4096
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model-path)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        -d|--devices)
+            CUDA_VISIBLE_DEVICES="$2"
+            shift 2
+            ;;
+        -t|--tensor-parallel)
+            INFERENCE_TP_SIZE="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        -g|--gpu-memory)
+            GPU_MEMORY_UTILIZATION="$2"
+            shift 2
+            ;;
+        --pipeline-parallel-size)
+            PIPELINE_PARALLEL_SIZE="$2"
+            shift 2
+            ;;
+        --max-model-len)
+            MAX_MODEL_LEN="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*|--*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
+
+export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
 
 python3 -m vllm.entrypoints.openai.api_server \
     --host 0.0.0.0 \
     --port ${PORT} \
     --model ${MODEL_PATH} \
-    --pipeline_parallel_size 1 \
+    --pipeline_parallel_size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${INFERENCE_TP_SIZE} \
     --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --max-model-len 4096
+    --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
+    --max-model-len ${MAX_MODEL_LEN}

From 06978fc788433325ea18cbdbeb49f984ca53c8ac Mon Sep 17 00:00:00 2001
From: ali-88123 <1940747290@qq.com>
Date: Fri, 31 Oct 2025 15:52:44 +0800
Subject: [PATCH 2/2] modify README

---
 README.md    | 10 +++++-----
 README_en.md | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 7937cc0b..13116990 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ python3 tools/spec_benchmark.py \
 测试`transformers`加载量化模型离线推理：
 
 ```shell
-python deploy/offline.py $MODEL_PATH "Hello, my name is"
+python scripts/deploy/offline.py $MODEL_PATH "Hello, my name is"
 ```
 
 其中 `MODEL_PATH` 为量化产出模型路径。
@@ -168,7 +168,7 @@ python deploy/offline.py $MODEL_PATH "Hello, my name is"
 [vLLM](https://github.com/vllm-project/vllm) 服务启动脚本，建议版本`vllm>=0.8.5.post1`，部署MOE INT8量化模型需要`vllm>=0.9.2`。
 
 ```shell
-bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
+bash scripts/deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
 ```
 其中`-d`为可见设备，`-t`为张量并行度，`-p`为流水线并行度，`-g`为显存使用率。
 
@@ -177,7 +177,7 @@ bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p
 [SGLang](https://github.com/sgl-project/sglang) 服务启动脚本，建议版本 `sglang>=0.4.6.post1`：
 
 ```shell
-bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
+bash scripts/deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
 ```
 
 #### 3. 服务调用
@@ -185,7 +185,7 @@ bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -
 通过 [OpenAI 格式](https://platform.openai.com/docs/api-reference/introduction) 接口发起请求：
 
 ```shell
-bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
+bash scripts/deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
 ```
 其中`-p`为输入prompt
 
@@ -194,7 +194,7 @@ bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-to
 使用 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 评估量化模型精度，建议版本`lm-eval>=0.4.8`：
 
 ```shell
-bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
+bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
 ```
 其中`RESULT_PATH`为测试结果保存目录，`-b`为batch size大小，`--tasks`为评测任务，`-n`为few-shot数量
 
diff --git a/README_en.md b/README_en.md
index e70519d4..c437f340 100644
--- a/README_en.md
+++ b/README_en.md
@@ -154,7 +154,7 @@ If you need to load a quantized model via `transformers`, please set the `deploy
 To test offline inference with a quantized model loaded via `transformers`, run the following command:
 
 ```shell
-python deploy/offline.py $MODEL_PATH "Hello, my name is"
+python scripts/deploy/offline.py $MODEL_PATH "Hello, my name is"
 ```
 
 Where `MODEL_PATH` is the path to the quantized model output.
@@ -169,7 +169,7 @@ Use the following script to launch a [vLLM](https://github.com/vllm-project/vllm
 
 
 ```shell
-bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
+bash scripts/deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096
 ```
 Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipeline parallel size, and `-g` is the GPU memory utilization.
 
@@ -179,7 +179,7 @@ Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipelin
 Use the following script to launch a [SGLang](https://github.com/sgl-project/sglang) server, recommended version `sglang>=0.4.6.post1`.
 
 ```shell
-bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
+bash scripts/deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8
 ```
 
 #### 3. Service Invocation
@@ -187,7 +187,7 @@ bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -
 Invoke requests via [OpenAI's API format](https://platform.openai.com/docs/api-reference/introduction):
 
 ```shell
-bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
+bash scripts/deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant."
 ```
 where `-p` is the input prompt.
 
@@ -196,7 +196,7 @@ where `-p` is the input prompt.
 Evaluate the performance of quantized model using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness), recommended version`lm-eval>=0.4.8`:
 
 ```shell
-bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
+bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH
 ```
 where `RESULT_PATH` is the directory for saving test results, `-b` is batch size, `--tasks` specifies the evaluation tasks, and `-n` is the number of few-shot examples.