From 80f469530a1d12d3eda97085ede1534a144792ec Mon Sep 17 00:00:00 2001 From: ali-88123 <1940747290@qq.com> Date: Fri, 31 Oct 2025 15:28:24 +0800 Subject: [PATCH 1/2] delete tokenizer folder;refactor the deployment test script to take external parameters --- README.md | 13 +- README_en.md | 13 +- angelslim/models/llm/kimi_k2.py | 2 +- .../llm/tiktoken_tokenizer.py} | 0 angelslim/tokenizer/__init__.py | 15 -- scripts/deploy/lm_eval.sh | 162 ++++++++++++++---- scripts/deploy/lmms_eval.sh | 115 +++++++++++-- scripts/deploy/offline.py | 2 +- scripts/deploy/openai.sh | 104 ++++++++++- scripts/deploy/run_sglang.sh | 71 +++++++- scripts/deploy/run_vllm.sh | 87 +++++++++- 11 files changed, 488 insertions(+), 96 deletions(-) rename angelslim/{tokenizer/kimi_k2.py => models/llm/tiktoken_tokenizer.py} (100%) delete mode 100644 angelslim/tokenizer/__init__.py diff --git a/README.md b/README.md index 64efc221..7937cc0b 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ python3 tools/spec_benchmark.py \ 测试`transformers`加载量化模型离线推理: ```shell -python deploy/offline.py $MODEL_PATH +python deploy/offline.py $MODEL_PATH "Hello, my name is" ``` 其中 `MODEL_PATH` 为量化产出模型路径。 @@ -168,15 +168,16 @@ python deploy/offline.py $MODEL_PATH [vLLM](https://github.com/vllm-project/vllm) 服务启动脚本,建议版本`vllm>=0.8.5.post1`,部署MOE INT8量化模型需要`vllm>=0.9.2`。 ```shell -bash deploy/run_vllm.sh $MODEL_PATH +bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 ``` +其中`-d`为可见设备,`-t`为张量并行度,`-p`为流水线并行度,`-g`为显存使用率。 **SGLang** [SGLang](https://github.com/sgl-project/sglang) 服务启动脚本,建议版本 `sglang>=0.4.6.post1`: ```shell -bash deploy/run_sglang.sh $MODEL_PATH +bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 ``` #### 3. 服务调用 @@ -184,16 +185,18 @@ bash deploy/run_sglang.sh $MODEL_PATH 通过 [OpenAI 格式](https://platform.openai.com/docs/api-reference/introduction) 接口发起请求: ```shell -bash deploy/openai.sh $MODEL_PATH +bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." ``` +其中`-p`为输入prompt #### 4. 效果验证 使用 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 评估量化模型精度,建议版本`lm-eval>=0.4.8`: ```shell -bash deploy/lm_eval.sh $MODEL_PATH +bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH ``` +其中`RESULT_PATH`为测试结果保存目录,`-b`为batch size大小,`--tasks`为评测任务,`-n`为few-shot数量 详细操作指南请参阅[部署文档](https://angelslim.readthedocs.io/zh-cn/latest/deployment/deploy.html)。 diff --git a/README_en.md b/README_en.md index 8b7fb219..e70519d4 100644 --- a/README_en.md +++ b/README_en.md @@ -154,7 +154,7 @@ If you need to load a quantized model via `transformers`, please set the `deploy To test offline inference with a quantized model loaded via `transformers`, run the following command: ```shell -python deploy/offline.py $MODEL_PATH +python deploy/offline.py $MODEL_PATH "Hello, my name is" ``` Where `MODEL_PATH` is the path to the quantized model output. @@ -169,8 +169,9 @@ Use the following script to launch a [vLLM](https://github.com/vllm-project/vllm ```shell -bash deploy/run_vllm.sh $MODEL_PATH +bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 ``` +Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipeline parallel size, and `-g` is the GPU memory utilization. **SGLang** @@ -178,7 +179,7 @@ bash deploy/run_vllm.sh $MODEL_PATH Use the following script to launch a [SGLang](https://github.com/sgl-project/sglang) server, recommended version `sglang>=0.4.6.post1`. ```shell -bash deploy/run_sglang.sh $MODEL_PATH +bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 ``` #### 3. Service Invocation @@ -186,16 +187,18 @@ bash deploy/run_sglang.sh $MODEL_PATH Invoke requests via [OpenAI's API format](https://platform.openai.com/docs/api-reference/introduction): ```shell -bash deploy/openai.sh $MODEL_PATH +bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." ``` +where `-p` is the input prompt. #### 4. Performance Evaluation Evaluate the performance of quantized model using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness), recommended version`lm-eval>=0.4.8`: ```shell -bash deploy/lm_eval.sh $MODEL_PATH +bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH ``` +where `RESULT_PATH` is the directory for saving test results, `-b` is batch size, `--tasks` specifies the evaluation tasks, and `-n` is the number of few-shot examples. For more detaileds, please refer to the [Deployment Documentation](https://angelslim.readthedocs.io/zh-cn/latest/deployment/deploy.html). diff --git a/angelslim/models/llm/kimi_k2.py b/angelslim/models/llm/kimi_k2.py index bb14285f..a6a500cf 100644 --- a/angelslim/models/llm/kimi_k2.py +++ b/angelslim/models/llm/kimi_k2.py @@ -16,11 +16,11 @@ from transformers import AutoModelForCausalLM from transformers.models.deepseek_v3 import DeepseekV3Config -from ...tokenizer import TikTokenTokenizer from ...utils import print_info from ..model_factory import SlimModelFactory from .deepseek import DeepSeek from .modeling_deepseek import DeepseekV3ForCausalLM +from .tiktoken_tokenizer import TikTokenTokenizer @SlimModelFactory.register diff --git a/angelslim/tokenizer/kimi_k2.py b/angelslim/models/llm/tiktoken_tokenizer.py similarity index 100% rename from angelslim/tokenizer/kimi_k2.py rename to angelslim/models/llm/tiktoken_tokenizer.py diff --git a/angelslim/tokenizer/__init__.py b/angelslim/tokenizer/__init__.py deleted file mode 100644 index 35ef522c..00000000 --- a/angelslim/tokenizer/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2025 Tencent Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .kimi_k2 import TikTokenTokenizer # noqa: F401 diff --git a/scripts/deploy/lm_eval.sh b/scripts/deploy/lm_eval.sh index 0af8fcdc..70a2da4d 100644 --- a/scripts/deploy/lm_eval.sh +++ b/scripts/deploy/lm_eval.sh @@ -1,12 +1,82 @@ #!/bin/bash -# Set environment variables -export CUDA_VISIBLE_DEVICES=0,1,2,3 -export PYTHON_MULTIPROCESSING_METHOD=spawn -export VLLM_WORKER_MULTIPROC_METHOD=spawn -export HF_ALLOW_CODE_EVAL=1 +usage() { + cat << EOF +Usage: $0 [OPTIONS] ... + +Options: + -d, --devices DEVICES CUDA devices to use (default: 0,1,2,3) + -t, --tensor-parallel SIZE Tensor parallel size (default: 4) + -g, --gpu-memory UTILIZATION GPU memory utilization (default: 0.9) + -r, --result-dir DIR Base result directory (default: ./results) + -b, --batch-size SIZE Batch size for auto tasks (default: auto) + --tasks TASK1,TASK2,... Comma-separated list of tasks to evaluate (default: ceval-valid,mmlu,gsm8k,humaneval) + -n, --num-fewshot NUM Number of few-shot examples (default: 0) + -h, --help Show this help message + +Examples: + bash $0 -d 0,1 -t 2 --gpu-memory 0.8 /path/to/model1 /path/to/model2 + bash $0 --tasks ceval-valid,mmlu,gsm8k,humaneval /path/to/model1 +EOF +} +CUDA_VISIBLE_DEVICES="0,1,2,3" INFERENCE_TP_SIZE=4 +GPU_MEMORY_UTILIZATION=0.9 +RESULT_BASE_DIR="./results" +BATCH_SIZE="auto" +TASKS=("ceval-valid" "mmlu" "gsm8k" "humaneval") +NUM_FEWSHOT=0 + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + -d|--devices) + CUDA_VISIBLE_DEVICES="$2" + shift 2 + ;; + -t|--tensor-parallel) + INFERENCE_TP_SIZE="$2" + shift 2 + ;; + -g|--gpu-memory) + GPU_MEMORY_UTILIZATION="$2" + shift 2 + ;; + -r|--result-dir) + RESULT_BASE_DIR="$2" + shift 2 + ;; + -b|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --tasks) + IFS=',' read -ra TASKS <<< "$2" + shift 2 + ;; + -n|--num-fewshot) + NUM_FEWSHOT="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*|--*) + echo "Error: Unknown option: $1" + usage + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" # Check if model paths are provided if [ $# -eq 0 ]; then @@ -14,6 +84,30 @@ if [ $# -eq 0 ]; then exit 1 fi +# Set environment variables +export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES +export PYTHON_MULTIPROCESSING_METHOD=spawn +export VLLM_WORKER_MULTIPROC_METHOD=spawn +export HF_ALLOW_CODE_EVAL=1 + +echo "======================================================" +echo " Model Evaluation Configuration" +echo "======================================================" +echo "CUDA Visible Devices: $CUDA_VISIBLE_DEVICES" +echo "Tensor Parallel Size: $INFERENCE_TP_SIZE" +echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" +echo "Result Base Directory: $RESULT_BASE_DIR" +echo "Batch Size: $BATCH_SIZE" +echo "Number of Few-shot: $NUM_FEWSHOT" +echo "Tasks to Evaluate: ${TASKS[*]}" +echo "Number of Models: $#" +echo "Model Paths:" +for model_path in "$@"; do + echo " - $model_path" +done +echo "======================================================" +echo + # Iterate over all provided model paths for MODEL_PATH in "$@"; do # Extract model name from path (last directory name) @@ -21,43 +115,35 @@ for MODEL_PATH in "$@"; do echo "======================================================" echo "Evaluating model: $MODEL_NAME" echo "Model path: $MODEL_PATH" - echo "======================================================" # Create dedicated result directory for the model - RESULT_PATH="./results/$MODEL_NAME" + RESULT_PATH="$RESULT_BASE_DIR/$MODEL_NAME" mkdir -p "$RESULT_PATH" - # Evaluate ceval, mmlu, gsm8k - lm_eval --model vllm \ - --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \ - --tasks ceval-valid \ - --num_fewshot 5 \ - --batch_size auto \ - --output_path "$RESULT_PATH/ceval_results.json" 2>&1 | tee "$RESULT_PATH/ceval.log" - - lm_eval --model vllm \ - --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \ - --tasks mmlu \ - --num_fewshot 4 \ - --batch_size 1 \ - --output_path "$RESULT_PATH/mmlu_results.json" 2>&1 | tee "$RESULT_PATH/mmlu.log" - - lm_eval --model vllm \ - --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \ - --tasks gsm8k \ - --num_fewshot 5 \ - --batch_size auto \ - --output_path "$RESULT_PATH/gsm8k_results.json" 2>&1 | tee "$RESULT_PATH/gsm8k.log" - - # Evaluate humaneval - lm_eval --model vllm \ - --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \ - --tasks humaneval \ - --num_fewshot 0 \ - --batch_size auto \ - --confirm_run_unsafe_code \ - --output_path "$RESULT_PATH/humaneval_results.json" 2>&1 | tee "$RESULT_PATH/humaneval.log" - + for TASK in "${TASKS[@]}"; do + echo "==============================================" + echo "Evaluating task: $TASK" + echo "Number of few-shot: $NUM_FEWSHOT" + echo "==============================================" + if [[ "$TASK" == *"humaneval"* ]]; then + # Evaluate humaneval + lm_eval --model vllm \ + --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \ + --tasks $TASK \ + --num_fewshot $NUM_FEWSHOT \ + --batch_size $BATCH_SIZE \ + --confirm_run_unsafe_code \ + --output_path "$RESULT_PATH/$TASK.json" 2>&1 | tee "$RESULT_PATH/$TASK.log" + else + lm_eval --model vllm \ + --model_args pretrained=$MODEL_PATH,add_bos_token=True,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \ + --tasks $TASK \ + --num_fewshot $NUM_FEWSHOT \ + --batch_size $BATCH_SIZE \ + --output_path "$RESULT_PATH/$TASK.json" 2>&1 | tee "$RESULT_PATH/$TASK.log" + fi + done + echo "Evaluation completed for $MODEL_NAME" echo "Results saved to: $RESULT_PATH" done diff --git a/scripts/deploy/lmms_eval.sh b/scripts/deploy/lmms_eval.sh index 0d218ddb..1b352215 100644 --- a/scripts/deploy/lmms_eval.sh +++ b/scripts/deploy/lmms_eval.sh @@ -1,26 +1,116 @@ #!/bin/bash -# Set environment variables -export CUDA_VISIBLE_DEVICES=0,1,2,3 -export PYTHON_MULTIPROCESSING_METHOD=spawn -export VLLM_WORKER_MULTIPROC_METHOD=spawn -export HF_HOME="~/.cache/huggingface" -export HF_HUB_ENABLE_HF_TRANSFER="1" -export NCCL_BLOCKING_WAIT=1 -export NCCL_TIMEOUT=18000000 -export NCCL_DEBUG=DEBUG +usage() { + cat << EOF +Usage: $0 [OPTIONS] ... +Options: + -d, --devices DEVICES CUDA devices to use (default: 0,1,2,3) + -t, --tensor-parallel SIZE Tensor parallel size (default: 4) + -g, --gpu-memory UTILIZATION GPU memory utilization (default: 0.9) + -r, --result-dir DIR Base result directory (default: ./results) + -b, --batch-size SIZE Batch size for auto tasks (default: auto) + -c, --hf-home Cache dir for HF datasets + --tasks TASK1,TASK2,... Comma-separated list of tasks to evaluate (default: mmmu_val,docvqa_val,chartqa) + -h, --help Show this help message + +Examples: + bash $0 -d 0,1 -t 2 --gpu-memory 0.8 /path/to/model1 /path/to/model2 + bash $0 --tasks mmmu_val,docvqa_val,chartqa /path/to/model1 +EOF +} + +CUDA_VISIBLE_DEVICES="0,1,2,3" INFERENCE_TP_SIZE=4 +GPU_MEMORY_UTILIZATION=0.9 +RESULT_BASE_DIR="./results" BATCH_SIZE=16 - +HF_HOME="~/.cache/huggingface" TASKS=("mmmu_val" "docvqa_val" "chartqa") +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + -d|--devices) + CUDA_VISIBLE_DEVICES="$2" + shift 2 + ;; + -t|--tensor-parallel) + INFERENCE_TP_SIZE="$2" + shift 2 + ;; + -g|--gpu-memory) + GPU_MEMORY_UTILIZATION="$2" + shift 2 + ;; + -r|--result-dir) + RESULT_BASE_DIR="$2" + shift 2 + ;; + -b|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + -c|--hf-home) + HF_HOME="$2" + shift 2 + ;; + --tasks) + IFS=',' read -ra TASKS <<< "$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*|--*) + echo "Error: Unknown option: $1" + usage + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + # Check if model paths are provided if [ $# -eq 0 ]; then echo "Usage: $0 ..." exit 1 fi +# Set environment variables +export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES +export PYTHON_MULTIPROCESSING_METHOD=spawn +export VLLM_WORKER_MULTIPROC_METHOD=spawn +export HF_HOME=$HF_HOME +export HF_HUB_ENABLE_HF_TRANSFER="1" +export NCCL_BLOCKING_WAIT=1 +export NCCL_TIMEOUT=18000000 +export NCCL_DEBUG=DEBUG + +echo "======================================================" +echo " Model Evaluation Configuration" +echo "======================================================" +echo "CUDA Visible Devices: $CUDA_VISIBLE_DEVICES" +echo "Tensor Parallel Size: $INFERENCE_TP_SIZE" +echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" +echo "Result Base Directory: $RESULT_BASE_DIR" +echo "Batch Size: $BATCH_SIZE" +echo "Tasks to Evaluate: ${TASKS[*]}" +echo "Number of Models: $#" +echo "Model Paths:" +for model_path in "$@"; do + echo " - $model_path" +done +echo "======================================================" +echo + # Iterate over all provided model paths for MODEL_PATH in "$@"; do # Extract model name from path (last directory name) @@ -28,10 +118,9 @@ for MODEL_PATH in "$@"; do echo "======================================================" echo "Evaluating model: $MODEL_NAME" echo "Model path: $MODEL_PATH" - echo "======================================================" # Create dedicated result directory for the model - RESULT_PATH="./results/$MODEL_NAME" + RESULT_PATH="$RESULT_BASE_DIR/$MODEL_NAME" mkdir -p "$RESULT_PATH" for TASK in "${TASKS[@]}"; do @@ -42,7 +131,7 @@ for MODEL_PATH in "$@"; do python3 -m lmms_eval \ --model vllm \ - --model_args model_version=$MODEL_PATH,gpu_memory_utilization=0.9,tensor_parallel_size=$INFERENCE_TP_SIZE \ + --model_args model_version=$MODEL_PATH,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,tensor_parallel_size=$INFERENCE_TP_SIZE \ --tasks $TASK \ --batch_size $BATCH_SIZE \ --log_samples \ diff --git a/scripts/deploy/offline.py b/scripts/deploy/offline.py index 6322df86..8c1a17fe 100644 --- a/scripts/deploy/offline.py +++ b/scripts/deploy/offline.py @@ -3,6 +3,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer model_path = sys.argv[1] +prompt = sys.argv[2] model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", @@ -12,7 +13,6 @@ ) tokenizer = AutoTokenizer.from_pretrained(model_path) -prompt = "Hello, my name is" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs) print(tokenizer.decode(outputs[0])) diff --git a/scripts/deploy/openai.sh b/scripts/deploy/openai.sh index 6c43c54f..7c42430a 100644 --- a/scripts/deploy/openai.sh +++ b/scripts/deploy/openai.sh @@ -1,5 +1,93 @@ -MODEL_PATH=$1 +#!/bin/bash + +PROMPT="一种零件的内径尺寸在图纸上是30±0.02(单位:毫米) 表示这种零件的标准尺寸是30毫米.加工要求最大不超过标准尺寸__毫米 最小不低于标准尺寸__毫米。" PORT=8080 +MAX_TOKENS=2048 +TEMPERATURE=0.7 +TOP_P=0.8 +TOP_K=20 +REPETITION_PENALTY=1.05 +SYSTEM_PROMPT="You are a helpful assistant." + +usage() { + cat << EOF +Usage: $0 -m /path/to/model [OPTIONS] + +Options: + -m, --model Model path(Needed) + -p, --prompt PROMPT Prompt text to send to the model + --port PORT API server port (default: 8080) + --max-tokens TOKENS Maximum tokens to generate (default: 2048) + --temperature TEMP Sampling temperature (default: 0.7) + --top-p TOP_P Top-p sampling parameter (default: 0.8) + --top-k TOP_K Top-k sampling parameter (default: 20) + --repetition-penalty PENALTY Repetition penalty (default: 1.05) + --system-prompt PROMPT System prompt (default: "You are a helpful assistant.") + -h, --help Show this help message + +Examples: + bash $0 -m /path/to/model --port 8000 -p "你的提示词" +EOF +} + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + -m|--model) + MODEL_PATH="$2" + shift 2 + ;; + -p|--prompt) + PROMPT="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --max-tokens) + MAX_TOKENS="$2" + shift 2 + ;; + --temperature) + TEMPERATURE="$2" + shift 2 + ;; + --top-p) + TOP_P="$2" + shift 2 + ;; + --top-k) + TOP_K="$2" + shift 2 + ;; + --repetition-penalty) + REPETITION_PENALTY="$2" + shift 2 + ;; + --system-prompt) + SYSTEM_PROMPT="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*) + echo "Error: Unknown option: $1" + usage + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + curl http://0.0.0.0:$PORT/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{ @@ -7,16 +95,16 @@ curl http://0.0.0.0:$PORT/v1/chat/completions \ "messages": [ { "role": "system", - "content": "You are a helpful assistant." + "content": "'"$SYSTEM_PROMPT"'" }, { "role": "user", - "content": "一种零件的内径尺寸在图纸上是30±0.02(单位:毫米) 表示这种零件的标准尺寸是30毫米.加工要求最大不超过标准尺寸__毫米 最小不低于标准尺寸__毫米。" + "content": "'"$PROMPT"'" } ], - "max_tokens": 2048, - "temperature": 0.7, - "top_p": 0.8, - "top_k": 20, - "repetition_penalty": 1.05 + "max_tokens": '"$MAX_TOKENS"', + "temperature": '"$TEMPERATURE"', + "top_p": '"$TOP_P"', + "top_k": '"$TOP_K"', + "repetition_penalty": '"$REPETITION_PENALTY"' }' \ No newline at end of file diff --git a/scripts/deploy/run_sglang.sh b/scripts/deploy/run_sglang.sh index 5e29c3f1..f4f68adf 100644 --- a/scripts/deploy/run_sglang.sh +++ b/scripts/deploy/run_sglang.sh @@ -1,12 +1,75 @@ -export CUDA_VISIBLE_DEVICES=0,1,2,3 -MODEL_PATH=$1 # your/path/to/model -PORT=8080 +#!/bin/bash + +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Options: + --model-path Model path (Need) + --port PORT Servive port (default: 8080) + -d, --devices DEVICES CUDA devices to use (default: 0,1,2,3) + -t, --tensor-parallel SIZE Tensor parallel size (default: 4) + -g, --gpu-memory UTILIZATION GPU memory utilization (default: 0.9) + -h, --help Show this help message + +Examples: + bash $0 --model-path /path/to/model -d 0,1 -t 2 --gpu-memory-utilization 0.8 +EOF +} + +CUDA_VISIBLE_DEVICES="0,1,2,3" INFERENCE_TP_SIZE=4 +PORT=8080 +GPU_MEMORY_UTILIZATION=0.9 + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + -d|--devices) + CUDA_VISIBLE_DEVICES="$2" + shift 2 + ;; + -t|--tensor-parallel) + INFERENCE_TP_SIZE="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + -g|--gpu-memory) + GPU_MEMORY_UTILIZATION="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*|--*) + echo "Error: Unknown option: $1" + usage + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + +export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python -m sglang.launch_server \ --host 0.0.0.0 \ --port ${PORT} \ --model-path $MODEL_PATH \ --tp $INFERENCE_TP_SIZE \ - --mem-fraction-static 0.9 \ + --mem-fraction-static $GPU_MEMORY_UTILIZATION \ --trust-remote-code \ No newline at end of file diff --git a/scripts/deploy/run_vllm.sh b/scripts/deploy/run_vllm.sh index 84be4787..8c86f288 100644 --- a/scripts/deploy/run_vllm.sh +++ b/scripts/deploy/run_vllm.sh @@ -1,14 +1,89 @@ -export CUDA_VISIBLE_DEVICES=0,1,2,3 -MODEL_PATH=$1 # your/path/to/model -PORT=8080 +#!/bin/bash + +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Options: + --model-path Model path (Need) + --port PORT Servive port (default: 8080) + -d, --devices DEVICES CUDA devices to use (default: 0,1,2,3) + -t, --tensor-parallel SIZE Tensor parallel size (default: 4) + -p, --pipeline-parallel-size Pipline parallel size (default: 1) + -g, --gpu-memory UTILIZATION GPU memory utilization (default: 0.9) + --max-model-len Max model len (default: 4096) + -h, --help Show this help message + +Examples: + bash $0 --model-path /path/to/model -d 0,1 -t 2 --gpu-memory-utilization 0.8 +EOF +} + +CUDA_VISIBLE_DEVICES="0,1,2,3" INFERENCE_TP_SIZE=4 +PIPELINE_PARALLEL_SIZE=1 +PORT=8080 +GPU_MEMORY_UTILIZATION=0.9 +MAX_MODEL_LEN=4096 + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + -d|--devices) + CUDA_VISIBLE_DEVICES="$2" + shift 2 + ;; + -t|--tensor-parallel) + INFERENCE_TP_SIZE="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + -g|--gpu-memory) + GPU_MEMORY_UTILIZATION="$2" + shift 2 + ;; + --pipeline-parallel-size) + PIPELINE_PARALLEL_SIZE="$2" + shift 2 + ;; + --max-model-len) + MAX_MODEL_LEN="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*|--*) + echo "Error: Unknown option: $1" + usage + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + +export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python3 -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ --port ${PORT} \ --model ${MODEL_PATH} \ - --pipeline_parallel_size 1 \ + --pipeline_parallel_size ${PIPELINE_PARALLEL_SIZE} \ --tensor-parallel-size ${INFERENCE_TP_SIZE} \ --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --max-model-len 4096 + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --max-model-len ${MAX_MODEL_LEN} From 06978fc788433325ea18cbdbeb49f984ca53c8ac Mon Sep 17 00:00:00 2001 From: ali-88123 <1940747290@qq.com> Date: Fri, 31 Oct 2025 15:52:44 +0800 Subject: [PATCH 2/2] modify README --- README.md | 10 +++++----- README_en.md | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7937cc0b..13116990 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ python3 tools/spec_benchmark.py \ 测试`transformers`加载量化模型离线推理: ```shell -python deploy/offline.py $MODEL_PATH "Hello, my name is" +python scripts/deploy/offline.py $MODEL_PATH "Hello, my name is" ``` 其中 `MODEL_PATH` 为量化产出模型路径。 @@ -168,7 +168,7 @@ python deploy/offline.py $MODEL_PATH "Hello, my name is" [vLLM](https://github.com/vllm-project/vllm) 服务启动脚本,建议版本`vllm>=0.8.5.post1`,部署MOE INT8量化模型需要`vllm>=0.9.2`。 ```shell -bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 +bash scripts/deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 ``` 其中`-d`为可见设备,`-t`为张量并行度,`-p`为流水线并行度,`-g`为显存使用率。 @@ -177,7 +177,7 @@ bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p [SGLang](https://github.com/sgl-project/sglang) 服务启动脚本,建议版本 `sglang>=0.4.6.post1`: ```shell -bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 +bash scripts/deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 ``` #### 3. 服务调用 @@ -185,7 +185,7 @@ bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 - 通过 [OpenAI 格式](https://platform.openai.com/docs/api-reference/introduction) 接口发起请求: ```shell -bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." +bash scripts/deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." ``` 其中`-p`为输入prompt @@ -194,7 +194,7 @@ bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-to 使用 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 评估量化模型精度,建议版本`lm-eval>=0.4.8`: ```shell -bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH +bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH ``` 其中`RESULT_PATH`为测试结果保存目录,`-b`为batch size大小,`--tasks`为评测任务,`-n`为few-shot数量 diff --git a/README_en.md b/README_en.md index e70519d4..c437f340 100644 --- a/README_en.md +++ b/README_en.md @@ -154,7 +154,7 @@ If you need to load a quantized model via `transformers`, please set the `deploy To test offline inference with a quantized model loaded via `transformers`, run the following command: ```shell -python deploy/offline.py $MODEL_PATH "Hello, my name is" +python scripts/deploy/offline.py $MODEL_PATH "Hello, my name is" ``` Where `MODEL_PATH` is the path to the quantized model output. @@ -169,7 +169,7 @@ Use the following script to launch a [vLLM](https://github.com/vllm-project/vllm ```shell -bash deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 +bash scripts/deploy/run_vllm.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -p 1 -g 0.8 --max-model-len 4096 ``` Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipeline parallel size, and `-g` is the GPU memory utilization. @@ -179,7 +179,7 @@ Where `-d` is the visible devices, `-t` is tensor parallel size, `-p` is pipelin Use the following script to launch a [SGLang](https://github.com/sgl-project/sglang) server, recommended version `sglang>=0.4.6.post1`. ```shell -bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 +bash scripts/deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 -g 0.8 ``` #### 3. Service Invocation @@ -187,7 +187,7 @@ bash deploy/run_sglang.sh --model-path $MODEL_PATH --port 8080 -d 0,1,2,3 -t 4 - Invoke requests via [OpenAI's API format](https://platform.openai.com/docs/api-reference/introduction): ```shell -bash deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." +bash scripts/deploy/openai.sh -m $MODEL_PATH -p "Hello, my name is" --port 8080 --max-tokens 4096 --temperature 0.7 --top-p 0.8 --top-k 20 --repetition-penalty 1.05 --system-prompt "You are a helpful assistant." ``` where `-p` is the input prompt. @@ -196,7 +196,7 @@ where `-p` is the input prompt. Evaluate the performance of quantized model using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness), recommended version`lm-eval>=0.4.8`: ```shell -bash deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH +bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --tasks ceval-valid,mmlu,gsm8k,humaneval -n 0 $MODEL_PATH ``` where `RESULT_PATH` is the directory for saving test results, `-b` is batch size, `--tasks` specifies the evaluation tasks, and `-n` is the number of few-shot examples.