diff --git a/.gitignore b/.gitignore index b728e6ea..149d4bc3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ python/infinilm/lib/*.so # Vscode .vscode/ +*.sh +model_weight/ + # Python __pycache__/ *.egg-info/ diff --git a/examples/bench.py b/examples/bench.py index a52c44ec..6fc27498 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -3,6 +3,7 @@ from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.distributed import DistConfig from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.base_config import BaseConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig import argparse import sys @@ -125,150 +126,6 @@ def get_test_cases( return case_dict - -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model", - type=str, - required=True, - help="model path", - ) - parser.add_argument( - "--batch-size", - type=parse_list, - default=1, - help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')", - ) - parser.add_argument( - "--tensor-parallel-size", - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--input-len", - type=parse_list, - default=10, - help="output tokens", - ) - - parser.add_argument( - "--output-len", - type=parse_list, - default=20, - help="output tokens", - ) - parser.add_argument( - "--skip-load", - action="store_true", - help="skip loading model weights", - ) - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - parser.add_argument( - "--warmup", - action="store_true", - help="Perform a warmup run before benchmarking/inference.", - ) - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - - with open("examples/bench_prompt.md", "r") as f: prompt = f.read() @@ -305,7 +162,7 @@ def __init__( cache_config=cache_config, enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # @@ -396,52 +253,28 @@ def run( if __name__ == "__main__": - args = get_args() - print(args) - - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50" - ) - sys.exit(1) - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size # -------------------------------------------------------- # # 解析参数 # -------------------------------------------------------- # - model_path = args.model + model_path = cfg.model infini_device = infinicore.device(device_str, 0) - tp = args.tensor_parallel_size + tp = cfg.tp - skip_load = args.skip_load + skip_load = cfg.skip_load - batch_size = args.batch_size - input_len = args.input_len - output_len = args.output_len - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph - attn_backend = args.attn + batch_size = cfg.batch_size + input_len = cfg.input_len + output_len = cfg.output_len + enable_paged_attn = cfg.enable_paged_attn + enable_graph = cfg.enable_graph + attn_backend = cfg.attn if isinstance(batch_size, int): batch_size = [batch_size] @@ -488,7 +321,7 @@ def run( # ---------------------------------------------------------------------------- # # Warmup # ---------------------------------------------------------------------------- # - if args.warmup: + if cfg.warmup: warmup_steps = 1 # warmup cache capacity @@ -518,9 +351,9 @@ def run( input_ids_infini, GenerationConfig( max_new_tokens=5, # decode kernel warmup - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, + temperature=cfg.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, stop_on_eos=False, ), _measure_and_log_time=False, @@ -557,7 +390,7 @@ def run( batch_size=batch_size, input_len=input_len, output_len=output_len, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, ) diff --git a/examples/jiuge.py b/examples/jiuge.py index fa547435..9040f1c6 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -12,153 +12,13 @@ import numpy as np from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig from packaging import version +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) _PAGED_KV_BLOCK_SIZE = 256 -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model-path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="cpp", - help="python or cpp model", - ) - parser.add_argument( - "--batch-size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - parser.add_argument( - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - def test( prompts: str | list[str], @@ -186,7 +46,7 @@ def test( distributed_config=DistConfig(tp), enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # # Load Weights @@ -300,44 +160,26 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "Usage: python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=\n" - "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size + + model_path = cfg.model - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend - tp = args.tp - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph + max_new_tokens = cfg.max_new_tokens + + backend = cfg.backend + + tp = cfg.tp + + enable_paged_attn = cfg.enable_paged_attn + + enable_graph = cfg.enable_graph + if backend != "cpp": raise ValueError(f"Unsupported backend: {backend}.") @@ -351,8 +193,9 @@ def test( tp=tp, enable_paged_attn=enable_paged_attn, enable_graph=enable_graph, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - attn_backend=args.attn, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, + attn_backend=cfg.attn ) + diff --git a/examples/llama.py b/examples/llama.py index aa890ca9..6160848f 100644 --- a/examples/llama.py +++ b/examples/llama.py @@ -7,72 +7,10 @@ import sys import time import os - +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--model_path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max_new_tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="python", - help="python or cpp model", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - - return parser.parse_args() - - def test( prompts: str | list[str], model_path, @@ -163,32 +101,15 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) - - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - else: - print( - "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=\n" - "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + prompts = [cfg.prompt for _ in range(cfg.batch_size)] - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend + model_path = cfg.model + max_new_tokens = cfg.max_new_tokens + backend = cfg.backend if backend != "python": raise ValueError(f"Unsupported backend: {backend}.") diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py index e34514a7..f552a2cc 100644 --- a/python/infinilm/__init__.py +++ b/python/infinilm/__init__.py @@ -2,6 +2,7 @@ from . import distributed from . import cache from . import llm +from . import base_config from .llm import ( LLM, @@ -16,6 +17,7 @@ "distributed", "cache", "llm", + "base_config", # LLM classes "LLM", "AsyncLLMEngine", diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py new file mode 100644 index 00000000..b9ac376c --- /dev/null +++ b/python/infinilm/base_config.py @@ -0,0 +1,160 @@ +import argparse +import sys +import os + + + +class BaseConfig: + """InfiniLM Unified Config - Command line argument parser""" + + def __init__(self): + + self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config") + self._add_common_args() + self.args, self.extra = self.parser.parse_known_args() + + + self.model = self.args.model + self.device = self.args.device + self.tp = self.args.tp + + + self.attn = self.args.attn + self.enable_graph = self.args.enable_graph + self.cache_type = self.args.cache_type + self.enable_paged_attn = self.args.enable_paged_attn + self.paged_kv_block_size = self.args.paged_kv_block_size + self.num_blocks = self.args.num_blocks + self.block_size = self.args.block_size + self.max_cache_len = self.args.max_cache_len + self.kv_cache_dtype = self.args.kv_cache_dtype + self.skip_load = self.args.skip_load + + + self.batch_size = self.args.batch_size + self.max_batch = self.args.max_batch + self.max_batch_size = self.args.max_batch_size + self.input_len = self.args.input_len + self.output_len = self.args.output_len + self.max_new_tokens = self.args.max_new_tokens + self.max_tokens = self.args.max_tokens + self.prompt = self.args.prompt + self.top_k = self.args.top_k + self.top_p = self.args.top_p + self.temperature = self.args.temperature + + self.warmup = self.args.warmup + self.verbose = self.args.verbose + self.log_level = self.args.log_level + + + # Evaluation parameters + self.bench = self.args.bench + self.backend = self.args.backend + self.tp = self.args.tp + self.subject = self.args.subject + self.split = self.args.split + self.num_samples = self.args.num_samples + self.output_csv = self.args.output_csv + self.cache_dir = self.args.cache_dir + + + # Quantization parameters + self.awq = self.args.awq + self.gptq = self.args.gptq + self.dtype = self.args.dtype + + + # Server parameters + self.host = self.args.host + self.port = self.args.port + self.endpoint = self.args.endpoint + self.ignore_eos = self.args.ignore_eos + + if self.enable_paged_attn and self.attn == "default": + self.attn = "paged-attn" + + def _add_common_args(self): + # --- base configuration --- + self.parser.add_argument("--model", type=str, required=True) + self.parser.add_argument("--device", type=str, default="cpu") + self.parser.add_argument("--tp", "--tensor-parallel-size", type=int, default=1) + + + # --- Infer backend optimization --- + self.parser.add_argument("--attn", type=str, default="default", choices=["default", "paged-attn", "flash-attn"]) + self.parser.add_argument("--enable-graph", action="store_true") + self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged", "static"]) + self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",) + self.parser.add_argument("--paged-kv-block-size", type=int, default=256) + self.parser.add_argument("--num-blocks", type=int, default=512, help="number of KV cache blocks") + self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block") + self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length") + self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type") + self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights") + + + # --- Length and infer parameters --- + self.parser.add_argument("--batch-size", type=int, default=1) + self.parser.add_argument("--max-batch", type=int, default=3, help="maximum batch size") + self.parser.add_argument("--max-batch-size", type=int, default=8, help="maximum batch size for server") + self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length") + self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length") + self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate") + self.parser.add_argument("--max-tokens", type=int, default=512, help="maximum tokens") + self.parser.add_argument("--prompt", type=str, default="How are you", help="default prompt text") + self.parser.add_argument("--top-k", type=int, default=1) + self.parser.add_argument("--top-p", type=float, default=1.0) + self.parser.add_argument("--temperature", type=float, default=1.0) + + # --- debug --- + self.parser.add_argument("--warmup", action="store_true") + self.parser.add_argument("--verbose", action="store_false") + self.parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="logging level") + + + # --- Evaluation parameters --- + self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate") + self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type") + + self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'") + self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use") + self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject") + self.parser.add_argument("--output-csv", type=str, default=None, help="path to output CSV file for results") + self.parser.add_argument("--cache-dir", type=str, default=None, help="directory for dataset cache") + + + # --- Quantization parameters --- + self.parser.add_argument("--awq", action="store_false", help="use AWQ quantization") + self.parser.add_argument("--gptq", action="store_false", help="use GPTQ quantization") + self.parser.add_argument("--dtype", type=str, default="float16", choices=["float32", "float16", "bfloat16"], help="data type for model") + + + # --- Server parameters --- + self.parser.add_argument("--host", type=str, default="0.0.0.0", help="server host") + self.parser.add_argument("--port", type=int, default=8000, help="server port") + self.parser.add_argument("--endpoint", type=str, default="/completions", help="API endpoint") + + self.parser.add_argument("--ignore-eos", action="store_true",dest="ignore_eos", default=False, help="Ignore EOS token and continue generation",) + + def get_device_str(self, device): + """Convert device name to backend string (cuda/cpu/musa/mlu)""" + DEVICE_STR_MAP = { + "cpu": "cpu", + "nvidia": "cuda", + "qy": "cuda", + "cambricon": "mlu", + "ascend": "ascend", + "metax": "cuda", + "moore": "musa", + "iluvatar": "cuda", + "kunlun": "kunlun", + "hygon": "cuda", + "ali": "cuda" + } + return DEVICE_STR_MAP.get(device.lower(), "cpu") + + + def __repr__(self): + """String representation of configuration""" + return f"BaseConfig(model='{self.model}', device='{self.device}', tp={self.tp})" diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 7f510a41..56cc1a40 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -12,7 +12,7 @@ import logging import os import asyncio - +from infinilm.base_config import BaseConfig from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse @@ -550,159 +550,32 @@ def setup_logging(log_level: str = "INFO"): ) -def parse_args(): - """Parse command line arguments.""" - parser = argparse.ArgumentParser(description="InfiniLM Inference Server") - parser.add_argument( - "--model_path", type=str, required=True, help="Path to model directory" - ) - parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism degree") - parser.add_argument( - "--cache_type", - type=str, - default="paged", - choices=["paged", "static"], - help="Cache type: paged or static", - ) - parser.add_argument( - "--max_tokens", - type=int, - default=512, - help="Maximum number of tokens to generate", - ) - parser.add_argument( - "--max_batch_size", - type=int, - default=8, - help="Maximum batch size (paged cache only)", - ) - parser.add_argument( - "--num_blocks", - type=int, - default=512, - help="Number of blocks for KV cache (paged cache only)", - ) - parser.add_argument( - "--block_size", - type=int, - default=256, - help="Block size for KV cache (paged cache only)", - ) - parser.add_argument( - "--max_cache_len", - type=int, - default=4096, - help="Maximum sequence length (static cache only)", - ) - parser.add_argument( - "--dtype", - type=str, - default="float16", - choices=["float32", "float16", "bfloat16"], - help="Data type", - ) - parser.add_argument( - "--temperature", type=float, default=1.0, help="Sampling temperature" - ) - parser.add_argument( - "--top_p", type=float, default=0.8, help="Top-p sampling parameter" - ) - parser.add_argument("--top_k", type=int, default=1, help="Top-k sampling parameter") - parser.add_argument("--host", type=str, default="0.0.0.0", help="Server host") - parser.add_argument("--port", type=int, default=8000, help="Server port") - parser.add_argument("--cpu", action="store_true", help="Use CPU") - parser.add_argument("--nvidia", action="store_true", help="Use NVIDIA GPU") - parser.add_argument("--qy", action="store_true", help="Use QY GPU") - parser.add_argument("--metax", action="store_true", help="Use MetaX device") - parser.add_argument("--moore", action="store_true", help="Use Moore device") - parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device") - parser.add_argument("--cambricon", action="store_true", help="Use Cambricon device") - parser.add_argument("--ali", action="store_true", help="Use Ali PPU device") - parser.add_argument("--hygon", action="store_true", help="Use Hygon DCU device") - parser.add_argument( - "--enable-graph", - action="store_true", - help="Enable graph compiling", - ) - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="Attention backend to use: 'default' or 'flash-attn'", - ) - parser.add_argument( - "--log_level", - type=str, - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - help="Logging level", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - dest="ignore_eos", - default=False, - help="Ignore EOS token and continue generation", - ) - - return parser.parse_args() def main(): - args = parse_args() - - setup_logging(args.log_level) - - if args.cpu: - device = "cpu" - elif args.nvidia: - device = "cuda" - elif args.qy: - device = "cuda" - elif args.metax: - device = "cuda" - elif args.moore: - device = "musa" - elif args.iluvatar: - device = "cuda" - elif args.cambricon: - device = "mlu" - elif args.ali: - device = "cuda" - elif args.hygon: - device = "cuda" - else: - print( - "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] " - "--model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE" - "\n" - "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ " - "--max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1" - "\n" - "Optional: --enable-paged-attn --enable-graph --attn=default" - ) - sys.exit(1) + cfg = BaseConfig() + setup_logging(cfg.log_level) + device = cfg.get_device_str(cfg.device) server = InferenceServer( - model_path=args.model_path, + model_path=cfg.model, device=device, - dtype=args.dtype, - tensor_parallel_size=args.tp, - cache_type=args.cache_type, - max_tokens=args.max_tokens, - max_batch_size=args.max_batch_size, - num_blocks=args.num_blocks, - block_size=args.block_size, - max_cache_len=args.max_cache_len, - temperature=args.temperature, - top_p=args.top_p, - top_k=args.top_k, - host=args.host, - port=args.port, - enable_graph=args.enable_graph, - attn_backend=args.attn, - ignore_eos=args.ignore_eos, + dtype=cfg.dtype, + tensor_parallel_size=cfg.tp, + cache_type=cfg.cache_type, + max_tokens=cfg.max_tokens, + max_batch_size=cfg.max_batch_size, + num_blocks=cfg.num_blocks, + block_size=cfg.block_size, + max_cache_len=cfg.max_cache_len, + temperature=cfg.temperature, + top_p=cfg.top_p, + top_k=cfg.top_k, + host=cfg.host, + port=cfg.port, + enable_graph=cfg.enable_graph, + attn_backend=cfg.attn, + ignore_eos=cfg.ignore_eos, ) server.start() diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py index c5c399dc..78305992 100644 --- a/test/bench/test_benchmark.py +++ b/test/bench/test_benchmark.py @@ -8,7 +8,7 @@ import numpy as np from datasets import load_dataset, Dataset from abc import ABC, abstractmethod - +from infinilm.base_config import BaseConfig TOTAL_TOKENS = 0 TOTAL_TIME = 0.0 @@ -826,121 +826,7 @@ def parse_list(value: str): ) -def parse_arguments(): - """Parse command line arguments using argparse""" - parser = argparse.ArgumentParser( - description="Benchmark evaluation for language models on CEval and MMLU datasets", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python test_benchmark.py --cpu /path/to/model --bench ceval --backend cpp - python test_benchmark.py --nvidia /path/to/model --bench mmlu --backend vllm --ndev 2 - python test_benchmark.py --cpu /path/to/model --bench ceval --subject "accountant" --num_samples 10 --output_csv results.csv - """, - ) - - # Device flags (mutually exclusive) - device_group = parser.add_mutually_exclusive_group(required=True) - device_group.add_argument("--cpu", action="store_true", help="Use CPU device") - device_group.add_argument( - "--nvidia", action="store_true", help="Use NVIDIA GPU device" - ) - device_group.add_argument( - "--cambricon", action="store_true", help="Use Cambricon MLU device" - ) - device_group.add_argument("--ascend", action="store_true", help="Use Ascend device") - device_group.add_argument("--metax", action="store_true", help="Use Metax device") - device_group.add_argument("--moore", action="store_true", help="Use Moore device") - device_group.add_argument( - "--iluvatar", action="store_true", help="Use Iluvatar device" - ) - device_group.add_argument("--kunlun", action="store_true", help="Use Kunlun device") - device_group.add_argument("--hygon", action="store_true", help="Use Hygon device") - device_group.add_argument("--ali", action="store_true", help="Use Ali device") - - # Positional argument for model path - parser.add_argument("model_path", type=str, help="Path to the model directory") - - # Required benchmark argument - parser.add_argument( - "--bench", - required=True, - choices=["ceval", "mmlu"], - help="Benchmark to evaluate (ceval or mmlu)", - ) - - # Optional arguments - parser.add_argument( - "--backend", - type=str, - default="cpp", - choices=["python", "cpp", "torch", "vllm"], - help="Backend to use for inference (default: cpp)", - ) - parser.add_argument( - "--ndev", - type=int, - default=1, - help="Number of devices for tensor parallelism (default: 1)", - ) - parser.add_argument( - "--subject", - type=str, - default="all", - help="Subject(s) to evaluate, comma-separated or 'all' (default: all)", - ) - parser.add_argument( - "--split", - type=str, - default="test", - choices=["test", "val", "all"], - help="Dataset split to use: test, val, or all (default: test)", - ) - parser.add_argument( - "--num-samples", - type=int, - default=None, - help="Number of samples to evaluate per subject (default: all)", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=500, - help="Maximum number of new tokens to generate (default: 500)", - ) - parser.add_argument( - "--output-csv", - type=str, - default=None, - help="Path to output CSV file for results", - ) - parser.add_argument( - "--cache-dir", - type=str, - default=None, - help="Directory to use for dataset cache (offline mode when specified)", - ) - - # InfiniLM specific options - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="Enable paged attention for InfiniLM backend", - ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="Enable graph compilation for InfiniLM backend", - ) - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="Attention backend for InfiniLM (default: default)", - ) - return parser.parse_args() def load_dataset_samples(args): @@ -1222,34 +1108,13 @@ def load_subject_samples(subj_name): def main(): """Main function""" - args = parse_arguments() - - # Map device flags to device type string - device_type_str = "cpu" - if args.cpu: - device_type_str = "cpu" - elif args.nvidia: - device_type_str = "nvidia" - elif args.cambricon: - device_type_str = "cambricon" - elif args.ascend: - device_type_str = "ascend" - elif args.metax: - device_type_str = "metax" - elif args.moore: - device_type_str = "moore" - elif args.iluvatar: - device_type_str = "iluvatar" - elif args.kunlun: - device_type_str = "kunlun" - elif args.hygon: - device_type_str = "hygon" - elif args.ali: - device_type_str = "ali" + cfg = BaseConfig() + + device_type_str = cfg.device # Normalize cache_dir and force offline when provided - if args.cache_dir: - args.cache_dir = os.path.expanduser(args.cache_dir) + if cfg.cache_dir: + cfg.cache_dir = os.path.expanduser(cfg.cache_dir) os.environ["HF_DATASETS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" @@ -1258,7 +1123,7 @@ def main(): print("STEP 1: LOADING DATASET") print("=" * 60 + "\n") - subject_samples = load_dataset_samples(args) + subject_samples = load_dataset_samples(cfg) if not subject_samples: print("No samples loaded. Exiting.") @@ -1269,21 +1134,21 @@ def main(): print("STEP 2: LOADING MODEL") print("=" * 60 + "\n") - if args.backend == "torch": - assert args.ndev == 1, "Torch backend only supports single-device evaluation" - model = TorchBenchmark(args.model_path, device_type_str, args.bench) - elif args.backend == "vllm": - model = VLLMBenchmark(args.model_path, device_type_str, args.ndev, args.bench) + if cfg.backend == "torch": + assert cfg.tp == 1, "Torch backend only supports single-device evaluation" + model = TorchBenchmark(cfg.model, device_type_str, cfg.bench) + elif cfg.backend == "vllm": + model = VLLMBenchmark(cfg.model_path, device_type_str, cfg.tp, cfg.bench) else: # cpp backend model = InfiniLMBenchmark( - args.model_path, + cfg.model, device_type_str, - args.ndev, - args.backend, - args.bench, - args.enable_paged_attn, - args.enable_graph, - args.attn, + cfg.tp, + cfg.backend, + cfg.bench, + cfg.enable_paged_attn, + cfg.enable_graph, + cfg.attn, ) # Step 3: Evaluate each subject @@ -1300,7 +1165,7 @@ def main(): # Evaluate samples for this subject result = evaluate_samples( - model, samples, args.bench, args.max_new_tokens, subject_name + model, samples, cfg.bench, cfg.max_new_tokens, subject_name ) all_results.append(result) print( @@ -1326,7 +1191,7 @@ def main(): overall_accuracy = overall_correct / overall_total if overall_total > 0 else 0.0 print(f"{'=' * 60}") - if args.bench == "ceval": + if cfg.bench == "ceval": print( f"Overall 成绩: {overall_correct}/{overall_total} = {overall_accuracy:.2%}" ) @@ -1341,9 +1206,9 @@ def main(): print(f"Overall Throughput: {TOTAL_TOKENS / TOTAL_TIME:.2f} tokens/s") # Write CSV if output path is specified - if args.output_csv: - print(f"\nWriting results to CSV: {args.output_csv}") - with open(args.output_csv, "w", newline="", encoding="utf-8") as csvfile: + if cfg.output_csv: + print(f"\nWriting results to CSV: {cfg.output_csv}") + with open(cfg.output_csv, "w", newline="", encoding="utf-8") as csvfile: writer = csv.writer(csvfile) writer.writerow(["Subject", "Correct", "Total", "Accuracy"]) for result in all_results: @@ -1358,7 +1223,7 @@ def main(): writer.writerow( ["Overall", overall_correct, overall_total, f"{overall_accuracy:.4f}"] ) - print(f"CSV file written successfully: {args.output_csv}") + print(f"CSV file written successfully: {cfg.output_csv}") if __name__ == "__main__":