diff --git a/test/common/evalscope_utils.py b/test/common/evalscope_utils.py new file mode 100644 index 000000000..3901e603e --- /dev/null +++ b/test/common/evalscope_utils.py @@ -0,0 +1,128 @@ +import glob +import json +import os +from typing import Any, Dict, List, Optional + +import evalscope +from common.capture_utils import export_vars + + +class EvalScopeRunner: + """ + Encapsulate the logic for running evalscope tasks and collecting results. + """ + + def __init__(self, output_dir: str): + self.output_dir = output_dir + + def run(self, task_cfg: evalscope.config.TaskConfig) -> None: + evalscope.run_task(task_cfg=task_cfg) + + @staticmethod + def _get_latest_run_dir(output_dir: str) -> Optional[str]: + if not os.path.exists(output_dir): + return None + subdirs = [ + d + for d in os.listdir(output_dir) + if os.path.isdir(os.path.join(output_dir, d)) + ] + if not subdirs: + return None + subdirs.sort( + reverse=True + ) # The timestamp directory can be sorted in descending order by string + return os.path.join(output_dir, subdirs[0]) + + @staticmethod + def _collect_report_json_files(run_dir: str) -> List[str]: + reports_root = os.path.join(run_dir, "reports") + if not os.path.exists(reports_root): + return [] + + json_files = [] + for model_dir in os.listdir(reports_root): + model_path = os.path.join(reports_root, model_dir) + if os.path.isdir(model_path): + json_files.extend(glob.glob(os.path.join(model_path, "*.json"))) + return json_files + + @staticmethod + def _parse_metrics_from_json(json_path: str) -> Dict[str, Any]: + """Parse a single JSON report file and return a structured metrics dictionary""" + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + + dataset_name = data.get( + "dataset_name", os.path.splitext(os.path.basename(json_path))[0] + ) + model_name = data.get("model_name", "") + + metrics = [] + for m in data.get("metrics", []): + categories = [ + { + "name": c.get("name"), + "score": c.get("score", 0.0), + "macro_score": c.get("macro_score", 0.0), + "num": c.get("num", 0), + "subsets": c.get("subsets", []), + } + for c in m.get("categories", []) + ] + metrics.append( + { + "name": m.get("name"), + "score": m.get("score", 0.0), + "macro_score": m.get("macro_score", 0.0), + "num": m.get("num", 0), + "categories": categories, + } + ) + + return { + "dataset_name": dataset_name, + "model_name": model_name, + "pretty_name": data.get("dataset_pretty_name", dataset_name), + "score": data.get("score", 0.0), + "metrics": metrics, + "analysis": data.get("analysis", "N/A"), + } + + @export_vars + def collect_results(self) -> Dict[str, Any]: + latest_run = self._get_latest_run_dir(self.output_dir) + if not latest_run: + return {"_name": "eval_scope", "_proj": {}} + + json_files = self._collect_report_json_files(latest_run) + if not json_files: + return {"_name": "eval_scope", "_proj": {}} + + all_metrics = {} + extracted_model_name = "" + + for json_path in json_files: + try: + parsed = self._parse_metrics_from_json(json_path) + except (json.JSONDecodeError, KeyError): + continue + + if not extracted_model_name: + extracted_model_name = parsed["model_name"] + + dataset_name = parsed["dataset_name"] + all_metrics[dataset_name] = { + "pretty_name": parsed["pretty_name"], + "model": parsed["model_name"], + "score": parsed["score"], + "metrics": parsed["metrics"], + "analysis": parsed["analysis"], + } + # The total score is presented in a flat format, facilitating quick access by external parties + all_metrics[f"{dataset_name}.score"] = parsed["score"] + + if extracted_model_name: + all_metrics["model_name"] = extracted_model_name + + return {"_name": "eval_scope", "_proj": all_metrics} diff --git a/test/docs/assets/pic1.png b/test/docs/assets/pic1.png new file mode 100644 index 000000000..33b689ac0 Binary files /dev/null and b/test/docs/assets/pic1.png differ diff --git a/test/docs/assets/pic2.png b/test/docs/assets/pic2.png new file mode 100644 index 000000000..9c86a8a55 Binary files /dev/null and b/test/docs/assets/pic2.png differ diff --git a/test/docs/assets/pic3.png b/test/docs/assets/pic3.png new file mode 100644 index 000000000..74fde6163 Binary files /dev/null and b/test/docs/assets/pic3.png differ diff --git a/test/docs/evalscopeTest.md b/test/docs/evalscopeTest.md new file mode 100644 index 000000000..d5ef4a8a5 --- /dev/null +++ b/test/docs/evalscopeTest.md @@ -0,0 +1,179 @@ +# EvalScope Accuracy Evaluation Guide + +This test case is built upon **EvalScope (v1.5.2)** to provide automated evaluation capabilities, enabling convenient assessment of large language model performance on mainstream academic benchmarks and long-context retrieval tasks. + +## Supported Evaluation Types + +| Type | Description | Example Datasets | +|------|-------------|------------------| +| **Mainstream Benchmark Evaluation** | Standard question-answering tasks covering mathematics, reasoning, knowledge, coding, and more | `aime24`, `aime25`, `aime26`, `gsm8k`, `longbench_v2`, `ceval`, `cmmlu`, `humaneval`, `mmlu`, `mmlu_pro`, etc. | +| **Needle In A Haystack** | Evaluates the model's ability to locate specific information within extremely long contexts | - | + +> **Note**: Except for the Needle In A Haystack test, only simple question-answering datasets are currently supported. Datasets requiring additional runtime environments or judge models are not yet adapted. + +--- + +## Quick Start + +### 1. Environment Setup + +- It is recommended to use a virtual environment to install dependencies: + ```bash + cd test + pip install -r requirements.txt + ``` + +### 2. Dataset Preparation + +#### Online Environment (With Internet Access) +- The framework will automatically download required datasets from ModelScope. **No manual operation is needed**. + +#### Offline Environment (No Internet Access) +- Datasets must be downloaded in advance to a unified directory. +- Ensure that subdirectory names exactly match the identifiers in the task list. + +**Method 1: Clone Individual Datasets** +```bash +git clone https://www.modelscope.cn/datasets/evalscope/aime26.git +git clone https://www.modelscope.cn/datasets/ZhipuAI/LongBench-v2.git # Note: Rename the cloned directory to `longbench_v2` +git clone https://www.modelscope.cn/datasets/AI-ModelScope/Needle-in-a-Haystack-Corpus.git +``` + +**Method 2: Use the Pre-Packaged Dataset Archive** +- Visit the [ModelScope Dataset Repository](https://modelscope.cn/datasets/keriko/UCM_tools/files/dataset) to download the complete archive and extract it to the target path. + +--- + +## Configuration + +### General Parameters + +| Environment Variable | Default | Description | +|---------------------|---------|-------------| +| `SCOPE_DATASET_ROOT` | | Root directory where datasets are stored | +| `SCOPE_TEST_LIST` | `aime24,gsm8k` (example) | Comma-separated list of datasets to evaluate | + +### Needle In A Haystack Specific Parameters + +| Environment Variable | Default | Description | +|----------------------|---------|-------------| +| `SCOPE_NEEDLE_MIN` | `1000` | Minimum context length (in tokens) | +| `SCOPE_NEEDLE_MAX` | `32000` | Maximum context length (in tokens) | + +### Local Manual Testing +Directly modify the following constants in `test_evalscope.py`: +```python +DEFAULT_DATASET_ROOT = "/mnt/data/evalscope/dataset" # Dataset path; can be left empty in online environments +DEFAULT_TASK_LIST = ["aime24", "gsm8k"] # Datasets to evaluate +``` + +--- + +## Running Tests + +### Single Task Execution + +```bash +cd test + +# Mainstream benchmark evaluation +pytest suites/E2E/test_evalscope.py::test_eval_accuracy + +# Needle In A Haystack evaluation +pytest suites/E2E/test_evalscope.py::test_needle_task +``` + +### Batch Execution by Feature Tag + +```bash +cd test +pytest --feature=evalscope +``` + +--- + +## Output and Results + +### 1. EvalScope Native Output +All run records are saved under the `test/results/evalscope_outputs/` directory, organized into timestamped subdirectories, including: +- Evaluation configuration files +- Detailed request/response logs +- Aggregated metrics files (JSON) +- Visualization reports (HTML) + +For detailed format information, please refer to the [EvalScope Official Documentation](https://evalscope.readthedocs.io/). + +### 2. Database Persistence +Evaluation results are automatically parsed and stored in the configured database backend for centralized querying and comparison. + +The following files are generated in the `test/results/` directory: +- `eval_scope.jsonl` +- `eval_scope.csv` + +To customize database connections, modify the `results` section in the configuration (PostgreSQL, MongoDB, etc. are supported): + +```yaml +results: + localFile: + path: "./results" + # postgresql: + # host: "localhost" + # ... + # mongodb: + # host: "127.0.0.1" + # ... +``` + +--- + +## Notes + +1. Some dataset names must strictly match the ModelScope repository names (e.g., `longbench_v2` instead of `LongBench-v2`). Pay attention to directory renaming when using offline mode. +2. If using a remote API for evaluation, ensure that the `llm_connection` configuration is correct and the service is accessible (example: `http://127.0.0.1:8080/`). +3. The Needle In A Haystack task uses the **model under test itself** as the judge model. Ensure that the model possesses basic instruction-following capabilities, and configure the model path as `tokenizer_path` in `llm_connection`. + +## Test Process +![](assets/pic1.png) + +## Test Result Example +```json +{ + "aime25": { + "pretty_name": "AIME-2025", + "model": "Qwen3-32B", + "score": 0.0, + "metrics": [{ + "name": "mean_acc", + "score": 0.0, + "macro_score": 0.0, + "num": 30, + "categories": [{ + "name": ["default"], + "score": 0.0, + "macro_score": 0.0, + "num": 30, + "subsets": [{ + "name": "default", + "score": 0.0, + "num": 30 + }] + }] + }], + "analysis": "N/A" + }, + "aime25.score": 0.0, + "model_name": "Qwen3-32B", + "test_id": "ad9ba909-1646-47b3-89d6-9240c6497593", + "test_items": "pytestall_cases", + "create_at": "2026-04-09 17:00:05.910252", + "extra_info": "" +} +``` + +## HTML Test Report +![](assets/pic2.png) + +## Needle In A Haystack Heatmap +![](assets/pic3.png) + +*Note: The screenshots above were generated using a mock model for testing, hence all scores are zero.* diff --git a/test/docs/evalscopeTest_zh.md b/test/docs/evalscopeTest_zh.md new file mode 100644 index 000000000..a4cb1cc9f --- /dev/null +++ b/test/docs/evalscopeTest_zh.md @@ -0,0 +1,176 @@ +# EvalScope 精度评测指南 + +本测试case基于 **EvalScope (v1.5.2)** 封装了自动化评测能力,用于便捷地测试大语言模型在主流学术基准及长上下文检索任务上的表现。 + +## 支持的评测类型 + +| 类型 | 说明 | 示例数据集 | +|------|------|------------| +| **主流数据集评测** | 覆盖数学、推理、知识、代码等能力的标准问答任务 | `aime24`、`aime25`、`aime26`、`gsm8k`、`longbench_v2`、`ceval`、`cmmlu`、`humaneval`、`mmlu`、`mmlu_pro` 等 | +| **大海捞针评测** | 评估模型在超长上下文中定位特定信息的能力(Needle In A Haystack) | - | + +> **注意**:除大海捞针测试外,当前仅支持简单问答形式的数据集。需要额外运行环境或裁判模型介入的数据集暂未适配。 + +--- + +## 快速开始 + +### 1. 环境准备 + +- 推荐使用虚拟环境安装依赖: + ```bash + cd test + pip install -r requirements.txt + ``` + +### 2. 数据集准备 + +#### 在线环境(有网络) +- 框架会自动从 ModelScope 下载所需数据集,**无需手动操作**。 + +#### 离线环境(无网络) +- 需提前将数据集下载至统一目录。 +- 确保子目录名称与任务列表中的标识完全一致。 + +**下载方式一:克隆单个数据集** +```bash +git clone https://www.modelscope.cn/datasets/evalscope/aime26.git +git clone https://www.modelscope.cn/datasets/ZhipuAI/LongBench-v2.git # 注意克隆后需将目录重命名为 longbench_v2 +git clone https://www.modelscope.cn/datasets/AI-ModelScope/Needle-in-a-Haystack-Corpus.git +``` + +**下载方式二:使用打包好的数据集压缩包** +- 访问 [ModelScope 数据集仓库](https://modelscope.cn/datasets/keriko/UCM_tools/files/dataset) 下载全量压缩包并解压至目标路径。 + +--- + +## 配置说明 + +### 通用参数 + +| 环境变量 | 默认值 | 说明 | +|----------|------|------| +| `SCOPE_DATASET_ROOT` | | 数据集存放根目录 | +| `SCOPE_TEST_LIST` | `aime24,gsm8k`(示例) | 待评测数据集列表,逗号分隔 | + +### 大海捞针专用参数 + +| 环境变量 | 默认值 | 说明 | +|----------|--------|------| +| `SCOPE_NEEDLE_MIN` | `1000` | 最小上下文长度(token 数) | +| `SCOPE_NEEDLE_MAX` | `32000` | 最大上下文长度(token 数) | + +### 本地手动测试 +直接修改 `test_evalscope.py` 中的以下常量即可: +```python +DEFAULT_DATASET_ROOT = "/mnt/data/evalscope/dataset" # 数据集路径,联网环境下可为空 +DEFAULT_TASK_LIST = ["aime24", "gsm8k"] # 待测数据集 +``` + +--- + +## 运行测试 + +### 单任务执行 + +```bash +cd test + +# 主流数据集评测 +pytest suites/E2E/test_evalscope.py::test_eval_accuracy + +# 大海捞针评测 +pytest suites/E2E/test_evalscope.py::test_needle_task +``` + +### 按标签批量执行 + +```bash +cd test +pytest --feature=evalscope +``` + +--- + +## 结果输出 + +### 1. EvalScope 原生输出 +所有运行记录均保存在 `test/results/evalscope_outputs/` 目录下,按时间戳分子目录,包含: +- 评测配置文件 +- 详细请求/响应日志 +- 汇总指标文件(JSON) +- 可视化报告(HTML) + +具体格式说明请参阅 [EvalScope 官方文档](https://evalscope.readthedocs.io/)。 + +### 2. 数据库持久化存储 +评测结果会被自动解析并存入配置的数据库后端,便于集中查询与对比。 + +`test/results/` 目录下会生成以下文件: +- `eval_scope.jsonl` +- `eval_scope.csv` + +如需自定义数据库连接,可修改配置中的 `results` 段落(支持 PostgreSQL、MongoDB 等): + +```yaml +results: + localFile: + path: "./results" + # postgresql: + # host: "localhost" + # ... + # mongodb: + # host: "127.0.0.1" + # ... +``` + +--- + +## 注意事项 + +1. 部分数据集名称需与 ModelScope 仓库名严格对应(如 `longbench_v2` 而非 `LongBench-v2`),离线使用时请留意目录重命名。 +2. 若使用远程 API 进行评测,请确保 `llm_connection` 配置正确且服务可访问(示例:http://127.0.0.1:8080/)。 +3. 大海捞针任务会使用**被测模型自身**作为裁判模型,请确保模型具备基本的指令遵循能力;且在`llm_connection`中配置模型路径作为`tokenizer_path` + +测试过程 +![](assets/pic1.png) +测试结果 +```json +{ + "aime25": { + "pretty_name": "AIME-2025", + "model": "Qwen3-32B", + "score": 0.0, + "metrics": [{ + "name": "mean_acc", + "score": 0.0, + "macro_score": 0.0, + "num": 30, + "categories": [{ + "name": ["default"], + "score": 0.0, + "macro_score": 0.0, + "num": 30, + "subsets": [{ + "name": "default", + "score": 0.0, + "num": 30 + }] + }] + }], + "analysis": "N/A" + }, + "aime25.score": 0.0, + "model_name": "Qwen3-32B", + "test_id": "ad9ba909-1646-47b3-89d6-9240c6497593", + "test_items": "pytestall_cases", + "create_at": "2026-04-09 17:00:05.910252", + "extra_info": "" +} +``` +HTML测试报告 +![](assets/pic2.png) +大海捞针测试热力图 +![](assets/pic3.png) + +注:使用Mock模型进行测试,所以得分均为0 diff --git a/test/common/llmperf/LLMPerf.md b/test/docs/llmperfTest.md similarity index 100% rename from test/common/llmperf/LLMPerf.md rename to test/docs/llmperfTest.md diff --git a/test/common/llmperf/LLMPerf_zh.md b/test/docs/llmperfTest_zh.md similarity index 100% rename from test/common/llmperf/LLMPerf_zh.md rename to test/docs/llmperfTest_zh.md diff --git a/test/common/uc_eval/README.md b/test/docs/ucevalTest_zh.md similarity index 98% rename from test/common/uc_eval/README.md rename to test/docs/ucevalTest_zh.md index b8df223d2..e2a796247 100644 --- a/test/common/uc_eval/README.md +++ b/test/docs/ucevalTest_zh.md @@ -202,7 +202,7 @@ multiturn_dialogue_perf_cases = [ pytest.param( PerfConfig( data_type="multi_turn_dialogue", - dataset_file_path="datasets/multi_turn_dialogues/multiturndialog.json", + dataset_file_path="../common/uc_eval/datasets/multi_turn_dialogues/multiturndialog.json", enable_prefix_cache=False, parallel_num=1, benchmark_mode="default-perf", @@ -217,7 +217,7 @@ multiturn_dialogue_perf_cases = [ @pytest.mark.parametrize("perf_config", multiturn_dialogue_perf_cases) @export_vars def test_multiturn_dialogue_perf( - perf_config: PerfConfig, model_config: ModelConfig + perf_config: PerfConfig, model_config: ModelConfig ): file_save_path = config_instance.get_config("reports").get("base_dir") task = MultiTurnDialogPerfTask(model_config, perf_config, file_save_path) diff --git a/test/requirements.txt b/test/requirements.txt index 90fed962b..ac3bab62f 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -12,8 +12,10 @@ pymongo>=4.0.0 requests>=2.10.0 pandas>=2.3.0 pydantic>=2.12.0 -transformers>=4.0.0 +transformers==4.57.1 httpx>=0.25.0 #uc_eval jieba>=0.42.1 -openpyxl>=3.1.5 \ No newline at end of file +openpyxl>=3.1.5 +#eval-scope +evalscope==1.5.2 \ No newline at end of file diff --git a/test/suites/E2E/test_evalscope.py b/test/suites/E2E/test_evalscope.py new file mode 100644 index 000000000..cfdf8ffcd --- /dev/null +++ b/test/suites/E2E/test_evalscope.py @@ -0,0 +1,161 @@ +import os + +import evalscope +import pytest +from common.config_utils import config_utils as config_instance +from common.evalscope_utils import EvalScopeRunner + +# ---------- Default Var Value ---------- +DEFAULT_DATASET_ROOT = "" +# DEFAULT_TASK_LIST = ["aime24","aime25","aime26","ceval","cmmlu","gsm8k","humaneval","longbench_v2","mmlu","mmlu_pro","mmmlu"] +DEFAULT_TASK_LIST = ["gsm8k"] +DEFAULT_NEEDLE_MIN = 1000 +DEFAULT_NEEDLE_MAX = 32000 + + +def _build_general_task_config( + model: str, + api_url: str, + api_key: str, + datasets: list, + dataset_root: str, + output_dir: str, +) -> evalscope.config.TaskConfig: + """Build a general evaluation task configuration""" + dataset_args = {} + if dataset_root: + dataset_args = {ds: {"dataset_id": f"{dataset_root}/{ds}/"} for ds in datasets} + return evalscope.config.TaskConfig( + model=model, + api_url=api_url, + api_key=api_key, + datasets=datasets, + dataset_args=dataset_args, + work_dir=output_dir, + no_timestamp=False, + ) + + +def _build_needle_task_config( + model: str, + api_url: str, + api_key: str, + tokenizer_path: str, + dataset_root: str, + output_dir: str, +) -> evalscope.config.TaskConfig: + """Constructing the task configuration for finding a needle in a haystack""" + needle_min = int(os.getenv("SCOPE_NEEDLE_MIN", DEFAULT_NEEDLE_MIN)) + needle_max = int(os.getenv("SCOPE_NEEDLE_MAX", DEFAULT_NEEDLE_MAX)) + + return evalscope.config.TaskConfig( + model=model, + api_url=api_url, + api_key=api_key, + eval_type="openai_api", + datasets=["needle_haystack"], + eval_batch_size=5, + dataset_args={ + "needle_haystack": { + "subset_list": ["chinese", "english"], + "dataset_id": f"{dataset_root}/Needle-in-a-Haystack-Corpus/", + "extra_params": { + "retrieval_question": "According to the text, what is the unique name of the award-winning purple vegetable grown on the floating farm?", + "needles": [ + "\nIn a surprising turn of events recorded only in this document, the award for 'Most Exotic Produce of 2077' was given to a bioluminescent purple vegetable called the 'Voidlight Yam', which is cultivated exclusively on a floating aeroponic farm barge named 'The Drifting Mandrake'.\n" + ], + "context_lengths_min": needle_min, + "context_lengths_max": needle_max, + "context_lengths_num_intervals": 20, + "document_depth_percent_min": 0, + "document_depth_percent_max": 100, + "document_depth_percent_intervals": 10, + "tokenizer_path": tokenizer_path, + "show_score": True, + }, + } + }, + generation_config={"max_tokens": 512}, + # If necessary, other models can be selected as the judge model + judge_model_args={ + "model_id": model, + "api_url": api_url, + "api_key": api_key, + }, + work_dir=output_dir, + no_timestamp=False, + ) + + +@pytest.mark.feature("eval_scope") +def test_eval_accuracy(): + env_list = os.getenv("SCOPE_TEST_LIST") + if env_list: + task_list = [x.strip() for x in env_list.split(",") if x.strip()] + else: + task_list = DEFAULT_TASK_LIST + + llm_cfg = config_instance.get_nested_config("llm_connection") + base_url = llm_cfg.get("server_url", "").rstrip("/") + model = llm_cfg.get("model") + api_url = f"{base_url}/v1/chat/completions" + api_key = "EMPTY_TOKEN" + + dataset_root = os.getenv("SCOPE_DATASET_ROOT") or DEFAULT_DATASET_ROOT + dataset_root = dataset_root.rstrip("/") + "/" if dataset_root else "" + + output_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + "results", + "evalscope_outputs", + ) + + task_cfg = _build_general_task_config( + model=model, + api_url=api_url, + api_key=api_key, + datasets=task_list, + dataset_root=dataset_root, + output_dir=output_dir, + ) + + runner = EvalScopeRunner(output_dir) + runner.run(task_cfg) + runner.collect_results() + + assert True + + +@pytest.mark.feature("eval_scope") +def test_needle_task(): + """Haystack Needle Test (using oneself as the referee model)""" + llm_cfg = config_instance.get_nested_config("llm_connection") + base_url = llm_cfg.get("server_url", "").rstrip("/") + model = llm_cfg.get("model") + api_url = f"{base_url}/v1/chat/completions" + api_key = "EMPTY_TOKEN" + tokenizer_path = llm_cfg.get("tokenizer_path") + + dataset_root = os.getenv("SCOPE_DATASET_ROOT") or DEFAULT_DATASET_ROOT + dataset_root = dataset_root.rstrip("/") + "/" if dataset_root else "" + + output_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + "results", + "evalscope_outputs", + ) + + task_cfg = _build_needle_task_config( + model=model, + api_url=api_url, + api_key=api_key, + tokenizer_path=tokenizer_path, + dataset_root=dataset_root, + output_dir=output_dir, + ) + + runner = EvalScopeRunner(output_dir) + runner.run(task_cfg) + runner.collect_results() + + assert True