Skip to content

Commit 8a6a84a

Browse files
committed
feat(tests): add EvalScope evaluation suite with needle-in-haystack support
- Introduce EvalScopeRunner utility class to encapsulate task execution and result collection. - Add test cases for mainstream benchmarks (aime, gsm8k, mmlu, etc.) and needle-in-haystack evaluation. - Support environment variable overrides for dataset root, task list, and needle context lengths. - Refactor configuration building into dedicated helper functions for clarity. - Include detailed README with setup instructions, usage examples, and result interpretation.
1 parent f531458 commit 8a6a84a

11 files changed

Lines changed: 643 additions & 4 deletions

File tree

test/common/evalscope_utils.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import glob
2+
import json
3+
import os
4+
from typing import Any, Dict, List, Optional
5+
6+
import evalscope
7+
from common.capture_utils import export_vars
8+
9+
10+
class EvalScopeRunner:
11+
"""
12+
Encapsulate the logic for running evalscope tasks and collecting results.
13+
"""
14+
15+
def __init__(self, output_dir: str):
16+
self.output_dir = output_dir
17+
18+
def run(self, task_cfg: evalscope.config.TaskConfig) -> None:
19+
evalscope.run_task(task_cfg=task_cfg)
20+
21+
@staticmethod
22+
def _get_latest_run_dir(output_dir: str) -> Optional[str]:
23+
if not os.path.exists(output_dir):
24+
return None
25+
subdirs = [
26+
d
27+
for d in os.listdir(output_dir)
28+
if os.path.isdir(os.path.join(output_dir, d))
29+
]
30+
if not subdirs:
31+
return None
32+
subdirs.sort(
33+
reverse=True
34+
) # The timestamp directory can be sorted in descending order by string
35+
return os.path.join(output_dir, subdirs[0])
36+
37+
@staticmethod
38+
def _collect_report_json_files(run_dir: str) -> List[str]:
39+
reports_root = os.path.join(run_dir, "reports")
40+
if not os.path.exists(reports_root):
41+
return []
42+
43+
json_files = []
44+
for model_dir in os.listdir(reports_root):
45+
model_path = os.path.join(reports_root, model_dir)
46+
if os.path.isdir(model_path):
47+
json_files.extend(glob.glob(os.path.join(model_path, "*.json")))
48+
return json_files
49+
50+
@staticmethod
51+
def _parse_metrics_from_json(json_path: str) -> Dict[str, Any]:
52+
"""Parse a single JSON report file and return a structured metrics dictionary"""
53+
with open(json_path, "r", encoding="utf-8") as f:
54+
data = json.load(f)
55+
56+
dataset_name = data.get(
57+
"dataset_name", os.path.splitext(os.path.basename(json_path))[0]
58+
)
59+
model_name = data.get("model_name", "")
60+
61+
metrics = []
62+
for m in data.get("metrics", []):
63+
categories = [
64+
{
65+
"name": c.get("name"),
66+
"score": c.get("score", 0.0),
67+
"macro_score": c.get("macro_score", 0.0),
68+
"num": c.get("num", 0),
69+
"subsets": c.get("subsets", []),
70+
}
71+
for c in m.get("categories", [])
72+
]
73+
metrics.append(
74+
{
75+
"name": m.get("name"),
76+
"score": m.get("score", 0.0),
77+
"macro_score": m.get("macro_score", 0.0),
78+
"num": m.get("num", 0),
79+
"categories": categories,
80+
}
81+
)
82+
83+
return {
84+
"dataset_name": dataset_name,
85+
"model_name": model_name,
86+
"pretty_name": data.get("dataset_pretty_name", dataset_name),
87+
"score": data.get("score", 0.0),
88+
"metrics": metrics,
89+
"analysis": data.get("analysis", "N/A"),
90+
}
91+
92+
@export_vars
93+
def collect_results(self) -> Dict[str, Any]:
94+
latest_run = self._get_latest_run_dir(self.output_dir)
95+
if not latest_run:
96+
return {"_name": "eval_scope", "_proj": {}}
97+
98+
json_files = self._collect_report_json_files(latest_run)
99+
if not json_files:
100+
return {"_name": "eval_scope", "_proj": {}}
101+
102+
all_metrics = {}
103+
extracted_model_name = ""
104+
105+
for json_path in json_files:
106+
try:
107+
parsed = self._parse_metrics_from_json(json_path)
108+
except (json.JSONDecodeError, KeyError):
109+
continue
110+
111+
if not extracted_model_name:
112+
extracted_model_name = parsed["model_name"]
113+
114+
dataset_name = parsed["dataset_name"]
115+
all_metrics[dataset_name] = {
116+
"pretty_name": parsed["pretty_name"],
117+
"model": parsed["model_name"],
118+
"score": parsed["score"],
119+
"metrics": parsed["metrics"],
120+
"analysis": parsed["analysis"],
121+
}
122+
# The total score is presented in a flat format, facilitating quick access by external parties
123+
all_metrics[f"{dataset_name}.score"] = parsed["score"]
124+
125+
if extracted_model_name:
126+
all_metrics["model_name"] = extracted_model_name
127+
128+
return {"_name": "eval_scope", "_proj": all_metrics}

test/docs/assets/pic1.png

721 KB
Loading

test/docs/assets/pic2.png

311 KB
Loading

test/docs/assets/pic3.png

247 KB
Loading

test/docs/evalscopeTest.md

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# EvalScope Accuracy Evaluation Guide
2+
3+
This test case is built upon **EvalScope (v1.5.2)** to provide automated evaluation capabilities, enabling convenient assessment of large language model performance on mainstream academic benchmarks and long-context retrieval tasks.
4+
5+
## Supported Evaluation Types
6+
7+
| Type | Description | Example Datasets |
8+
|------|-------------|------------------|
9+
| **Mainstream Benchmark Evaluation** | Standard question-answering tasks covering mathematics, reasoning, knowledge, coding, and more | `aime24`, `aime25`, `aime26`, `gsm8k`, `longbench_v2`, `ceval`, `cmmlu`, `humaneval`, `mmlu`, `mmlu_pro`, etc. |
10+
| **Needle In A Haystack** | Evaluates the model's ability to locate specific information within extremely long contexts | - |
11+
12+
> **Note**: Except for the Needle In A Haystack test, only simple question-answering datasets are currently supported. Datasets requiring additional runtime environments or judge models are not yet adapted.
13+
14+
---
15+
16+
## Quick Start
17+
18+
### 1. Environment Setup
19+
20+
- It is recommended to use a virtual environment to install dependencies:
21+
```bash
22+
cd test
23+
pip install -r requirements.txt
24+
```
25+
26+
### 2. Dataset Preparation
27+
28+
#### Online Environment (With Internet Access)
29+
- The framework will automatically download required datasets from ModelScope. **No manual operation is needed**.
30+
31+
#### Offline Environment (No Internet Access)
32+
- Datasets must be downloaded in advance to a unified directory.
33+
- Ensure that subdirectory names exactly match the identifiers in the task list.
34+
35+
**Method 1: Clone Individual Datasets**
36+
```bash
37+
git clone https://www.modelscope.cn/datasets/evalscope/aime26.git
38+
git clone https://www.modelscope.cn/datasets/ZhipuAI/LongBench-v2.git # Note: Rename the cloned directory to `longbench_v2`
39+
git clone https://www.modelscope.cn/datasets/AI-ModelScope/Needle-in-a-Haystack-Corpus.git
40+
```
41+
42+
**Method 2: Use the Pre-Packaged Dataset Archive**
43+
- Visit the [ModelScope Dataset Repository](https://modelscope.cn/datasets/keriko/UCM_tools/files/dataset) to download the complete archive and extract it to the target path.
44+
45+
---
46+
47+
## Configuration
48+
49+
### General Parameters
50+
51+
| Environment Variable | Default | Description |
52+
|----------------------|---------|-------------|
53+
| `SCOPE_DATASET_ROOT` | | Root directory where datasets are stored |
54+
| `SCOPE_TREST_LIST` | `aime24,gsm8k` (example) | Comma-separated list of datasets to evaluate |
55+
56+
### Needle In A Haystack Specific Parameters
57+
58+
| Environment Variable | Default | Description |
59+
|----------------------|---------|-------------|
60+
| `SCOPE_NEEDLE_MIN` | `1000` | Minimum context length (in tokens) |
61+
| `SCOPE_NEEDLE_MAX` | `32000` | Maximum context length (in tokens) |
62+
63+
### Local Manual Testing
64+
Directly modify the following constants in `test_evalscope.py`:
65+
```python
66+
DEFAULT_DATASET_ROOT = "/mnt/data/evalscope/dataset" # Dataset path; can be left empty in online environments
67+
DEFAULT_TASK_LIST = ["aime24", "gsm8k"] # Datasets to evaluate
68+
```
69+
70+
---
71+
72+
## Running Tests
73+
74+
### Single Task Execution
75+
76+
```bash
77+
cd test
78+
79+
# Mainstream benchmark evaluation
80+
pytest suites/E2E/test_evalscope.py::test_eval_accuracy
81+
82+
# Needle In A Haystack evaluation
83+
pytest suites/E2E/test_evalscope.py::test_needle_task
84+
```
85+
86+
### Batch Execution by Feature Tag
87+
88+
```bash
89+
cd test
90+
pytest --feature=evalscope
91+
```
92+
93+
---
94+
95+
## Output and Results
96+
97+
### 1. EvalScope Native Output
98+
All run records are saved under the `test/results/evalscope_outputs/` directory, organized into timestamped subdirectories, including:
99+
- Evaluation configuration files
100+
- Detailed request/response logs
101+
- Aggregated metrics files (JSON)
102+
- Visualization reports (HTML)
103+
104+
For detailed format information, please refer to the [EvalScope Official Documentation](https://evalscope.readthedocs.io/).
105+
106+
### 2. Database Persistence
107+
Evaluation results are automatically parsed and stored in the configured database backend for centralized querying and comparison.
108+
109+
The following files are generated in the `test/results/` directory:
110+
- `eval_scope.jsonl`
111+
- `eval_scope.csv`
112+
113+
To customize database connections, modify the `results` section in the configuration (PostgreSQL, MongoDB, etc. are supported):
114+
115+
```yaml
116+
results:
117+
localFile:
118+
path: "./results"
119+
# postgresql:
120+
# host: "localhost"
121+
# ...
122+
# mongodb:
123+
# host: "127.0.0.1"
124+
# ...
125+
```
126+
127+
---
128+
129+
## Notes
130+
131+
1. Some dataset names must strictly match the ModelScope repository names (e.g., `longbench_v2` instead of `LongBench-v2`). Pay attention to directory renaming when using offline mode.
132+
2. If using a remote API for evaluation, ensure that the `llm_connection` configuration is correct and the service is accessible (example: `http://127.0.0.1:8080/`).
133+
3. The Needle In A Haystack task uses the **model under test itself** as the judge model. Ensure that the model possesses basic instruction-following capabilities, and configure the model path as `tokenizer_path` in `llm_connection`.
134+
135+
## Test Process
136+
![](assets/pic1.png)
137+
138+
## Test Result Example
139+
```json
140+
{
141+
"aime25": {
142+
"pretty_name": "AIME-2025",
143+
"model": "Qwen3-32B",
144+
"score": 0.0,
145+
"metrics": [{
146+
"name": "mean_acc",
147+
"score": 0.0,
148+
"macro_score": 0.0,
149+
"num": 30,
150+
"categories": [{
151+
"name": ["default"],
152+
"score": 0.0,
153+
"macro_score": 0.0,
154+
"num": 30,
155+
"subsets": [{
156+
"name": "default",
157+
"score": 0.0,
158+
"num": 30
159+
}]
160+
}]
161+
}],
162+
"analysis": "N/A"
163+
},
164+
"aime25.score": 0.0,
165+
"model_name": "Qwen3-32B",
166+
"test_id": "ad9ba909-1646-47b3-89d6-9240c6497593",
167+
"test_items": "pytestall_cases",
168+
"create_at": "2026-04-09 17:00:05.910252",
169+
"extra_info": ""
170+
}
171+
```
172+
173+
## HTML Test Report
174+
![](assets/pic2.png)
175+
176+
## Needle In A Haystack Heatmap
177+
![](assets/pic3.png)
178+
179+
*Note: The screenshots above were generated using a mock model for testing, hence all scores are zero.*

0 commit comments

Comments
 (0)