Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions test/common/evalscope_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import glob
import json
import os
from typing import Any, Dict, List, Optional

import evalscope
from common.capture_utils import export_vars


class EvalScopeRunner:
"""
Encapsulate the logic for running evalscope tasks and collecting results.
"""

def __init__(self, output_dir: str):
self.output_dir = output_dir

def run(self, task_cfg: evalscope.config.TaskConfig) -> None:
evalscope.run_task(task_cfg=task_cfg)

@staticmethod
def _get_latest_run_dir(output_dir: str) -> Optional[str]:
if not os.path.exists(output_dir):
return None
subdirs = [
d
for d in os.listdir(output_dir)
if os.path.isdir(os.path.join(output_dir, d))
]
if not subdirs:
return None
subdirs.sort(
reverse=True
) # The timestamp directory can be sorted in descending order by string
return os.path.join(output_dir, subdirs[0])

@staticmethod
def _collect_report_json_files(run_dir: str) -> List[str]:
reports_root = os.path.join(run_dir, "reports")
if not os.path.exists(reports_root):
return []

json_files = []
for model_dir in os.listdir(reports_root):
model_path = os.path.join(reports_root, model_dir)
if os.path.isdir(model_path):
json_files.extend(glob.glob(os.path.join(model_path, "*.json")))
return json_files

@staticmethod
def _parse_metrics_from_json(json_path: str) -> Dict[str, Any]:
"""Parse a single JSON report file and return a structured metrics dictionary"""
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)

dataset_name = data.get(
"dataset_name", os.path.splitext(os.path.basename(json_path))[0]
)
model_name = data.get("model_name", "")

metrics = []
for m in data.get("metrics", []):
categories = [
{
"name": c.get("name"),
"score": c.get("score", 0.0),
"macro_score": c.get("macro_score", 0.0),
"num": c.get("num", 0),
"subsets": c.get("subsets", []),
}
for c in m.get("categories", [])
]
metrics.append(
{
"name": m.get("name"),
"score": m.get("score", 0.0),
"macro_score": m.get("macro_score", 0.0),
"num": m.get("num", 0),
"categories": categories,
}
)

return {
"dataset_name": dataset_name,
"model_name": model_name,
"pretty_name": data.get("dataset_pretty_name", dataset_name),
"score": data.get("score", 0.0),
"metrics": metrics,
"analysis": data.get("analysis", "N/A"),
}

@export_vars
def collect_results(self) -> Dict[str, Any]:
latest_run = self._get_latest_run_dir(self.output_dir)
if not latest_run:
return {"_name": "eval_scope", "_proj": {}}

json_files = self._collect_report_json_files(latest_run)
if not json_files:
return {"_name": "eval_scope", "_proj": {}}

all_metrics = {}
extracted_model_name = ""

for json_path in json_files:
try:
parsed = self._parse_metrics_from_json(json_path)
except (json.JSONDecodeError, KeyError):
continue

if not extracted_model_name:
extracted_model_name = parsed["model_name"]

dataset_name = parsed["dataset_name"]
all_metrics[dataset_name] = {
"pretty_name": parsed["pretty_name"],
"model": parsed["model_name"],
"score": parsed["score"],
"metrics": parsed["metrics"],
"analysis": parsed["analysis"],
}
# The total score is presented in a flat format, facilitating quick access by external parties
all_metrics[f"{dataset_name}.score"] = parsed["score"]

if extracted_model_name:
all_metrics["model_name"] = extracted_model_name

return {"_name": "eval_scope", "_proj": all_metrics}
Binary file added test/docs/assets/pic1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/docs/assets/pic2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/docs/assets/pic3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
179 changes: 179 additions & 0 deletions test/docs/evalscopeTest.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# EvalScope Accuracy Evaluation Guide

This test case is built upon **EvalScope (v1.5.2)** to provide automated evaluation capabilities, enabling convenient assessment of large language model performance on mainstream academic benchmarks and long-context retrieval tasks.

## Supported Evaluation Types

| Type | Description | Example Datasets |
|------|-------------|------------------|
| **Mainstream Benchmark Evaluation** | Standard question-answering tasks covering mathematics, reasoning, knowledge, coding, and more | `aime24`, `aime25`, `aime26`, `gsm8k`, `longbench_v2`, `ceval`, `cmmlu`, `humaneval`, `mmlu`, `mmlu_pro`, etc. |
| **Needle In A Haystack** | Evaluates the model's ability to locate specific information within extremely long contexts | - |

> **Note**: Except for the Needle In A Haystack test, only simple question-answering datasets are currently supported. Datasets requiring additional runtime environments or judge models are not yet adapted.

---

## Quick Start

### 1. Environment Setup

- It is recommended to use a virtual environment to install dependencies:
```bash
cd test
pip install -r requirements.txt
```

### 2. Dataset Preparation

#### Online Environment (With Internet Access)
- The framework will automatically download required datasets from ModelScope. **No manual operation is needed**.

#### Offline Environment (No Internet Access)
- Datasets must be downloaded in advance to a unified directory.
- Ensure that subdirectory names exactly match the identifiers in the task list.

**Method 1: Clone Individual Datasets**
```bash
git clone https://www.modelscope.cn/datasets/evalscope/aime26.git
git clone https://www.modelscope.cn/datasets/ZhipuAI/LongBench-v2.git # Note: Rename the cloned directory to `longbench_v2`
git clone https://www.modelscope.cn/datasets/AI-ModelScope/Needle-in-a-Haystack-Corpus.git
```

**Method 2: Use the Pre-Packaged Dataset Archive**
- Visit the [ModelScope Dataset Repository](https://modelscope.cn/datasets/keriko/UCM_tools/files/dataset) to download the complete archive and extract it to the target path.

---

## Configuration

### General Parameters

| Environment Variable | Default | Description |
|---------------------|---------|-------------|
| `SCOPE_DATASET_ROOT` | | Root directory where datasets are stored |
| `SCOPE_TEST_LIST` | `aime24,gsm8k` (example) | Comma-separated list of datasets to evaluate |

### Needle In A Haystack Specific Parameters

| Environment Variable | Default | Description |
|----------------------|---------|-------------|
| `SCOPE_NEEDLE_MIN` | `1000` | Minimum context length (in tokens) |
| `SCOPE_NEEDLE_MAX` | `32000` | Maximum context length (in tokens) |

### Local Manual Testing
Directly modify the following constants in `test_evalscope.py`:
```python
DEFAULT_DATASET_ROOT = "/mnt/data/evalscope/dataset" # Dataset path; can be left empty in online environments
DEFAULT_TASK_LIST = ["aime24", "gsm8k"] # Datasets to evaluate
```

---

## Running Tests

### Single Task Execution

```bash
cd test

# Mainstream benchmark evaluation
pytest suites/E2E/test_evalscope.py::test_eval_accuracy

# Needle In A Haystack evaluation
pytest suites/E2E/test_evalscope.py::test_needle_task
```

### Batch Execution by Feature Tag

```bash
cd test
pytest --feature=evalscope
```

---

## Output and Results

### 1. EvalScope Native Output
All run records are saved under the `test/results/evalscope_outputs/` directory, organized into timestamped subdirectories, including:
- Evaluation configuration files
- Detailed request/response logs
- Aggregated metrics files (JSON)
- Visualization reports (HTML)

For detailed format information, please refer to the [EvalScope Official Documentation](https://evalscope.readthedocs.io/).

### 2. Database Persistence
Evaluation results are automatically parsed and stored in the configured database backend for centralized querying and comparison.

The following files are generated in the `test/results/` directory:
- `eval_scope.jsonl`
- `eval_scope.csv`

To customize database connections, modify the `results` section in the configuration (PostgreSQL, MongoDB, etc. are supported):

```yaml
results:
localFile:
path: "./results"
# postgresql:
# host: "localhost"
# ...
# mongodb:
# host: "127.0.0.1"
# ...
```

---

## Notes

1. Some dataset names must strictly match the ModelScope repository names (e.g., `longbench_v2` instead of `LongBench-v2`). Pay attention to directory renaming when using offline mode.
2. If using a remote API for evaluation, ensure that the `llm_connection` configuration is correct and the service is accessible (example: `http://127.0.0.1:8080/`).
3. The Needle In A Haystack task uses the **model under test itself** as the judge model. Ensure that the model possesses basic instruction-following capabilities, and configure the model path as `tokenizer_path` in `llm_connection`.

## Test Process
![](assets/pic1.png)

## Test Result Example
```json
{
"aime25": {
"pretty_name": "AIME-2025",
"model": "Qwen3-32B",
"score": 0.0,
"metrics": [{
"name": "mean_acc",
"score": 0.0,
"macro_score": 0.0,
"num": 30,
"categories": [{
"name": ["default"],
"score": 0.0,
"macro_score": 0.0,
"num": 30,
"subsets": [{
"name": "default",
"score": 0.0,
"num": 30
}]
}]
}],
"analysis": "N/A"
},
"aime25.score": 0.0,
"model_name": "Qwen3-32B",
"test_id": "ad9ba909-1646-47b3-89d6-9240c6497593",
"test_items": "pytestall_cases",
"create_at": "2026-04-09 17:00:05.910252",
"extra_info": ""
}
```

## HTML Test Report
![](assets/pic2.png)

## Needle In A Haystack Heatmap
![](assets/pic3.png)

*Note: The screenshots above were generated using a mock model for testing, hence all scores are zero.*
Loading
Loading