Add new accuracy test cases

zhou-haitao · zhou-haitao · commit 6ed8bf43d098 · 2025-12-23T11:40:41.000+08:00
diff --git a/test/common/db_utils.py b/test/common/db_utils.py
@@ -65,11 +65,6 @@ def _get_db():
         db_config = _get_db_config()
         _db_enabled = db_config.get("enabled", False)
 
-        backup_str = db_config.get("backup", "results/")
-        _backup_path = Path(backup_str).resolve()
-        _backup_path.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Backup directory set to: {_backup_path}")
-
         if not _db_enabled:
             return None
 
@@ -205,10 +200,16 @@ def write_to_db(table_name: str, data: Dict[str, Any]) -> bool:
 
 
 def database_connection(build_id: str) -> None:
+    global _backup_path
     logger.info(f"Setting test build ID: {build_id}")
     _set_test_build_id(build_id)
 
     db_config = _get_db_config()
+    backup_str = db_config.get("backup", "results/")
+    _backup_path = Path(backup_str).resolve()
+    _backup_path.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Backup directory set to: {_backup_path}")
+
     if not db_config.get("enabled", False):
         logger.info("Database connection skipped because enabled=false.")
         return
diff --git a/test/common/uc_eval/datasets/doc_qa/Galaxy_Railroad.json b/test/common/uc_eval/datasets/doc_qa/Galaxy_Railroad.json
diff --git a/test/common/uc_eval/datasets/doc_qa/prompt.json b/test/common/uc_eval/datasets/doc_qa/prompt.json
diff --git a/test/common/uc_eval/task.py b/test/common/uc_eval/task.py
@@ -1,7 +1,10 @@
+import json
+import os
 import time
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Union
 
+import pandas as pd
 from common.uc_eval.utils.config_loader import ConfigLoader, TaskFactory
 from common.uc_eval.utils.data_class import (
     BenchmarkModeType,
@@ -241,6 +244,7 @@ def __init__(
         self.prompt_tokens = perf_config.prompt_tokens
         self.output_tokens = perf_config.output_tokens
         self.prefix_cache_num = perf_config.prefix_cache_num
+        self.enable_warmup = perf_config.enable_warmup
         self.prompt_seed = 0 if self.enable_prefix_cache else -1
         self.stable_perf = self.benchmark_mode == BenchmarkModeType.STABLE_PREF
         self.stable_rate = stable_rate
@@ -272,7 +276,11 @@ def process(self):
                 logger.info(
                     f"Performance benchmark running with: enable prefix cache: ({self.enable_prefix_cache}), {syntheric_params=}"
                 )
-                if self.enable_prefix_cache and self.prefix_cache_num[idx] > 0:
+                if (
+                    self.enable_prefix_cache
+                    and self.prefix_cache_num[idx] > 0
+                    and self.enable_warmup
+                ):
                     logger.info(f"Begin build kvcache...")
                     input_data = self.dataset.prepare_data(syntheric_params)
                     self.client.handle_requests_with_pool(
@@ -359,10 +367,11 @@ def __init__(
         )
         self.dataset_file_path = perf_config.dataset_file_path
         self.max_tokens = model_config.payload.get("max_tokens")
+        self.enable_warmup = perf_config.enable_warmup
 
     def process(self):
         cases_list = self.dataset.prepare_data(self.dataset_file_path)
-        if self.enable_prefix_cache:
+        if self.enable_prefix_cache and self.enable_warmup:
             logger.info("Begin build kvcache...")
             self.client.handle_requests_with_pool(
                 cases_list, self.parallel_num, BAD_COMPLETION_TOKENS_THR
@@ -389,10 +398,39 @@ def __init__(
         self.dataset_file_path = eval_config.dataset_file_path
         self.max_tokens = model_config.payload.get("max_tokens")
         self.eval_cls = eval_config.eval_class
+        self.prompt_split_ratio = eval_config.prompt_split_ratio
+        self.enable_warmup = eval_config.enable_warmup
+        self.enable_clear_hbm = model_config.enable_clear_hbm
+        self.round = getattr(eval_config, "round", 0)
+
+    def _split_prompt_by_tokens(
+        self, prompt: str, tokenizer, split_ratio: float
+    ) -> str:
+        """Split prompt by token ratio and return the first part."""
+        tokens = tokenizer.encode(prompt)
+        split_idx = int(len(tokens) * split_ratio)
+        first_tokens = tokens[:split_idx]
+        return tokenizer.decode(first_tokens, skip_special_tokens=False)
 
     def process(self):
         cases_list = self.dataset.prepare_data(self.dataset_file_path)
-        if self.enable_prefix_cache:
+
+        if self.prompt_split_ratio is not None and 0 < self.prompt_split_ratio < 1:
+            logger.info(
+                f"Applying prompt split ratio: {self.prompt_split_ratio} (only sending first {self.prompt_split_ratio*100:.0f}% of prompt)"
+            )
+            tokenizer = self.client.tokenizer
+            modified_cases = []
+            for case in cases_list:
+                case_name, context, question, answer = case
+                full_prompt = context + question
+                split_prompt = self._split_prompt_by_tokens(
+                    full_prompt, tokenizer, self.prompt_split_ratio
+                )
+                modified_cases.append([case_name, split_prompt, "", answer])
+            cases_list = modified_cases
+
+        if self.enable_prefix_cache and self.enable_warmup:
             logger.info("Begin build kvcache...")
             self.client.handle_requests_with_pool(
                 cases_list, self.parallel_num, BAD_COMPLETION_TOKENS_THR
@@ -402,8 +440,56 @@ def process(self):
         records: List[RequestRecord] = self.client.handle_requests_with_pool(
             cases_list, self.parallel_num, self.max_tokens
         )
+
+        if self.prompt_split_ratio is not None and 0 < self.prompt_split_ratio < 1:
+            logger.info(
+                f"Skipping accuracy evaluation when prompt_split_ratio={self.prompt_split_ratio} (service ran but no accuracy check)"
+            )
+            from common.uc_eval.utils.data_class import LatencyStatistics
+
+            empty_latency = LatencyStatistics()
+            empty_latency.metric_dict = {}
+            return empty_latency, len(records)
+
         metric_result, match_record_list = self.benchmark.perf_show(
             records, self.parallel_num
         )
+
+        if self.enable_clear_hbm:
+            self.client.clear_hbm()
+
         self.save_eval_cases_excel(match_record_list, self.eval_cls)
+        self.compare_first_round_results(match_record_list, self.round)
         return metric_result, len(records)
+
+    def compare_first_round_results(
+        self, match_record_list: List[RequestRecord], round: int
+    ):
+        if round == 0:
+            return
+        cache_file = "first_round_outputs.json"
+        if round == 1:
+            first_round_data = {r.case_name: r.output_data for r in match_record_list}
+            with open(cache_file, "w", encoding="utf-8") as f:
+                json.dump(first_round_data, f, ensure_ascii=False, indent=2)
+            logger.info(f"First round outputs saved to {cache_file}")
+        elif round == 2:
+            if not os.path.exists(cache_file):
+                return
+            with open(cache_file, "r", encoding="utf-8") as f:
+                first_round_data = json.load(f)
+            for r in match_record_list:
+                if r.case_name in first_round_data:
+                    first_output = first_round_data[r.case_name]
+                    is_match = first_output == r.output_data
+                    logger.info(f"First Round Output: {first_output}")
+                    logger.info(f"Second Round Output: {r.output_data}")
+                    if not is_match:
+                        logger.error(
+                            f"Case {r.case_name}: The output results are inconsistent."
+                        )
+                    else:
+                        logger.info(
+                            f"Case {r.case_name}: The output results are consistent"
+                        )
+            os.remove(cache_file)
diff --git a/test/common/uc_eval/utils/config_loader.py b/test/common/uc_eval/utils/config_loader.py
@@ -194,6 +194,12 @@ def create_task(
         client_kwargs = {}
         if data_type is DatasetType.MULTI_DIALOGUE:
             client_kwargs["enable_prefix_cache"] = perf_config.enable_prefix_cache
+        elif data_type is DatasetType.DOC_QA and eval_config:
+            if (
+                hasattr(eval_config, "prompt_split_ratio")
+                and eval_config.prompt_split_ratio is not None
+            ):
+                client_kwargs["prompt_split_ratio"] = eval_config.prompt_split_ratio
         return (
             cls._dataset[data_type](tokenizer_path),
             cls._client[data_type](model_config, stream, **client_kwargs),
diff --git a/test/common/uc_eval/utils/data_class.py b/test/common/uc_eval/utils/data_class.py
@@ -36,11 +36,14 @@ class ModelConfig:
 class EvalConfig:
     data_type: str = ""
     dataset_file_path: str = ""
-    enable_prefix_cache: str = False
+    enable_prefix_cache: bool = False
     parallel_num: int = 1
     benchmark_mode: str = "evaluate"
     metrics: Optional[List[str]] = field(default_factory=list)
     eval_class: Optional[str] = None
+    prompt_split_ratio: Optional[float] = None
+    enable_warmup: bool = True
+    round: int = 0
 
 
 @dataclass
@@ -53,6 +56,7 @@ class PerfConfig:
     output_tokens: List[int] = field(default_factory=list)
     prefix_cache_num: List[float] = field(default_factory=list)
     benchmark_mode: str = ""
+    enable_warmup: bool = True
 
 
 @dataclass
diff --git a/test/suites/E2E/test_accuracy.py b/test/suites/E2E/test_accuracy.py
@@ -0,0 +1,99 @@
+import dataclasses
+import json
+
+import pytest
+from common.capture_utils import export_vars
+from common.config_utils import config_utils as config_instance
+from common.uc_eval.task import DocQaEvalTask
+from common.uc_eval.utils.data_class import EvalConfig, ModelConfig
+
+
+@pytest.fixture(scope="session")
+def model_config() -> ModelConfig:
+    cfg = config_instance.get_config("models") or {}
+    field_name = [field.name for field in dataclasses.fields(ModelConfig)]
+    kwargs = {k: v for k, v in cfg.items() if k in field_name and v is not None}
+    if "payload" in kwargs and isinstance(kwargs["payload"], str):
+        try:
+            kwargs["payload"] = json.loads(kwargs["payload"])
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid payload JSON format: {e}")
+    return ModelConfig(**kwargs)
+
+
+_DOC_QA_BASE_CONFIG = {
+    "data_type": "doc_qa",
+    "dataset_file_path": "../../common/uc_eval/datasets/doc_qa/Galaxy_Railroad.json",
+    "enable_prefix_cache": True,
+    "parallel_num": 1,
+    "benchmark_mode": "evaluate",
+    "metrics": ["accuracy", "bootstrap-accuracy", "f1-score"],
+    "eval_class": "common.uc_eval.utils.metric:Includes",
+}
+
+doc_qa_eval_cases = [
+    pytest.param(
+        EvalConfig(
+            **{
+                **_DOC_QA_BASE_CONFIG,
+                "prompt_split_ratio": None,
+                "enable_warmup": False,
+                "round": 1,
+            }
+        ),
+        id="doc-qa-full-prompt-warmup-evaluate",
+    ),
+    pytest.param(
+        EvalConfig(
+            **{**_DOC_QA_BASE_CONFIG, "prompt_split_ratio": 0.5, "enable_warmup": False}
+        ),
+        id="doc-qa-full-prompt-no-warmup-evaluate",
+    ),
+    pytest.param(
+        EvalConfig(
+            **{
+                **_DOC_QA_BASE_CONFIG,
+                "prompt_split_ratio": None,
+                "enable_warmup": False,
+                "round": 2,
+            }
+        ),
+        id="doc-qa-half-prompt-warmup-evaluate",
+    ),
+]
+
+test_configs = [
+    pytest.param(
+        {"max_tokens": 1024, "ignore_eos": True, "temperature": 0.7},
+        False,  # enable_clear_hbm
+        id="max_tokens_2048_clear_hbm_true",
+    ),
+]
+
+
+@pytest.mark.feature("accu_test")
+@pytest.mark.stage(2)
+@pytest.mark.parametrize("eval_config", doc_qa_eval_cases)
+@pytest.mark.parametrize("payload_updates,enable_clear_hbm", test_configs)
+@export_vars
+def test_doc_qa_perf(
+    eval_config: EvalConfig,
+    model_config: ModelConfig,
+    payload_updates: dict,
+    enable_clear_hbm: bool,
+    request: pytest.FixtureRequest,
+):
+    file_save_path = config_instance.get_config("reports").get("base_dir")
+    if isinstance(model_config.payload, str):
+        model_config.payload = json.loads(model_config.payload)
+
+    model_config.payload.update(payload_updates)
+
+    if eval_config.prompt_split_ratio is None:
+        model_config.enable_clear_hbm = True
+    else:
+        model_config.enable_clear_hbm = enable_clear_hbm
+
+    task = DocQaEvalTask(model_config, eval_config, file_save_path)
+    result = task.run()
+    return {"_name": request.node.callspec.id, "_data": result}