diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_v2.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_v2.py new file mode 100644 index 000000000..c7eb8b5ee --- /dev/null +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_v2.py @@ -0,0 +1,82 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import ArenaHardV2Dataset, arenahard_postprocess +from opencompass.summarizers import ArenaHardSummarizer +from mmengine.config import read_base + +subjective_reader_cfg = dict( + input_columns=['question'], + output_column='judge', + ) + +subjective_all_sets = [ + 'arenahard-v2', +] + + +arenahard_v2_datasets = [] + +system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." + +judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>" + +o3_mini = [dict( + abbr='o3-mini-2025-01-31', +)] + +for _name in subjective_all_sets: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt=system_prompt) + ], + round=[ + dict( + role='HUMAN', + prompt = judge_prompt + ), + ]), + ), + dict_postprocessor=dict(type=arenahard_postprocess, base_model_name='o3-mini-2025-01-31'), + ), + pred_role='BOT', + ) + + arenahard_v2_datasets.append( + dict( + abbr='arenahard-v2', + type=ArenaHardV2Dataset, + path='./data/subjective/arena-hard-v2', + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg, + mode='m2n', + infer_order='double', + base_models=o3_mini, + given_pred = [{'abbr':'o3-mini-2025-01-31', 'path':'./data/subjective/arena-hard-v2'}], + summarizer = dict(type=ArenaHardSummarizer, base_model_name='o3-mini-2025-01-31', tiktoken_model_name='gpt-4'), + )) \ No newline at end of file diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 646c619ed..439dd9379 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -5,6 +5,7 @@ from .alpacaeval import alpacaeval_bradleyterry_postprocess # noqa: F401, F403 from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403 +from .arena_hard import ArenaHardV2Dataset # noqa: F401, F403 from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403 from .arena_hard import arenahard_postprocess # noqa: F401, F403 from .commonbench import commonbench_postprocess @@ -35,4 +36,4 @@ from .wildbench import WildBenchDataset # noqa: F401, F403 from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403 from .wildbench import wildbench_postprocess # noqa: F401, F403 -from .writingbench import * \ No newline at end of file +from .writingbench import * diff --git a/opencompass/datasets/subjective/arena_hard.py b/opencompass/datasets/subjective/arena_hard.py index 1403c978d..22303c657 100644 --- a/opencompass/datasets/subjective/arena_hard.py +++ b/opencompass/datasets/subjective/arena_hard.py @@ -47,6 +47,36 @@ def load(self, path: str, name: str, *args, **kwargs): return dataset +@LOAD_DATASET.register_module() +class ArenaHardV2Dataset(BaseDataset): + + def load(self, path: str, name: str, *args, **kwargs): + path = get_data_path(path, local_mode=True) + filename = osp.join(path, f'{name}.jsonl') + dataset = DatasetDict() + raw_data = [] + with open(filename, 'r', encoding='utf-8') as file: + for line in file: + problem = json.loads(line) + question_id = problem['uid'] + category = problem['category'] + subcategory = problem['subcategory'] + question = problem['prompt'] + raw_data.append({ + 'question': question, + 'category': category, + 'subcategory': subcategory, + 'judge': { + 'category': category, + 'subcategory': subcategory, + 'question': question, + 'question_id': question_id, + }, + }) + dataset = Dataset.from_list(raw_data) + return dataset + + def post_process_arenahard(completion): s = completion['prediction'] if result := re.findall('\[\[([AB<>=]+)\]\]', s): @@ -90,7 +120,11 @@ def get_battles_from_judgment(judged_answers, references, WEIGHT=3): return arena_hard_battles -def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): +def compute_mle_elo(df, + SCALE=400, + BASE=10, + INIT_RATING=1000, + base_model_name='gpt4-0314'): models = pd.concat([df['model_a'], df['model_b']]).unique() models = pd.Series(np.arange(len(models)), index=models) @@ -120,8 +154,8 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): elo_scores = SCALE * lr.coef_[0] + INIT_RATING # set anchor as gpt4-0314 = 1000 - if 'gpt4-0314' in models.index: - elo_scores += 1000 - elo_scores[models['gpt4-0314']] + if base_model_name in models.index: + elo_scores += 1000 - elo_scores[models[base_model_name]] return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) @@ -176,6 +210,7 @@ def get_win_rate_column(df, column, baseline='gpt4-0314'): def arenahard_postprocess( output: dict, output_path: str, + base_model_name: str = 'gpt4-0314', ) -> dict: judged_answers, references = get_judgeanswer_and_reference( output, output_path, post_process_arenahard) @@ -188,7 +223,8 @@ def arenahard_postprocess( references, ) - bootstrap_online_elo = compute_mle_elo(battles) + bootstrap_online_elo = compute_mle_elo(battles, + base_model_name=base_model_name) np.random.seed(42) bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100) @@ -204,10 +240,11 @@ def arenahard_postprocess( # stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5) # stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist() - stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist() + stats['score'] = get_win_rate_column(stats, 'score', + base_model_name).tolist() models = stats['model'] scores = stats['score'] - if models[0] == 'gpt4-0314': + if models[0] == base_model_name: score = scores[1] else: score = scores[0] diff --git a/opencompass/summarizers/subjective/arenahard.py b/opencompass/summarizers/subjective/arenahard.py index b9fe9ecae..871fe1072 100644 --- a/opencompass/summarizers/subjective/arenahard.py +++ b/opencompass/summarizers/subjective/arenahard.py @@ -307,7 +307,7 @@ def get_score(self, time_str): model_preds = load_model_preds(file_name) pred_length = 0 for model_pred in model_preds: - pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(model_pred, disallowed_special=())) + pred_length += len(tiktoken.encoding_for_model('gpt-4').encode(model_pred, disallowed_special=())) pred_length /= len(model_preds) stats.at[i, 'avg_tokens'] = pred_length stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()