-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
164 lines (150 loc) · 6.48 KB
/
evaluate.py
File metadata and controls
164 lines (150 loc) · 6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"Code originally copied from gemini-benchmark https://github.com/neulab/gemini-benchmark/blob/main/benchmarking/MMLU/run_mmlu.py"
from litellm import acompletion
from tqdm import tqdm
from utils import *
import pandas as pd
import asyncio
import litellm
import json
# parse arguments
import argparse
import os
import time
async def get_response(prompt: str, model: str):
if "gemini" in model:
response = await acompletion(
model=model,
messages=[
{
"role": "system",
# To perform system instruction, add this prompt: In each of the questions that I ask, I will replace some of the words that you might know with a word that is arbitrarily assigned a specific meaning just for this test. The meaning of these arbitrary definition may change with every question.
"content": "Follow the given examples and answer the question. Please respond to each question with 'Answer: <letter>' where <letter> is the correct choice. Avoid additional explanations.",
},
{"role": "user", "content": prompt},
],
temperature=0,
safety_settings=[
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE", #uses "BLOCK_ONLY_HIGH" if you don't have permission to "BLOCK_NONE".
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
],
)
else:
response = await acompletion(
model=model,
messages=[
{
"role": "system",
# To perform system instruction, add this prompt: In each of the questions that I ask, I will replace some of the words that you might know with a word that is arbitrarily assigned a specific meaning just for this test. The meaning of these arbitrary definition may change with every question.
"content": "Follow the given examples and answer the question. Please respond to each question with 'Answer: <letter>' where <letter> is the correct choice. Avoid additional explanations.",
},
{"role": "user", "content": prompt},
],
temperature=0,
)
return response
def main(args, tasks=TASKS):
if "gpt" in args.model_name:
# gpt evaluation
os.environ["OPENAI_API_KEY"] = args.openai_api_key
elif "gemini" in args.model_name:
# gemini evaluation
litellm.vertex_project = " " # Your Project ID
litellm.vertex_location = " " # Your Project Location, e.g us-east4
litellm.drop_params = True
all_acc = 0
all_number = 0
accs_json = {}
method_name = "5-shot" # We used 5-shot as baseline experiment
outputs_file = open(f"results/{args.task}_{args.model_name}_{method_name}_outputs.json", "a")
for task in tasks:
print("Testing %s ..." % task)
acc = 0
dev_df = pd.read_csv(
os.path.join("dataset/" + args.task + "_dev", args.task + "_" + task + "_dev.csv"), header=None
)[: args.num_examples]
test_df = pd.read_csv(
os.path.join("dataset/" + args.task + "_test", args.task + "_" + task + "_test.csv"), header=None
)
for i in tqdm(range(test_df.shape[0])):
try:
# Building the prompt
k = args.num_examples
prompt_end = format_example(test_df, i, include_answer=False)
train_prompt = gen_prompt(dev_df, task, k)
prompt = train_prompt + prompt_end
label = test_df.iloc[i, test_df.shape[1] - 1]
# Get the model response
response = asyncio.run(get_response(prompt, args.model_name))
response_text = response["choices"][0]["message"]["content"]
choices = {
"A": test_df.iloc[i, 1],
"B": test_df.iloc[i, 2],
"C": test_df.iloc[i, 3],
"D": test_df.iloc[i, 4]
}
ans_model = extract_ans(response_text, choices)
correct = ans_model == label
if correct:
acc += 1
# Write the output to a file
outputs_file.write(
json.dumps(
{
"task": task,
"correct": correct,
"prediction": ans_model,
"label": label,
"response": response_text,
"question": test_df.iloc[i, 0],
"A": test_df.iloc[i, 1],
"B": test_df.iloc[i, 2],
"C": test_df.iloc[i, 3],
"D": test_df.iloc[i, 4],
"prompt": prompt,
}
) + "\n"
)
except Exception as e:
print(f"Skipping index {i} due to error: {e}")
continue
print("%s acc %.4f" % (task, acc / test_df.shape[0]))
accs_json[task] = acc / test_df.shape[0]
all_acc += acc
all_number += test_df.shape[0]
accs_json["all"] = all_acc / all_number
json.dump(
accs_json, open(f"results/{args.task}_{args.model_name}_{method_name}_accs.json", "w")
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--openai_api_key", type=str)
parser.add_argument(
"--model_name",
type=str,
default="gemini-1.0-pro",
choices=["gpt-3.5-turbo", "gpt-4-1106-preview", "gemini-1.0-pro", "mixtral"],
)
parser.add_argument("--task", type=str, required=True, help="'answer_only', 'question_only', 'question_and_answer'")
parser.add_argument("--cot", action="store_true")
parser.add_argument(
"--num_examples",
type=int,
default=5,
help="Number of examples included in the current prompt input. ",
)
args = parser.parse_args()
main(args)