-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy patheval.py
More file actions
118 lines (91 loc) · 3.16 KB
/
Copy patheval.py
File metadata and controls
118 lines (91 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pathlib
import json
from main import Chatbot
from tqdm import tqdm
import pandas as pd
import os
import dotenv
import threading
from queue import Queue
import src.prompts as prompts
import src.utils as utils
dotenv.load_dotenv("env")
def judge(item, q):
judge_model = f"{os.environ.get('JUDGE_MODEL')}"
judge_api_link = f"{os.environ.get('JUDGE_API_LINK')}"
token = f"{os.environ.get('TOKEN')}"
with sem:
eval = utils.send_question(
prompt=prompts.EVALUATION_PROMPT.format(
instruction=item["question"],
response=item["generated_answer"],
reference_answer=item["true_answer"],
),
model=judge_model,
api_link=judge_api_link,
token=token,
temperature=0.1,
max_tokens=512,
)
try:
feedback, score = [i.strip() for i in eval.split("[RESULT]")]
print(f"Score: {score}\nFeedback: {feedback}")
item["feedback"] = feedback
item["score"] = score
with q_lock:
q.put(item)
except Exception:
return
eval_files = ["data/questions_en.json", "data/questions_ru.json"]
c = Chatbot(
f"{os.environ.get('CHATBOT_MODEL')}",
f"{os.environ.get('API_LINK')}",
f"{os.environ.get('TOKEN')}",
["data/orientation.md"],
verbose=True,
)
c.build_database()
for idx, eval_file in enumerate(eval_files):
data = json.loads(pathlib.Path(eval_file).read_text())
outputs = []
for example in tqdm(data[:5]):
question = example["question"]
if question in [output["question"] for output in outputs]:
continue
answer = c.question(question)
result = {
"question": question,
"true_answer": example["answer"],
"source_doc": example["context"],
"generated_answer": answer,
}
outputs.append(result)
with open(f"eval/eval_ans_{idx}.json", "w") as f:
json.dump(outputs, f)
flattened_data = json.loads(pathlib.Path(f"eval/eval_ans_{idx}.json").read_text())
sem = threading.Semaphore(6)
q_lock = threading.Lock()
threads = []
q = Queue()
for item in tqdm(flattened_data):
thread = threading.Thread(target=judge, args=(item, q))
thread.start()
threads.append(thread)
[_.join() for _ in threads]
res = []
while not q.empty():
res.append(q.get())
with open(f"eval/eval_res_{idx}.json", "w") as f:
json.dump(res, f, indent=4)
for idx, eval_file in enumerate(eval_files):
df = pd.read_json(f"eval/eval_res_{idx}.json")
print(f"Eval file: {eval_file}\n")
print(f"Reader model: {os.environ.get('CHATBOT_MODEL')}")
print(f"Reranker model: {os.environ.get('RERANKER_MODEL')}")
print(f"Embedder model: {os.environ.get('EMBEDDER_MODEL')}\n")
print(f"Judge model: {os.environ.get('JUDGE_MODEL')}\n")
print(df.score.value_counts(), end="\n\n")
print("Mean score: " + str(df.score[df.score != 0].mean()))
print("Median score: " + str(df.score[df.score != 0].median()))
print("Percentage: " + str(df.score[df.score != 0].mean() / 5 * 100))
print("\n\n")