SelfEval-Guided-Decoding/src/execute_and_evaluate/aqua_interpret_and_evaluate.py at main · WING-NUS/SelfEval-Guided-Decoding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import sys
import openai
from concurrent.futures import ProcessPoolExecutor as Pool
from tenacity import wait_random_exponential, stop_after_attempt, retry

sys.path.append('/home/yuxi/Projects/SelfEval-Guided-Decoding/src')
from utils.tool import *
from utils.dataset import jsonlines_load, jsonlines_dump
from prompts.aqua import choice_prompt
from execute_and_evaluate.interpret_and_evaluate import cal_weight

USE_CHATGPT = False
KEYS = ["OPENAI_KEY1", "OPENAI_KEY2"]
KEYS_USED = {k:None for k in KEYS}


def check_eq(p, g):
    return p == g


@retry(wait=wait_random_exponential(min=5, max=10000), stop=stop_after_attempt(256))
def make_choice(prompt, keys, chatgpt=False):
    if chatgpt:
        return openai.ChatCompletion.create(
            model='gpt-3.5-turbo',
            messages=prompt,
            api_key=select_key(KEYS_USED, keys, all_keys=KEYS),
            max_tokens=8,
            temperature=0.0,
            n=1,
            stop=['\n'],
        )
    else:
        return openai.Completion.create(
            engine='code-davinci-002',
            prompt=prompt,
            api_key=select_key(KEYS_USED, keys, all_keys=KEYS),
            max_tokens=8,
            temperature=0.0,
            n=1,
            stop=['\n'],
        )


def generate_prompt(qu, options, prediction, chatgpt=False):
    if chatgpt:
        chatgpt_choice_prompt = choice_prompt.strip().split('\n\n')
        instr, examples = chatgpt_choice_prompt[0], chatgpt_choice_prompt[1:]
        prompt = [{'role': 'system',
                   'content': 'You are a helpful assistant that finds {} '.format(' '.join(instr.split()[1:]))}]
        for exp in examples:
            _input, _output = exp.split('\nClosest Option: ')
            prompt += [
                {'role': 'user', 'content': f'Choose the closest answer to the question according the prediction:\n{_input}'},
                {'role': 'assistant', 'content': _output},
            ]
        prompt += [{
            'role': 'user',
            'content': 'Choose the closest answer to the question according the prediction:\n' \
                + f'Question: {qu}\nAnswer Choices:\n{options}\nPrediction: {prediction}',
        }]
    else:
        prompt = f'{choice_prompt}\nQuestion: {qu}\nAnswer Choices:\n{options}\nPrediction: {prediction}\nClosest Option:'
    return prompt


def select_option(argv):
    examples, keys = argv

    exp_id = -1
    for example in tqdm(examples):
        exp_id += 1
        if example.get('executed_results', None) is None or len(example['executed_results']) <= len(example['generated']):
            saved = list(example.get('executed_results', {}).keys())
            candidates = []
            for g_id, g in enumerate(example['generated']):
                if isinstance(g, str):
                    code = g
                elif isinstance(g, list):
                    code = g[0]
                else:
                    code = '\n'.join(g['generated'])
                if code not in candidates + saved:
                    candidates.append(code)

            candidate_results = {c: {} for c in candidates}
            qu, options = example['question'], '\n'.join(example['options'])
            for code in candidate_results:
                prediction = simplify_ans(safe_execute(code))
                prediction = 'None' if prediction is None else prediction

                prompt = generate_prompt(qu, options, prediction, chatgpt=USE_CHATGPT)
                rst = make_choice(prompt, keys, chatgpt=USE_CHATGPT)
                selected = rst['choices'][0]['text'].strip()

                candidate_results[code] = {'predict': prediction, 'select': selected}

            if len(saved):
                if len(candidate_results):
                    examples[exp_id]['executed_results'].update(candidate_results)
            else:
                examples[exp_id]['executed_results'] = candidate_results

    return examples


if __name__ == '__main__':
    fname = sys.argv[1]
    data = jsonlines_load(fname)
    n_jobs = 2

    prompts, data = data[0], data[1:]
    n_d_per_job = (len(data) + n_jobs - 1) // n_jobs
    n_k_per_job = len(KEYS) // n_jobs
    data_list, keys_list = [], []
    for i in range(n_jobs):
        data_list.append(data[i * n_d_per_job: (i + 1) * n_d_per_job])
        keys_list.append(KEYS[i * n_k_per_job: (i + 1) * n_k_per_job])

    with Pool(n_jobs) as pool:
        results = list(pool.map(select_option, zip(data_list, keys_list)))
    data = [example for _set in results for example in _set]

    # data = select_option((data, KEYS))

    accu, accu_all = {}, {}

    for d in tqdm(data):
        idx = d['index']
        gt_ans = d['correct'].strip()
        executed_results = d['executed_results']

        if d.get('executed', None) is not None and isinstance(d['generated'][0], list) and d['executed'][0] is not None:
            selected = d['executed'][1]
            answers = [selected]
            ans = answers[0]
        else:
            answers, weights, predictions = [], [], []
            candidates = []

            for g_id, g in enumerate(d['generated']):
                if isinstance(g, str):
                    code = g
                elif isinstance(g, list):
                    code = g[0]
                else:
                    code = '\n'.join(g['generated'])
                candidates.append(code)

                executed = executed_results[code]
                prd, selected = executed['predict'], executed['select']

                answers.append(selected)
                predictions.append(prd)

            anss, prds = answers, predictions

            result_counter = Counter()
            result_counter.update([x for x, y in zip(anss, prds) if y != 'None'])
            ans = result_counter.most_common(1)[0][0] if len(result_counter) else None
            prd = prds[anss.index(ans)] if ans is not None else None
            d['executed'] = (prd, ans)

        accu[idx] = check_eq(answers[0], gt_ans)
        accu_all[idx] = check_eq(ans, gt_ans)

    print('accu ({}):'.format(len(accu)), sum(accu.values())/len(accu))
    print('accu_all:', sum(accu_all.values())/len(accu_all))

    jsonlines_dump([prompts] + data, fname)