Skip to content

Commit c70a45d

Browse files
committed
move evaluation logic inside the tfbench package
1 parent fefa726 commit c70a45d

6 files changed

Lines changed: 188 additions & 103 deletions

File tree

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ dependencies = [
2020
"numpy>=1.26.4",
2121
"ollama==0.5.3",
2222
"openai==1.99.9",
23+
"orjson>=3.11.3",
2324
"pyarrow>=21.0.0",
2425
"pytest>=8.0.0",
2526
"python-dotenv==1.0.1",
@@ -68,6 +69,12 @@ disable = [
6869
fail-under = 9
6970
max-line-length = 120
7071

72+
[tool.pylint]
73+
extension-pkg-whitelist = [
74+
"orjson", # https://github.com/pylint-dev/pylint/issues/9762
75+
]
76+
77+
7178
[tool.pylint.typecheck]
7279
generated-members = ["cv2.*", "skimage.metrics.*"]
7380

src/main.py

Lines changed: 57 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,71 @@
1-
import os
2-
import json
31
import logging
2+
from os.path import join as pjoin, abspath
3+
import os
44

5-
from funcy_chain import Chain
6-
from funcy import lmap
7-
from tqdm import tqdm
8-
from returns.result import ResultE
95
import fire
6+
import numpy as np
7+
import orjson
8+
from returns.result import Success, Failure
9+
from tfbench import run_one_model, EvalResult
1010

11-
from tfbench.common import get_prompt
12-
from tfbench.postprocessing import postprocess, RESPONSE_STRATEGIES
13-
from tfbench.evaluation import evaluate
14-
from tfbench.lm import router, LMAnswer, extract_response
15-
from tfbench.load import load_from_hf
11+
12+
def analysis(results: list[EvalResult]):
13+
"""calculate mean and std of accuracy of multiple runs"""
14+
accs = list(map(lambda r: r["accuracy"], results))
15+
return np.mean(accs), np.std(accs)
1616

1717

1818
def main(
1919
model: str,
20-
pure: bool = False,
2120
effort: str | None = None,
22-
output_file: str | None = None,
21+
n_repeats: int = 3,
2322
log_file: str = "evaluation_log.jsonl",
2423
):
25-
"""
26-
Run an experiment using various AI models to generate and evaluate type signatures.
27-
28-
Parameters:
29-
model (str): Name of the model to use for generating type signatures. Must be one of:
30-
- GPT_MODELS: ["gpt-3.5-turbo-0125", "gpt-4-turbo-2024-04-09", ...]
31-
- OLLAMA_MODELS, CLAUDE_MODELS, or O1_MODELS.
32-
Default is "gpt-3.5-turbo".
33-
34-
pure (bool): If True, uses the original variable naming in type inference.
35-
If False, uses rewritten variable naming (e.g., `v1`, `v2`, ...). Default is False.
36-
37-
"""
38-
39-
if output_file is None:
40-
os.makedirs("result", exist_ok=True)
41-
if "/" in model:
42-
dir_name = model.split("/")[0]
43-
os.makedirs(f"result/{dir_name}", exist_ok=True)
44-
output_file = os.path.abspath(f"result/{model}.txt")
45-
logging.info(f"Writing generation results in {output_file}.")
46-
47-
client = router(model, pure, effort)
48-
assert client, f"Failed to create client for {model}."
49-
50-
tasks = load_from_hf("pure" if pure else "base")
51-
prompts = lmap(get_prompt, tasks)
52-
responses: list[ResultE[LMAnswer]] = lmap(
53-
client.generate, tqdm(prompts, desc=model)
54-
)
55-
56-
gen_results = (
57-
Chain(responses)
58-
.map(extract_response)
59-
.map(lambda s: postprocess(s, RESPONSE_STRATEGIES))
60-
.map(str.strip)
61-
.value
62-
)
63-
64-
# writing results
65-
with open(output_file, "w", errors="ignore") as file:
66-
file.write("\n".join(gen_results))
67-
68-
eval_acc = evaluate(tasks, gen_results)
69-
print(eval_acc)
70-
71-
os.makedirs(os.path.dirname(output_file), exist_ok=True)
72-
with open(log_file, "a") as fp:
73-
logging_result = {"model_name": model, **eval_acc, "pure": pure}
74-
fp.write(f"{json.dumps(logging_result)}\n")
24+
"""Main script to run experiments reported in the paper"""
25+
26+
def _run(pure: bool):
27+
results = []
28+
for i in range(n_repeats):
29+
ext = "pure" if pure else "base"
30+
31+
result_dir = abspath(pjoin("results", model))
32+
os.makedirs(result_dir, exist_ok=True)
33+
result_file = pjoin(result_dir, f"run-{i}.{ext}.jsonl")
34+
match run_one_model(
35+
model, pure=pure, output_file=result_file, effort=effort
36+
):
37+
case Success(r):
38+
results.append(r)
39+
case Failure(e):
40+
return Failure(e)
41+
return Success(analysis(results))
42+
43+
def _eval(pure: bool):
44+
split = "pure" if pure else "base"
45+
logging.info(f"Running {model} on TF-Bench ({split}):")
46+
match _run(pure=False):
47+
case Success((mean, std)):
48+
logging.info(f"Accuracy: {mean:.4f} ± {std:.4f}")
49+
with open(log_file, "ab") as f:
50+
f.write(
51+
orjson.dumps(
52+
{
53+
"model": model,
54+
"split": split,
55+
"effort": effort,
56+
"n_repeats": n_repeats,
57+
"mean": mean,
58+
"std": std,
59+
},
60+
option=orjson.OPT_APPEND_NEWLINE,
61+
)
62+
)
63+
case Failure(e):
64+
print(f"Error in base run: {e}")
65+
return
66+
67+
_eval(pure=False)
68+
_eval(pure=True)
7569

7670

7771
if __name__ == "__main__":

src/tfbench/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
from dotenv import load_dotenv
22

3+
from .experiment import run_one_model
4+
from .evaluation import EvalResult
5+
36
load_dotenv(override=True)
7+
8+
__all__ = ["run_one_model", "EvalResult"]

src/tfbench/evaluation.py

Lines changed: 15 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
1-
import json
2-
import logging
31
from itertools import starmap
42
import re
3+
from typing import TypedDict
54

6-
import fire
7-
from funcy_chain import Chain
8-
from dacite import from_dict
9-
10-
from tfbench.common import BenchmarkTask
11-
from tfbench.postprocessing import postprocess, TASK_STRATEGIES, RESPONSE_STRATEGIES
5+
from .common import BenchmarkTask
6+
from .postprocessing import postprocess, TASK_STRATEGIES, RESPONSE_STRATEGIES
7+
from .lm import LMAnswer
128

139

1410
def tokenize_type_signature(sig: str) -> list[str]:
@@ -65,16 +61,20 @@ def alpha_equiv(s1: str, s2: str) -> bool:
6561
return n1 == n2
6662

6763

68-
def evaluate_one_task(task: BenchmarkTask, result: str) -> bool:
64+
def evaluate_one_task(task: BenchmarkTask, result: LMAnswer) -> bool:
6965
"""evaluate a single task against its result by alpha equivalence"""
70-
ground_truth = postprocess(task.signature, TASK_STRATEGIES)
71-
result = postprocess(result, RESPONSE_STRATEGIES)
72-
return alpha_equiv(ground_truth, result)
66+
ground_truth = postprocess(task.signature, TASK_STRATEGIES).strip()
67+
predicted = postprocess(result.answer, RESPONSE_STRATEGIES).strip()
68+
return alpha_equiv(ground_truth, predicted)
69+
70+
71+
class EvalResult(TypedDict):
72+
total: int
73+
n_correct: int
74+
accuracy: float
7375

7476

75-
def evaluate(
76-
benchmark_f: list[BenchmarkTask], results: list[str]
77-
) -> dict[str, int | float]:
77+
def evaluate(benchmark_f: list[BenchmarkTask], results: list[LMAnswer]) -> EvalResult:
7878
"""evaluate all generation results"""
7979

8080
assert len(benchmark_f) == len(results)
@@ -87,27 +87,3 @@ def evaluate(
8787
"n_correct": n_correct,
8888
"accuracy": acc,
8989
}
90-
91-
92-
def main(
93-
benchmark_file: str = "Benchmark-F.jsonl",
94-
results_file: str = "data/experiment/gpt_generated_responses.jsonl",
95-
):
96-
"""script to run all evaluation tasks"""
97-
with open(benchmark_file, "r") as file:
98-
benchmark_f: list[BenchmarkTask] = (
99-
Chain(file.readlines())
100-
.map(json.loads)
101-
.map(lambda d: from_dict(data_class=BenchmarkTask, data=d))
102-
.value
103-
)
104-
with open(results_file, "r") as file:
105-
results: list[str] = Chain(file.readlines()).map(json.loads).value
106-
107-
eval_acc = evaluate(benchmark_f, results)
108-
logging.info(json.dumps(eval_acc, indent=2))
109-
110-
111-
if __name__ == "__main__":
112-
logging.basicConfig(level=logging.INFO)
113-
fire.Fire(main)

src/tfbench/experiment.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,55 @@
11
"""
2-
Experiment script for OpenAI models
2+
Experiment script
33
"""
4+
5+
import logging
6+
7+
from tqdm import tqdm
8+
from returns.result import Success, Failure, ResultE
9+
import orjson
10+
11+
from .common import get_prompt
12+
from .evaluation import evaluate, EvalResult
13+
from .lm import router, LMAnswer
14+
from .load import load_from_hf
15+
16+
17+
def run_one_model(
18+
model: str,
19+
pure: bool = False,
20+
output_file: str | None = None,
21+
effort: str | None = None,
22+
) -> ResultE[EvalResult]:
23+
"""Running the generation & evaluation pipeline for one pre-supported model
24+
25+
Args:
26+
model (str): name of the model to evaluate
27+
pure (bool, optional): To evaluate on the `pure` split or not. Defaults to False.
28+
output_file (str | None, optional): The file to save generation result. Defaults to None.
29+
Warning: If None, generation results will not be saved to disk.
30+
effort (str | None, optional): Reasoning effort. Defaults to None.
31+
Warning: Different model handles None(default) effort differently.
32+
33+
Returns:
34+
EvalResult: evaluation result including accuracy
35+
"""
36+
client = router(model, pure, effort)
37+
if not client:
38+
return Failure(Exception(f"Failed to create client for {model}."))
39+
40+
tasks = load_from_hf("pure" if pure else "base")
41+
gen_results: list[LMAnswer] = []
42+
for task in tqdm(tasks, desc=model):
43+
prompt = get_prompt(task)
44+
match client.generate(prompt):
45+
case Success(r):
46+
gen_results.append(r)
47+
if output_file:
48+
with open(output_file, "ab", errors="ignore") as file:
49+
file.write(orjson.dumps(r, option=orjson.OPT_APPEND_NEWLINE))
50+
case Failure(e):
51+
logging.error(f"Error generating response: {e}")
52+
return Failure(e)
53+
54+
eval_acc = evaluate(tasks, gen_results)
55+
return Success(eval_acc)

0 commit comments

Comments
 (0)