move evaluation logic inside the tfbench package

EYH0602 · EYH0602 · commit c70a45d1118e · 2025-08-27T14:59:41.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "numpy>=1.26.4",
     "ollama==0.5.3",
     "openai==1.99.9",
+    "orjson>=3.11.3",
     "pyarrow>=21.0.0",
     "pytest>=8.0.0",
     "python-dotenv==1.0.1",
@@ -68,6 +69,12 @@ disable = [
 fail-under = 9
 max-line-length = 120
 
+[tool.pylint]
+extension-pkg-whitelist = [
+    "orjson", # https://github.com/pylint-dev/pylint/issues/9762
+]
+
+
 [tool.pylint.typecheck]
 generated-members = ["cv2.*", "skimage.metrics.*"]
 
diff --git a/src/main.py b/src/main.py
@@ -1,77 +1,71 @@
-import os
-import json
 import logging
+from os.path import join as pjoin, abspath
+import os
 
-from funcy_chain import Chain
-from funcy import lmap
-from tqdm import tqdm
-from returns.result import ResultE
 import fire
+import numpy as np
+import orjson
+from returns.result import Success, Failure
+from tfbench import run_one_model, EvalResult
 
-from tfbench.common import get_prompt
-from tfbench.postprocessing import postprocess, RESPONSE_STRATEGIES
-from tfbench.evaluation import evaluate
-from tfbench.lm import router, LMAnswer, extract_response
-from tfbench.load import load_from_hf
+
+def analysis(results: list[EvalResult]):
+    """calculate mean and std of accuracy of multiple runs"""
+    accs = list(map(lambda r: r["accuracy"], results))
+    return np.mean(accs), np.std(accs)
 
 
 def main(
     model: str,
-    pure: bool = False,
     effort: str | None = None,
-    output_file: str | None = None,
+    n_repeats: int = 3,
     log_file: str = "evaluation_log.jsonl",
 ):
-    """
-    Run an experiment using various AI models to generate and evaluate type signatures.
-
-    Parameters:
-        model (str): Name of the model to use for generating type signatures. Must be one of:
-                     - GPT_MODELS: ["gpt-3.5-turbo-0125", "gpt-4-turbo-2024-04-09", ...]
-                     - OLLAMA_MODELS, CLAUDE_MODELS, or O1_MODELS.
-                     Default is "gpt-3.5-turbo".
-
-        pure (bool): If True, uses the original variable naming in type inference.
-                     If False, uses rewritten variable naming (e.g., `v1`, `v2`, ...). Default is False.
-
-    """
-
-    if output_file is None:
-        os.makedirs("result", exist_ok=True)
-        if "/" in model:
-            dir_name = model.split("/")[0]
-            os.makedirs(f"result/{dir_name}", exist_ok=True)
-        output_file = os.path.abspath(f"result/{model}.txt")
-    logging.info(f"Writing generation results in {output_file}.")
-
-    client = router(model, pure, effort)
-    assert client, f"Failed to create client for {model}."
-
-    tasks = load_from_hf("pure" if pure else "base")
-    prompts = lmap(get_prompt, tasks)
-    responses: list[ResultE[LMAnswer]] = lmap(
-        client.generate, tqdm(prompts, desc=model)
-    )
-
-    gen_results = (
-        Chain(responses)
-        .map(extract_response)
-        .map(lambda s: postprocess(s, RESPONSE_STRATEGIES))
-        .map(str.strip)
-        .value
-    )
-
-    # writing results
-    with open(output_file, "w", errors="ignore") as file:
-        file.write("\n".join(gen_results))
-
-    eval_acc = evaluate(tasks, gen_results)
-    print(eval_acc)
-
-    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    with open(log_file, "a") as fp:
-        logging_result = {"model_name": model, **eval_acc, "pure": pure}
-        fp.write(f"{json.dumps(logging_result)}\n")
+    """Main script to run experiments reported in the paper"""
+
+    def _run(pure: bool):
+        results = []
+        for i in range(n_repeats):
+            ext = "pure" if pure else "base"
+
+            result_dir = abspath(pjoin("results", model))
+            os.makedirs(result_dir, exist_ok=True)
+            result_file = pjoin(result_dir, f"run-{i}.{ext}.jsonl")
+            match run_one_model(
+                model, pure=pure, output_file=result_file, effort=effort
+            ):
+                case Success(r):
+                    results.append(r)
+                case Failure(e):
+                    return Failure(e)
+        return Success(analysis(results))
+
+    def _eval(pure: bool):
+        split = "pure" if pure else "base"
+        logging.info(f"Running {model} on TF-Bench ({split}):")
+        match _run(pure=False):
+            case Success((mean, std)):
+                logging.info(f"Accuracy: {mean:.4f} ± {std:.4f}")
+                with open(log_file, "ab") as f:
+                    f.write(
+                        orjson.dumps(
+                            {
+                                "model": model,
+                                "split": split,
+                                "effort": effort,
+                                "n_repeats": n_repeats,
+                                "mean": mean,
+                                "std": std,
+                            },
+                            option=orjson.OPT_APPEND_NEWLINE,
+                        )
+                    )
+            case Failure(e):
+                print(f"Error in base run: {e}")
+                return
+
+    _eval(pure=False)
+    _eval(pure=True)
 
 
 if __name__ == "__main__":
diff --git a/src/tfbench/__init__.py b/src/tfbench/__init__.py
@@ -1,3 +1,8 @@
 from dotenv import load_dotenv
 
+from .experiment import run_one_model
+from .evaluation import EvalResult
+
 load_dotenv(override=True)
+
+__all__ = ["run_one_model", "EvalResult"]
diff --git a/src/tfbench/evaluation.py b/src/tfbench/evaluation.py
@@ -1,14 +1,10 @@
-import json
-import logging
 from itertools import starmap
 import re
+from typing import TypedDict
 
-import fire
-from funcy_chain import Chain
-from dacite import from_dict
-
-from tfbench.common import BenchmarkTask
-from tfbench.postprocessing import postprocess, TASK_STRATEGIES, RESPONSE_STRATEGIES
+from .common import BenchmarkTask
+from .postprocessing import postprocess, TASK_STRATEGIES, RESPONSE_STRATEGIES
+from .lm import LMAnswer
 
 
 def tokenize_type_signature(sig: str) -> list[str]:
@@ -65,16 +61,20 @@ def alpha_equiv(s1: str, s2: str) -> bool:
     return n1 == n2
 
 
-def evaluate_one_task(task: BenchmarkTask, result: str) -> bool:
+def evaluate_one_task(task: BenchmarkTask, result: LMAnswer) -> bool:
     """evaluate a single task against its result by alpha equivalence"""
-    ground_truth = postprocess(task.signature, TASK_STRATEGIES)
-    result = postprocess(result, RESPONSE_STRATEGIES)
-    return alpha_equiv(ground_truth, result)
+    ground_truth = postprocess(task.signature, TASK_STRATEGIES).strip()
+    predicted = postprocess(result.answer, RESPONSE_STRATEGIES).strip()
+    return alpha_equiv(ground_truth, predicted)
+
+
+class EvalResult(TypedDict):
+    total: int
+    n_correct: int
+    accuracy: float
 
 
-def evaluate(
-    benchmark_f: list[BenchmarkTask], results: list[str]
-) -> dict[str, int | float]:
+def evaluate(benchmark_f: list[BenchmarkTask], results: list[LMAnswer]) -> EvalResult:
     """evaluate all generation results"""
 
     assert len(benchmark_f) == len(results)
@@ -87,27 +87,3 @@ def evaluate(
         "n_correct": n_correct,
         "accuracy": acc,
     }
-
-
-def main(
-    benchmark_file: str = "Benchmark-F.jsonl",
-    results_file: str = "data/experiment/gpt_generated_responses.jsonl",
-):
-    """script to run all evaluation tasks"""
-    with open(benchmark_file, "r") as file:
-        benchmark_f: list[BenchmarkTask] = (
-            Chain(file.readlines())
-            .map(json.loads)
-            .map(lambda d: from_dict(data_class=BenchmarkTask, data=d))
-            .value
-        )
-    with open(results_file, "r") as file:
-        results: list[str] = Chain(file.readlines()).map(json.loads).value
-
-    eval_acc = evaluate(benchmark_f, results)
-    logging.info(json.dumps(eval_acc, indent=2))
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    fire.Fire(main)
diff --git a/src/tfbench/experiment.py b/src/tfbench/experiment.py
@@ -1,3 +1,55 @@
 """
-Experiment script for OpenAI models
+Experiment script
 """
+
+import logging
+
+from tqdm import tqdm
+from returns.result import Success, Failure, ResultE
+import orjson
+
+from .common import get_prompt
+from .evaluation import evaluate, EvalResult
+from .lm import router, LMAnswer
+from .load import load_from_hf
+
+
+def run_one_model(
+    model: str,
+    pure: bool = False,
+    output_file: str | None = None,
+    effort: str | None = None,
+) -> ResultE[EvalResult]:
+    """Running the generation & evaluation pipeline for one pre-supported model
+
+    Args:
+        model (str): name of the model to evaluate
+        pure (bool, optional): To evaluate on the `pure` split or not. Defaults to False.
+        output_file (str | None, optional): The file to save generation result. Defaults to None.
+            Warning: If None, generation results will not be saved to disk.
+        effort (str | None, optional): Reasoning effort. Defaults to None.
+            Warning: Different model handles None(default) effort differently.
+
+    Returns:
+        EvalResult: evaluation result including accuracy
+    """
+    client = router(model, pure, effort)
+    if not client:
+        return Failure(Exception(f"Failed to create client for {model}."))
+
+    tasks = load_from_hf("pure" if pure else "base")
+    gen_results: list[LMAnswer] = []
+    for task in tqdm(tasks, desc=model):
+        prompt = get_prompt(task)
+        match client.generate(prompt):
+            case Success(r):
+                gen_results.append(r)
+                if output_file:
+                    with open(output_file, "ab", errors="ignore") as file:
+                        file.write(orjson.dumps(r, option=orjson.OPT_APPEND_NEWLINE))
+            case Failure(e):
+                logging.error(f"Error generating response: {e}")
+                return Failure(e)
+
+    eval_acc = evaluate(tasks, gen_results)
+    return Success(eval_acc)
diff --git a/uv.lock b/uv.lock