doc: improve instructions

EYH0602 · EYH0602 · commit cba92af18d75 · 2025-09-30T13:13:12.000-07:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Evaluating Program Semantics Reasoning with Type Inference in System _F_
 
-## Setup
+## Development
 
 ### Python
 
@@ -29,17 +29,17 @@ and [impredicative polymorphism](https://ghc.gitlab.haskell.org/ghc/doc/users_gu
 so we require GHC version >= 9.2.1.
 Our evaluation used GHC-9.6.7.
 
-## Building TF-Bench From Scratch (Optional)
+## Building TF-Bench from scratch (optional)
 
-### TF-Bench
+### TF-Bench (base)
 
 This script will build the benchmark (Prelude with NL) from the raw data.
 
 ```sh
 uv run scripts/preprocess_benchmark.py -o tfb.json
 ```
 
-### TF-Bench_pure
+### TF-Bench (pure)
 
 ```sh
 git clone https://github.com/SecurityLab-UCD/alpharewrite.git
@@ -53,38 +53,52 @@ cd ..
 
 For details, please check out the README of [alpharewrite](https://github.com/SecurityLab-UCD/alpharewrite).
 
-## Download Pre-built Benchmark
+## Download pre-built benchmark
 
-You can also download our pre-built benchmark from [Zenodo](https://doi.org/10.5281/zenodo.14751813).
+You can also use TF-Bench on HuggingFace datasets.
 
-<a href="https://doi.org/10.5281/zenodo.14751813"><img src="https://zenodo.org/badge/DOI/10.5281/zenodo.14751813.svg" alt="DOI"></a>
+```python
+from datasets import load_dataset
 
-## Benchmarking!
+split = "pure" # or "base"
+dataset = load_dataset("SecLabUCD/TF-Bench", split=split)
+```
 
-Please have your API key ready in `.env`.
-Please note that the `.env` in the repository is tracked by git,
-we recommend telling your git to ignore its changes by
+Or through our provided package.
+
+```python
+from tfbench import load_tfb_from_hf
+
+dataset = load_tfb_from_hf(split)
+```
+
+## Using as an application
 
 ```sh
-git update-index --assume-unchanged .env
+git clone https://github.com/SecurityLab-UCD/TF-Bench.git
+cd TF-Bench
+uv sync
 ```
 
-### GPT Models
+Please have your API key ready in `.env`.
 
-To run single model:
+### Proprietary models
 
-```sh
-export OPENAI_API_KEY=<OPENAI_API_KEY> # make sure your API key is in the environment
-uv run main.py -i TF-Bench.json -m gpt-3.5-turbo
+We use each provider's official SDK to access their models.
+You can check our pre-supported models in `tfbench.lm` module.
+
+```python
+from tfbench.lm import supported_models
+print(supported_models)
 ```
 
-To run all GPT models:
+To run single model, which runs both `base` and `pure` splits:
 
 ```sh
-uv run run_all.py --option gpt
+uv run main.py -m gpt-5-2025-08-07
 ```
 
-### Open Source Models with Ollama
+### Open-weights models with Ollama
 
 We use [Ollama](https://ollama.com/) to manage and run the OSS models reported in the Appendix.
 We switched to vLLM for better performance and SDK design.
@@ -108,34 +122,77 @@ ollama version is 0.11.7
 Run the benchmark.
 
 ```sh
-uv run scripts/experiment_ollama.py -m llama3:8b
+uv run src/main.py -m llama3:8b
 ```
 
-### (WIP) Running Your Model with vLLM
+### Running any model on HuggingFace Hub
+
+We also support running any model that is on HuggingFace Hub out-of-the-box.
+We provide an example using Qwen3.
+
+```sh
+uv run src/main.py Qwen/Qwen3-4B-Instruct-2507 # or other models
+```
 
-#### OpenAI-Compatible Server
+Note that our `main.py` uses a pre-defined model router,
+which routes all un-recognized model names to HuggingFace.
+We use the `</think>` token to parse thinking process,
+if the model do it differently, please see the next section.
 
-First, launch the vLLM OpenAI-Compatible Server (with default values, please check vLLM's doc for setting your own):
+### Running your own model
+
+To support your customized model,
+you can input the path to your HuggingFace compatible checkpoint to our `main.py`.
 
 ```sh
-uv run vllm serve openai/gpt-oss-120b --tensor-parallel-size 2 --async-scheduling
+uv run src/main.py <path to your checkpoint>
 ```
 
-Then, run the benchmark:
+## Using as a package
+
+Our package is also available on PyPi.
 
 ```sh
-uv run main.py -i Benchmark-F.json -m vllm_openai_chat_completion
+uv add tfbench
 ```
 
-NOTE: if you set your API key, host, and port when launching the vLLM server,
-please add them to the `.env` file as well.
-Please modify `.env` for your vLLM api-key, host, and port.
-If they are left empty, the default values ("", "localhost", "8000") will be used.
-We do not recommend using the default values on machine connect to the public web,
-as they are not secure.
+Or directly using pip, you know the way
 
+```sh
+pip install tfbench
 ```
-VLLM_API_KEY=
-VLLM_HOST=
-VLLM_PORT=
+
+### Proprietary model checkpoints that are not currently supported
+
+Our supported model list is used to route the model name to the correct SDK.
+Even a newly released model is not in our supported models list,
+you can still use it by specifying the SDK client directly.
+We take OpenAI GPT-4.1 as and example here.
+
+```python
+from tfbench.lm import OpenAIResponse
+from tfbench import run_one_model
+
+model = "gpt-4.1"
+split = "pure"
+client = OpenAIResponses(model_name=model, pure=split == "pure", effort=None)
+eval_result = run_one_model(client, pure=split == "pure", effort=None)
+```
+
+### Support other customized models
+
+You may implement an `LM` instance.
+
+```python
+from tfbench.lm._types import LM, LMAnswer
+
+class YourLM(LM):
+    def __init__(self, model_name: str, pure: bool = False):
+        """initialize your model"""
+        super().__init__(model_name=model_name, pure=pure)
+        ...
+
+    def _gen(self, prompt: str) -> LMAnswer:
+        """your generation logic here"""
+        return LMAnswer(answer=content, reasoning_steps=thinking_content)
 ```
diff --git a/src/main.py b/src/main.py
@@ -5,6 +5,7 @@
 from orjsonl import orjsonl
 
 from tfbench import run_one_model, analysis_multi_runs, EvalResult
+from tfbench.lm import router
 
 
 def main(
@@ -13,17 +14,25 @@ def main(
     n_repeats: int = 3,
     log_file: str = "evaluation_log.jsonl",
 ):
-    """Main script to run experiments reported in the paper"""
+    """Ready-to use evaluation script for a single model.
+
+    Args:
+        model (str): The model's name, please refer to `tfbench.lm.supported_models` for supported models.
+        effort (str | None, optional): The effort level to use for evaluation. Defaults to None.
+        n_repeats (int, optional): The number of times to repeat the evaluation. Defaults to 3.
+        log_file (str, optional): The file to log results to. Defaults to "evaluation_log.jsonl".
+    """
 
     def _run(pure: bool):
+        client = router(model, pure, effort)
         results: list[EvalResult] = []
         split = "pure" if pure else "base"
         result_dir = abspath(pjoin("results", model, split))
         for i in range(n_repeats):
             os.makedirs(result_dir, exist_ok=True)
             result_file = pjoin(result_dir, f"run-{i}.jsonl")
             r = run_one_model(
-                model,
+                client,
                 pure=pure,
                 output_file=result_file,
                 effort=effort,
diff --git a/src/tfbench/experiment.py b/src/tfbench/experiment.py
@@ -7,20 +7,20 @@
 
 from .common import get_prompt
 from .evaluation import prover_evaluate, EvalResult
-from .lm import router, LMAnswer
+from .lm import router, LMAnswer, LM
 from .load import load_tfb_from_hf
 
 
 def run_one_model(
-    model: str,
+    client: LM,
     pure: bool = False,
     output_file: str | None = None,
     effort: str | None = None,
 ) -> EvalResult:
     """Running the generation & evaluation pipeline for one pre-supported model
 
     Args:
-        model (str): name of the model to evaluate
+        client (LM): some LM client wrapper to use `generate`
         pure (bool, optional): To evaluate on the `pure` split or not. Defaults to False.
         output_file (str | None, optional): The file to save generation result. Defaults to None.
             Warning: If None, generation results will not be saved to disk.
@@ -30,11 +30,10 @@ def run_one_model(
     Returns:
         EvalResult: evaluation result including accuracy
     """
-    client = router(model, pure, effort)
 
     tasks = load_tfb_from_hf("pure" if pure else "base")
     gen_results: list[LMAnswer | None] = []
-    for task in tqdm(tasks, desc=model):
+    for task in tqdm(tasks, desc=client.model_name):
         prompt = get_prompt(task)
 
         response = client.generate(prompt)
diff --git a/src/tfbench/lm/__init__.py b/src/tfbench/lm/__init__.py
@@ -2,14 +2,34 @@
 
 from .prompts import get_sys_prompt
 from .settings import MAX_TOKENS
-from ._openai import OpenAIChatCompletion, OpenAIResponses
-from ._google import GeminiChat, GeminiReasoning
+from ._openai import (
+    OAI_MODELS,
+    OAI_TTC_MODELS,
+    OAI_O5,
+    OpenAIChatCompletion,
+    OpenAIResponses,
+)
+from ._google import GEMINI_MODELS, GEMINI_TTC_MODELS, GeminiChat, GeminiReasoning
+from ._anthropic import CLAUDE_MODELS, CLAUDE_TTC_MODELS, ClaudeChat, ClaudeReasoning
+from ._ollama import OLLAMA_TTC_MODELS, OllamaChat
+from ._hf import HFChat
 from ._types import LM, LMAnswer
 from .utils import router, extract_response
 
 logging.getLogger("openai").setLevel(logging.ERROR)
 logging.getLogger("httpx").setLevel(logging.ERROR)
 
+supported_models = (
+    OAI_MODELS
+    + OAI_TTC_MODELS
+    + OAI_O5
+    + GEMINI_MODELS
+    + GEMINI_TTC_MODELS
+    + CLAUDE_MODELS
+    + CLAUDE_TTC_MODELS
+    + OLLAMA_TTC_MODELS
+)
+
 __all__ = [
     "get_sys_prompt",
     "MAX_TOKENS",
@@ -19,6 +39,10 @@
     "OpenAIResponses",
     "GeminiChat",
     "GeminiReasoning",
+    "ClaudeChat",
+    "ClaudeReasoning",
+    "HFChat",
     "router",
     "extract_response",
+    "supported_models",
 ]