SecurityLab-UCD · EYH0602 · Aug 25, 2025 · Aug 14, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.env b/.env
@@ -0,0 +1,3 @@
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+GEMINI_API_KEY=
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
@@ -17,6 +17,8 @@ jobs:
       - name: Set up Python
         run: uv sync
 
+      - name: install mypy
+        run: uv pip install mypy
+
       - name: Type Check Source Code
         run: uv run mypy src
-
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -16,6 +16,9 @@ jobs:
 
       - name: Set up Python
         run: uv sync
+
+      - name: Install Pylint
+        run: uv pip install pylint
 
       - name: Lint Source Code
-        run: uv run pylint src
+        run: uv run pylint src
diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.11
+3.12
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ uv sync # create a virtual environment, and install dependencies
 This script will build the benchmark (Prelude with NL) from the raw data.
 
 ```sh
-uv run scripts/preprocess_benchmark.py
+uv run scripts/preprocess_benchmark.py -o tfb.json
 ```
 
 ### TF-Bench_pure
@@ -36,7 +36,7 @@ stack exec alpharewrite-exe 1 ../tfb.json > ../tfb.pure.json
 cd ..
 ```
 
-For details, please take a look at the README of [alpharewrite](https://github.com/SecurityLab-UCD/alpharewrite).
+For details, please check out the README of [alpharewrite](https://github.com/SecurityLab-UCD/alpharewrite).
 
 ## Download Pre-built Benchmark
 
@@ -46,6 +46,14 @@ You can also download our pre-built benchmark from [Zenodo](https://doi.org/10.5
 
 ## Benchmarking!
 
+Please have your API key ready in `.env`.
+Please note that the `.env` in the repository is tracked by git,
+we recommend telling your git to ignore its changes by
+
+```sh
+git update-index --assume-unchanged .env
+```
+
 ### GPT Models
 
 To run single model:
@@ -61,22 +69,58 @@ To run all GPT models:
 uv run run_all.py --option gpt
 ```
 
-### Open Source Models
+### Open Source Models with Ollama
 
-We use [Ollama](https://ollama.com/) to manage and run the OSS models.
+We use [Ollama](https://ollama.com/) to manage and run the OSS models reported in the Appendix.
+We switched to vLLM for better performance and SDK design.
+Although the Ollama option is still available,
+it is no longer maintained.
+We recommend using vLLM instead.
 
 ```sh
 curl -fsSL https://ollama.com/install.sh | sh # install ollama, you need sudo for this
 ollama serve # start your own instance instead of a system service
-uv run --project . scripts/ollama_pull.sh # install required models
 ```
 
+NOTE: we required the ollama version at least 0.9.0 to enable thinking parsers.
+We use 0.11.7 for our experiments.
+
 ```sh
-uv run main.py -i Benchmark-F.json -m llama3:70b
+> ollama --version
+ollama version is 0.11.7
 ```
 
-To run all Ollama models:
+Run the benchmark.
 
 ```sh
-uv run run_all.py --option ollama
+uv run scripts/experiment_ollama.py -m llama3:8b
+```
+
+### (WIP) Running Your Model with vLLM
+
+#### OpenAI-Compatible Server
+
+First, launch the vLLM OpenAI-Compatible Server (with default values, please check vLLM's doc for setting your own):
+
+```sh
+uv run vllm serve openai/gpt-oss-120b --tensor-parallel-size 2 --async-scheduling
+```
+
+Then, run the benchmark:
+
+```sh
+uv run main.py -i Benchmark-F.json -m vllm_openai_chat_completion
+```
+
+NOTE: if you set your API key, host, and port when launching the vLLM server,
+please add them to the `.env` file as well.
+Please modify `.env` for your vLLM api-key, host, and port.
+If they are left empty, the default values ("", "localhost", "8000") will be used.
+We do not recommend using the default values on machine connect to the public web,
+as they are not secure.
+
+```
+VLLM_API_KEY=
+VLLM_HOST=
+VLLM_PORT=
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,34 +3,36 @@ name = "tfbench"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.12"
 dependencies = [
-    "anthropic>=0.49.0",
+    "anthropic==0.49.0",
     "dacite>=1.8.1",
+    "deprecated>=1.2.18",
+    "dotenv>=0.9.9",
     "fire==0.5.0",
     "funcy==2.0",
     "funcy-chain==0.2.0",
-    "google-genai>=1.11.0",
-    "groq==0.8.0",
+    "google-genai==1.31.0",
     "hypothesis>=6.98.6",
     "markdown-to-json==2.1.2",
     "matplotlib>=3.8.3",
     "numpy>=1.26.4",
-    "ollama>=0.2.1",
-    "openai==1.75.0",
+    "ollama==0.5.3",
+    "openai==1.99.9",
     "pathos>=0.3.3",
-    "pylint>=3.3.6",
     "pytest>=8.0.0",
     "python-dotenv==1.0.1",
     "requests==2.32.3",
-    "returns[compatible-mypy]==0.22.0",
+    "returns[compatible-mypy]>=0.26.0",
     "seaborn==0.13.2",
     "tabulate>=0.9.0",
+    "tenacity>=9.1.2",
     "tiktoken==0.7.0",
     "tqdm>=4.66.2",
     "tree-sitter==0.22.3",
     "tree-sitter-haskell==0.21.0",
     "types-requests>=2.31.0",
+    "vllm>=0.10.1.1",
 ]
 
 [build-system]
@@ -60,6 +62,7 @@ disable = [
     "too-many-statements",
     "unspecified-encoding",
     "missing-class-docstring",
+    "too-few-public-methods",        # LM only have 1 public method
 ]
 fail-under = 9
 max-line-length = 120

diff --git a/scripts/experiment_ollama.py b/scripts/experiment_ollama.py
@@ -0,0 +1,176 @@
+"""
+(deprecated) Experiment script for OSS models using Ollama
+This script reproduce legacy results for OSS models using Ollama in our paper's Appendix.
+New models should use our vLLM option instead.
+"""
+
+from typing import Union
+import os
+import json
+
+from ollama import Client as OllamaClient, ResponseError
+import fire
+from dacite import from_dict
+from tqdm import tqdm
+from funcy_chain import Chain
+
+from tfbench.lm import get_sys_prompt
+from tfbench.common import BenchmarkTask, get_prompt
+from tfbench.postprocessing import postprocess, RESPONSE_STRATEGIES
+from tfbench.evaluation import evaluate
+
+OLLAMA_OSS = [
+    "phi3:3.8b",
+    "phi3:14b",
+    "mistral",
+    "mixtral:8x7b",
+    "mixtral:8x22b",
+    "llama3:8b",
+    "llama3:70b",
+    "llama3.1:8b",
+    "llama3.1:70b",
+    "llama3.1:405b",
+    "llama3.2:1b",
+    "llama3.2:3b",
+    "llama3.3:70b",
+    "gemma:2b",
+    "gemma:7b",
+    "gemma2:9b",
+    "gemma2:27b",
+    "qwen2:1.5b",
+    "qwen2:7b",
+    "qwen2:72b",
+    "qwen2.5:1.5b",
+    "qwen2.5:7b",
+    "qwen2.5:72b",
+    "deepseek-v2:16b",
+    "deepseek-v2:236b",
+    "deepseek-v2.5:236b",
+]
+
+
+OLLAMA_CODE = [
+    "qwen2.5-coder:1.5b",
+    "qwen2.5-coder:7b",
+    "granite-code:3b",
+    "granite-code:8b",
+    "granite-code:20b",
+    "granite-code:34b",
+    "deepseek-coder-v2:16b",
+    "deepseek-coder-v2:236b",
+]
+
+OLLAMA_MODELS = OLLAMA_OSS + OLLAMA_CODE
+
+
+def get_ollama_model(
+    client: OllamaClient,
+    model: str = "llama3:8b",
+    pure: bool = False,
+):
+    """
+    Configure and return a function to generate type signatures using an Ollama model.
+
+    Parameters:
+        client (OllamaClient): The Ollama client instance used for sending requests to the model.
+
+        model (str): Name of the model to use for generating type signatures.
+                    Must be one of the predefined models in OLLAMA_MODELS.
+                    Default is "llama3:8b".
+
+        pure (bool): If True, uses the original variable naming in type inference.
+                     If False, uses rewritten variable naming (e.g., `v1`, `v2`, ...). Default is False.
+
+    Returns:
+        Callable[[str], Union[str, None]]:
+            A function that takes a prompt string as input and returns the generated type
+            signature as a string, or None if the generation fails.
+    """
+
+    def generate_type_signature(prompt: str) -> Union[str, None]:
+        try:
+            response = client.chat(
+                messages=[
+                    {
+                        "role": "system",
+                        "content": get_sys_prompt(pure),
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                model=model,
+            )
+        except ResponseError as e:
+            print(e)
+            return None
+
+        message = response.message
+        if message.content:
+            return str(message.content)
+
+        return None
+
+    return generate_type_signature
+
+
+def main(
+    model: str = "llama3:8b",
+    pure: bool = False,
+    port: int = 11434,
+    output_file: str | None = None,
+    log_file: str = "evaluation_log.jsonl",
+):
+    """
+    Run an experiment using various AI models to generate and evaluate type signatures.
+
+    Parameters:
+        model (str): Name of the model to use for generating type signatures. Must be one of OLLAMA_MODELS
+
+        port (int): Port number for connecting to the Ollama server.
+                    Ignored for other models. Default is 11434.
+
+        pure (bool): If True, uses the original variable naming in type inference.
+                     If False, uses rewritten variable naming (e.g., `v1`, `v2`, ...). Default is False.
+    """
+    assert model in OLLAMA_MODELS, f"{model} is not supported."
+
+    # hard-coding benchmark file path for experiment
+    input_file = "tfb.pure.json" if pure else "tfb.json"
+    input_file = os.path.abspath(input_file)
+    assert os.path.exists(
+        input_file
+    ), f"{input_file} does not exist! Please download or build it first."
+
+    if output_file is None:
+        os.makedirs("result", exist_ok=True)
+        output_file = f"result/{model}.txt"
+
+    client = OllamaClient(host=f"http://localhost:{port}")
+    generate = get_ollama_model(client, model, pure)
+
+    with open(input_file, "r") as fp:
+        tasks = [from_dict(data_class=BenchmarkTask, data=d) for d in json.load(fp)]
+
+    prompts = map(get_prompt, tasks)
+    responses = map(generate, tqdm(prompts, desc=model))
+    gen_results = (
+        Chain(responses)
+        .map(lambda x: x if x is not None else "")  # convert None to empty string
+        .map(lambda s: postprocess(s, RESPONSE_STRATEGIES))
+        .map(str.strip)
+        .value
+    )
+
+    with open(output_file, "w", errors="ignore") as file:
+        file.write("\n".join(gen_results))
+
+    eval_acc = evaluate(tasks, gen_results)
+    print(eval_acc)
+
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(log_file, "a") as fp:
+        logging_result = {"model_name": model, **eval_acc, "pure": pure}
+        fp.write(f"{json.dumps(logging_result)}\n")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/scripts/preprocess_benchmark.py b/scripts/preprocess_benchmark.py
@@ -10,6 +10,7 @@
 
 
 def main(input_raw_benchmark_path: str = "benchmark", output_path: str = "tfb.json"):
+    """Process pre-extracted tasks from Markdown to JSON"""
 
     # read in all files ending with .md in the input_raw_benchmark_path
     tasks: list[BenchmarkTask] = []