add transformers generation as default

EYH0602 · EYH0602 · commit a11645046342 · 2025-09-19T22:59:21.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "tenacity>=9.1.2",
     "tiktoken==0.7.0",
     "tqdm>=4.66.2",
+    "transformers[torch]>=4.55.4",
     "tree-sitter==0.22.3",
     "tree-sitter-haskell==0.21.0",
     "types-deprecated>=1.2.15.20250304",
diff --git a/src/tfbench/env.py b/src/tfbench/env.py
@@ -1,5 +1,3 @@
-from dotenv import dotenv_values
+from dotenv import load_dotenv
 
-ENV = dotenv_values(".env")
-
-assert ENV, "No .env file found! Please create one with the required variables."
+load_dotenv(override=True)  # override existing env vars with those in .env
diff --git a/src/tfbench/lm/_hf.py b/src/tfbench/lm/_hf.py
@@ -0,0 +1,56 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ._types import LM, LMAnswer, NoneResponseError
+
+
+def extract_thinking_content(output: str) -> tuple[str, str | None]:
+    """Extract the thinking content and the final answer from the model output.
+    based on <think> and </think> tags.
+
+    Args:
+        output (str): The model output.
+    Returns:
+        tuple[str, str | None]: The thinking content and the final answer.
+    """
+    if "<think>" in output and "</think>" in output:
+        thinking_content = output.split("<think>")[1].split("</think>")[0].strip()
+        content = output.split("</think>")[-1].strip()
+        return content, thinking_content
+
+    return output, None
+
+
+class HFChat(LM):
+
+    def __init__(self, model_name: str, pure: bool = False):
+        super().__init__(model_name=model_name, pure=pure)
+
+        # load the tokenizer and the model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype="auto", device_map="auto"
+        )
+
+    def _gen(self, prompt: str) -> LMAnswer:
+        messages = [
+            {"role": "user", "content": prompt},
+            {"role": "system", "content": self.instruction},
+        ]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+
+        # conduct text completion
+        generated_ids = self.model.generate(**model_inputs, max_new_tokens=32768)
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
+        output = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
+
+        if output is None:
+            raise NoneResponseError(self.model_name)
+
+        content, thinking_content = extract_thinking_content(output)
+        return LMAnswer(answer=content, reasoning_steps=thinking_content)
diff --git a/src/tfbench/lm/_vllm.py b/src/tfbench/lm/_vllm.py
@@ -1,9 +1,9 @@
+import os
 from vllm import LLM
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 from openai import OpenAI
 
-from ..env import ENV
 from ._types import LM, LMAnswer, NoneResponseError
 
 
@@ -32,9 +32,9 @@ class VLLMOpenAIChatCompletion(LM):
     def __init__(self, model_name: str, pure: bool = False):
         super().__init__(model_name=model_name, pure=pure)
 
-        api_key = ENV.get("VLLM_API_KEY", "")
-        host = ENV.get("VLLM_HOST", "localhost")
-        port = ENV.get("VLLM_PORT", "8000")
+        api_key = os.getenv("VLLM_API_KEY", "")
+        host = os.getenv("VLLM_HOST", "localhost")
+        port = os.getenv("VLLM_PORT", "8000")
 
         url = f"http://{host}:{port}/v1"
         self.client = OpenAI(
diff --git a/src/tfbench/lm/utils.py b/src/tfbench/lm/utils.py
@@ -16,6 +16,7 @@
 from ._google import GeminiChat, GeminiReasoning, GEMINI_MODELS, GEMINI_TTC_MODELS
 from ._anthropic import ClaudeChat, ClaudeReasoning, CLAUDE_MODELS, CLAUDE_TTC_MODELS
 from ._ollama import OllamaChat, OLLAMA_TTC_MODELS
+from ._hf import HFChat
 
 from ._google import GeminiReasoningEffort
 from ._types import ReasoningEffort
@@ -89,7 +90,7 @@ def router(
     if model_name in OLLAMA_TTC_MODELS:
         return OllamaChat(model_name=model_name, pure=pure)
 
-    return None
+    return HFChat(model_name=model_name, pure=pure)
 
 
 def extract_response(response: ResultE[LMAnswer]) -> str:
diff --git a/uv.lock b/uv.lock