feat: define new types automatically for pure tasks

EYH0602 · EYH0602 · commit 7032a3e306b7 · 2025-09-08T17:23:54.000-07:00
diff --git a/src/tfbench/ghc.py b/src/tfbench/ghc.py
@@ -50,6 +50,7 @@ def ghc_prove_equiv(code: str) -> Result[None, str]:
 type Char_ = Char
 type Float_ = Float
 type Double_ = Double
+data Natural = Natural
 
 $new_types
 
diff --git a/src/tfbench/hs_parser/__init__.py b/src/tfbench/hs_parser/__init__.py
@@ -6,6 +6,7 @@
     get_type_vars,
     get_type_constraints,
 )
+from .extractor import TypeExtractor
 
 __all__ = [
     "AST",
@@ -16,4 +17,5 @@
     "to_type_node",
     "get_type_vars",
     "get_type_constraints",
+    "TypeExtractor",
 ]
diff --git a/src/tfbench/hs_parser/ast_util.py b/src/tfbench/hs_parser/ast_util.py
@@ -99,19 +99,6 @@ def get_fn_name(self, node: Node) -> Maybe[str]:
                 return Nothing
         return Some(fn_name.strip())
 
-    def get_fn_docstring(self, node: Node) -> Maybe[str]:
-        """
-        Retrieves the docstring associated with a function node.
-
-        Args:
-            node (Node): The AST node representing a function.
-
-        Returns:
-            Maybe[str]: A Maybe containing the docstring if found, or Nothing otherwise.
-        """
-        # todo: implement docstring finder
-        raise NotImplementedError
-
     def func2src(self, func: HaskellFunction) -> tuple[str, str]:
         """
         Converts a `HaskellFunction` object into its corresponding type signature and code source.
diff --git a/src/tfbench/hs_parser/extractor.py b/src/tfbench/hs_parser/extractor.py
@@ -0,0 +1,124 @@
+from collections import defaultdict, Counter
+from dataclasses import dataclass
+from tree_sitter import Node
+from .ast_util import AST
+
+
+class TypeExtractor(AST):
+    """Static analyzer for Haskell type signatures.
+    NOTE: this analyzer works on the body of a type signature only,
+    i.e. the part after the `=>` symbol if it has constraints,
+    or otherwise after the `::` symbol.
+    The constraints (if any) are handled in other modules.
+    """
+
+    def __init__(self, code: str):
+        super().__init__(code)
+        self.constructors: dict[str, Counter] = defaultdict(Counter)
+        self.names: set[str] = set()
+
+        self._analysis_types()
+
+    @property
+    def type_constructors(self) -> dict[str, int]:
+        """Get a mapping of type constructor names to their maximum observed arity (i.e. number of parameters)."""
+        return {k: max(v.keys()) for k, v in self.constructors.items()}
+
+    def _analysis_types(self):
+        """analysis types in the function signature to fill out self.constructors and self.names"""
+        sigs = self.get_all_nodes_of_type(self.root, "signature")
+        functions = self.get_all_nodes_of_type(sigs[0], "function")
+        if len(functions) > 0:
+            self._visit(functions[0])
+
+    def _collect_from_tuple(self, node: Node):
+        # record tuple arity if you care: arity = count of element children
+        # then continue walking children
+        for ch in node.named_children:
+            self._visit(ch)
+
+    def _visit(self, n: Node):
+        t = n.type
+
+        if t == "apply":
+            # Count this application chain once, at the top-most 'apply' only.
+            parent = n.parent
+            if not (
+                parent
+                and parent.type == "apply"
+                and parent.child_by_field_name("constructor") is n
+            ):
+                apply_chain = _peel_apply_chain(n)
+                ctor_name = self.get_src_from_node(apply_chain.constructor)
+                self.constructors[ctor_name][apply_chain.arity] += 1
+            # Recurse into children so we also catch nested names/applications.
+            for ch in n.named_children:
+                self._visit(ch)
+            return
+
+        if t == "constructor":
+            # Zero-arity constructor occurrence (e.g., `Int`) not part of an apply
+            parent = n.parent
+            if not (
+                parent
+                and parent.type == "apply"
+                and parent.child_by_field_name("constructor") is n
+            ):
+                name_node = n.child_by_field_name("name") or (
+                    n.named_children[0] if n.named_children else None
+                )
+                if name_node:
+                    constructor_name = self.get_src_from_node(name_node)
+                    self.constructors[constructor_name][0] += 1
+            # still walk inside
+            for ch in n.named_children:
+                self._visit(ch)
+            return
+
+        if t == "tuple":
+            self._collect_from_tuple(n)
+            return
+
+        if t == "name":
+            # Treat as a plain type variable/name when not under a constructor role.
+            p = n.parent
+            # If its parent is 'constructor', it's part of a constructor; skip here.
+            if p is None or p.type != "constructor":
+                self.names.add(self.get_src_from_node(n))
+            return
+
+        # default: recurse
+        for ch in n.named_children:
+            self._visit(ch)
+
+
+@dataclass
+class TypeApplyChain:
+    constructor: Node
+    arity: int
+    arguments: list[Node]
+
+
+def _peel_apply_chain(node: Node) -> TypeApplyChain:
+    """
+    Given an (apply ...) subtree, walk left through nested apply nodes to
+    find the root constructor name and count how many arguments were applied.
+    # Returns (arity, arg_nodes_list, constructor_node).
+    """
+    args = []
+    arity = 0
+    cur = node
+    while cur.type == "apply":
+        arity += 1
+        arg = cur.child_by_field_name("argument")
+        if arg is not None:
+            args.append(arg)
+        # could be 'constructor' or another 'apply'
+        next_level = cur.child_by_field_name("constructor")
+        if not next_level:
+            break
+        cur = next_level
+
+    # now cur is either a 'constructor' node or a 'name' (rare)
+    ctor_node = cur
+    return TypeApplyChain(constructor=ctor_node, arity=arity, arguments=args)
diff --git a/src/tfbench/type_def.py b/src/tfbench/type_def.py
@@ -2,6 +2,7 @@
 
 from .common import BenchmarkTask
 from .hs_parser import AST, get_type_constraints
+from .hs_parser.extractor import TypeExtractor
 
 
 def _is_type(code: str, type_name: str) -> bool:
@@ -49,8 +50,6 @@ def is_type_defined(type_name: str, type_defs: list[str]) -> bool:
 def get_type_defs(task: BenchmarkTask) -> list[str]:
     """Get Haskell type definitions from a BenchmarkTask"""
     existing_defs = lfilter(is_type_def, task.dependencies)
-    ast = AST(task.signature)
-    sig = ast.get_all_nodes_of_type(ast.root, "signature")[0]
 
     if "=>" in task.signature:
         constrains = get_type_constraints(task.signature)
@@ -60,14 +59,16 @@ def get_type_defs(task: BenchmarkTask) -> list[str]:
                 continue
             existing_defs.append(def_new_type_class(ty_class, ty_vars))
 
-    for node in ast.get_all_nodes_of_type(sig, "name"):
-        ty = ast.get_src_from_node(node)
-        if is_type_defined(ty, existing_defs):
+    extractor = TypeExtractor(task.signature)
+    for ctor_name, arity in extractor.type_constructors.items():
+        if is_type_defined(ctor_name, existing_defs):
             continue
+        type_vars = [f"t{i}" for i in range(arity)]
+        existing_defs.append(def_new_type_constructor(ctor_name, type_vars))
 
-        np = node.parent
-        assert np is not None
-        if np.type == "function":  # data type
-            existing_defs.append(def_new_type(ty))
+    for type_name in extractor.names:
+        if is_type_defined(type_name, existing_defs):
+            continue
+        existing_defs.append(def_new_type(type_name))
 
     return list(existing_defs)
diff --git a/tests/test_eval_diff.py b/tests/test_eval_diff.py
@@ -0,0 +1,71 @@
+from os.path import abspath, dirname, basename, join as pjoin
+import os
+from itertools import starmap
+from multiprocessing import Pool
+
+import pytest
+import fire
+from orjsonl import orjsonl
+from tqdm import tqdm
+from tfbench import (
+    analysis_multi_runs,
+    load_tfb_from_hf,
+    load_gen_results_jsonl,
+    prover_evaluate,
+)
+from tfbench.ghc import get_prover
+from tfbench.evaluation import evaluate_one_task, prove_one_task
+from tfbench.common import task2md
+from tfbench.type_def import get_type_defs
+from tfbench.postprocessing import postprocess, TASK_STRATEGIES, RESPONSE_STRATEGIES
+
+
+def diff_one_file(file_path: str, split: str):
+    tasks = load_tfb_from_hf(split)
+    answers = load_gen_results_jsonl(abspath(file_path))
+
+    old_eval = starmap(evaluate_one_task, zip(tasks, answers))
+    with Pool() as pool:
+        new_eval = pool.starmap(
+            prove_one_task, zip(tasks, answers, [split == "pure"] * len(tasks))
+        )
+
+    for t, a, o, n in zip(tasks, answers, old_eval, new_eval):
+        if a is None:
+            continue
+        # if o:
+        #     assert n, "both evaluations should return a result"
+        if o and not n:
+            print(task2md(t))
+            defs = get_type_defs(t)
+
+            predicted_body = postprocess(a.answer, RESPONSE_STRATEGIES).strip()
+            predicted = f"f :: {predicted_body}"
+            print(get_prover(t.signature, predicted, defs).unwrap())
+            assert False
+
+
+def test_diff_recorded():
+    """different test evaluation function with recorded results
+    Since the new prover evaluation fixes the false negative issue,
+    we assume if an answer is determined as correct by the old evaluation,
+    it should also be correct by the new evaluation.
+    """
+
+    result_path = abspath("results")
+    # skip the test if there are not recorded results
+    if not os.path.exists(result_path):
+        pytest.skip("No recorded results found, skip the test.")
+
+    # walk the result directory to find all jsonl files
+    for b, _, f in os.walk(result_path):
+        for file in f:
+            if file.endswith(".jsonl"):
+                file_path = pjoin(b, file)
+                split = basename(b)
+                print(f"Diffing {file_path} ...")
+                diff_one_file(file_path, split)
+
+
+if __name__ == "__main__":
+    fire.Fire(test_diff_recorded)
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -0,0 +1,27 @@
+from tfbench.hs_parser import TypeExtractor
+
+
+def test_real_cases():
+    code = "f:: T1 t1 => t1"
+    et = TypeExtractor(code)
+    assert not et.type_constructors
+
+    code = "f:: T1 t1 => T2 -> t1"
+    et = TypeExtractor(code)
+    assert not et.type_constructors
+    assert et.names == {"T2"}
+
+    code = "f:: T1 t1 => T2 T3 -> t1"
+    et = TypeExtractor(code)
+    assert et.type_constructors == {"T2": 1}
+    assert et.names == {"T2", "T3"}
+
+    code = "f:: T1 -> T2 T3 -> Either T1 T3 -> (T1, T3, T2 T3)"
+    et = TypeExtractor(code)
+    assert et.type_constructors == {"T2": 1, "Either": 2}
+    assert et.names == {"T1", "T2", "T3", "Either"}
+
+    code = "g:: Ord a  => Int -> Either String a -> T3 T1 T2 T4"
+    et = TypeExtractor(code)
+    assert et.type_constructors == {"Either": 2, "T3": 3}
+    assert et.names == {"Int", "String", "T1", "T2", "T3", "T4", "Either"}

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`	`get_type_vars,`
`7`	`7`	`get_type_constraints,`
`8`	`8`	`)`
	`9`	`+from .extractor import TypeExtractor`
`9`	`10`
`10`	`11`	`__all__ = [`
`11`	`12`	`"AST",`
`@@ -16,4 +17,5 @@`
`16`	`17`	`"to_type_node",`
`17`	`18`	`"get_type_vars",`
`18`	`19`	`"get_type_constraints",`
	`20`	`+ "TypeExtractor",`
`19`	`21`	`]`