fix: groundtruth error (#63)

EYH0602 · web-flow · commit a0c4f9eb0937 · 2025-09-03T16:28:15.000-07:00
* fix: missing type class and typevar in benchmark

* fix: order of  tasks in tfb

* fix: allow load_gen_results to load error

* remove error_cls unused imports
diff --git a/benchmark/task_19.hs.md b/benchmark/task_19.hs.md
@@ -7,7 +7,7 @@ Ad-hoc
 
 # signature
 ```haskell
-quot :: Integral => a -> a -> a
+quot :: Integral a => a -> a -> a
 ```   
 
 # code
diff --git a/benchmark/task_59.hs.md b/benchmark/task_59.hs.md
@@ -7,7 +7,7 @@ Parametric
 
 # signature
 ```haskell
-foldl :: (b -> a -> b) -> b -> t a -> b
+foldl :: Foldable t => (b -> a -> b) -> b -> t a -> b
 ```   
 
 # code
diff --git a/scripts/error_cls.py b/scripts/error_cls.py
@@ -1,19 +1,17 @@
-from os.path import abspath, dirname, basename, join as pjoin
+from os.path import abspath, basename, join as pjoin
 import os
 
 import orjson
 from pydantic import BaseModel
 from openai import OpenAI
 import fire
+from tqdm import tqdm
+
 from tfbench import (
-    analysis_multi_runs,
     load_tfb_from_hf,
     load_gen_results_jsonl,
-    evaluate,
     LMAnswer,
 )
-from tqdm import tqdm
-
 from tfbench.evaluation import get_incorrect
 from tfbench.common import get_prompt as get_task_prompt, BenchmarkTask
 
diff --git a/scripts/preprocess_benchmark.py b/scripts/preprocess_benchmark.py
@@ -14,7 +14,9 @@ def main(input_raw_benchmark_path: str = "benchmark", output_path: str = "tfb.js
 
     # read in all files ending with .md in the input_raw_benchmark_path
     tasks: list[BenchmarkTask] = []
-    for file in os.listdir(input_raw_benchmark_path):
+    files = os.listdir(input_raw_benchmark_path)
+    files_w_order = sorted(files)
+    for file in files_w_order:
         if not file.endswith(".hs.md"):
             continue
         with open(os.path.join(input_raw_benchmark_path, file), "r") as f:
diff --git a/src/tfbench/load.py b/src/tfbench/load.py
@@ -35,5 +35,5 @@ def load_tfb_from_hf(split: str = "base") -> list[BenchmarkTask]:
 
 def load_gen_results_jsonl(result_file: str) -> list[LMAnswer | None]:
     """load generation results from a jsonl file"""
-    objs: list[dict[str, str | None]] = orjsonl.load(result_file)  # type: ignore
-    return [from_dict(LMAnswer, obj) for obj in objs]
+    objs: list[dict[str, str]] = orjsonl.load(result_file)  # type: ignore
+    return [from_dict(LMAnswer, obj) if "answer" in obj else None for obj in objs]