11import json
2+ import os
3+ import re
4+ import tempfile
5+ import logging
6+ from multiprocessing import cpu_count
7+ from typing import Iterable
8+
29import fire
310from funcy import func_partial
411from funcy_chain import Chain
5- from functools import reduce
612from dacite import from_dict
7- from returns .result import Result , Success , Failure
8- import os
9- import re
10- import tempfile
13+ from returns .result import Success , Failure
1114from tqdm import tqdm
1215import pandas as pd
13- import logging
14- from multiprocessing import cpu_count
1516from pathos .multiprocessing import ProcessingPool
16- from returns .result import Result , Success , Failure
17- from typing import Iterable
18- from evaluation .util import CovError , HumanEvalTask
17+
18+ from evaluation .util import HumanEvalTask
1919from evaluation .result_analysis import to_record
2020
2121
2222def init_humaneval_x_workspace (tmpdir : str = "tmp" ):
23+ """setup a temporary workspace for compiling and running humaneval-x programs"""
2324 fuzz_aug_home = os .environ ["FUZZ_AUG_HOME" ]
2425 cargo_files = f"{ fuzz_aug_home } /evaluation/cargo/Cargo.*"
2526 os .system (f"cp { cargo_files } { tmpdir } /" )
2627 os .mkdir (f"{ tmpdir } /src" )
2728
2829
2930def evaluate_program (program : str , fn_name : str , timeout : int ):
30- from evaluation .coverage import get_coverage
31+ """write, compile, and run a HumanEval-X program, return coverage result"""
32+ from evaluation .coverage import ( # pylint: disable=import-outside-toplevel
33+ get_coverage ,
34+ )
35+
36+ with tempfile .TemporaryDirectory () as tmpdir_path :
3137
32- tmpdir = tempfile .TemporaryDirectory ()
33- tmpdir_path = tmpdir .name
38+ init_humaneval_x_workspace (tmpdir = tmpdir_path )
39+ test_target = f"test_{ fn_name } "
40+ with open (f"{ tmpdir_path } /src/main.rs" , "w" ) as fp :
41+ fp .write (program )
42+ cov = get_coverage (tmpdir_path , test_target , timeout = timeout )
3443
35- init_humaneval_x_workspace (tmpdir = tmpdir_path )
36- test_target = f"test_{ fn_name } "
37- with open (f"{ tmpdir_path } /src/main.rs" , "w" ) as fp :
38- fp .write (program )
39- cov = get_coverage (tmpdir_path , test_target , timeout = timeout )
40- tmpdir .cleanup ()
41- return cov
44+ return cov
4245
4346
4447def evaluate_assertions (solution : HumanEvalTask , timeout : int = 60 ):
48+ """evaluate individual assertions in the generated test function"""
4549 assert solution .fn_name is not None
4650 return [
4751 evaluate_program (p , solution .fn_name , timeout )
@@ -92,17 +96,20 @@ def extract_assertions(test_function: str) -> list[str]:
9296
9397
9498def concat_assertions_to_test (solution , assertions ):
99+ """concatenate multiple extracted assertions into a single test function"""
95100 return solution .assertion_to_program ("\n " .join (assertions ))
96101
97102
98103def generated_assertions (sol : HumanEvalTask ) -> list [str ]:
104+ """extract assertions from the generated test function"""
99105 assert sol .generated_test is not None
100106 return extract_assertions (sol .test_prompt_header + sol .generated_test )
101107
102108
103109def evaluate_whole_function_coverage (solution : HumanEvalTask , timeout : int = 60 ):
104110 """
105- Evaluate the whole function coverage using the combined test function of correct assertions, which is similar to evaluate_assertion
111+ Evaluate the whole function coverage using the combined test function of correct assertions,
112+ which is similar to evaluate_assertion
106113 """
107114 assert solution .fn_name is not None
108115
@@ -120,8 +127,8 @@ def main(
120127 """evaluate coverage on HumanEval-X
121128
122129 Args:
123- input_human_eval_x_path (str, optional): input path. Defaults to "data/humaneval_rust.jsonl".
124- output_result_file (str, optional): write to write results. Defaults to "humaneval_rust_coverage.jsonl".
130+ input_human_eval_x_path (str, optional): input path.
131+ output_result_file (str, optional): write to write results.
125132 nproc (int, optional): number of proc to use. Defaults to cpu_count().
126133 timeout (int, optional): allowance time for each exec, in seconds. Defaults to 60.
127134 whole_function_coverage(bool, optional): whether or not test whole function coverage
0 commit comments