|
| 1 | +import os |
| 2 | +import re |
| 3 | +import copy |
| 4 | +import json |
| 5 | +import docker |
| 6 | +import platform |
| 7 | +import resource |
| 8 | +import traceback |
| 9 | +from tqdm import tqdm |
| 10 | +from typing import Any |
| 11 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
| 12 | + |
| 13 | +from swebench.harness.constants import LATEST |
| 14 | +from swebench.harness.docker_build import build_env_images |
| 15 | +from swebench.harness.test_spec.python import get_test_directives |
| 16 | +from swebench.harness.run_evaluation import get_dataset_from_preds, run_instance |
| 17 | +from swebench.harness.test_spec.test_spec import (TestSpec, make_env_script_list, make_repo_script_list) |
| 18 | +from swebench.harness.constants import (FAIL_TO_PASS, KEY_INSTANCE_ID, MAP_REPO_VERSION_TO_SPECS, PASS_TO_PASS, USE_X86, SWEbenchInstance) |
| 19 | + |
| 20 | +OPEN_FILE_LIMIT = 4096 |
| 21 | + |
| 22 | +NOOP_PATCH = """diff --git a/this_is_invisible.py b/this_is_invisible.py |
| 23 | +new file mode 100644 |
| 24 | +index 0000000..e69de29 |
| 25 | +--- /dev/null |
| 26 | ++++ b/this_is_invisible.py |
| 27 | +@@ -0,0 +1 @@ |
| 28 | ++# This is a commented out line |
| 29 | +""" |
| 30 | + |
| 31 | +NOOP_PATCH_2 = """diff --git a/this_is_invisible_2.py b/this_is_invisible_2.py |
| 32 | +new file mode 100644 |
| 33 | +index 0000000..e69de29 |
| 34 | +--- /dev/null |
| 35 | ++++ b/this_is_invisible_2.py |
| 36 | +@@ -0,0 +1 @@ |
| 37 | ++# This is a commented out line |
| 38 | +""" |
| 39 | + |
| 40 | +def remove_ansi_sequences(input_string): |
| 41 | + ansi_escape_pattern = r"\x1b\[\d+m" |
| 42 | + clean_string = re.sub(ansi_escape_pattern, "", input_string) |
| 43 | + return clean_string |
| 44 | + |
| 45 | +def txt_file_contains_string(path_to_txt, expected_output, other_patterns=[]): |
| 46 | + """ |
| 47 | + Check if the given text file contains the specified string. |
| 48 | + :param path_to_txt: Path to the text file. |
| 49 | + :param expected_output: The string to search for in the text file. |
| 50 | + :return: True if the string is found in the text file, otherwise False. |
| 51 | + """ |
| 52 | + try: |
| 53 | + with open(path_to_txt, "r", encoding="utf-8") as file: |
| 54 | + content = file.read() |
| 55 | + filtered_content = remove_ansi_sequences(content) |
| 56 | + for pattern in other_patterns: |
| 57 | + if pattern in filtered_content: |
| 58 | + return False |
| 59 | + return expected_output in filtered_content |
| 60 | + except FileNotFoundError: |
| 61 | + pass |
| 62 | + except IOError: |
| 63 | + print(f"An error occurred while reading the file at {path_to_txt}.") |
| 64 | + return False |
| 65 | + |
| 66 | +def create_instance_test_dict(jsonl_file_path): |
| 67 | + instance_test_dict = {} |
| 68 | + with open(jsonl_file_path, "r") as file: |
| 69 | + for line in file: |
| 70 | + json_obj = json.loads(line.strip()) |
| 71 | + instance_id = json_obj["instance_id"] |
| 72 | + test_patch = json_obj["test_patch"] |
| 73 | + instance_test_dict[instance_id] = test_patch |
| 74 | + return instance_test_dict |
| 75 | + |
| 76 | + |
| 77 | +def extract_resolved_info(directory_path): |
| 78 | + # Check if the directory exists |
| 79 | + if not os.path.exists(directory_path) or not os.path.isdir(directory_path): |
| 80 | + return {} |
| 81 | + result = {} |
| 82 | + for subdir in os.listdir(directory_path): |
| 83 | + subdir_path = os.path.join(directory_path, subdir) |
| 84 | + if os.path.isdir(subdir_path): |
| 85 | + report_path = os.path.join(subdir_path, "report.json") |
| 86 | + if os.path.isfile(report_path): |
| 87 | + try: |
| 88 | + with open(report_path, "r", encoding="utf-8") as report_file: |
| 89 | + data = json.load(report_file) |
| 90 | + resolved_value = data.get(subdir, {}).get("resolved", False) |
| 91 | + result[subdir] = resolved_value |
| 92 | + except (json.JSONDecodeError, KeyError): |
| 93 | + result[subdir] = False |
| 94 | + return result |
| 95 | + |
| 96 | +def make_regression_spec(instance: SWEbenchInstance) -> TestSpec: |
| 97 | + if isinstance(instance, TestSpec): |
| 98 | + return instance |
| 99 | + instance_id = instance[KEY_INSTANCE_ID] |
| 100 | + repo = instance["repo"] |
| 101 | + version = instance["version"] |
| 102 | + base_commit = instance["base_commit"] |
| 103 | + def _from_json_or_obj(key: str) -> Any: |
| 104 | + """If key points to string, load with json""" |
| 105 | + if isinstance(instance[key], str): |
| 106 | + return json.loads(instance[key]) |
| 107 | + return instance[key] |
| 108 | + pass_to_pass = _from_json_or_obj(PASS_TO_PASS) |
| 109 | + fail_to_pass = _from_json_or_obj(FAIL_TO_PASS) |
| 110 | + env_name = "testbed" |
| 111 | + repo_directory = f"/{env_name}" |
| 112 | + specs = MAP_REPO_VERSION_TO_SPECS[repo][version] |
| 113 | + repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name) |
| 114 | + env_script_list = make_env_script_list(instance, specs, env_name) |
| 115 | + eval_script_list = make_regression_script_list(instance, specs, env_name, repo_directory, base_commit) |
| 116 | + if platform.machine() in {"aarch64", "arm64"}: |
| 117 | + arch = "arm64" if instance_id not in USE_X86 else "x86_64" |
| 118 | + else: |
| 119 | + arch = "x86_64" |
| 120 | + return TestSpec( |
| 121 | + instance_id=instance_id, |
| 122 | + repo=repo, |
| 123 | + env_script_list=env_script_list, |
| 124 | + repo_script_list=repo_script_list, |
| 125 | + eval_script_list=eval_script_list, |
| 126 | + version=version, |
| 127 | + arch=arch, |
| 128 | + FAIL_TO_PASS=fail_to_pass, |
| 129 | + PASS_TO_PASS=pass_to_pass, |
| 130 | + language=specs.get("language", "python"), |
| 131 | + docker_specs=specs, |
| 132 | + namespace=specs.get("namespace", "swebench"), |
| 133 | + ) |
| 134 | + |
| 135 | + |
| 136 | +def make_regression_script_list(instance, specs, env_name, repo_directory, base_commit): |
| 137 | + HEREDOC_DELIMITER = "EOF_114329324912" |
| 138 | + fake_apply_test_patch_command = (f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{NOOP_PATCH_2}\n{HEREDOC_DELIMITER}") |
| 139 | + test_command = " ".join([MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"], *get_test_directives(instance)]) |
| 140 | + eval_commands = ["source /opt/miniconda3/bin/activate", f"conda activate {env_name}", f"cd {repo_directory}"] |
| 141 | + if "eval_commands" in specs: |
| 142 | + eval_commands += specs["eval_commands"] |
| 143 | + eval_commands += [ |
| 144 | + f"git config --global --add safe.directory {repo_directory}", |
| 145 | + f"cd {repo_directory}", |
| 146 | + "git status", |
| 147 | + "git show", |
| 148 | + f"git diff {base_commit}", |
| 149 | + "source /opt/miniconda3/bin/activate", |
| 150 | + f"conda activate {env_name}", |
| 151 | + ] |
| 152 | + if "install" in specs: |
| 153 | + eval_commands.append(specs["install"]) |
| 154 | + eval_commands += [fake_apply_test_patch_command, test_command] |
| 155 | + return eval_commands |
| 156 | + |
| 157 | + |
| 158 | +def rearrange_patches(test_specs): |
| 159 | + """ |
| 160 | + rearrange the patches such that slower instance_ids are evaluated first |
| 161 | + this way pipelining will be faster. |
| 162 | + """ |
| 163 | + slow_instance_ids = ["sympy__sympy-11870"] |
| 164 | + slow_specs = [test_spec for test_spec in test_specs if test_spec.instance_id in slow_instance_ids] |
| 165 | + if len(slow_specs) != 0: |
| 166 | + print(f"rearrange patches such that {[x.instance_id for x in slow_specs]} are evaluated first") |
| 167 | + rearranged_test_specs = slow_specs |
| 168 | + for test_spec in test_specs: |
| 169 | + if test_spec.instance_id not in slow_instance_ids: |
| 170 | + rearranged_test_specs.append(test_spec) |
| 171 | + return rearranged_test_specs |
| 172 | + else: |
| 173 | + return test_specs |
| 174 | + |
| 175 | + |
| 176 | +def run_tests(location_data:list, instance_ids: list, model_patches: list, max_workers: int, run_id: str, timeout: int, apply_model_patch=True, dataset_name="princeton-nlp/SWE-bench_Verified"): |
| 177 | + assert len(instance_ids) == len(model_patches), "There must be the same number of instance_ids as model patches" |
| 178 | + resource.setrlimit(resource.RLIMIT_NOFILE, (OPEN_FILE_LIMIT, OPEN_FILE_LIMIT)) |
| 179 | + print(f"Using run_id: {run_id}") |
| 180 | + split = "test" |
| 181 | + client = docker.from_env() |
| 182 | + force_rebuild = False |
| 183 | + predictions = {} |
| 184 | + for idx, one_instance_id in enumerate(instance_ids): |
| 185 | + if not apply_model_patch: |
| 186 | + patch_to_apply = NOOP_PATCH |
| 187 | + else: |
| 188 | + patch_to_apply = model_patches[idx] |
| 189 | + predictions[one_instance_id] = {"model_name_or_path": "test", "model_patch": patch_to_apply, "instance_id": one_instance_id} |
| 190 | + instances = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id, False, True) |
| 191 | + print(f"Running {len(instances)} unevaluated instances...") |
| 192 | + instance_test_dict = {} |
| 193 | + for line in location_data: |
| 194 | + instance_id = line["original_id"] |
| 195 | + test = line["tests_passing_in_original_repo"] |
| 196 | + instance_test_dict[instance_id] = test |
| 197 | + no_f2p_instances = [] |
| 198 | + for instance in instances: |
| 199 | + revised_instance = instance |
| 200 | + revised_instance["FAIL_TO_PASS"] = "[]" |
| 201 | + revised_instance["PASS_TO_PASS"] = instance_test_dict[instance["instance_id"]] |
| 202 | + no_f2p_instances.append(revised_instance) |
| 203 | + build_env_images(client, no_f2p_instances, force_rebuild=force_rebuild, max_workers=max_workers, namespace="swebench", instance_image_tag=LATEST, env_image_tag=LATEST) |
| 204 | + test_specs = list(map(make_regression_spec, no_f2p_instances)) |
| 205 | + test_specs = rearrange_patches(test_specs) |
| 206 | + instance_image_ids = {x.instance_image_key for x in test_specs} |
| 207 | + existing_images = {tag for i in client.images.list(all=True) for tag in i.tags if tag in instance_image_ids} |
| 208 | + print(f"Found {len(existing_images)} existing instance images. Will reuse them.") |
| 209 | + resolved_dict = extract_resolved_info(os.path.join("logs", "run_evaluation", run_id, "test")) |
| 210 | + ids = [test_spec.instance_id for test_spec in test_specs if test_spec.instance_id not in list(resolved_dict.keys())] |
| 211 | + for index, patch in enumerate(model_patches): |
| 212 | + if patch == "": |
| 213 | + resolved_dict[instance_ids[index]] = False |
| 214 | + with tqdm(total=len(ids), smoothing=0, colour="MAGENTA") as pbar: |
| 215 | + with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| 216 | + futures = {executor.submit(run_instance, test_spec, predictions[test_spec.instance_id], False, force_rebuild, client, run_id, timeout): None for test_spec in test_specs if test_spec.instance_id in ids} |
| 217 | + for future in as_completed(futures): |
| 218 | + pbar.update(1) |
| 219 | + try: |
| 220 | + result = future.result() |
| 221 | + except Exception as e: |
| 222 | + traceback.print_exc() |
| 223 | + print(f"Error getting future result: {e}") |
| 224 | + continue |
| 225 | + if result and isinstance(result, tuple) and len(result) == 2: |
| 226 | + instance_id = result[0] |
| 227 | + resolved = result[1][instance_id]["resolved"] |
| 228 | + resolved_dict[instance_id] = resolved |
| 229 | + elif result and isinstance(result, dict): |
| 230 | + for instance_id in result: |
| 231 | + resolved_dict[instance_id] = False |
| 232 | + else: |
| 233 | + print(f"Warning: Unexpected result type: {type(result)}") |
| 234 | + print("All instances run.") |
| 235 | + return resolved_dict |
0 commit comments