Skip to content

Commit 00e56cb

Browse files
authored
Create run_tests.py
1 parent dfa7452 commit 00e56cb

1 file changed

Lines changed: 235 additions & 0 deletions

File tree

sweagent/run/run_tests.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
import os
2+
import re
3+
import copy
4+
import json
5+
import docker
6+
import platform
7+
import resource
8+
import traceback
9+
from tqdm import tqdm
10+
from typing import Any
11+
from concurrent.futures import ThreadPoolExecutor, as_completed
12+
13+
from swebench.harness.constants import LATEST
14+
from swebench.harness.docker_build import build_env_images
15+
from swebench.harness.test_spec.python import get_test_directives
16+
from swebench.harness.run_evaluation import get_dataset_from_preds, run_instance
17+
from swebench.harness.test_spec.test_spec import (TestSpec, make_env_script_list, make_repo_script_list)
18+
from swebench.harness.constants import (FAIL_TO_PASS, KEY_INSTANCE_ID, MAP_REPO_VERSION_TO_SPECS, PASS_TO_PASS, USE_X86, SWEbenchInstance)
19+
20+
OPEN_FILE_LIMIT = 4096
21+
22+
NOOP_PATCH = """diff --git a/this_is_invisible.py b/this_is_invisible.py
23+
new file mode 100644
24+
index 0000000..e69de29
25+
--- /dev/null
26+
+++ b/this_is_invisible.py
27+
@@ -0,0 +1 @@
28+
+# This is a commented out line
29+
"""
30+
31+
NOOP_PATCH_2 = """diff --git a/this_is_invisible_2.py b/this_is_invisible_2.py
32+
new file mode 100644
33+
index 0000000..e69de29
34+
--- /dev/null
35+
+++ b/this_is_invisible_2.py
36+
@@ -0,0 +1 @@
37+
+# This is a commented out line
38+
"""
39+
40+
def remove_ansi_sequences(input_string):
41+
ansi_escape_pattern = r"\x1b\[\d+m"
42+
clean_string = re.sub(ansi_escape_pattern, "", input_string)
43+
return clean_string
44+
45+
def txt_file_contains_string(path_to_txt, expected_output, other_patterns=[]):
46+
"""
47+
Check if the given text file contains the specified string.
48+
:param path_to_txt: Path to the text file.
49+
:param expected_output: The string to search for in the text file.
50+
:return: True if the string is found in the text file, otherwise False.
51+
"""
52+
try:
53+
with open(path_to_txt, "r", encoding="utf-8") as file:
54+
content = file.read()
55+
filtered_content = remove_ansi_sequences(content)
56+
for pattern in other_patterns:
57+
if pattern in filtered_content:
58+
return False
59+
return expected_output in filtered_content
60+
except FileNotFoundError:
61+
pass
62+
except IOError:
63+
print(f"An error occurred while reading the file at {path_to_txt}.")
64+
return False
65+
66+
def create_instance_test_dict(jsonl_file_path):
67+
instance_test_dict = {}
68+
with open(jsonl_file_path, "r") as file:
69+
for line in file:
70+
json_obj = json.loads(line.strip())
71+
instance_id = json_obj["instance_id"]
72+
test_patch = json_obj["test_patch"]
73+
instance_test_dict[instance_id] = test_patch
74+
return instance_test_dict
75+
76+
77+
def extract_resolved_info(directory_path):
78+
# Check if the directory exists
79+
if not os.path.exists(directory_path) or not os.path.isdir(directory_path):
80+
return {}
81+
result = {}
82+
for subdir in os.listdir(directory_path):
83+
subdir_path = os.path.join(directory_path, subdir)
84+
if os.path.isdir(subdir_path):
85+
report_path = os.path.join(subdir_path, "report.json")
86+
if os.path.isfile(report_path):
87+
try:
88+
with open(report_path, "r", encoding="utf-8") as report_file:
89+
data = json.load(report_file)
90+
resolved_value = data.get(subdir, {}).get("resolved", False)
91+
result[subdir] = resolved_value
92+
except (json.JSONDecodeError, KeyError):
93+
result[subdir] = False
94+
return result
95+
96+
def make_regression_spec(instance: SWEbenchInstance) -> TestSpec:
97+
if isinstance(instance, TestSpec):
98+
return instance
99+
instance_id = instance[KEY_INSTANCE_ID]
100+
repo = instance["repo"]
101+
version = instance["version"]
102+
base_commit = instance["base_commit"]
103+
def _from_json_or_obj(key: str) -> Any:
104+
"""If key points to string, load with json"""
105+
if isinstance(instance[key], str):
106+
return json.loads(instance[key])
107+
return instance[key]
108+
pass_to_pass = _from_json_or_obj(PASS_TO_PASS)
109+
fail_to_pass = _from_json_or_obj(FAIL_TO_PASS)
110+
env_name = "testbed"
111+
repo_directory = f"/{env_name}"
112+
specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
113+
repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name)
114+
env_script_list = make_env_script_list(instance, specs, env_name)
115+
eval_script_list = make_regression_script_list(instance, specs, env_name, repo_directory, base_commit)
116+
if platform.machine() in {"aarch64", "arm64"}:
117+
arch = "arm64" if instance_id not in USE_X86 else "x86_64"
118+
else:
119+
arch = "x86_64"
120+
return TestSpec(
121+
instance_id=instance_id,
122+
repo=repo,
123+
env_script_list=env_script_list,
124+
repo_script_list=repo_script_list,
125+
eval_script_list=eval_script_list,
126+
version=version,
127+
arch=arch,
128+
FAIL_TO_PASS=fail_to_pass,
129+
PASS_TO_PASS=pass_to_pass,
130+
language=specs.get("language", "python"),
131+
docker_specs=specs,
132+
namespace=specs.get("namespace", "swebench"),
133+
)
134+
135+
136+
def make_regression_script_list(instance, specs, env_name, repo_directory, base_commit):
137+
HEREDOC_DELIMITER = "EOF_114329324912"
138+
fake_apply_test_patch_command = (f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{NOOP_PATCH_2}\n{HEREDOC_DELIMITER}")
139+
test_command = " ".join([MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"], *get_test_directives(instance)])
140+
eval_commands = ["source /opt/miniconda3/bin/activate", f"conda activate {env_name}", f"cd {repo_directory}"]
141+
if "eval_commands" in specs:
142+
eval_commands += specs["eval_commands"]
143+
eval_commands += [
144+
f"git config --global --add safe.directory {repo_directory}",
145+
f"cd {repo_directory}",
146+
"git status",
147+
"git show",
148+
f"git diff {base_commit}",
149+
"source /opt/miniconda3/bin/activate",
150+
f"conda activate {env_name}",
151+
]
152+
if "install" in specs:
153+
eval_commands.append(specs["install"])
154+
eval_commands += [fake_apply_test_patch_command, test_command]
155+
return eval_commands
156+
157+
158+
def rearrange_patches(test_specs):
159+
"""
160+
rearrange the patches such that slower instance_ids are evaluated first
161+
this way pipelining will be faster.
162+
"""
163+
slow_instance_ids = ["sympy__sympy-11870"]
164+
slow_specs = [test_spec for test_spec in test_specs if test_spec.instance_id in slow_instance_ids]
165+
if len(slow_specs) != 0:
166+
print(f"rearrange patches such that {[x.instance_id for x in slow_specs]} are evaluated first")
167+
rearranged_test_specs = slow_specs
168+
for test_spec in test_specs:
169+
if test_spec.instance_id not in slow_instance_ids:
170+
rearranged_test_specs.append(test_spec)
171+
return rearranged_test_specs
172+
else:
173+
return test_specs
174+
175+
176+
def run_tests(location_data:list, instance_ids: list, model_patches: list, max_workers: int, run_id: str, timeout: int, apply_model_patch=True, dataset_name="princeton-nlp/SWE-bench_Verified"):
177+
assert len(instance_ids) == len(model_patches), "There must be the same number of instance_ids as model patches"
178+
resource.setrlimit(resource.RLIMIT_NOFILE, (OPEN_FILE_LIMIT, OPEN_FILE_LIMIT))
179+
print(f"Using run_id: {run_id}")
180+
split = "test"
181+
client = docker.from_env()
182+
force_rebuild = False
183+
predictions = {}
184+
for idx, one_instance_id in enumerate(instance_ids):
185+
if not apply_model_patch:
186+
patch_to_apply = NOOP_PATCH
187+
else:
188+
patch_to_apply = model_patches[idx]
189+
predictions[one_instance_id] = {"model_name_or_path": "test", "model_patch": patch_to_apply, "instance_id": one_instance_id}
190+
instances = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id, False, True)
191+
print(f"Running {len(instances)} unevaluated instances...")
192+
instance_test_dict = {}
193+
for line in location_data:
194+
instance_id = line["original_id"]
195+
test = line["tests_passing_in_original_repo"]
196+
instance_test_dict[instance_id] = test
197+
no_f2p_instances = []
198+
for instance in instances:
199+
revised_instance = instance
200+
revised_instance["FAIL_TO_PASS"] = "[]"
201+
revised_instance["PASS_TO_PASS"] = instance_test_dict[instance["instance_id"]]
202+
no_f2p_instances.append(revised_instance)
203+
build_env_images(client, no_f2p_instances, force_rebuild=force_rebuild, max_workers=max_workers, namespace="swebench", instance_image_tag=LATEST, env_image_tag=LATEST)
204+
test_specs = list(map(make_regression_spec, no_f2p_instances))
205+
test_specs = rearrange_patches(test_specs)
206+
instance_image_ids = {x.instance_image_key for x in test_specs}
207+
existing_images = {tag for i in client.images.list(all=True) for tag in i.tags if tag in instance_image_ids}
208+
print(f"Found {len(existing_images)} existing instance images. Will reuse them.")
209+
resolved_dict = extract_resolved_info(os.path.join("logs", "run_evaluation", run_id, "test"))
210+
ids = [test_spec.instance_id for test_spec in test_specs if test_spec.instance_id not in list(resolved_dict.keys())]
211+
for index, patch in enumerate(model_patches):
212+
if patch == "":
213+
resolved_dict[instance_ids[index]] = False
214+
with tqdm(total=len(ids), smoothing=0, colour="MAGENTA") as pbar:
215+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
216+
futures = {executor.submit(run_instance, test_spec, predictions[test_spec.instance_id], False, force_rebuild, client, run_id, timeout): None for test_spec in test_specs if test_spec.instance_id in ids}
217+
for future in as_completed(futures):
218+
pbar.update(1)
219+
try:
220+
result = future.result()
221+
except Exception as e:
222+
traceback.print_exc()
223+
print(f"Error getting future result: {e}")
224+
continue
225+
if result and isinstance(result, tuple) and len(result) == 2:
226+
instance_id = result[0]
227+
resolved = result[1][instance_id]["resolved"]
228+
resolved_dict[instance_id] = resolved
229+
elif result and isinstance(result, dict):
230+
for instance_id in result:
231+
resolved_dict[instance_id] = False
232+
else:
233+
print(f"Warning: Unexpected result type: {type(result)}")
234+
print("All instances run.")
235+
return resolved_dict

0 commit comments

Comments
 (0)