@@ -95,18 +95,6 @@ def filter_instances_by_ids(instances: list[BatchInstance], target_ids: set[str]
9595 return filtered_instances
9696
9797def deduplicate_patches (sample_paths : list [Path ], output_file : Path , logger , min_ratio : float = 0.5 ) -> dict :
98- """
99- 从多个sample的patch文件中去重
100- 去重规则:
101- - 如果file_path相同且current_code的最长连续匹配行数 >= 较短代码长度的min_ratio,认为是重复
102- Args:
103- sample_paths: 样本目录列表
104- output_file: 输出的jsonl文件路径
105- logger: 日志记录器
106- min_ratio: 最小匹配比例(默认0.5,即二分之一)
107- Returns:
108- 去重统计信息
109- """
11098 all_modifications = []
11199 unique_modifications = []
112100 total_count = 0
@@ -185,14 +173,6 @@ def are_modifications_similar(mod1: dict, mod2: dict, min_ratio: float = 0.5) ->
185173 return is_similar , details
186174
187175def find_longest_consecutive_match (code1 : str , code2 : str ) -> tuple [int , int ]:
188- """
189- 找到两段代码中最长的连续匹配行数
190- Args:
191- code1: 第一段代码
192- code2: 第二段代码
193- Returns:
194- (最长连续匹配的行数, code1的总行数)
195- """
196176 lines1 = get_code_lines (code1 )
197177 lines2 = get_code_lines (code2 )
198178 if not lines1 or not lines2 :
@@ -209,14 +189,6 @@ def get_code_lines(code: str) -> list[str]:
209189 return [line .strip () for line in code .strip ().split ('\n ' ) if line .strip ()]
210190
211191def deduplicate_patches_by_first_path_and_count (sample_paths : list [Path ], output_file : Path , logger , min_unique : int = 2 ) -> dict :
212- """
213- 直接从 sample1-sampleN 读取 .patch,按 (第一个 file_path, files_to_modify 数量) 去重。
214- 仅当去重后 unique >= min_unique 才写出 output_file,否则跳过不生成文件。
215- 输出 jsonl,每行:
216- {"files_to_modify": [...], "source_sample": "xxx-sampleK"}
217- Returns:
218- 统计信息 dict
219- """
220192 all_patches = []
221193 total_read = 0
222194 invalid_count = 0
@@ -1074,4 +1046,4 @@ def run_from_cli(args: list[str] | None = None):
10741046 run_from_config (BasicCLI (RunBatchConfig , help_text = help_text ).get_config (args ))
10751047
10761048if __name__ == "__main__" :
1077- run_from_cli ()
1049+ run_from_cli ()
0 commit comments