translation

joyguoguo · joyguoguo · commit 1686058f9a44 · 2025-08-18T02:33:14.000Z
diff --git a/fuzz/ast_utils.py b/fuzz/ast_utils.py
@@ -5,25 +5,26 @@
 import os
 import re
 
+
 class TestFunctionTransformer(ast.NodeTransformer):
     """AST transformer for test function conversion"""
 
     def visit_FunctionDef(self, node):
-        # 首先处理 main 函数（移除）
+        # First, process main function (remove it)
         if node.name == "main":
             return None
 
-        # 处理 TestInput/TestOneInput 函数
+        # Process TestInput/TestOneInput functions
         if node.name in ["TestInput", "TestOneInput"]:
-            # a. 记录参数名称（假设只有一个参数）
+            # a. Record parameter name (assume only one parameter)
             param_name = None
             if node.args.args:
                 param_name = node.args.args[0].arg
 
-            # b. 将函数名改为 test_
+            # b. Rename function to test_
             node.name = "test_"
 
-            # c. 移除参数（将参数列表设为空）
+            # c. Remove parameters (set argument list to empty)
             node.args = ast.arguments(
                 posonlyargs=[],
                 args=[],
@@ -34,48 +35,47 @@ def visit_FunctionDef(self, node):
                 defaults=[],
             )
 
-            # d. 在函数体开头插入 原参数名 = b""
+            # d. Insert param_name = b"" at the beginning of the function body
             if param_name:
                 self.add_param_assignment(node, param_name)
 
-        # 确保继续遍历子节点
+        # Ensure traversing child nodes continues
         self.generic_visit(node)
         return node
 
     def add_param_assignment(self, node, param_name):
         """Add param_name = b"..." at the beginning of the function body with an inline comment"""
-        # 创建包含赋值和注释的复合值
+        # Create a compound value containing assignment and comment
         value_with_comment = ast.JoinedStr(
             values=[
                 ast.FormattedValue(value=ast.Constant(value=b""), conversion=-1),
-                ast.Constant(value="  # This is a test template")
+                ast.Constant(value="  # This is a test template"),
             ]
         )
-        
-        # 创建赋值节点
+
+        # Create an assignment node
         assign_node = ast.Assign(
-            targets=[ast.Name(id=param_name, ctx=ast.Store())],
-            value=value_with_comment
+            targets=[ast.Name(id=param_name, ctx=ast.Store())], value=value_with_comment
         )
-        
-        # 如果有文档字符串，插入在文档字符串之后
+
+        # If there is a docstring, insert after the docstring
         if (
             node.body
             and isinstance(node.body[0], ast.Expr)
             and isinstance(node.body[0].value, ast.Constant)
             and isinstance(node.body[0].value.value, str)
         ):
-            # 插入在文档字符串后面
+            # Insert right after the docstring
             node.body.insert(1, assign_node)
         else:
-            # 插入在函数开头
+            # Insert at the beginning of the function
             node.body.insert(0, assign_node)
 
     def remove_print_param(self, node, param_name):
         """Remove print statements for the specific parameter"""
         new_body = []
         for stmt in node.body:
-            # 跳过 print(param_name) 调用
+            # Skip print(param_name) calls
             if (
                 isinstance(stmt, ast.Expr)
                 and isinstance(stmt.value, ast.Call)
@@ -92,7 +92,7 @@ def remove_print_param(self, node, param_name):
 
     def visit_If(self, node):
         """Remove if __name__ == '__main__' blocks"""
-        # 检查是否是主函数保护
+        # Check if this is the main function guard
         if (
             isinstance(node.test, ast.Compare)
             and isinstance(node.test.left, ast.Name)
@@ -102,10 +102,10 @@ def visit_If(self, node):
             and node.test.comparators[0].value == "__main__"
         ):
 
-            # 移除整个 if 块
+            # Remove the entire if block
             return None
 
-        # 确保继续遍历子节点
+        # Ensure traversing child nodes continues
         self.generic_visit(node)
         return node
 
@@ -119,22 +119,22 @@ def __init__(self, idx, fuzz_input):
     def visit_FunctionDef(self, node):
         if node.name == "test_":
             self.found_test_function = True
-            
-            # 1. 修改函数名
+
+            # 1. Modify function name
             node.name = f"test_{self.idx}"
-            
-            # 2. 查找并替换包含特定注释的赋值语句
+
+            # 2. Find and replace assignment statements with the special comment
             for i, stmt in enumerate(node.body):
-                # 检查是否是赋值语句
+                # Check if it's an assignment statement
                 if isinstance(stmt, ast.Assign):
-                    # 检查赋值语句的值是否是带有注释的复合值
+                    # Check if the value is a compound value with a comment
                     if (
                         isinstance(stmt.value, ast.JoinedStr)
                         and len(stmt.value.values) >= 2
                         and isinstance(stmt.value.values[1], ast.Constant)
                         and stmt.value.values[1].value == "  # This is a test template"
                     ):
-                        # 替换为新的输入值
+                        # Replace with new fuzz input
                         stmt.value = ast.Constant(value=self.fuzz_input)
                         break
         return node
@@ -145,9 +145,9 @@ def generate_test_template(target_name: str, repo_path: str):
     Generate Python test template using AST for more precise code transformations
     """
     src_file = os.path.join(repo_path, target_name)
-    logging.info(f"Generating test template for {src_file}")    
+    logging.info(f"Generating test template for {src_file}")
     if not src_file.endswith(".py"):
-        src_file += ".py"    
+        src_file += ".py"
     if not os.path.exists(src_file):
         logging.error(f"Source target file not found: {src_file}")
         return None
@@ -191,11 +191,11 @@ def generate_test_template(target_name: str, repo_path: str):
         with open(init_path, "w", encoding="utf-8") as f:
             f.write("")
 
-    # 使用目标名称的基础部分（移除扩展名）作为输出文件名
+    # Use the base part of target_name (remove extension) as the output file name
     base_target_name = os.path.splitext(target_name)[0]
     template_path = os.path.join(template_dir, f"{base_target_name}.py")
     with open(template_path, "w", encoding="utf-8") as f:
         f.write(shebang + cleaned_code.strip() + "\n")
 
     logging.info(f"Generated cleaned template: {template_path}")
-    return template_path
+    return template_path
diff --git a/fuzz/collect_fuzz_python.py b/fuzz/collect_fuzz_python.py
@@ -19,9 +19,15 @@
 from difflib import SequenceMatcher
 from itertools import islice
 from datetime import datetime
+import re
+
+# Import AST-related functionality
+from ast_utils import (
+    TestFunctionTransformer,
+    TestGenTransformer,
+    generate_test_template,
+)
 
-# 导入AST相关的功能
-from ast_utils import TestFunctionTransformer, TestGenTransformer, generate_test_template
 
 def build_image(repos: list[str], jobs: int):
     """
@@ -200,22 +206,23 @@ def _transform_repo(repo: str):
         project_name = os.path.basename(repo)
         oss_fuzz_dir = Path(repo).parent.parent
         raw_targets = discover_targets(project_name, oss_fuzz_dir)
-        
-        # 只需移除目标名称中的 "_print1"，不要添加任何新后缀
+
+        # Simply remove "_print1" from target names, don't add any new suffix
         transformed_targets = [t.replace("_print1", "") for t in raw_targets]
-        
-        # 去重
+
+        # Remove duplicates
         targets = list(set(transformed_targets))
-        
-        # 传递给 generate_test_template 的是简单目标名称
+
+        # Pass simple target names to generate_test_template
         return [generate_test_template(t, repo) for t in targets]
 
     with ProcessingPool(jobs) as p:
         return list(p.map(_transform_repo, repos))
 
+
 def substitute_one_repo(
     repo: str,
-    targets: list[tuple],  # 每个元素是 (transformed_target, raw_target)
+    targets: list[tuple],  # Each element is (transformed_target, raw_target)
     n_fuzz: int,
     strategy: str,
     max_len: int,
@@ -225,57 +232,58 @@ def substitute_one_repo(
     Copy files from fuzz target template and generate multiple testgen files based on fuzz inputs
     using AST transformations
     """
+
     input_dir = pjoin(repo, "fuzz_inputs")
     template_dir = pjoin(repo, "tests-gen")
     os.makedirs(template_dir, exist_ok=True)
 
     for transformed_target, raw_target in targets:
-        # 使用转换后的目标名称构建模板文件路径
+        # Build template file path using transformed target name
         source_file = pjoin(template_dir, transformed_target + ".py")
-        
-        # 使用原始目标名称构建输入文件路径
+
+        # Build input file path using raw target name
         input_path = pjoin(input_dir, raw_target)
-        
-        # 确保源文件存在
+
+        # Ensure source file exists
         if not os.path.exists(source_file):
             logging.warning(f"Source file not found: {source_file}")
             continue
         if not os.path.exists(input_path):
             logging.warning(f"Input file not found: {input_path}")
             continue
-        
-        # 读取所有有效的输入数据
+
+        # Read all valid input data
         valid_inputs = []
         with open(input_path, "rb") as f_input:
             lines = f_input.readlines()
-            # 文件已关闭，现在处理数据
+            # File is closed, now process data
             for line in lines:
-                # 使用 errors='replace' 确保解码不会失败
+                # Use errors='replace' to ensure decoding doesn't fail
                 decoded = line.decode("utf-8", errors="replace")
-                
-                # 只处理以 b' 或 b" 开头的行
+
+                # Only process lines starting with b' or b"
                 if decoded.startswith(("b'", 'b"')):
                     if decoded.startswith("b'") and decoded.endswith("'\n"):
                         byte_data = line[2:-2]
                     elif decoded.startswith('b"') and decoded.endswith('"\n'):
                         byte_data = line[2:-2]
                     else:
                         continue
-                    
+
                     if 0 < len(byte_data) <= max_len:
                         valid_inputs.append(byte_data)
-                # 对于其他行，如果长度在范围内且不是以 b' 或 b" 开头，也考虑加入
+                # For other lines, if length is within range and doesn't start with b' or b", also consider adding
                 elif 0 < len(line) <= max_len:
                     valid_inputs.append(line)
 
         if not valid_inputs:
-            # 使用 transformed_target 而不是 target_name
+            # Use transformed_target instead of target_name
             logging.warning(f"No valid inputs found for {transformed_target}")
             continue
 
-        # 使用 transformed_target 而不是 target_name
+        # Use transformed_target instead of target_name
         logging.info(f"Loaded {len(valid_inputs)} inputs for {transformed_target}")
-        # 策略选择输入
+        # Strategy for selecting inputs
         if strategy == "shuffle":
             random.shuffle(valid_inputs)
             inputs = valid_inputs[:n_fuzz]
@@ -284,44 +292,47 @@ def substitute_one_repo(
         else:
             inputs = valid_inputs[:n_fuzz]
 
-        # 每个 fuzz input 生成一个单独的文件（使用 AST）
+        # Generate a separate file for each fuzz input (using AST)
         for idx, fuzz_input in enumerate(inputs, start=1):
             with open(source_file, "r") as f_src:
                 code = f_src.read()
 
             try:
-                # 解析为 AST
+                # Parse into AST
                 tree = ast.parse(code)
 
-                # 应用转换器
+                # Apply transformer
                 transformer = TestGenTransformer(idx, fuzz_input)
                 new_tree = transformer.visit(tree)
                 ast.fix_missing_locations(new_tree)
 
-                # 确保找到并处理了测试函数
+                # Ensure test function was found and processed
                 if not transformer.found_test_function:
                     logging.warning(f"No test_ function found in {source_file}")
                     continue
 
-                # 生成新代码
+                # Generate new code
                 new_code = astunparse.unparse(new_tree)
 
-                # 使用 transformed_target 而不是 target_name
+                # Use transformed_target instead of target_name
                 out_path = pjoin(template_dir, f"{transformed_target}.testgen_{idx}.py")
                 with open(out_path, "w") as f_out:
                     f_out.write(new_code)
 
-                # 格式化代码
+                # Format code
                 try:
                     subprocess.run(["black", out_path], check=False)
                 except FileNotFoundError:
                     logging.warning("Black formatter not found, skipping formatting")
-                
+
             except SyntaxError as e:
                 logging.error(f"Syntax error when processing {source_file}: {e}")
             except Exception as e:
-                # 使用 transformed_target 而不是 target_name
-                logging.error(f"Error generating test case for {transformed_target}: {e}")
+                # Use transformed_target instead of target_name
+                logging.error(
+                    f"Error generating test case for {transformed_target}: {e}"
+                )
+
 
 def testgen_repos(
     repos: list[str],
@@ -348,28 +359,29 @@ def testgen_repos(
         project_name = os.path.basename(repo)
         oss_fuzz_dir = Path(repo).parent.parent
         raw_targets = discover_targets(project_name, oss_fuzz_dir)
-        
-        # 保存原始目标名称和转换后的目标名称
+
+        # Save original target names and transformed target names
         transformed_targets = [t.replace("_print1", "") for t in raw_targets]
-        targets = list(zip(transformed_targets, raw_targets))  # (转换后, 原始)
+        targets = list(zip(transformed_targets, raw_targets))  # (transformed, raw)
         target_map[repo] = targets
 
     # Process each repository in parallel
     with ProcessingPool(jobs) as p:
         list(
             p.map(
                 lambda item: substitute_one_repo(
-                    item[0],         # repo path
-                    item[1],         # list of (transformed, raw) targets
-                    n_fuzz, 
-                    strategy, 
-                    max_len, 
-                    sim_thresh
+                    item[0],  # repo path
+                    item[1],  # list of (transformed, raw) targets
+                    n_fuzz,
+                    strategy,
+                    max_len,
+                    sim_thresh,
                 ),
                 target_map.items(),
             )
         )
 
+
 def main(
     repo_id: str = "data/valid_projects.txt",
     repo_root: str = "fuzz/oss-fuzz/projects/",
@@ -431,4 +443,4 @@ def main(
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
-    fire.Fire(main)
+    fire.Fire(main)