diff --git a/.gitmodules b/.gitmodules index 5b761d3..82926c7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,7 @@ [submodule "UniTSyn"] path = UniTSyn url = https://github.com/SecurityLab-UCD/UniTSyn.git +[submodule "fuzz/oss-fuzz"] + path = fuzz/oss-fuzz + url = https://github.com/joyguoguo/oss-fuzz.git + branch = main diff --git a/UniTSyn b/UniTSyn index 45c7bd1..0d9e0df 160000 --- a/UniTSyn +++ b/UniTSyn @@ -1 +1 @@ -Subproject commit 45c7bd1152ce420781d4b5ce6d4bf8b1e6c7b3ca +Subproject commit 0d9e0df455655773eaf0acabd9008aa34f0e3f73 diff --git a/data/valid_projects.txt b/data/valid_projects.txt new file mode 100644 index 0000000..94fdcd5 --- /dev/null +++ b/data/valid_projects.txt @@ -0,0 +1,234 @@ +abseil-py +adal +aiohttp +aniso8601 +ansible +argcomplete +arrow-py +asn1crypto +asteval +astroid +asttokens +attrs +autoflake +autopep8 +azure-sdk-for-python +babel +black +botocore +bottleneck +bz2file +cachetools +cffi +chardet +charset_normalizer +click +cloud-custodian +configparser +connexion +coveragepy +croniter +cryptography +cssselect +dask +decorator +defusedxml +digest +dill +distlib +dnspython +docutils +ecdsa-python +et-xmlfile +face +filelock +filesystem_spec +flask +flask-jwt-extended +flask-restx +flask-wtf +fonttools +ftfy +g-api-auth-httplib2 +g-api-auth-library-python +g-api-pubsub +g-api-py-api-common-protos +g-api-py-oauthlib +g-api-python-bigquery-storage +g-api-python-client +g-api-python-cloud-core +g-api-python-firestore +g-api-python-tasks +g-api-resource-manager +g-api-resumable-media-python +g-api-secret-manager +g-apis-py-api-core +gast +gc-iam +gcloud-error-py +g-cloud-logging-py +gcp-python-cloud-storage +genshi +gitdb +glom +gprof2dot +g-py-bigquery +g-py-crc32c +grpc-py +gunicorn +h11 +h5py +hiredis-py +html2text +html5lib-python +httpcore +httpretty +httpx +idna +ijson +importlib_metadata +iniconfig +ipaddress +ipykernel +ipython +isodate +itsdangerous +jedi +jinja2 +jmespathpy +joblib +jsmin +jupyter-nbconvert +jupyter_server +kafka +keras +kiwisolver +lark-parser +libcst +looker-sdk +lxml +mako +markupsafe +matplotlib +mccabe +mdit-py-plugins +mdurl +more-itertools +mrab-regex +msal +msgpack-python +multidict +mutagen +nbclassic +nbformat +netaddr-py +networkx +ntlm2 +ntlm-auth +numexpr +numpy +oauth2 +oauthlib +olefile +openapi-schema-validator +opencensus-python +openpyxl +opt_einsum +oracle-py-cx +orjson +oscrypto +packaging +pandas +paramiko +parse +parsimonious +pasta +pathlib2 +pdoc +pem +pendulum +pip +ply +protobuf-python +proto-plus-python +psqlparse +psutil +psycopg2 +pyasn1 +pyasn1-modules +pycparser +pycrypto +pydantic +pydateutil +pygments +pyjson5 +pyjwt +pymysql +pynacl +pyodbc +pyparsing +pyrsistent +py-serde +pytables +pytest-py +python3-openid +python-ecdsa +python-email-validator +python-fastjsonschema +python-future +python-graphviz +python-hyperlink +python-jose +python-lz4 +python-markdown +python-markdownify +python-nameparser +python-nvd3 +python-pathspec +python-prompt-toolkit +python-pypdf +python-rison +python-rsa +python-tabulate +pytz +pyxdg +pyyaml +pyzmq +redis-py +requests +retry +rfc3967 +rich +sacremoses +scikit-learn +scipy +setuptools +sigstore-python +simplejson +six +smart_open +soupsieve +sqlalchemy_jsonfield +sqlalchemy-utils +sqlparse +stack_data +tensorflow-addons +tinycss2 +toml +tomlkit +toolbelt +toolz +tqdm +typing_extensions +underscore +uritemplate +urlextract +urllib3 +validators +w3lib +websocket-client +wheel +wtforms +xlrd +yarl +zipp diff --git a/fuzz/collect_fuzz.py b/fuzz/collect_fuzz.py index bcc4b5e..95bba86 100644 --- a/fuzz/collect_fuzz.py +++ b/fuzz/collect_fuzz.py @@ -1,5 +1,4 @@ """script for rust fuzzing and transforming test_template""" - import logging from typing import Optional import fire diff --git a/fuzz/collect_fuzz_python.py b/fuzz/collect_fuzz_python.py new file mode 100644 index 0000000..16ae813 --- /dev/null +++ b/fuzz/collect_fuzz_python.py @@ -0,0 +1,511 @@ +""" +Script for Python project fuzzing and test template conversion +usage: PYTHONPATH=. python3 fuzz/collect_fuzz_python.py --pipeline all +""" +from pathlib import Path +import logging +from typing import Optional, List, Tuple +import fire +import os +from UniTSyn.frontend.util import wrap_repo, parallel_subprocess +import subprocess +from os.path import join as pjoin, abspath +from tqdm import tqdm +from pathos.multiprocessing import ProcessingPool +import random +from difflib import SequenceMatcher +from itertools import islice +from datetime import datetime +import re + +def build_image(repos: list[str], jobs: int): + """ + Build Docker images for OSS-Fuzz projects corresponding to each repository + + Args: + repos (list[str]): List of repository paths + jobs (int): Number of parallel tasks + """ + logging.info(f"Building Docker images for {len(repos)} OSS-Fuzz projects") + log_dir = os.path.abspath("fuzz_pipeline_log") + os.makedirs(log_dir, exist_ok=True) + + def _build_cmd(path: str): + project_name = os.path.basename(path.rstrip("/")) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_file = os.path.join(log_dir, f"{project_name}_{timestamp}.log") + + logging.info(f"Start building {project_name}, logging to {log_file}") + return subprocess.Popen( + f"yes | python3 infra/helper.py build_image {project_name}", + cwd=os.path.abspath(os.path.join(path, "../../")), + stdout=open(log_file, "w"), + stderr=subprocess.STDOUT, + shell=True, + ) + + _ = parallel_subprocess(repos, jobs, _build_cmd, on_exit=None) + +def build_fuzzer(repos: list[str], jobs: int): + """ + Build fuzzers in parallel for successfully built projects + + Args: + repos (list[str]): List of repository paths + jobs (int): Number of parallel tasks + """ + logging.info(f"Building fuzzers for {len(repos)} OSS-Fuzz projects") + log_dir = os.path.abspath("fuzz_pipeline_log") + os.makedirs(log_dir, exist_ok=True) + + def _build_cmd(path: str): + project_name = os.path.basename(path.rstrip("/")) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_file = os.path.join(log_dir, f"{project_name}_fuzzer_{timestamp}.log") + + logging.info(f"Start building fuzzers for {project_name}, logging to {log_file}") + return subprocess.Popen( + f"python3 infra/helper.py build_fuzzers --sanitizer address {project_name}", + cwd=os.path.abspath(os.path.join(path, "../../")), + stdout=open(log_file, "w"), + stderr=subprocess.STDOUT, + shell=True, + ) + + _ = parallel_subprocess(repos, jobs, _build_cmd, on_exit=None) + +def discover_targets(project_name: str, oss_fuzz_dir: Path) -> list[str]: + """ + Discover fuzzing targets + + Args: + project_name (str): Project name + oss_fuzz_dir (Path): OSS-Fuzz root directory + + Returns: + list[str]: List of target names + """ + out_dir = oss_fuzz_dir / "build" / "out" / project_name + targets: list[str] = [] + + if not out_dir.is_dir(): + logging.warning(f"Build output directory for {project_name} does not exist") + return targets + + try: + for f in out_dir.iterdir(): + if (f.is_file() and f.name.startswith("fuzz_") and + '.' not in f.name and f.name.endswith("print1") and + os.access(f, os.X_OK)): + targets.append(f.name) + except Exception as e: + logging.error(f"Error discovering targets: {e}") + + return targets + +def fuzz_one_target(target: tuple[str, str], timeout: int): + """ + Perform fuzzing on a single fuzzing target + + Args: + target (tuple[str, str]): (Repository path, target name) + timeout (int): Timeout duration (seconds) + + Returns: + subprocess.Popen: Subprocess object + """ + repo_path, target_name = target + project_name = os.path.basename(repo_path) + oss_fuzz_root = os.path.dirname(os.path.dirname(repo_path)) + + # Create input file path + input_file_path = pjoin(repo_path, "fuzz_inputs", target_name) + os.makedirs(os.path.dirname(input_file_path), exist_ok=True) + + try: + with open(input_file_path, "w") as input_file: + return subprocess.Popen( + [ + "bash", + "-c", + f"timeout {timeout} python3 infra/helper.py run_fuzzer {project_name} {target_name}" + ], + cwd=oss_fuzz_root, + stdout=input_file, + stderr=subprocess.DEVNULL, + ) + except Exception as e: + logging.error(f"Error starting fuzzer: {e}") + return None + +def fuzz_repos(repos: list[str], jobs: int, timeout: int = 60): + """ + Perform fuzzing on a set of repositories + + Args: + repos (list[str]): List of repository paths + jobs (int): Number of parallel tasks + timeout (int): Timeout duration (seconds) + """ + logging.info("Discovering fuzz targets") + + # Get all targets + targets_list = [] + for repo in repos: + project_name = os.path.basename(repo) + oss_fuzz_dir = Path(repo).parent.parent + targets = discover_targets(project_name, oss_fuzz_dir) + targets_list.append(targets) + + # Create target mapping + target_map = {repo: targets for repo, targets in zip(repos, targets_list)} + all_targets: list[tuple[str, str]] = [ + (k, v) for k, vs in target_map.items() for v in vs + ] + + logging.info(f"Running fuzzing on {len(all_targets)} targets") + + # Create input directory + for repo in repos: + os.makedirs(pjoin(repo, "fuzz_inputs"), exist_ok=True) + + # Execute fuzzing in parallel + parallel_subprocess(all_targets, jobs, lambda p: fuzz_one_target(p, timeout), on_exit=None) + +def generate_test_template(target_name: str, repo_path: str): + """ + Generate Python test template for a single target + + Args: + target_name (str): Target name + repo_path (str): Repository path + + Returns: + str: Template file path + """ + template_dir = pjoin(repo_path, "tests-gen") + os.makedirs(template_dir, exist_ok=True) + + # Ensure __init__.py exists + init_path = pjoin(template_dir, "__init__.py") + if not os.path.exists(init_path): + with open(init_path, "w") as f: + f.write("") + + template_path = pjoin(template_dir, f"{target_name}.py") + + # Python test template with placeholder + template = f"""#!/usr/bin/env python3 +import sys +import os +import unittest + +# Add the parent directory to the Python path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +# Import the function to test +try: + from {target_name} import TestOneInput as TestClass +except ImportError: + from {target_name} import TestInput as TestClass + +class Test{target_name.capitalize()}(unittest.TestCase): + def test_generated(self): + \"\"\"Test generated from fuzzing input\"\"\" + input_data = b"" + result = TestClass(input_data) + +if __name__ == '__main__': + unittest.main() +""" + with open(template_path, "w") as f: + f.write(template) + + return template_path + +def transform_repos(repos: list[str], jobs: int): + """ + Generate test templates for all targets + + Args: + repos (list[str]): List of repository paths + jobs (int): Number of parallel tasks + """ + logging.info("Generating test templates") + + def _transform_repo(repo: str): + project_name = os.path.basename(repo) + oss_fuzz_dir = Path(repo).parent.parent + targets = discover_targets(project_name, oss_fuzz_dir) + return [generate_test_template(t, repo) for t in targets] + + with ProcessingPool(jobs) as p: + return list(p.map(_transform_repo, repos)) + +def escape_special_chars(input_data: str) -> str: + """ + Escape special characters in input data for Python byte strings + + Args: + input_data (str): Raw input data + + Returns: + str: Input data with escaped characters + """ + # For Python, we can use repr() to safely represent byte strings + # This will handle all special characters and non-ASCII bytes + return repr(input_data.encode('latin-1', 'replace')) + +def substitute_input(template: str, input_data: str, idx: int, target_name: str) -> str: + """ + Replace fuzzing input into Python test template + + Args: + template (str): Template content + input_data (str): Input data + idx (int): Test index + target_name (str): Target name + + Returns: + str: Test code after substitution + """ + # Escape special characters for Python + escaped_input = escape_special_chars(input_data) + + # Replace input placeholder + new_template = template.replace( + 'input_data = b""', + f'input_data = {escaped_input}' + ) + + # Replace test method name to avoid duplication + return new_template.replace( + f"def test_generated(self):", + f"def test_{idx}(self):" + ) + +def has_similar(selected: list[str], x: str, thresh: float = 0.8) -> bool: + """ + Check if a string is sufficiently similar to any string in the selected list + + Args: + selected (list[str]): List of selected strings + x (str): String to check + thresh (float): Similarity threshold + + Returns: + bool: Whether they are similar + """ + def similar(a, b): + return SequenceMatcher(None, a, b).ratio() + return any(similar(x, y) > thresh for y in selected) + +import re + +def substitute_one_repo( + repo: str, + targets: list[str], + n_fuzz: int, + strategy: str, + max_len: int, + sim_thresh: float, +): + """ + Process a single repository, replace fuzzing inputs into test templates + and generate {target_name}.inputs.py files. + """ + template_dir = pjoin(repo, "tests-gen") + input_dir = pjoin(repo, "fuzz_inputs") + + # Ensure __init__.py exists + init_path = pjoin(template_dir, "__init__.py") + if not os.path.exists(init_path): + with open(init_path, "w") as f: + f.write("") + + for target_name in targets: + input_path = pjoin(input_dir, target_name) + + try: + if not os.path.exists(input_path): + logging.warning(f"Input file not found: {input_path}") + continue + + with open(input_path, "r") as f_input: + all_inputs = [line.strip() for line in f_input if line.strip()] + + if not all_inputs: + logging.warning(f"No valid inputs found for {target_name}") + continue + + logging.info(f"Loaded {len(all_inputs)} inputs for {target_name}") + + # Input selection + if strategy == "shuffle": + random.shuffle(all_inputs) + inputs = list(islice((x for x in all_inputs if len(x) < max_len), n_fuzz)) + elif strategy == "reverse": + inputs = [] + for x in reversed(all_inputs): + if len(inputs) >= n_fuzz: + break + if len(x) > max_len or has_similar(inputs, x, sim_thresh): + continue + inputs.append(x) + else: + inputs = all_inputs[:n_fuzz] + + # Header + file_header = f"""import sys +import os +import unittest + +# 将项目目录加入 Python 路径,确保能导入上层模块 +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + +try: + from {target_name} import TestOneInput as TestClass +except ImportError: + from {target_name} import TestInput as TestClass + + +class Test{target_name.capitalize()}(unittest.TestCase):""" + + # Method body template + method_body_template = [ + '"""Test generated from fuzzing input"""', + 'input_data = b""', + 'result = TestClass(input_data)', + ] + + # Generate test methods + test_methods = [] + for i, input_data in enumerate(inputs): + escaped_input = escape_special_chars(input_data) + test_func = f" def test_{i}(self):\n" + for line in method_body_template: + replaced_line = line.replace('input_data = b""', f"input_data = {escaped_input}") + test_func += f" {replaced_line}\n" + test_methods.append(test_func) + + if not test_methods: + test_methods = [" def test_placeholder(self):\n self.assertTrue(True)"] + + # Combine full file + final_code = file_header + "\n\n" + "\n\n".join(test_methods) + final_code += "\n\nif __name__ == '__main__':\n unittest.main()\n" + + # Write output file + generated_path = pjoin(template_dir, f"{target_name}.inputs.py") + with open(generated_path, "w") as f: + f.write(final_code) + + # Format with black + try: + subprocess.run(["black", generated_path], check=False) + except FileNotFoundError: + logging.warning("Black formatter not found, skipping formatting") + + except Exception as e: + logging.error(f"Error processing {target_name}: {e}") + + +def testgen_repos( + repos: list[str], + jobs: int, + n_fuzz: int = 100, + strategy: str = "shuffle", + max_len: int = 100, + sim_thresh: float = 0.8, +): + """ + Generate test cases from fuzzing inputs + + Args: + repos (list[str]): List of repository paths + jobs (int): Number of parallel tasks + n_fuzz (int): Number of inputs to use + strategy (str): Selection strategy + max_len (int): Maximum length + sim_thresh (float): Similarity threshold + """ + # First get all targets + targets_list = [] + for repo in repos: + project_name = os.path.basename(repo) + oss_fuzz_dir = Path(repo).parent.parent + targets = discover_targets(project_name, oss_fuzz_dir) + targets_list.append(targets) + + target_map = {repo: targets for repo, targets in zip(repos, targets_list)} + + # Process each repository in parallel + with ProcessingPool(jobs) as p: + list(p.map( + lambda item: substitute_one_repo( + item[0], item[1], n_fuzz, strategy, max_len, sim_thresh + ), + target_map.items() + )) + +def main( + repo_id: str = "data/valid_projects.txt", + repo_root: str = "fuzz/oss-fuzz/projects/", + timeout: int = 60, + jobs: int = 4, + pipeline: str = "all", + n_fuzz: int = 100, + strategy: str = "shuffle", + max_len: int = 100, + sim_thresh: float = 0.8, +): + """ + Main function, controlling the entire fuzzing process + + Args: + repo_id (str): Project ID file path + repo_root (str): Project root directory + timeout (int): Timeout duration + jobs (int): Number of parallel tasks + pipeline (str): Pipeline type + n_fuzz (int): Number of inputs to use + strategy (str): Selection strategy + max_len (int): Maximum length + sim_thresh (float): Similarity threshold + """ + try: + with open(repo_id, "r") as f: + repo_id_list = [line.strip() for line in f if line.strip()] + except FileNotFoundError: + repo_id_list = [repo_id] + + # Collect repository paths + repos = [] + for repo_id in repo_id_list: + repo_path = abspath(os.path.join(repo_root, repo_id)) + if os.path.isdir(repo_path): + repos.append(repo_path) + + # Execute specified pipeline + if pipeline == "build_image": + build_image(repos, jobs) + elif pipeline == "build_fuzzer": + build_fuzzer(repos, jobs) + elif pipeline == "fuzz": + fuzz_repos(repos, jobs, timeout) + elif pipeline == "testgen": + testgen_repos(repos, jobs, n_fuzz, strategy, max_len, sim_thresh) + elif pipeline == "transform": + transform_repos(repos, jobs) + elif pipeline == "all": + build_image(repos, jobs) + build_fuzzer(repos, jobs) + transform_repos(repos, jobs) # Generate test templates + fuzz_repos(repos, jobs, timeout) + testgen_repos(repos, jobs, n_fuzz, strategy, max_len, sim_thresh) + else: + logging.error(f"Unknown pipeline: {pipeline}") + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + fire.Fire(main) \ No newline at end of file diff --git a/fuzz/modify_fuzz_files.py b/fuzz/modify_fuzz_files.py new file mode 100644 index 0000000..de8333f --- /dev/null +++ b/fuzz/modify_fuzz_files.py @@ -0,0 +1,66 @@ +import os +import re + +def add_print_to_testoneinput(file_path): + with open(file_path, 'r') as f: + content = f.read() + + # 正则表达式匹配TestOneInput或TestInput函数定义及其函数体 + pattern = r'(\bdef\s+(TestOneInput|TestInput)\(data\):\s*\n)((?:[ \t]+.*\n|\s*\n)*)' + matches = re.finditer(pattern, content, re.MULTILINE) + + new_content = content + for match in reversed(list(matches)): + function_def = match.group(1) + function_body = match.group(3) + + # 在函数体开头添加print(data)语句 + new_function_body = re.sub( + r'^([ \t]*)(.*\n)', + r'\g<1>\2\g<1>print(data)\n', + function_body, + count=1 + ) + + # 只有在函数体非空且未添加过print时才替换 + if new_function_body != function_body: + new_content = ( + new_content[:match.start(3)] + + new_function_body + + new_content[match.end(3):] + ) + + return new_content + +def main(): + projects_path = "/home/jiayiguo/FuzzAug/fuzz/oss-fuzz/projects" + valid_projects_file = "data/valid_projects.txt" + + with open(valid_projects_file, 'r') as f: + projects = [line.strip() for line in f if line.strip()] + + for project in projects: + project_dir = os.path.join(projects_path, project) + + if not os.path.isdir(project_dir): + continue + + for root, _, files in os.walk(project_dir): + for file in files: + if file.startswith('fuzz_') and file.endswith('.py'): + file_path = os.path.join(root, file) + + try: + new_content = add_print_to_testoneinput(file_path) + + # 保存修改后的文件(添加_print后缀) + new_file_path = file_path.rsplit('.', 1)[0] + '_print1.py' + with open(new_file_path, 'w') as f: + f.write(new_content) + print(f"Processed: {file_path} -> {new_file_path}") + + except Exception as e: + print(f"Error processing {file_path}: {str(e)}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/fuzz/oss-fuzz b/fuzz/oss-fuzz new file mode 160000 index 0000000..8f6559b --- /dev/null +++ b/fuzz/oss-fuzz @@ -0,0 +1 @@ +Subproject commit 8f6559b916e0d7ca6e7f974394ce6f651783c163 diff --git a/image_build_results.json b/image_build_results.json new file mode 100644 index 0000000..72b9fe6 --- /dev/null +++ b/image_build_results.json @@ -0,0 +1,6 @@ +{ + "autoflake": true, + "autopep8": true, + "azure-sdk-for-python": true, + "babel": true +} \ No newline at end of file