diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3bfc2a2bf1..53837c5895 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,10 @@ v35.1.0 (unreleased) license rules used during the scan. https://github.com/aboutcode-org/scancode.io/issues/1657 +- Add a new step to the ``DeployToDevelop`` pipeline, ``map_python``, to match + Cython source files (.pyx) to their compiled binaries. + https://github.com/aboutcode-org/scancode.io/pull/1703 + - Update scancode-toolkit to v32.4.0. See CHANGELOG for updates: https://github.com/aboutcode-org/scancode-toolkit/releases/tag/v32.4.0 Adds a new ``git_sha1`` attribute to the ``CodebaseResource`` model as this diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index 33f699472b..0a3c086063 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -81,6 +81,7 @@ def steps(cls): cls.map_winpe, cls.map_go, cls.map_rust, + cls.map_python, cls.match_directories_to_purldb, cls.match_resources_to_purldb, cls.map_javascript_post_purldb_match, @@ -221,6 +222,14 @@ def map_rust(self): """Map Rust binaries to their sources using symbols.""" d2d.map_rust_binaries_with_symbols(project=self.project, logger=self.log) + @optional_step("Python") + def map_python(self): + """ + Map binaries from Python packages to their sources using dwarf paths and + symbols. + """ + d2d.map_python_pyx_to_binaries(project=self.project, logger=self.log) + def match_directories_to_purldb(self): """Match selected directories in PurlDB.""" if not purldb.is_available(): diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index e168bbec5c..6e49603e9e 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -2254,3 +2254,55 @@ def _map_javascript_strings(to_resource, javascript_from_resources, logger): to_resource.update(status=flag.MAPPED) return 1 return 0 + + +def map_python_pyx_to_binaries(project, logger=None): + """Map Cython source to their compiled binaries in ``project``.""" + from source_inspector.symbols_tree_sitter import get_tree_and_language_info + + python_config = d2d_config.get_ecosystem_config(ecosystem="Python") + from_resources = ( + project.codebaseresources.files() + .from_codebase() + .filter(extension__in=python_config.source_symbol_extensions) + ) + to_resources = ( + project.codebaseresources.files().to_codebase().has_no_relation().elfs() + ) + + # Collect binary symbols from binaries + for resource in to_resources: + try: + binary_symbols = collect_and_parse_elf_symbols(resource.location) + resource.update_extra_data(binary_symbols) + except Exception as e: + logger(f"Error parsing binary symbols at: {resource.location_path!r} {e!r}") + + for resource in from_resources: + # Open Cython source file, create AST, parse it for function definitions + # and save them in a list + tree, _ = get_tree_and_language_info(resource.location) + function_definitions = [ + node + for node in tree.root_node.children + if node.type == "function_definition" + ] + identifiers = [] + for node in function_definitions: + for child in node.children: + if child.type == "identifier": + identifiers.append(child.text.decode()) + + # Find matching to/ resource by checking to see which to/ resource's + # extra_data field contains function definitions found from Cython + # source files + identifiers_qs = Q() + for identifier in identifiers: + identifiers_qs |= Q(extra_data__icontains=identifier) + matching_elfs = to_resources.filter(identifiers_qs) + for matching_elf in matching_elfs: + pipes.make_relation( + from_resource=resource, + to_resource=matching_elf, + map_type="python_pyx_match", + ) diff --git a/scanpipe/pipes/d2d_config.py b/scanpipe/pipes/d2d_config.py index 2508928323..3a755f7426 100644 --- a/scanpipe/pipes/d2d_config.py +++ b/scanpipe/pipes/d2d_config.py @@ -131,6 +131,10 @@ class EcosystemConfig: ecosystem_option="Windows", source_symbol_extensions=[".c", ".cpp", ".h", ".cs"], ), + "Python": EcosystemConfig( + ecosystem_option="Python", + source_symbol_extensions=[".pyx", ".pxd"], + ), } diff --git a/scanpipe/tests/data/d2d-python/from-intbitset.tar.gz b/scanpipe/tests/data/d2d-python/from-intbitset.tar.gz new file mode 100644 index 0000000000..cb31013acf Binary files /dev/null and b/scanpipe/tests/data/d2d-python/from-intbitset.tar.gz differ diff --git a/scanpipe/tests/data/d2d-python/to-intbitset.whl b/scanpipe/tests/data/d2d-python/to-intbitset.whl new file mode 100644 index 0000000000..a7e436452a Binary files /dev/null and b/scanpipe/tests/data/d2d-python/to-intbitset.whl differ diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index c472039f1b..1a808c5662 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -1677,6 +1677,32 @@ def test_scanpipe_pipes_d2d_map_macho_symbols(self): ).count(), ) + @skipIf(sys.platform == "darwin", "Test is failing on macOS") + def test_scanpipe_pipes_d2d_map_python_pyx(self): + input_dir = self.project1.input_path + input_resources = [ + self.data / "d2d-python/to-intbitset.whl", + self.data / "d2d-python/from-intbitset.tar.gz", + ] + copy_inputs(input_resources, input_dir) + self.from_files, self.to_files = d2d.get_inputs(self.project1) + inputs_with_codebase_path_destination = [ + (self.from_files, self.project1.codebase_path / d2d.FROM), + (self.to_files, self.project1.codebase_path / d2d.TO), + ] + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + scancode.extract_archive(input_file_path, codebase_path) + + scancode.extract_archives(self.project1.codebase_path, recurse=True) + pipes.collect_and_create_codebase_resources(self.project1) + buffer = io.StringIO() + d2d.map_python_pyx_to_binaries(project=self.project1, logger=buffer.write) + pyx_match_relations = CodebaseRelation.objects.filter( + project=self.project1, map_type="python_pyx_match" + ) + self.assertEqual(1, pyx_match_relations.count()) + @skipIf(sys.platform == "darwin", "Test is failing on macOS") def test_scanpipe_pipes_d2d_map_winpe_symbols(self): input_dir = self.project1.input_path diff --git a/setup.cfg b/setup.cfg index faa6916340..b4f8a4dad2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -86,7 +86,7 @@ install_requires = rust-inspector==0.1.0 binary-inspector==0.1.2 python-inspector==0.14.0 - source-inspector==0.6.1; sys_platform != "darwin" and platform_machine != "arm64" + source-inspector==0.7.0; sys_platform != "darwin" and platform_machine != "arm64" aboutcode-toolkit==11.1.1 # Utilities XlsxWriter==3.2.5