diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9935d14adf..17b6a9d6c6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,11 @@ v34.10.2 (unreleased) Use the UUID for the DiscoveredDependency spdx_id for better SPDX compatibility. https://github.com/aboutcode-org/scancode.io/issues/1651 +- Add MatchCode-specific functions to compute fingerprints from stemmed code + files. Update CodebaseResource file content view to display snippet matches, + if available, when the codebase has been sent for matching to MatchCode. + https://github.com/aboutcode-org/scancode.io/pull/1656 + v34.10.1 (2025-03-26) --------------------- diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index 1ad02899dd..7c5ea9ef3d 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -28,6 +28,8 @@ import requests from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes +from matchcode_toolkit.fingerprinting import get_line_by_pos +from matchcode_toolkit.fingerprinting import get_stemmed_file_fingerprint_hashes from scancode import Scanner from scanpipe.pipes import codebase @@ -254,6 +256,48 @@ def fingerprint_codebase_resources( ) +def fingerprint_stemmed_codebase_resource(location, with_threading=True, **kwargs): + """ + Compute stemmed code fingerprints for the resource at `location` using the + scancode-toolkit direct API. + + Return a dictionary of scan `results` and a list of `errors`. + """ + scanners = [ + Scanner("stemmed_fingerprints", get_stemmed_file_fingerprint_hashes), + ] + return _scan_resource(location, scanners, with_threading=with_threading) + + +def fingerprint_stemmed_codebase_resources( + project, resource_qs=None, progress_logger=None, to_codebase_only=False +): + """ + Compute stemmed code fingerprints for the resources from `project`. + + These resource fingerprints are used for matching purposes on matchcode. + + Multiprocessing is enabled by default on this pipe, the number of processes can be + controlled through the SCANCODEIO_PROCESSES setting. + + If `to_codebase_only` is True, the only resources from the `to/` codebase + are computed. + """ + # Checking for None to make the distinction with an empty resource_qs queryset + if resource_qs is None: + resource_qs = project.codebaseresources.filter(is_text=True) + + if to_codebase_only: + resource_qs = resource_qs.to_codebase() + + scan_resources( + resource_qs=resource_qs, + scan_func=fingerprint_stemmed_codebase_resource, + save_func=save_resource_fingerprints, + progress_logger=progress_logger, + ) + + def send_project_json_to_matchcode( project, timeout=DEFAULT_TIMEOUT, api_url=MATCHCODEIO_API_URL ): @@ -362,3 +406,14 @@ def create_packages_from_match_results(project, match_results): package_data=matched_package, status=flag.MATCHED_TO_PURLDB_PACKAGE, ) + match_resources = match_results.get("files", []) + for match_resource in match_resources: + match_resource_extra_data = match_resource["extra_data"] + if match_resource_extra_data: + resource = project.codebaseresources.get(path=match_resource["path"]) + # compute line_by_pos for displaying matches in CodebaseResource detail view + with open(resource.location) as f: + content = f.read() + line_by_pos = get_line_by_pos(content) + match_resource_extra_data["line_by_pos"] = line_by_pos + resource.update_extra_data(match_resource_extra_data) diff --git a/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json b/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json new file mode 100644 index 0000000000..8aaa224363 --- /dev/null +++ b/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json @@ -0,0 +1,278 @@ +{ + "halo1": "0000004f5cc2ec9a5ebdaa44336f53be569d6829", + "snippets": [ + { + "snippet": "24a1651c51468fb8cf1ac6c38a2c4add", + "position": "0" + }, + { + "snippet": "7b1cbef763885c6856df8b15fa4e57a5", + "position": 5 + }, + { + "snippet": "46828d9d4a64300b1543e4e5a6356ed5", + "position": 12 + }, + { + "snippet": "c0496b020a8d87a3b1bf1a83c67c16d5", + "position": 14 + }, + { + "snippet": "b2ec716c571a0368ea37dbb7821c6945", + "position": 15 + }, + { + "snippet": "8dd2b57022204ecd9ea4a2471f224fd4", + "position": 22 + }, + { + "snippet": "cb9216ce4ad33a5d6feb378dbf0404c8", + "position": 30 + }, + { + "snippet": "034b634f1c726c9c0f7740ea9723637b", + "position": 37 + }, + { + "snippet": "d0bb8a1740512218c8e87bbaa5f5d9a6", + "position": 38 + }, + { + "snippet": "7ae529b13ddb3b0c74421772d78821a7", + "position": 41 + }, + { + "snippet": "b2aad3c6ab2c2c9ba1a95edac417aa09", + "position": 42 + }, + { + "snippet": "be339f1c1670b7789e83f875978c1e06", + "position": 46 + }, + { + "snippet": "a895f0ff2b99352b33392fda0a87a4cf", + "position": 53 + }, + { + "snippet": "6819c7f718a1fa7f2501009d21ee46d7", + "position": 57 + }, + { + "snippet": "97ecd33b1ca08589363df198458d976f", + "position": 61 + }, + { + "snippet": "2c73086d098f182cf8441046b97af434", + "position": 64 + }, + { + "snippet": "3ba6ad01d6f9130be38df14a44633abd", + "position": 67 + } + ], + "line_by_pos": { + "0": 1, + "1": 1, + "2": 1, + "3": 3, + "4": 3, + "5": 3, + "6": 3, + "7": 3, + "8": 4, + "9": 4, + "10": 5, + "11": 5, + "12": 6, + "13": 6, + "14": 6, + "15": 6, + "16": 6, + "17": 6, + "18": 7, + "19": 7, + "20": 7, + "21": 7, + "22": 7, + "23": 7, + "24": 8, + "25": 8, + "26": 8, + "27": 8, + "28": 8, + "29": 8, + "30": 11, + "31": 11, + "32": 11, + "33": 11, + "34": 11, + "35": 11, + "36": 11, + "37": 12, + "38": 12, + "39": 12, + "40": 15, + "41": 15, + "42": 16, + "43": 16, + "44": 16, + "45": 16, + "46": 17, + "47": 17, + "48": 17, + "49": 18, + "50": 18, + "51": 19, + "52": 19, + "53": 19, + "54": 20, + "55": 20, + "56": 20, + "57": 20, + "58": 21, + "59": 21, + "60": 21, + "61": 21, + "62": 21, + "63": 22, + "64": 22, + "65": 22, + "66": 22, + "67": 22, + "68": 23, + "69": 23, + "70": 24, + "71": 24, + "72": 24, + "73": 25, + "74": 25, + "75": 25, + "76": 27, + "77": 27, + "78": 28, + "79": 28, + "80": 28, + "81": 29, + "82": 29 + }, + "stemmed_halo1": "000000240a64b6c8aae4625491a8aa77ffd9b2a6", + "stemmed_snippets": [ + { + "snippet": "8e5f6fead6d0469a9af967bd3b3c823c", + "position": "0" + }, + { + "snippet": "3b4fb17158ed94e2babd49970af94d06", + "position": 2 + }, + { + "snippet": "b0607c96667235727aa1e4212e907f7b", + "position": 3 + }, + { + "snippet": "65aecd343e17c78db5cfca34a8a4fa02", + "position": 4 + }, + { + "snippet": "89a7bf1c4ead7854f274e6f41b7654da", + "position": 5 + }, + { + "snippet": "8c38b55be87ffec2c0b91d6085f12e69", + "position": 6 + }, + { + "snippet": "5e0ddfbe6eeaa0bbe00f0a3bcb4183a8", + "position": 7 + }, + { + "snippet": "f8a7cabd43fb2d8a40a23d83217e3d8b", + "position": 8 + }, + { + "snippet": "fdc4910fe720d6b9f20196d306e7aedc", + "position": 9 + }, + { + "snippet": "7a5ee56ca82edc1c76e0b0b9322129dd", + "position": 10 + }, + { + "snippet": "6b93bb4ea1623dd6946a21f99418a3fa", + "position": 11 + }, + { + "snippet": "8f2a211b1a10cbd28fb8f1ad21dbf5fb", + "position": 12 + }, + { + "snippet": "c3c82df4de85b1c9dbf69b2b5a45935c", + "position": 13 + }, + { + "snippet": "216e662345dd2969bff90aefdae76672", + "position": 14 + }, + { + "snippet": "24d9e003c332e26e2cae1263d18e0ef6", + "position": 15 + }, + { + "snippet": "7210020de6bfe60b69ca8ec908845a15", + "position": 17 + }, + { + "snippet": "667f800b10c105c2418effd6035e6763", + "position": 18 + }, + { + "snippet": "c18caedb3daf59b210278b2b6d1d0db5", + "position": 19 + }, + { + "snippet": "a19fe989f63161a76526933a34593741", + "position": 20 + }, + { + "snippet": "f782389ac40b56bc81a7c92f40d87a83", + "position": 21 + }, + { + "snippet": "4ed61cd372dcc7d88c95d899271fd138", + "position": 22 + }, + { + "snippet": "e9c74c50192eb95bc4595254fc253427", + "position": 23 + }, + { + "snippet": "5a908af743b549f1f0ef8ab02c9053eb", + "position": 24 + } + ], + "matched_snippets": [ + { + "package": "pkg:github/isaacs/inherits@v2.0.3", + "resource": "inherits-2.0.3/inherits.js", + "similarity": "1.0", + "match_detections": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ] + } + ] +} \ No newline at end of file diff --git a/scanpipe/tests/data/matchcode/fingerprinting/inherits.js b/scanpipe/tests/data/matchcode/fingerprinting/inherits.js new file mode 100644 index 0000000000..061b396207 --- /dev/null +++ b/scanpipe/tests/data/matchcode/fingerprinting/inherits.js @@ -0,0 +1,29 @@ +module.exports = inherits + +function inherits (c, p, proto) { + proto = proto || {} + var e = {} + ;[c.prototype, proto].forEach(function (s) { + Object.getOwnPropertyNames(s).forEach(function (k) { + e[k] = Object.getOwnPropertyDescriptor(s, k) + }) + }) + c.prototype = Object.create(p.prototype, e) + c.super = p +} + +//function Child () { +// Child.super.call(this) +// console.error([this +// ,this.constructor +// ,this.constructor === Child +// ,this.constructor.super === Parent +// ,Object.getPrototypeOf(this) === Child.prototype +// ,Object.getPrototypeOf(Object.getPrototypeOf(this)) +// === Parent.prototype +// ,this instanceof Child +// ,this instanceof Parent]) +//} +//function Parent () {} +//inherits(Child, Parent) +//new Child diff --git a/scanpipe/tests/pipes/test_matchcode.py b/scanpipe/tests/pipes/test_matchcode.py index d4b44bb140..8b9a390ab8 100644 --- a/scanpipe/tests/pipes/test_matchcode.py +++ b/scanpipe/tests/pipes/test_matchcode.py @@ -366,3 +366,66 @@ def test_scanpipe_pipes_matchcode_fingerprint_codebase_resources(self): } self.assertEqual(expected_extra_data, codebase_resource1.extra_data) self.assertFalse(codebase_resource2.extra_data) + + def test_scanpipe_pipes_matchcode_fingerprint_stemmed_codebase_resources(self): + # This resource should not have a fingerprint + copy_input( + self.data / "aboutcode" / "notice.NOTICE", self.project1.codebase_path + ) + codebase_resource1 = CodebaseResource.objects.create( + project=self.project1, path="notice.NOTICE", is_text=True + ) + + # This resource should not have a fingerprint + copy_input( + self.data / "scancode" / "is-npm-1.0.0.tgz", self.project1.codebase_path + ) + codebase_resource2 = CodebaseResource.objects.create( + project=self.project1, path="is-npm-1.0.0.tgz" + ) + + # This resource should have a fingerprint + copy_input( + self.data / "matchcode" / "fingerprinting" / "inherits.js", + self.project1.codebase_path, + ) + codebase_resource3 = CodebaseResource.objects.create( + project=self.project1, path="inherits.js", is_text=True + ) + + matchcode.fingerprint_stemmed_codebase_resources(self.project1) + codebase_resource1.refresh_from_db() + codebase_resource2.refresh_from_db() + codebase_resource3.refresh_from_db() + + expected_extra_data = { + "stemmed_halo1": "000000240a64b6c8aae4625491a8aa77ffd9b2a6", + "stemmed_snippets": [ + {"snippet": "8e5f6fead6d0469a9af967bd3b3c823c", "position": 0}, + {"snippet": "3b4fb17158ed94e2babd49970af94d06", "position": 2}, + {"snippet": "b0607c96667235727aa1e4212e907f7b", "position": 3}, + {"snippet": "65aecd343e17c78db5cfca34a8a4fa02", "position": 4}, + {"snippet": "89a7bf1c4ead7854f274e6f41b7654da", "position": 5}, + {"snippet": "8c38b55be87ffec2c0b91d6085f12e69", "position": 6}, + {"snippet": "5e0ddfbe6eeaa0bbe00f0a3bcb4183a8", "position": 7}, + {"snippet": "f8a7cabd43fb2d8a40a23d83217e3d8b", "position": 8}, + {"snippet": "fdc4910fe720d6b9f20196d306e7aedc", "position": 9}, + {"snippet": "7a5ee56ca82edc1c76e0b0b9322129dd", "position": 10}, + {"snippet": "6b93bb4ea1623dd6946a21f99418a3fa", "position": 11}, + {"snippet": "8f2a211b1a10cbd28fb8f1ad21dbf5fb", "position": 12}, + {"snippet": "c3c82df4de85b1c9dbf69b2b5a45935c", "position": 13}, + {"snippet": "216e662345dd2969bff90aefdae76672", "position": 14}, + {"snippet": "24d9e003c332e26e2cae1263d18e0ef6", "position": 15}, + {"snippet": "7210020de6bfe60b69ca8ec908845a15", "position": 17}, + {"snippet": "667f800b10c105c2418effd6035e6763", "position": 18}, + {"snippet": "c18caedb3daf59b210278b2b6d1d0db5", "position": 19}, + {"snippet": "a19fe989f63161a76526933a34593741", "position": 20}, + {"snippet": "f782389ac40b56bc81a7c92f40d87a83", "position": 21}, + {"snippet": "4ed61cd372dcc7d88c95d899271fd138", "position": 22}, + {"snippet": "e9c74c50192eb95bc4595254fc253427", "position": 23}, + {"snippet": "5a908af743b549f1f0ef8ab02c9053eb", "position": 24}, + ], + } + self.assertEqual(expected_extra_data, codebase_resource3.extra_data) + self.assertFalse(codebase_resource1.extra_data) + self.assertFalse(codebase_resource2.extra_data) diff --git a/scanpipe/tests/test_views.py b/scanpipe/tests/test_views.py index eb380c250d..2db25414ec 100644 --- a/scanpipe/tests/test_views.py +++ b/scanpipe/tests/test_views.py @@ -56,6 +56,7 @@ from scanpipe.tests import make_resource_file from scanpipe.tests import package_data1 from scanpipe.tests import package_data2 +from scanpipe.views import CodebaseResourceDetailsView from scanpipe.views import ProjectActionView from scanpipe.views import ProjectCodebaseView from scanpipe.views import ProjectDetailView @@ -1325,3 +1326,17 @@ def test_scanpipe_policies_broken_policies_project_details(self): response = self.client.get(url) self.assertEqual(200, response.status_code) self.assertContains(response, "Policies file format error") + + def test_scanpipe_views_codebase_resource_details_get_matched_snippet_annotations( + self, + ): + resource1 = make_resource_file(self.project1, "inherits.js") + extra_data_loc = self.data / "matchcode" / "fingerprinting" / "extra_data.json" + with open(extra_data_loc) as f: + extra_data = json.load(f) + resource1.extra_data.update(extra_data) + resource1.save() + resource1.refresh_from_db() + results = CodebaseResourceDetailsView.get_matched_snippet_annotations(resource1) + expected_results = [{"start_line": 1, "end_line": 6}] + self.assertEqual(expected_results, results) diff --git a/scanpipe/views.py b/scanpipe/views.py index 69cd592b9f..43d12e38e0 100644 --- a/scanpipe/views.py +++ b/scanpipe/views.py @@ -61,6 +61,7 @@ import saneyaml import xlsxwriter from django_filters.views import FilterView +from licensedcode.spans import Span from packageurl.contrib.django.models import PACKAGE_URL_FIELDS from scancodeio.auth import ConditionalLoginRequired @@ -1948,6 +1949,27 @@ def get_license_annotations(self, field_name): return annotations + @staticmethod + def get_matched_snippet_annotations(resource): + # convert qspan from list of ints to Spans + matched_snippet_annotations = [] + matched_snippets = resource.extra_data.get("matched_snippets") + if matched_snippets: + line_by_pos = resource.extra_data.get("line_by_pos") + for matched_snippet in matched_snippets: + match_detections = matched_snippet["match_detections"] + qspan = Span(match_detections) + for span in qspan.subspans(): + # line_by_pos is stored as JSON and keys in JSON are always + # strings + matched_snippet_annotations.append( + { + "start_line": line_by_pos[str(span.start)], + "end_line": line_by_pos[str(span.end)], + } + ) + return matched_snippet_annotations + def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) resource = self.object @@ -1975,6 +1997,9 @@ def get_context_data(self, **kwargs): annotations = self.get_annotations(getattr(resource, field_name), value_key) context["detected_values"][field_name] = annotations + matched_snippet_annotations = self.get_matched_snippet_annotations(resource) + context["detected_values"]["matched snippets"] = matched_snippet_annotations + return context diff --git a/setup.cfg b/setup.cfg index 5f0a37a959..6f9bdff72f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit==7.0.0 + matchcode-toolkit==7.2.2 # Univers univers==30.12.1 # Markdown