From e06433d606a732f298d89795b2528ff0847f7f12 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 25 Feb 2025 12:26:03 +0530 Subject: [PATCH 1/9] Add scanner to compute stem code fingerprint Signed-off-by: Keshav Priyadarshi --- scanpipe/pipes/matchcode.py | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index 1ad02899dd..5e2da11edc 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -28,6 +28,7 @@ import requests from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes +from matchcode_toolkit.fingerprinting import get_stem_file_fingerprint_hashes from scancode import Scanner from scanpipe.pipes import codebase @@ -254,6 +255,48 @@ def fingerprint_codebase_resources( ) +def fingerprint_stem_codebase_resource(location, with_threading=True, **kwargs): + """ + Compute stem code fingerprints for the resource at `location` using the + scancode-toolkit direct API. + + Return a dictionary of scan `results` and a list of `errors`. + """ + scanners = [ + Scanner("stem_fingerprints", get_stem_file_fingerprint_hashes), + ] + return _scan_resource(location, scanners, with_threading=with_threading) + + +def fingerprint_setm_codebase_resources( + project, resource_qs=None, progress_logger=None, to_codebase_only=False +): + """ + Compute stem code fingerprints for the resources from `project`. + + These resource fingerprints are used for matching purposes on matchcode. + + Multiprocessing is enabled by default on this pipe, the number of processes can be + controlled through the SCANCODEIO_PROCESSES setting. + + If `to_codebase_only` is True, the only resources from the `to/` codebase + are computed. + """ + # Checking for None to make the distinction with an empty resource_qs queryset + if resource_qs is None: + resource_qs = project.codebaseresources.filter(is_text=True) + + if to_codebase_only: + resource_qs = resource_qs.to_codebase() + + scan_resources( + resource_qs=resource_qs, + scan_func=fingerprint_stem_codebase_resource, + save_func=save_resource_fingerprints, + progress_logger=progress_logger, + ) + + def send_project_json_to_matchcode( project, timeout=DEFAULT_TIMEOUT, api_url=MATCHCODEIO_API_URL ): From 5e8250e85333e00b1c24b5ba977c07a9f552c58d Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 25 Feb 2025 12:31:19 +0530 Subject: [PATCH 2/9] Bump matchcode-toolkit Signed-off-by: Keshav Priyadarshi --- scanpipe/pipes/matchcode.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index 5e2da11edc..d2ad916a07 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -268,7 +268,7 @@ def fingerprint_stem_codebase_resource(location, with_threading=True, **kwargs): return _scan_resource(location, scanners, with_threading=with_threading) -def fingerprint_setm_codebase_resources( +def fingerprint_stem_codebase_resources( project, resource_qs=None, progress_logger=None, to_codebase_only=False ): """ diff --git a/setup.cfg b/setup.cfg index 5f0a37a959..52fd00facb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit==7.0.0 + matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@8e0405ecd40db7b0644eacce73b3071ebbe0d9fd # Univers univers==30.12.1 # Markdown From 83fc82376d4d8083c86e7d20d781083f1f9dbe46 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 25 Feb 2025 10:13:11 -0800 Subject: [PATCH 3/9] Update function names Signed-off-by: Jono Yang --- scanpipe/pipes/matchcode.py | 14 +++++++------- setup.cfg | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index d2ad916a07..09afc1ec76 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -28,7 +28,7 @@ import requests from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes -from matchcode_toolkit.fingerprinting import get_stem_file_fingerprint_hashes +from matchcode_toolkit.fingerprinting import get_stemmed_file_fingerprint_hashes from scancode import Scanner from scanpipe.pipes import codebase @@ -255,24 +255,24 @@ def fingerprint_codebase_resources( ) -def fingerprint_stem_codebase_resource(location, with_threading=True, **kwargs): +def fingerprint_stemmed_codebase_resource(location, with_threading=True, **kwargs): """ - Compute stem code fingerprints for the resource at `location` using the + Compute stemmed code fingerprints for the resource at `location` using the scancode-toolkit direct API. Return a dictionary of scan `results` and a list of `errors`. """ scanners = [ - Scanner("stem_fingerprints", get_stem_file_fingerprint_hashes), + Scanner("stemmed_fingerprints", get_stemmed_file_fingerprint_hashes), ] return _scan_resource(location, scanners, with_threading=with_threading) -def fingerprint_stem_codebase_resources( +def fingerprint_stemmed_codebase_resources( project, resource_qs=None, progress_logger=None, to_codebase_only=False ): """ - Compute stem code fingerprints for the resources from `project`. + Compute stemmed code fingerprints for the resources from `project`. These resource fingerprints are used for matching purposes on matchcode. @@ -291,7 +291,7 @@ def fingerprint_stem_codebase_resources( scan_resources( resource_qs=resource_qs, - scan_func=fingerprint_stem_codebase_resource, + scan_func=fingerprint_stemmed_codebase_resource, save_func=save_resource_fingerprints, progress_logger=progress_logger, ) diff --git a/setup.cfg b/setup.cfg index 52fd00facb..658547fe61 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@8e0405ecd40db7b0644eacce73b3071ebbe0d9fd + matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@5201aa53ec222f3de25943fde026b16b19244e7d # Univers univers==30.12.1 # Markdown From 2b4d8110c4bae9ee704d494a4b5be8b6b4058156 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 25 Feb 2025 13:08:28 -0800 Subject: [PATCH 4/9] Update matchcode-toolkit to 7.2.1 Signed-off-by: Jono Yang --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 658547fe61..04f018e388 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@5201aa53ec222f3de25943fde026b16b19244e7d + matchcode-toolkit==7.2.1 # Univers univers==30.12.1 # Markdown From 76e1daae6cb9edabf2f51175c618bfce9818e408 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 26 Feb 2025 18:22:36 -0800 Subject: [PATCH 5/9] Use matchcode-toolkit branch * there is a bug in the code stemming function Signed-off-by: Jono Yang --- scanpipe/pipes/matchcode.py | 12 ++++++++++++ scanpipe/views.py | 24 ++++++++++++++++++++++++ setup.cfg | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index 09afc1ec76..7c5ea9ef3d 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -28,6 +28,7 @@ import requests from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes +from matchcode_toolkit.fingerprinting import get_line_by_pos from matchcode_toolkit.fingerprinting import get_stemmed_file_fingerprint_hashes from scancode import Scanner @@ -405,3 +406,14 @@ def create_packages_from_match_results(project, match_results): package_data=matched_package, status=flag.MATCHED_TO_PURLDB_PACKAGE, ) + match_resources = match_results.get("files", []) + for match_resource in match_resources: + match_resource_extra_data = match_resource["extra_data"] + if match_resource_extra_data: + resource = project.codebaseresources.get(path=match_resource["path"]) + # compute line_by_pos for displaying matches in CodebaseResource detail view + with open(resource.location) as f: + content = f.read() + line_by_pos = get_line_by_pos(content) + match_resource_extra_data["line_by_pos"] = line_by_pos + resource.update_extra_data(match_resource_extra_data) diff --git a/scanpipe/views.py b/scanpipe/views.py index 69cd592b9f..05f06325f7 100644 --- a/scanpipe/views.py +++ b/scanpipe/views.py @@ -61,6 +61,7 @@ import saneyaml import xlsxwriter from django_filters.views import FilterView +from licensedcode.spans import Span from packageurl.contrib.django.models import PACKAGE_URL_FIELDS from scancodeio.auth import ConditionalLoginRequired @@ -1948,6 +1949,26 @@ def get_license_annotations(self, field_name): return annotations + def get_matched_snippet_annotations(self, resource): + # convert qspan from list of ints to Spans + matched_snippet_annotations = [] + matched_snippets = resource.extra_data.get("matched_snippets") + if matched_snippets: + line_by_pos = resource.extra_data.get("line_by_pos") + for matched_snippet in matched_snippets: + match_detections = matched_snippet["match_detections"] + qspan = Span(match_detections) + for span in qspan.subspans(): + # line_by_pos is stored as JSON and keys in JSON are always + # strings + matched_snippet_annotations.append( + { + "start_line": line_by_pos[str(span.start)], + "end_line": line_by_pos[str(span.end)], + } + ) + return matched_snippet_annotations + def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) resource = self.object @@ -1964,6 +1985,9 @@ def get_context_data(self, **kwargs): "licenses": license_annotations, } + matched_snippet_annotations = self.get_matched_snippet_annotations(resource) + context["detected_values"]["matched_snippets"] = matched_snippet_annotations + fields = [ ("copyrights", "copyright"), ("holders", "holder"), diff --git a/setup.cfg b/setup.cfg index 04f018e388..f6b2c52531 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit==7.2.1 + matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@bf9fd9afd6ce1dd25dce4935135ea4118fc80b06 # Univers univers==30.12.1 # Markdown From 66fa777ba220453d7d517d7beed6d2ea6690e61d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 27 Feb 2025 13:16:02 -0800 Subject: [PATCH 6/9] Use matchcode-toolkit 7.2.2 Signed-off-by: Jono Yang --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f6b2c52531..6f9bdff72f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,7 +97,7 @@ install_requires = cyclonedx-python-lib==9.1.0 jsonschema==4.23.0 # MatchCode-toolkit - matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@bf9fd9afd6ce1dd25dce4935135ea4118fc80b06 + matchcode-toolkit==7.2.2 # Univers univers==30.12.1 # Markdown From df2219c2d8c73b5f7c2a10321ed5188488b8fb9a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 24 Apr 2025 16:51:05 -0700 Subject: [PATCH 7/9] Add test for fingerprint_stemmed_codebase_resources Signed-off-by: Jono Yang --- .../matchcode/fingerprinting/handleError.js | 12 +++++ scanpipe/tests/pipes/test_matchcode.py | 53 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 scanpipe/tests/data/matchcode/fingerprinting/handleError.js diff --git a/scanpipe/tests/data/matchcode/fingerprinting/handleError.js b/scanpipe/tests/data/matchcode/fingerprinting/handleError.js new file mode 100644 index 0000000000..06494ce0c8 --- /dev/null +++ b/scanpipe/tests/data/matchcode/fingerprinting/handleError.js @@ -0,0 +1,12 @@ +'use strict'; + +var gutil = require('gulp-util'); + +function handleError(error, source) { + var message = error.messageFormatted ? error.messageFormatted : error.message; + console.error(new gutil.PluginError(source || 'metal', message).toString()); + + this.emit('end'); // jshint ignore:line +} + +module.exports = handleError; diff --git a/scanpipe/tests/pipes/test_matchcode.py b/scanpipe/tests/pipes/test_matchcode.py index d4b44bb140..1f5266b764 100644 --- a/scanpipe/tests/pipes/test_matchcode.py +++ b/scanpipe/tests/pipes/test_matchcode.py @@ -366,3 +366,56 @@ def test_scanpipe_pipes_matchcode_fingerprint_codebase_resources(self): } self.assertEqual(expected_extra_data, codebase_resource1.extra_data) self.assertFalse(codebase_resource2.extra_data) + + def test_scanpipe_pipes_matchcode_fingerprint_stemmed_codebase_resources(self): + # This resource should not have a fingerprint + copy_input( + self.data / "aboutcode" / "notice.NOTICE", self.project1.codebase_path + ) + codebase_resource1 = CodebaseResource.objects.create( + project=self.project1, path="notice.NOTICE", is_text=True + ) + + # This resource should not have a fingerprint + copy_input( + self.data / "scancode" / "is-npm-1.0.0.tgz", self.project1.codebase_path + ) + codebase_resource2 = CodebaseResource.objects.create( + project=self.project1, path="is-npm-1.0.0.tgz" + ) + + # This resource should have a fingerprint + copy_input( + self.data / "matchcode" / "fingerprinting" / "handleError.js", + self.project1.codebase_path, + ) + codebase_resource3 = CodebaseResource.objects.create( + project=self.project1, path="handleError.js", is_text=True + ) + + matchcode.fingerprint_stemmed_codebase_resources(self.project1) + codebase_resource1.refresh_from_db() + codebase_resource2.refresh_from_db() + codebase_resource3.refresh_from_db() + + expected_extra_data = { + "stemmed_halo1": "0000001ebf495b2fde7beb419238f8a4e8427b41", + "stemmed_snippets": [ + {"snippet": "7089085d2b66fc610e31a54edf2ddc76", "position": 0}, + {"snippet": "accf246732a0ea80d8c59af1a69dc074", "position": 2}, + {"snippet": "a163d9edfaa1f6daf2c1e92fcd4b8b8a", "position": 3}, + {"snippet": "7ebfad556997dc224a75499ee4411169", "position": 4}, + {"snippet": "a77f64bd3bfef4323bd6cbc3c93aab4f", "position": 7}, + {"snippet": "6a2bcde13a7f15492c3e2e4436c4217e", "position": 8}, + {"snippet": "2c988df1972a487121338ec1b947df1a", "position": 9}, + {"snippet": "bebb16613133c76d2c260474fc82ab34", "position": 10}, + {"snippet": "979167ee18b8e80590c2c083ed9e1a8a", "position": 11}, + {"snippet": "d7a3167b8a401f9147ce5ed773fab894", "position": 12}, + {"snippet": "251fb1d28cc5d7ae002ff82b87377233", "position": 13}, + {"snippet": "ed139c8a1f4764c33cdc3432097a2dc6", "position": 15}, + {"snippet": "6c37ff7b040d2c75a0b94597d73d42da", "position": 18}, + ], + } + self.assertEqual(expected_extra_data, codebase_resource3.extra_data) + self.assertFalse(codebase_resource1.extra_data) + self.assertFalse(codebase_resource2.extra_data) From f856080ccfa37832eabbdd52c2b3ee794abc5a94 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 24 Apr 2025 18:23:36 -0700 Subject: [PATCH 8/9] Create test for get_matched_snippet_annotations * Update test js file Signed-off-by: Jono Yang --- .../matchcode/fingerprinting/extra_data.json | 278 ++++++++++++++++++ .../matchcode/fingerprinting/handleError.js | 12 - .../data/matchcode/fingerprinting/inherits.js | 29 ++ scanpipe/tests/pipes/test_matchcode.py | 42 ++- scanpipe/tests/test_views.py | 15 + scanpipe/views.py | 9 +- 6 files changed, 353 insertions(+), 32 deletions(-) create mode 100644 scanpipe/tests/data/matchcode/fingerprinting/extra_data.json delete mode 100644 scanpipe/tests/data/matchcode/fingerprinting/handleError.js create mode 100644 scanpipe/tests/data/matchcode/fingerprinting/inherits.js diff --git a/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json b/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json new file mode 100644 index 0000000000..8aaa224363 --- /dev/null +++ b/scanpipe/tests/data/matchcode/fingerprinting/extra_data.json @@ -0,0 +1,278 @@ +{ + "halo1": "0000004f5cc2ec9a5ebdaa44336f53be569d6829", + "snippets": [ + { + "snippet": "24a1651c51468fb8cf1ac6c38a2c4add", + "position": "0" + }, + { + "snippet": "7b1cbef763885c6856df8b15fa4e57a5", + "position": 5 + }, + { + "snippet": "46828d9d4a64300b1543e4e5a6356ed5", + "position": 12 + }, + { + "snippet": "c0496b020a8d87a3b1bf1a83c67c16d5", + "position": 14 + }, + { + "snippet": "b2ec716c571a0368ea37dbb7821c6945", + "position": 15 + }, + { + "snippet": "8dd2b57022204ecd9ea4a2471f224fd4", + "position": 22 + }, + { + "snippet": "cb9216ce4ad33a5d6feb378dbf0404c8", + "position": 30 + }, + { + "snippet": "034b634f1c726c9c0f7740ea9723637b", + "position": 37 + }, + { + "snippet": "d0bb8a1740512218c8e87bbaa5f5d9a6", + "position": 38 + }, + { + "snippet": "7ae529b13ddb3b0c74421772d78821a7", + "position": 41 + }, + { + "snippet": "b2aad3c6ab2c2c9ba1a95edac417aa09", + "position": 42 + }, + { + "snippet": "be339f1c1670b7789e83f875978c1e06", + "position": 46 + }, + { + "snippet": "a895f0ff2b99352b33392fda0a87a4cf", + "position": 53 + }, + { + "snippet": "6819c7f718a1fa7f2501009d21ee46d7", + "position": 57 + }, + { + "snippet": "97ecd33b1ca08589363df198458d976f", + "position": 61 + }, + { + "snippet": "2c73086d098f182cf8441046b97af434", + "position": 64 + }, + { + "snippet": "3ba6ad01d6f9130be38df14a44633abd", + "position": 67 + } + ], + "line_by_pos": { + "0": 1, + "1": 1, + "2": 1, + "3": 3, + "4": 3, + "5": 3, + "6": 3, + "7": 3, + "8": 4, + "9": 4, + "10": 5, + "11": 5, + "12": 6, + "13": 6, + "14": 6, + "15": 6, + "16": 6, + "17": 6, + "18": 7, + "19": 7, + "20": 7, + "21": 7, + "22": 7, + "23": 7, + "24": 8, + "25": 8, + "26": 8, + "27": 8, + "28": 8, + "29": 8, + "30": 11, + "31": 11, + "32": 11, + "33": 11, + "34": 11, + "35": 11, + "36": 11, + "37": 12, + "38": 12, + "39": 12, + "40": 15, + "41": 15, + "42": 16, + "43": 16, + "44": 16, + "45": 16, + "46": 17, + "47": 17, + "48": 17, + "49": 18, + "50": 18, + "51": 19, + "52": 19, + "53": 19, + "54": 20, + "55": 20, + "56": 20, + "57": 20, + "58": 21, + "59": 21, + "60": 21, + "61": 21, + "62": 21, + "63": 22, + "64": 22, + "65": 22, + "66": 22, + "67": 22, + "68": 23, + "69": 23, + "70": 24, + "71": 24, + "72": 24, + "73": 25, + "74": 25, + "75": 25, + "76": 27, + "77": 27, + "78": 28, + "79": 28, + "80": 28, + "81": 29, + "82": 29 + }, + "stemmed_halo1": "000000240a64b6c8aae4625491a8aa77ffd9b2a6", + "stemmed_snippets": [ + { + "snippet": "8e5f6fead6d0469a9af967bd3b3c823c", + "position": "0" + }, + { + "snippet": "3b4fb17158ed94e2babd49970af94d06", + "position": 2 + }, + { + "snippet": "b0607c96667235727aa1e4212e907f7b", + "position": 3 + }, + { + "snippet": "65aecd343e17c78db5cfca34a8a4fa02", + "position": 4 + }, + { + "snippet": "89a7bf1c4ead7854f274e6f41b7654da", + "position": 5 + }, + { + "snippet": "8c38b55be87ffec2c0b91d6085f12e69", + "position": 6 + }, + { + "snippet": "5e0ddfbe6eeaa0bbe00f0a3bcb4183a8", + "position": 7 + }, + { + "snippet": "f8a7cabd43fb2d8a40a23d83217e3d8b", + "position": 8 + }, + { + "snippet": "fdc4910fe720d6b9f20196d306e7aedc", + "position": 9 + }, + { + "snippet": "7a5ee56ca82edc1c76e0b0b9322129dd", + "position": 10 + }, + { + "snippet": "6b93bb4ea1623dd6946a21f99418a3fa", + "position": 11 + }, + { + "snippet": "8f2a211b1a10cbd28fb8f1ad21dbf5fb", + "position": 12 + }, + { + "snippet": "c3c82df4de85b1c9dbf69b2b5a45935c", + "position": 13 + }, + { + "snippet": "216e662345dd2969bff90aefdae76672", + "position": 14 + }, + { + "snippet": "24d9e003c332e26e2cae1263d18e0ef6", + "position": 15 + }, + { + "snippet": "7210020de6bfe60b69ca8ec908845a15", + "position": 17 + }, + { + "snippet": "667f800b10c105c2418effd6035e6763", + "position": 18 + }, + { + "snippet": "c18caedb3daf59b210278b2b6d1d0db5", + "position": 19 + }, + { + "snippet": "a19fe989f63161a76526933a34593741", + "position": 20 + }, + { + "snippet": "f782389ac40b56bc81a7c92f40d87a83", + "position": 21 + }, + { + "snippet": "4ed61cd372dcc7d88c95d899271fd138", + "position": 22 + }, + { + "snippet": "e9c74c50192eb95bc4595254fc253427", + "position": 23 + }, + { + "snippet": "5a908af743b549f1f0ef8ab02c9053eb", + "position": 24 + } + ], + "matched_snippets": [ + { + "package": "pkg:github/isaacs/inherits@v2.0.3", + "resource": "inherits-2.0.3/inherits.js", + "similarity": "1.0", + "match_detections": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ] + } + ] +} \ No newline at end of file diff --git a/scanpipe/tests/data/matchcode/fingerprinting/handleError.js b/scanpipe/tests/data/matchcode/fingerprinting/handleError.js deleted file mode 100644 index 06494ce0c8..0000000000 --- a/scanpipe/tests/data/matchcode/fingerprinting/handleError.js +++ /dev/null @@ -1,12 +0,0 @@ -'use strict'; - -var gutil = require('gulp-util'); - -function handleError(error, source) { - var message = error.messageFormatted ? error.messageFormatted : error.message; - console.error(new gutil.PluginError(source || 'metal', message).toString()); - - this.emit('end'); // jshint ignore:line -} - -module.exports = handleError; diff --git a/scanpipe/tests/data/matchcode/fingerprinting/inherits.js b/scanpipe/tests/data/matchcode/fingerprinting/inherits.js new file mode 100644 index 0000000000..061b396207 --- /dev/null +++ b/scanpipe/tests/data/matchcode/fingerprinting/inherits.js @@ -0,0 +1,29 @@ +module.exports = inherits + +function inherits (c, p, proto) { + proto = proto || {} + var e = {} + ;[c.prototype, proto].forEach(function (s) { + Object.getOwnPropertyNames(s).forEach(function (k) { + e[k] = Object.getOwnPropertyDescriptor(s, k) + }) + }) + c.prototype = Object.create(p.prototype, e) + c.super = p +} + +//function Child () { +// Child.super.call(this) +// console.error([this +// ,this.constructor +// ,this.constructor === Child +// ,this.constructor.super === Parent +// ,Object.getPrototypeOf(this) === Child.prototype +// ,Object.getPrototypeOf(Object.getPrototypeOf(this)) +// === Parent.prototype +// ,this instanceof Child +// ,this instanceof Parent]) +//} +//function Parent () {} +//inherits(Child, Parent) +//new Child diff --git a/scanpipe/tests/pipes/test_matchcode.py b/scanpipe/tests/pipes/test_matchcode.py index 1f5266b764..8b9a390ab8 100644 --- a/scanpipe/tests/pipes/test_matchcode.py +++ b/scanpipe/tests/pipes/test_matchcode.py @@ -386,11 +386,11 @@ def test_scanpipe_pipes_matchcode_fingerprint_stemmed_codebase_resources(self): # This resource should have a fingerprint copy_input( - self.data / "matchcode" / "fingerprinting" / "handleError.js", + self.data / "matchcode" / "fingerprinting" / "inherits.js", self.project1.codebase_path, ) codebase_resource3 = CodebaseResource.objects.create( - project=self.project1, path="handleError.js", is_text=True + project=self.project1, path="inherits.js", is_text=True ) matchcode.fingerprint_stemmed_codebase_resources(self.project1) @@ -399,21 +399,31 @@ def test_scanpipe_pipes_matchcode_fingerprint_stemmed_codebase_resources(self): codebase_resource3.refresh_from_db() expected_extra_data = { - "stemmed_halo1": "0000001ebf495b2fde7beb419238f8a4e8427b41", + "stemmed_halo1": "000000240a64b6c8aae4625491a8aa77ffd9b2a6", "stemmed_snippets": [ - {"snippet": "7089085d2b66fc610e31a54edf2ddc76", "position": 0}, - {"snippet": "accf246732a0ea80d8c59af1a69dc074", "position": 2}, - {"snippet": "a163d9edfaa1f6daf2c1e92fcd4b8b8a", "position": 3}, - {"snippet": "7ebfad556997dc224a75499ee4411169", "position": 4}, - {"snippet": "a77f64bd3bfef4323bd6cbc3c93aab4f", "position": 7}, - {"snippet": "6a2bcde13a7f15492c3e2e4436c4217e", "position": 8}, - {"snippet": "2c988df1972a487121338ec1b947df1a", "position": 9}, - {"snippet": "bebb16613133c76d2c260474fc82ab34", "position": 10}, - {"snippet": "979167ee18b8e80590c2c083ed9e1a8a", "position": 11}, - {"snippet": "d7a3167b8a401f9147ce5ed773fab894", "position": 12}, - {"snippet": "251fb1d28cc5d7ae002ff82b87377233", "position": 13}, - {"snippet": "ed139c8a1f4764c33cdc3432097a2dc6", "position": 15}, - {"snippet": "6c37ff7b040d2c75a0b94597d73d42da", "position": 18}, + {"snippet": "8e5f6fead6d0469a9af967bd3b3c823c", "position": 0}, + {"snippet": "3b4fb17158ed94e2babd49970af94d06", "position": 2}, + {"snippet": "b0607c96667235727aa1e4212e907f7b", "position": 3}, + {"snippet": "65aecd343e17c78db5cfca34a8a4fa02", "position": 4}, + {"snippet": "89a7bf1c4ead7854f274e6f41b7654da", "position": 5}, + {"snippet": "8c38b55be87ffec2c0b91d6085f12e69", "position": 6}, + {"snippet": "5e0ddfbe6eeaa0bbe00f0a3bcb4183a8", "position": 7}, + {"snippet": "f8a7cabd43fb2d8a40a23d83217e3d8b", "position": 8}, + {"snippet": "fdc4910fe720d6b9f20196d306e7aedc", "position": 9}, + {"snippet": "7a5ee56ca82edc1c76e0b0b9322129dd", "position": 10}, + {"snippet": "6b93bb4ea1623dd6946a21f99418a3fa", "position": 11}, + {"snippet": "8f2a211b1a10cbd28fb8f1ad21dbf5fb", "position": 12}, + {"snippet": "c3c82df4de85b1c9dbf69b2b5a45935c", "position": 13}, + {"snippet": "216e662345dd2969bff90aefdae76672", "position": 14}, + {"snippet": "24d9e003c332e26e2cae1263d18e0ef6", "position": 15}, + {"snippet": "7210020de6bfe60b69ca8ec908845a15", "position": 17}, + {"snippet": "667f800b10c105c2418effd6035e6763", "position": 18}, + {"snippet": "c18caedb3daf59b210278b2b6d1d0db5", "position": 19}, + {"snippet": "a19fe989f63161a76526933a34593741", "position": 20}, + {"snippet": "f782389ac40b56bc81a7c92f40d87a83", "position": 21}, + {"snippet": "4ed61cd372dcc7d88c95d899271fd138", "position": 22}, + {"snippet": "e9c74c50192eb95bc4595254fc253427", "position": 23}, + {"snippet": "5a908af743b549f1f0ef8ab02c9053eb", "position": 24}, ], } self.assertEqual(expected_extra_data, codebase_resource3.extra_data) diff --git a/scanpipe/tests/test_views.py b/scanpipe/tests/test_views.py index eb380c250d..2db25414ec 100644 --- a/scanpipe/tests/test_views.py +++ b/scanpipe/tests/test_views.py @@ -56,6 +56,7 @@ from scanpipe.tests import make_resource_file from scanpipe.tests import package_data1 from scanpipe.tests import package_data2 +from scanpipe.views import CodebaseResourceDetailsView from scanpipe.views import ProjectActionView from scanpipe.views import ProjectCodebaseView from scanpipe.views import ProjectDetailView @@ -1325,3 +1326,17 @@ def test_scanpipe_policies_broken_policies_project_details(self): response = self.client.get(url) self.assertEqual(200, response.status_code) self.assertContains(response, "Policies file format error") + + def test_scanpipe_views_codebase_resource_details_get_matched_snippet_annotations( + self, + ): + resource1 = make_resource_file(self.project1, "inherits.js") + extra_data_loc = self.data / "matchcode" / "fingerprinting" / "extra_data.json" + with open(extra_data_loc) as f: + extra_data = json.load(f) + resource1.extra_data.update(extra_data) + resource1.save() + resource1.refresh_from_db() + results = CodebaseResourceDetailsView.get_matched_snippet_annotations(resource1) + expected_results = [{"start_line": 1, "end_line": 6}] + self.assertEqual(expected_results, results) diff --git a/scanpipe/views.py b/scanpipe/views.py index 05f06325f7..43d12e38e0 100644 --- a/scanpipe/views.py +++ b/scanpipe/views.py @@ -1949,7 +1949,8 @@ def get_license_annotations(self, field_name): return annotations - def get_matched_snippet_annotations(self, resource): + @staticmethod + def get_matched_snippet_annotations(resource): # convert qspan from list of ints to Spans matched_snippet_annotations = [] matched_snippets = resource.extra_data.get("matched_snippets") @@ -1985,9 +1986,6 @@ def get_context_data(self, **kwargs): "licenses": license_annotations, } - matched_snippet_annotations = self.get_matched_snippet_annotations(resource) - context["detected_values"]["matched_snippets"] = matched_snippet_annotations - fields = [ ("copyrights", "copyright"), ("holders", "holder"), @@ -1999,6 +1997,9 @@ def get_context_data(self, **kwargs): annotations = self.get_annotations(getattr(resource, field_name), value_key) context["detected_values"][field_name] = annotations + matched_snippet_annotations = self.get_matched_snippet_annotations(resource) + context["detected_values"]["matched snippets"] = matched_snippet_annotations + return context From d9793b7769d5371966942a2bee00a44ad721692d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 25 Apr 2025 11:44:09 -0700 Subject: [PATCH 9/9] Update CHANGELOG.rst Signed-off-by: Jono Yang --- CHANGELOG.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9935d14adf..17b6a9d6c6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,11 @@ v34.10.2 (unreleased) Use the UUID for the DiscoveredDependency spdx_id for better SPDX compatibility. https://github.com/aboutcode-org/scancode.io/issues/1651 +- Add MatchCode-specific functions to compute fingerprints from stemmed code + files. Update CodebaseResource file content view to display snippet matches, + if available, when the codebase has been sent for matching to MatchCode. + https://github.com/aboutcode-org/scancode.io/pull/1656 + v34.10.1 (2025-03-26) ---------------------