Skip to content

Commit 6abf6ae

Browse files
committed
Improve SPDX import by generating generic PURLs when missing and add inferred identity metadata
Signed-off-by: dikshaa2909 <dikshadeware@gmail.com>
1 parent d2084e6 commit 6abf6ae

File tree

6 files changed

+219
-108
lines changed

6 files changed

+219
-108
lines changed

scanpipe/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3827,7 +3827,7 @@ def create_from_data(cls, project, package_data):
38273827
return
38283828

38293829
if not package_data.get("type"):
3830-
package_data["type"] = "unknown"
3830+
package_data["type"] = "generic"
38313831

38323832
qualifiers = package_data.get("qualifiers")
38333833
if qualifiers:

scanpipe/pipes/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def compare_purls(project, expected_purls):
7070
- Lines starting with '+' are unexpected in the project.
7171
"""
7272
sorted_project_purls = get_unique_project_purls(project)
73-
diff_result = difflib.ndiff(sorted_project_purls, expected_purls)
73+
diff_result = difflib.ndiff(sorted_project_purls, sorted(expected_purls))
7474

7575
# Keep only lines that are diffs (- or +)
7676
filtered_diff = [line for line in diff_result if line.startswith(("-", "+"))]

scanpipe/pipes/resolve.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -311,27 +311,77 @@ def convert_spdx_expression(license_expression_spdx):
311311
return get_license_detections_and_expression(license_expression_spdx)[1]
312312

313313

314+
def build_spdx_purl(spdx_package):
315+
"""
316+
Return a PackageURL dict for the SPDX package.
317+
318+
Resolution order:
319+
1. Use declared PURL unless type == "unknown"
320+
2. Fallback to deterministic generic PURL
321+
"""
322+
for ref in spdx_package.external_refs:
323+
if ref.type == "purl" and ref.locator:
324+
declared = PackageURL.from_string(ref.locator)
325+
326+
# If declared type is meaningful it will use it
327+
if declared.type and declared.type != "unknown":
328+
return declared.to_dict(encode=True), False
329+
330+
# If declared type is unknown it will upgrade to generic
331+
name = declared.name
332+
version = declared.version
333+
334+
if name:
335+
generic = PackageURL(
336+
type="generic",
337+
name=name,
338+
version=version,
339+
)
340+
return generic.to_dict(encode=True), True
341+
342+
# No declared PURL - fallback
343+
name = (spdx_package.name or "").strip()
344+
version = (spdx_package.version or "").strip()
345+
346+
if name:
347+
generic = PackageURL(
348+
type="generic",
349+
name=name,
350+
version=version or None,
351+
)
352+
return generic.to_dict(encode=True), True
353+
354+
return {}, False
355+
356+
314357
def spdx_package_to_package_data(spdx_package):
315358
"""Convert the provided spdx_package into package_data."""
316-
package_url_dict = {}
317-
# Store the original "SPDXID" as package_uid for dependencies resolution.
318359
package_uid = spdx_package.spdx_id
319360

320-
for ref in spdx_package.external_refs:
321-
if ref.type == "purl":
322-
purl = ref.locator
323-
package_url_dict = PackageURL.from_string(purl).to_dict(encode=True)
361+
# Resolve declared or fallback PURL
362+
package_url_dict, inferred = build_spdx_purl(spdx_package)
324363

364+
# Collect checksums
325365
checksum_data = {
326366
checksum.algorithm.lower(): checksum.value
327367
for checksum in spdx_package.checksums
328368
}
329369

370+
# License handling
330371
declared_license_expression_spdx = spdx_package.license_concluded
331372
declared_expression = ""
332373
if declared_license_expression_spdx:
333374
declared_expression = convert_spdx_expression(declared_license_expression_spdx)
334375

376+
# Structured identity metadata
377+
identity = {
378+
"source": "inferred" if inferred else "declared",
379+
"origin": {
380+
"download_location": spdx_package.download_location,
381+
"homepage": spdx_package.homepage,
382+
},
383+
}
384+
335385
package_data = {
336386
"package_uid": package_uid,
337387
"name": spdx_package.name,
@@ -345,6 +395,9 @@ def spdx_package_to_package_data(spdx_package):
345395
"filename": spdx_package.filename,
346396
"description": spdx_package.description,
347397
"release_date": spdx_package.release_date,
398+
"extra_data": {
399+
"identity": identity,
400+
},
348401
**package_url_dict,
349402
**checksum_data,
350403
}

scanpipe/tests/pipes/test_resolve.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def test_scanpipe_pipes_resolve_spdx_package_to_package_data(self):
224224
"qualifiers": "arch=all",
225225
"md5": "76cf50f29e47676962645632737365a7",
226226
}
227+
expected["extra_data"] = {
228+
"identity": {
229+
"source": "declared",
230+
"origin": {
231+
"download_location": "https://download.url/package.zip",
232+
"homepage": "https://packages.debian.org",
233+
},
234+
}
235+
}
227236
self.assertEqual(expected, package_data)
228237

229238
def test_scanpipe_pipes_spdx_relationship_to_dependency_data(self):
@@ -250,6 +259,55 @@ def test_scanpipe_pipes_resolve_spdx_packages(self):
250259
packages_data = resolve.resolve_spdx_packages(input_location)
251260
self.assertEqual(4, len(packages_data))
252261

262+
def test_scanpipe_resolve_spdx_package_generates_generic_purl_when_missing(self):
263+
"""
264+
SPDX package without externalRefs should generate
265+
a deterministic generic PURL and mark it as inferred.
266+
"""
267+
spdx_content = {
268+
"spdxVersion": "SPDX-2.3",
269+
"SPDXID": "SPDXRef-DOCUMENT",
270+
"name": "test-doc",
271+
"dataLicense": "CC0-1.0",
272+
"documentNamespace": "http://example.com/spdx/test",
273+
"creationInfo": {
274+
"created": "2024-01-01T00:00:00Z",
275+
"creators": ["Tool: pytest"],
276+
},
277+
"packages": [
278+
{
279+
"name": "examplepkg",
280+
"SPDXID": "SPDXRef-Package-examplepkg",
281+
"versionInfo": "1.0.0",
282+
"downloadLocation": "NOASSERTION",
283+
"licenseConcluded": "MIT",
284+
"licenseDeclared": "MIT",
285+
"copyrightText": "NOASSERTION",
286+
}
287+
],
288+
"relationships": [],
289+
}
290+
291+
test_file = self.data / "spdx" / "temp_test.spdx.json"
292+
test_file.write_text(json.dumps(spdx_content))
293+
294+
try:
295+
packages = resolve.resolve_spdx_packages(test_file)
296+
finally:
297+
test_file.unlink(missing_ok=True)
298+
299+
self.assertEqual(1, len(packages))
300+
package = packages[0]
301+
302+
self.assertEqual("generic", package.get("type"))
303+
self.assertEqual("examplepkg", package.get("name"))
304+
self.assertEqual("1.0.0", package.get("version"))
305+
306+
self.assertEqual(
307+
"inferred",
308+
package.get("extra_data", {}).get("identity", {}).get("source"),
309+
)
310+
253311
def test_scanpipe_pipes_resolve_spdx_dependencies(self):
254312
input_location = self.data / "spdx" / "SPDXJSONExample-v2.3.spdx.json"
255313
dependencies_data = resolve.resolve_spdx_dependencies(input_location)

scanpipe/tests/test_integrations_ort.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
"dependencies": 25,
4444
"vulnerabilities": 10,
4545
"purls": [
46-
"pkg:unknown/alpine@3.17.0",
46+
"pkg:generic/alpine@3.17.0",
4747
],
4848
},
4949
@@ -141,7 +141,7 @@
141141
"pkg:maven/pkg4-grp/pkg4@0.0.1",
142142
"pkg:maven/pkg6-grp/pkg6@0.0.1",
143143
"pkg:maven/pkg7-grp/pkg7@0.0.1",
144-
"pkg:unknown/proj1@0.0.1",
144+
"pkg:generic/proj1@0.0.1",
145145
],
146146
},
147147
"ort-reporter-spdx-2.3/synthetic-scan-result-expected-output.spdx.json": {
@@ -158,7 +158,7 @@
158158
"pkg:maven/pkg4-grp/pkg4@0.0.1",
159159
"pkg:maven/pkg6-grp/pkg6@0.0.1",
160160
"pkg:maven/pkg7-grp/pkg7@0.0.1",
161-
"pkg:unknown/proj1@0.0.1",
161+
"pkg:generic/proj1@0.0.1",
162162
],
163163
},
164164
}

0 commit comments

Comments
 (0)