Skip to content

Commit a4988f5

Browse files
Created importers for detailed scan results
Signed-off-by: Prem <gowthamattada595@gmail.com>
1 parent df61c28 commit a4988f5

File tree

6 files changed

+438
-1
lines changed

6 files changed

+438
-1
lines changed

scanpipe/pipelines/load_inventory.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ class LoadInventory(Pipeline):
3333
Supported format are ScanCode-toolkit JSON scan results, ScanCode.io JSON output,
3434
and ScanCode.io XLSX output.
3535
36+
Additionally supports importing scan results from integrated tools:
37+
- VulnerableCode: Vulnerability data export
38+
- PurlDB: Package enrichment data export
39+
- MatchCode.io: Matching results export
40+
3641
An inventory is composed of packages, dependencies, resources, and relations.
3742
"""
3843

@@ -78,4 +83,32 @@ def build_inventory_from_scans(self):
7883
)
7984

8085
else:
81-
raise Exception(f"Input not supported: {str(input_path)} ")
86+
integrated_tool = input.get_integrated_tool_name(scan_data)
87+
88+
if integrated_tool == "vulnerablecode":
89+
updated_count = input.load_vulnerabilities_from_vulnerablecode(
90+
self.project, scan_data
91+
)
92+
self.log(
93+
f"Loaded vulnerability data for {updated_count} packages "
94+
f"from {input_path.name}"
95+
)
96+
97+
elif integrated_tool == "purldb":
98+
result = input.load_enrichment_from_purldb(self.project, scan_data)
99+
self.log(
100+
f"PurlDB import: {result['created']} packages created, "
101+
f"{result['updated']} packages updated from {input_path.name}"
102+
)
103+
104+
elif integrated_tool == "matchcodeio":
105+
created_count = input.load_matches_from_matchcode(
106+
self.project, scan_data
107+
)
108+
self.log(
109+
f"MatchCode.io import: {created_count} packages created "
110+
f"from {input_path.name}"
111+
)
112+
113+
else:
114+
raise Exception(f"Input not supported: {str(input_path)} ")

scanpipe/pipes/input.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,201 @@ def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None):
237237
if extra_data_prefix:
238238
extra_data = {extra_data_prefix: extra_data}
239239
project.update_extra_data(extra_data)
240+
241+
242+
def get_integrated_tool_name(scan_data):
243+
"""
244+
Detect and return the integrated tool name from the ``scan_data`` structure.
245+
246+
Supported tools:
247+
- vulnerablecode: VulnerableCode vulnerability data export
248+
- purldb: PurlDB package enrichment data export
249+
- matchcodeio: MatchCode.io matching results export
250+
251+
Returns None if the tool cannot be identified.
252+
"""
253+
if "vulnerabilities" in scan_data or (
254+
isinstance(scan_data, list)
255+
and scan_data
256+
and "affected_by_vulnerabilities" in scan_data[0]
257+
):
258+
return "vulnerablecode"
259+
260+
if "files" in scan_data and "packages" in scan_data:
261+
files = scan_data.get("files", [])
262+
if files and any("for_packages" in f for f in files if isinstance(f, dict)):
263+
for file_data in files:
264+
if isinstance(file_data, dict):
265+
extra_data = file_data.get("extra_data", {})
266+
if any(
267+
key in extra_data
268+
for key in ["matched_to", "path_score", "matched_fingerprints"]
269+
):
270+
return "matchcodeio"
271+
272+
if "packages" in scan_data or (
273+
isinstance(scan_data, list)
274+
and scan_data
275+
and isinstance(scan_data[0], dict)
276+
and "purl" in scan_data[0]
277+
and any(
278+
key in scan_data[0]
279+
for key in ["repository_homepage_url", "api_data_url", "package_content"]
280+
)
281+
):
282+
return "purldb"
283+
284+
return None
285+
286+
287+
def load_vulnerabilities_from_vulnerablecode(project, scan_data):
288+
"""
289+
Load vulnerability data from VulnerableCode export and update project packages.
290+
291+
The ``scan_data`` should contain vulnerability information that can be matched
292+
to existing packages in the project by their PURL.
293+
294+
Expected format:
295+
- List of package dicts with 'purl' and 'affected_by_vulnerabilities' keys
296+
- Or dict with 'vulnerabilities' key containing vulnerability details
297+
"""
298+
packages_by_purl = {}
299+
for package in project.discoveredpackages.all():
300+
if package.package_url:
301+
packages_by_purl[package.package_url] = package
302+
303+
if isinstance(scan_data, list):
304+
vulnerability_data_list = scan_data
305+
elif "packages" in scan_data:
306+
vulnerability_data_list = scan_data.get("packages", [])
307+
elif "results" in scan_data:
308+
vulnerability_data_list = scan_data.get("results", [])
309+
else:
310+
vulnerability_data_list = []
311+
312+
updated_packages = []
313+
for vuln_data in vulnerability_data_list:
314+
purl = vuln_data.get("purl")
315+
if not purl:
316+
continue
317+
318+
package = packages_by_purl.get(purl)
319+
if not package:
320+
continue
321+
322+
affected_by = vuln_data.get("affected_by_vulnerabilities", [])
323+
if affected_by:
324+
package.affected_by_vulnerabilities = affected_by
325+
updated_packages.append(package)
326+
327+
if updated_packages:
328+
DiscoveredPackage.objects.bulk_update(
329+
objs=updated_packages,
330+
fields=["affected_by_vulnerabilities"],
331+
batch_size=1000,
332+
)
333+
334+
return len(updated_packages)
335+
336+
337+
def load_enrichment_from_purldb(project, scan_data):
338+
"""
339+
Load package enrichment data from PurlDB export and update/create packages.
340+
341+
The ``scan_data`` should contain package information that can be used to
342+
enrich existing packages or create new packages in the project.
343+
344+
Expected format:
345+
- List of package dicts with package data fields
346+
- Or dict with 'packages' key containing package dicts
347+
"""
348+
if isinstance(scan_data, list):
349+
package_data_list = scan_data
350+
elif "packages" in scan_data:
351+
package_data_list = scan_data.get("packages", [])
352+
elif "results" in scan_data:
353+
package_data_list = scan_data.get("results", [])
354+
else:
355+
package_data_list = []
356+
357+
created_count = 0
358+
updated_count = 0
359+
360+
for package_data in package_data_list:
361+
purl = package_data.get("purl")
362+
if not purl:
363+
continue
364+
365+
existing_package = project.discoveredpackages.filter(
366+
package_url=purl
367+
).first()
368+
369+
if existing_package:
370+
updated_fields = existing_package.update_from_data(package_data)
371+
if updated_fields:
372+
existing_package.update_extra_data(
373+
{"enriched_from_purldb": updated_fields}
374+
)
375+
updated_count += 1
376+
else:
377+
pipes.update_or_create_package(project, package_data)
378+
created_count += 1
379+
380+
return {"created": created_count, "updated": updated_count}
381+
382+
383+
def load_matches_from_matchcode(project, scan_data):
384+
"""
385+
Load matching results from MatchCode.io export and create packages/associations.
386+
387+
The ``scan_data`` should contain matching results with package data and
388+
resource associations.
389+
390+
Expected format:
391+
- Dict with 'files' and 'packages' keys
392+
- 'files' contains resource data with 'for_packages' associations
393+
- 'packages' contains matched package data
394+
"""
395+
from collections import defaultdict
396+
397+
files_data = scan_data.get("files", [])
398+
packages_data = scan_data.get("packages", [])
399+
400+
resource_paths_by_package_uid = defaultdict(list)
401+
for file_data in files_data:
402+
for_packages = file_data.get("for_packages", [])
403+
file_path = file_data.get("path")
404+
if file_path:
405+
for package_uid in for_packages:
406+
resource_paths_by_package_uid[package_uid].append(file_path)
407+
408+
created_packages = 0
409+
410+
for package_data in packages_data:
411+
package_uid = package_data.get("package_uid")
412+
if not package_uid:
413+
continue
414+
415+
416+
resource_paths = resource_paths_by_package_uid.get(package_uid, [])
417+
418+
resources = project.codebaseresources.filter(path__in=resource_paths)
419+
420+
package, created = pipes.update_or_create_package(project, package_data)
421+
if created:
422+
created_packages += 1
423+
424+
if package and resources.exists():
425+
package.add_resources(resources)
426+
427+
for file_data in files_data:
428+
if file_data.get("path") in resource_paths:
429+
extra_data = file_data.get("extra_data", {})
430+
if extra_data:
431+
resource = project.codebaseresources.filter(
432+
path=file_data["path"]
433+
).first()
434+
if resource:
435+
resource.update_extra_data(extra_data)
436+
437+
return created_packages
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"files": [
3+
{
4+
"path": "src/utils.js",
5+
"type": "file",
6+
"for_packages": [
7+
"pkg:npm/lodash@4.17.21?uuid=test-uuid-1234"
8+
],
9+
"extra_data": {
10+
"matched_to": "lodash",
11+
"path_score": 100,
12+
"matched_fingerprints": [
13+
"abc123def456"
14+
]
15+
}
16+
},
17+
{
18+
"path": "src/helper.js",
19+
"type": "file",
20+
"for_packages": [
21+
"pkg:npm/lodash@4.17.21?uuid=test-uuid-1234"
22+
],
23+
"extra_data": {
24+
"matched_to": "lodash",
25+
"path_score": 95
26+
}
27+
}
28+
],
29+
"packages": [
30+
{
31+
"purl": "pkg:npm/lodash@4.17.21",
32+
"package_uid": "pkg:npm/lodash@4.17.21?uuid=test-uuid-1234",
33+
"type": "npm",
34+
"name": "lodash",
35+
"version": "4.17.21",
36+
"description": "Lodash modular utilities",
37+
"declared_license_expression": "mit"
38+
}
39+
]
40+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"packages": [
3+
{
4+
"purl": "pkg:npm/lodash@4.17.21",
5+
"type": "npm",
6+
"namespace": "",
7+
"name": "lodash",
8+
"version": "4.17.21",
9+
"description": "Lodash modular utilities",
10+
"homepage_url": "https://lodash.com/",
11+
"download_url": "https://registry.npmjs.com/lodash/-/lodash-4.17.21.tgz",
12+
"repository_homepage_url": "https://www.npmjs.com/package/lodash",
13+
"declared_license_expression": "mit",
14+
"declared_license_expression_spdx": "MIT",
15+
"copyright": "Copyright OpenJS Foundation and other contributors",
16+
"primary_language": "JavaScript"
17+
},
18+
{
19+
"purl": "pkg:pypi/requests@2.28.0",
20+
"type": "pypi",
21+
"namespace": "",
22+
"name": "requests",
23+
"version": "2.28.0",
24+
"description": "Python HTTP for Humans",
25+
"homepage_url": "https://requests.readthedocs.io",
26+
"repository_homepage_url": "https://pypi.org/project/requests/",
27+
"declared_license_expression": "apache-2.0",
28+
"declared_license_expression_spdx": "Apache-2.0"
29+
}
30+
]
31+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
[
2+
{
3+
"purl": "pkg:pypi/django@5.0",
4+
"affected_by_vulnerabilities": [
5+
{
6+
"vulnerability_id": "VCID-3gge-bre2-aaac",
7+
"summary": "CVE-2024-24680 vulnerability",
8+
"aliases": [
9+
"CVE-2024-24680",
10+
"GHSA-xxj9-f6rv-m3x4"
11+
]
12+
}
13+
]
14+
},
15+
{
16+
"purl": "pkg:pypi/requests@2.28.0",
17+
"affected_by_vulnerabilities": [
18+
{
19+
"vulnerability_id": "VCID-test-vuln-aaaa",
20+
"summary": "Test vulnerability",
21+
"aliases": [
22+
"CVE-2023-12345"
23+
]
24+
}
25+
]
26+
}
27+
]

0 commit comments

Comments
 (0)