Skip to content

Commit 33df277

Browse files
authored
[SP-4275] feat: extract and add cpe's to spdx output (#204)
* [SP-4275] feat: extract and add cpe's to spdx output * [SP-4275] chore: normalize cpe to lowercase
1 parent bcd043f commit 33df277

5 files changed

Lines changed: 257 additions & 9 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,4 @@ docs/build
3434
.DS_Store
3535
!scanoss.json
3636
examples/output/
37+
!spdx-*.json

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
## [Unreleased]
99

1010

11+
## [1.52.1] - 2026-04-14
12+
### Fixed
13+
- Fixed CPE identifiers missing from SPDX Lite output (`--format spdxlite`)
14+
- CPEs are now emitted as SPDX 2.2 `externalRefs` with `referenceCategory: SECURITY`
15+
- CPE 2.3 strings use `referenceType: cpe23Type`; legacy `cpe:/...` and `cpe:2.2:...` use `cpe22Type`
16+
- Multiple CPEs per component are preserved and deduplicated
17+
1118
## [1.52.0] - 2026-04-09
1219
### Added
1320
- Added `status` subcommand query to `component` command to retrieve development life-cycle status:
@@ -877,3 +884,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
877884
[1.51.0]: https://github.com/scanoss/scanoss.py/compare/v1.50.1...v1.51.0
878885
[1.51.1]: https://github.com/scanoss/scanoss.py/compare/v1.51.0...v1.51.1
879886
[1.52.0]: https://github.com/scanoss/scanoss.py/compare/v1.51.1...v1.52.0
887+
[1.52.0]: https://github.com/scanoss/scanoss.py/compare/v1.51.1...v1.52.1

src/scanoss/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
THE SOFTWARE.
2323
"""
2424

25-
__version__ = '1.52.0'
25+
__version__ = '1.52.1'

src/scanoss/spdxlite.py

Lines changed: 85 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,41 @@ def _create_file_summary(self, entry: dict) -> dict:
219219
for field in fields:
220220
summary[field] = entry.get(field)
221221
summary['licenses'] = self._process_licenses(entry.get('licenses'))
222+
summary['cpes'] = self._extract_cpes(entry.get('vulnerabilities'))
222223
return summary
223224

225+
def _extract_cpes(self, vulnerabilities: list) -> list:
226+
"""
227+
Extract CPE identifiers from a file entry's vulnerabilities array.
228+
229+
Raw scan results deliver CPEs embedded as vulnerability IDs prefixed with "CPE:"
230+
(case-insensitive). Everything else in the array is a real vulnerability record
231+
(CVE/GHSA) and must be ignored here.
232+
233+
Args:
234+
vulnerabilities (list): The 'vulnerabilities' list from a file match entry.
235+
May be None or empty.
236+
237+
Returns:
238+
list: Deduplicated list of CPE strings in source order (e.g.
239+
['cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*']).
240+
Returns an empty list when there are no CPE entries.
241+
"""
242+
if not vulnerabilities:
243+
return []
244+
cpes = []
245+
seen = set()
246+
for vuln in vulnerabilities:
247+
vuln_id = vuln.get('ID') or vuln.get('id') or ''
248+
if not vuln_id.upper().startswith('CPE:'):
249+
continue
250+
normalized = vuln_id.upper()
251+
if normalized in seen:
252+
continue
253+
seen.add(normalized)
254+
cpes.append(vuln_id)
255+
return cpes
256+
224257
def _process_licenses(self, licenses: list) -> list:
225258
"""
226259
Process license information and remove duplicates.
@@ -426,6 +459,15 @@ def _create_package_info(self, purl: str, comp: dict, lic_refs: set) -> dict:
426459
purl_ver = f'{purl}@{comp_ver}'
427460
purl_hash = hashlib.md5(purl_ver.encode('utf-8')).hexdigest()
428461

462+
external_refs = [
463+
{
464+
'referenceCategory': 'PACKAGE-MANAGER',
465+
'referenceLocator': PackageURL.from_string(purl_ver).to_string(),
466+
'referenceType': 'purl'
467+
}
468+
]
469+
external_refs.extend(self._create_cpe_external_refs(comp.get('cpes', [])))
470+
429471
return {
430472
'name': comp.get('component'),
431473
'SPDXID': f'SPDXRef-{purl_hash}',
@@ -437,13 +479,7 @@ def _create_package_info(self, purl: str, comp: dict, lic_refs: set) -> dict:
437479
'filesAnalyzed': False,
438480
'copyrightText': 'NOASSERTION',
439481
'supplier': f'Organization: {comp.get("vendor", "NOASSERTION")}',
440-
'externalRefs': [
441-
{
442-
'referenceCategory': 'PACKAGE-MANAGER',
443-
'referenceLocator': PackageURL.from_string(purl_ver).to_string(),
444-
'referenceType': 'purl'
445-
}
446-
],
482+
'externalRefs': external_refs,
447483
'checksums': [
448484
{
449485
'algorithm': 'MD5',
@@ -452,6 +488,48 @@ def _create_package_info(self, purl: str, comp: dict, lic_refs: set) -> dict:
452488
],
453489
}
454490

491+
def _create_cpe_external_refs(self, cpes: list) -> list:
492+
"""
493+
Build SPDX externalRefs entries for a component's CPE identifiers.
494+
495+
SPDX 2.2 models CPEs under the SECURITY reference category. Each CPE string
496+
must be emitted as its own externalRef dict with the shape:
497+
498+
{
499+
'referenceCategory': 'SECURITY',
500+
'referenceType': 'cpe23Type' | 'cpe22Type',
501+
'referenceLocator': '<cpe string>',
502+
}
503+
504+
Args:
505+
cpes (list): CPE strings extracted from the raw scan results. The list is
506+
already deduplicated by `_extract_cpes`. Values look like
507+
'cpe:2.3:a:vendor:product:version:...' (CPE 2.3) or
508+
'cpe:/a:vendor:product:version' (legacy CPE 2.2). May be empty.
509+
510+
Returns:
511+
list: A list of SPDX externalRef dicts ready to be appended to a package's
512+
`externalRefs`. Return an empty list when `cpes` is empty.
513+
"""
514+
if not cpes:
515+
return []
516+
refs = []
517+
for cpe in cpes:
518+
normalized = cpe.lower()
519+
if normalized.startswith('cpe:2.3:'):
520+
ref_type = 'cpe23Type'
521+
elif normalized.startswith('cpe:/') or normalized.startswith('cpe:2.2:'):
522+
ref_type = 'cpe22Type'
523+
else:
524+
self.print_debug(f'Warning: Unrecognized CPE format, defaulting to cpe23Type: {cpe}')
525+
ref_type = 'cpe23Type'
526+
refs.append({
527+
'referenceCategory': 'SECURITY',
528+
'referenceType': ref_type,
529+
'referenceLocator': cpe,
530+
})
531+
return refs
532+
455533
def _process_package_licenses(self, licenses: list, lic_refs: set) -> str:
456534
"""
457535
Process licenses and return license text formatted for SPDX.

tests/test_spdxlite.py

Lines changed: 162 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,165 @@ def testSpdxLite(self):
6767
self.assertEqual(len(checksum.get("checksumValue")), md5_length) #Check checksum length value be 32
6868

6969

70-
os.remove(spdx_lite_output) #Removes tmp spdxlite.json file
70+
os.remove(spdx_lite_output) #Removes tmp spdxlite.json file
71+
72+
73+
class SpdxLiteCpeTests(unittest.TestCase):
74+
"""
75+
Exercise CPE extraction and SPDX externalRefs emission.
76+
"""
77+
78+
@staticmethod
79+
def _build_raw(vulnerabilities, purl='pkg:github/postgres/postgres'):
80+
return {
81+
'src/main.c': [{
82+
'id': 'file',
83+
'component': 'postgresql',
84+
'vendor': 'postgresql',
85+
'version': '17.0',
86+
'latest': '17.0',
87+
'url': 'https://www.postgresql.org',
88+
'url_hash': 'abc123',
89+
'download_url': 'https://example.com/pg.tar.gz',
90+
'purl': [purl],
91+
'licenses': [{'name': 'PostgreSQL', 'source': 'component_declared'}],
92+
'vulnerabilities': vulnerabilities,
93+
}]
94+
}
95+
96+
def _run(self, raw):
97+
fd, out_path = tempfile.mkstemp(prefix='spdxlite_cpe_', suffix='.json')
98+
os.close(fd) # SpdxLite re-opens the path itself for writing
99+
try:
100+
spdx = SpdxLite(debug=False, output_file=out_path)
101+
spdx.produce_from_json(raw)
102+
with open(out_path, 'r') as f:
103+
return json.load(f)
104+
finally:
105+
if os.path.exists(out_path):
106+
os.remove(out_path)
107+
108+
def _security_refs(self, doc):
109+
refs = doc['packages'][0]['externalRefs']
110+
return [r for r in refs if r['referenceCategory'] == 'SECURITY']
111+
112+
def test_cpe23_emits_cpe23Type(self):
113+
cpe = 'cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
114+
doc = self._run(self._build_raw([{'ID': cpe, 'source': 'nvd'}]))
115+
refs = self._security_refs(doc)
116+
self.assertEqual(len(refs), 1)
117+
self.assertEqual(refs[0]['referenceType'], 'cpe23Type')
118+
self.assertEqual(refs[0]['referenceLocator'], cpe)
119+
120+
def test_legacy_cpe22_slash_emits_cpe22Type(self):
121+
cpe = 'cpe:/a:postgresql:postgresql:17.0'
122+
doc = self._run(self._build_raw([{'ID': cpe, 'source': 'nvd'}]))
123+
refs = self._security_refs(doc)
124+
self.assertEqual(len(refs), 1)
125+
self.assertEqual(refs[0]['referenceType'], 'cpe22Type')
126+
self.assertEqual(refs[0]['referenceLocator'], cpe)
127+
128+
def test_explicit_cpe22_prefix_emits_cpe22Type(self):
129+
cpe = 'cpe:2.2:a:postgresql:postgresql:17.0'
130+
doc = self._run(self._build_raw([{'ID': cpe, 'source': 'nvd'}]))
131+
refs = self._security_refs(doc)
132+
self.assertEqual(len(refs), 1)
133+
self.assertEqual(refs[0]['referenceType'], 'cpe22Type')
134+
135+
def test_case_insensitive_prefix_detection(self):
136+
cpe = 'CPE:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
137+
doc = self._run(self._build_raw([{'ID': cpe, 'source': 'nvd'}]))
138+
refs = self._security_refs(doc)
139+
self.assertEqual(len(refs), 1)
140+
self.assertEqual(refs[0]['referenceType'], 'cpe23Type')
141+
self.assertEqual(refs[0]['referenceLocator'], cpe) # casing preserved in locator
142+
143+
def test_duplicate_cpes_are_deduplicated(self):
144+
cpe = 'cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
145+
doc = self._run(self._build_raw([
146+
{'ID': cpe, 'source': 'nvd'},
147+
{'ID': cpe, 'source': 'nvd'},
148+
{'ID': cpe, 'source': 'nvd'},
149+
]))
150+
refs = self._security_refs(doc)
151+
self.assertEqual(len(refs), 1)
152+
153+
def test_dedup_is_case_insensitive_and_preserves_first_locator(self):
154+
lower = 'cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
155+
upper = 'CPE:2.3:A:POSTGRESQL:POSTGRESQL:17.0:*:*:*:*:*:*:*'
156+
doc = self._run(self._build_raw([
157+
{'ID': lower, 'source': 'nvd'},
158+
{'ID': upper, 'source': 'nvd'},
159+
]))
160+
refs = self._security_refs(doc)
161+
self.assertEqual(len(refs), 1)
162+
self.assertEqual(refs[0]['referenceLocator'], lower) # first-seen wins
163+
164+
def test_cve_entries_are_ignored(self):
165+
doc = self._run(self._build_raw([
166+
{'ID': 'CVE-2024-12345', 'CVE': 'CVE-2024-12345',
167+
'source': 'nvd', 'severity': 'high'},
168+
{'ID': 'GHSA-xxxx-yyyy-zzzz', 'source': 'github'},
169+
]))
170+
refs = self._security_refs(doc)
171+
self.assertEqual(refs, [])
172+
173+
def test_mixed_cpe_versions_in_same_component(self):
174+
cpe23 = 'cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
175+
cpe22 = 'cpe:/a:postgresql:postgresql:17.0'
176+
doc = self._run(self._build_raw([
177+
{'ID': cpe23, 'source': 'nvd'},
178+
{'ID': cpe22, 'source': 'nvd'},
179+
]))
180+
refs = self._security_refs(doc)
181+
self.assertEqual(len(refs), 2)
182+
types = {r['referenceType']: r['referenceLocator'] for r in refs}
183+
self.assertEqual(types['cpe23Type'], cpe23)
184+
self.assertEqual(types['cpe22Type'], cpe22)
185+
186+
def test_unknown_cpe_format_falls_back_to_cpe23Type(self):
187+
odd_cpe = 'cpe:weird-format:postgresql:17.0'
188+
doc = self._run(self._build_raw([{'ID': odd_cpe, 'source': 'nvd'}]))
189+
refs = self._security_refs(doc)
190+
self.assertEqual(len(refs), 1)
191+
self.assertEqual(refs[0]['referenceType'], 'cpe23Type')
192+
self.assertEqual(refs[0]['referenceLocator'], odd_cpe)
193+
194+
def test_no_vulnerabilities_field_produces_no_security_refs(self):
195+
raw = self._build_raw([])
196+
# Drop the key entirely to simulate entries without a vulnerabilities block
197+
del raw['src/main.c'][0]['vulnerabilities']
198+
doc = self._run(raw)
199+
self.assertEqual(self._security_refs(doc), [])
200+
# PURL externalRef must still be present
201+
refs = doc['packages'][0]['externalRefs']
202+
self.assertEqual(len(refs), 1)
203+
self.assertEqual(refs[0]['referenceType'], 'purl')
204+
205+
def test_empty_vulnerabilities_list_produces_no_security_refs(self):
206+
doc = self._run(self._build_raw([]))
207+
self.assertEqual(self._security_refs(doc), [])
208+
209+
def test_dependency_entries_do_not_emit_cpes(self):
210+
raw = {
211+
'package.json': [{
212+
'id': 'dependency',
213+
'dependencies': [{
214+
'purl': 'pkg:npm/left-pad',
215+
'component': 'left-pad',
216+
'version': '1.3.0',
217+
'url': 'https://npmjs.com/package/left-pad',
218+
'licenses': [{'name': 'MIT', 'source': 'component_declared'}],
219+
}]
220+
}]
221+
}
222+
doc = self._run(raw)
223+
self.assertEqual(self._security_refs(doc), [])
224+
225+
def test_lowercase_id_key_is_also_supported(self):
226+
cpe = 'cpe:2.3:a:postgresql:postgresql:17.0:*:*:*:*:*:*:*'
227+
# Raw scan output has been known to use 'id' (lowercase) occasionally
228+
doc = self._run(self._build_raw([{'id': cpe, 'source': 'nvd'}]))
229+
refs = self._security_refs(doc)
230+
self.assertEqual(len(refs), 1)
231+
self.assertEqual(refs[0]['referenceType'], 'cpe23Type')

0 commit comments

Comments
 (0)