Skip to content

Commit 1831881

Browse files
committed
chore(hfh):SP-4188 include license info into hfh results
1 parent 367e4e6 commit 1831881

3 files changed

Lines changed: 201 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
- Added `--format raw` option to `folder-scan` command to export HFH results in snippet-scanner JSON format
1111
- Expands directory-level HFH results into per-file entries keyed by relative file path
1212
- Assigns each file to the most specific matching `path_id` (deepest directory match wins)
13+
- Added license decoration to folder hash scan results via dependency service
14+
- Each component version in HFH results is now decorated with license information
15+
- CycloneDX output uses pre-decorated licenses instead of making a separate dependency API call
1316

1417
## [1.50.1] - 2026-03-23
1518
### Fixed

src/scanoss/scanners/scanner_hfh.py

Lines changed: 122 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -116,17 +116,108 @@ def __init__( # noqa: PLR0913
116116

117117
def _execute_grpc_scan(self, hfh_request: Dict) -> None:
118118
"""
119-
Execute folder hash scan.
119+
Execute folder hash scan and decorate results with license information.
120120
121121
Args:
122122
hfh_request: Request dictionary for the gRPC call
123123
"""
124124
try:
125125
self.scan_results = self.client.folder_hash_scan(hfh_request, self.use_grpc)
126+
self._decorate_with_licenses()
126127
except Exception as e:
127128
self.base.print_stderr(f'Error during folder hash scan: {e}')
128129
self.scan_results = None
129130

131+
def _decorate_with_licenses(self) -> None:
132+
"""
133+
Decorate each component version in scan results with license information
134+
by calling the dependency service.
135+
"""
136+
if not self.scan_results or not self.client:
137+
return
138+
results = self.scan_results.get('results', [])
139+
if not results:
140+
return
141+
142+
dep_files = self._collect_dep_files(results)
143+
if not dep_files:
144+
return
145+
146+
try:
147+
decorated = self.client.get_dependencies({'files': dep_files})
148+
except Exception as e:
149+
self.base.print_stderr(f'Warning: Failed to fetch license data: {e}')
150+
return
151+
152+
if not decorated or 'files' not in decorated:
153+
return
154+
155+
license_map = self._build_license_map(decorated)
156+
self._inject_licenses(results, license_map)
157+
158+
@staticmethod
159+
def _collect_dep_files(results: List[Dict]) -> List[Dict]:
160+
"""Collect dependency file entries for all component versions in the results."""
161+
dep_files = []
162+
for result in results:
163+
path_id = result.get('path_id', '')
164+
for component in result.get('components', []):
165+
purl = component.get('purl', '')
166+
if not purl:
167+
continue
168+
for version_entry in component.get('versions', []):
169+
version = version_entry.get('version', '')
170+
if not version:
171+
continue
172+
dep_files.append({
173+
'file': path_id,
174+
'purls': [{'purl': purl, 'requirement': version}],
175+
})
176+
return dep_files
177+
178+
@staticmethod
179+
def _build_license_map(decorated: Dict) -> Dict[str, List]:
180+
"""Build a purl@requirement -> licenses lookup from the dependency service response.
181+
182+
Args:
183+
decorated (Dict): The response from the dependency service containing
184+
decorated files with license information.
185+
186+
Returns:
187+
Dict[str, List]: A mapping of 'purl@requirement' keys to their
188+
corresponding list of license dictionaries.
189+
"""
190+
license_map = {}
191+
for dep_file in decorated.get('files', []):
192+
for dep in dep_file.get('dependencies', []):
193+
dep_purl = dep.get('purl', '')
194+
# Use 'requirement' instead of 'version' as the key because the service
195+
# may resolve a different version, but the requirement always matches what was sent.
196+
dep_requirement = dep.get('requirement', '')
197+
licenses = dep.get('licenses', [])
198+
if dep_purl and licenses:
199+
license_map[f'{dep_purl}@{dep_requirement}'] = licenses
200+
return license_map
201+
202+
@staticmethod
203+
def _inject_licenses(results: List[Dict], license_map: Dict[str, List]) -> None:
204+
"""Inject licenses from the lookup map into each component version entry.
205+
206+
Args:
207+
results (List[Dict]): The 'results' list from the HFH scan response.
208+
Each result contains components with version entries that will
209+
be mutated in place to include license data.
210+
license_map (Dict[str, List]): A mapping of 'purl@version' keys to
211+
their corresponding list of license dictionaries, as built by
212+
``_build_license_map``.
213+
"""
214+
for result in results:
215+
for component in result.get('components', []):
216+
purl = component.get('purl', '')
217+
for version_entry in component.get('versions', []):
218+
version = version_entry.get('version', '')
219+
version_entry['licenses'] = license_map.get(f'{purl}@{version}', [])
220+
130221
def scan(self) -> Optional[Dict]:
131222
"""
132223
Scan the provided directory using the folder hashing algorithm.
@@ -215,30 +306,34 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
215306
if not best_match_component.get('versions'):
216307
self.base.print_stderr('ERROR: No versions found for best match component')
217308
return ''
218-
219309
best_match_version = best_match_component['versions'][0]
220310
purl = best_match_component['purl']
311+
version = best_match_version['version']
312+
licenses = best_match_version.get('licenses', [])
221313

222-
get_dependencies_json_request = {
223-
'files': [
314+
# Build scan_results from already-decorated HFH data
315+
scan_results = {
316+
f'{best_match_component["name"]}:{version}': [
224317
{
225-
'file': f'{best_match_component["name"]}:{best_match_version["version"]}',
226-
'purls': [{'purl': purl, 'requirement': best_match_version['version']}],
318+
'id': 'dependency',
319+
'dependencies': [
320+
{
321+
'purl': purl,
322+
'component': best_match_component.get('name', ''),
323+
'version': version,
324+
'licenses': licenses,
325+
}
326+
],
227327
}
228328
]
229329
}
230330

231331
get_vulnerabilities_json_request = {
232-
'components': [{'purl': purl, 'requirement': best_match_version['version']}],
332+
'components': [{'purl': purl, 'requirement': version}],
233333
}
234-
235-
decorated_scan_results = self.scanner.client.get_dependencies(get_dependencies_json_request)
236334
vulnerabilities = self.scanner.client.get_vulnerabilities_json(get_vulnerabilities_json_request)
237335

238336
cdx = CycloneDx(self.base.debug)
239-
scan_results = {}
240-
for f in decorated_scan_results['files']:
241-
scan_results[f['file']] = [f]
242337
success, cdx_output = cdx.produce_from_json(scan_results)
243338
if not success:
244339
error_msg = 'ERROR: Failed to produce CycloneDX output'
@@ -250,7 +345,7 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
250345

251346
return json.dumps(cdx_output, indent=2)
252347
except Exception as e:
253-
self.base.print_stderr(f'ERROR: Failed to get license information: {e}')
348+
self.base.print_stderr(f'ERROR: Failed to produce CycloneDX output: {e}')
254349
return None
255350

256351
def _format_spdxlite_output(self) -> str:
@@ -411,6 +506,19 @@ def _build_file_match_entry(
411506
"""
412507
purl = component.get('purl', '')
413508
version = best_version.get('version', '')
509+
licenses = [
510+
{
511+
'name': lic.get('spdx_id') or lic.get('name', ''),
512+
'patent_hints': '',
513+
'copyleft': '',
514+
'checklist_url': '',
515+
'incompatible_with': '',
516+
'osadl_updated': '',
517+
'source': 'component_declared',
518+
'url': f"https://spdx.org/licenses/{lic['spdx_id']}.html" if lic.get('spdx_id') else '',
519+
}
520+
for lic in best_version.get('licenses', [])
521+
]
414522

415523
url = purl2url.get_repo_url(purl) if purl else ''
416524
return {
@@ -428,7 +536,7 @@ def _build_file_match_entry(
428536
'source_hash': file_hash,
429537
'url_hash': '',
430538
'release_date': '',
431-
'licenses': [],
539+
'licenses': licenses,
432540
'lines': 'all',
433541
'oss_lines': 'all',
434542
'status': 'pending',

tests/test_scanner_hfh.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,13 @@ class TestBuildFileMatchEntry(unittest.TestCase):
212212
def test_basic_entry(self, mock_purl2url):
213213
mock_purl2url.get_repo_url.return_value = 'https://github.com/vendor/comp'
214214
component = {'purl': 'pkg:github/vendor/comp', 'name': 'comp', 'vendor': 'vendor'}
215-
best_version = {'version': '1.0.0', 'licenses': [{'name': 'MIT'}]}
215+
# HFH API license format
216+
best_version = {
217+
'version': '1.0.0',
218+
'licenses': [
219+
{'name': 'MIT License', 'spdx_id': 'MIT', 'is_spdx_approved': True, 'url': 'https://spdx.org/licenses/MIT.html'},
220+
],
221+
}
216222

217223
entry = ScannerHFHPresenter._build_file_match_entry(
218224
component, best_version, 'src/file.py', 'abc123', 'https://api.example.com'
@@ -232,7 +238,17 @@ def test_basic_entry(self, mock_purl2url):
232238
self.assertEqual(entry['source_hash'], 'abc123')
233239
self.assertEqual(entry['url_hash'], '')
234240
self.assertEqual(entry['release_date'], '')
235-
self.assertEqual(entry['licenses'], [{'name': 'MIT'}])
241+
# License should be transformed from HFH format to snippet-scanner format
242+
self.assertEqual(len(entry['licenses']), 1)
243+
lic = entry['licenses'][0]
244+
self.assertEqual(lic['name'], 'MIT')
245+
self.assertEqual(lic['source'], 'component_declared')
246+
self.assertEqual(lic['url'], 'https://spdx.org/licenses/MIT.html')
247+
self.assertEqual(lic['patent_hints'], '')
248+
self.assertEqual(lic['copyleft'], '')
249+
self.assertEqual(lic['checklist_url'], '')
250+
self.assertEqual(lic['incompatible_with'], '')
251+
self.assertEqual(lic['osadl_updated'], '')
236252
self.assertEqual(entry['lines'], 'all')
237253
self.assertEqual(entry['oss_lines'], 'all')
238254
self.assertEqual(entry['status'], 'pending')
@@ -266,6 +282,64 @@ def test_missing_fields_use_defaults(self, mock_purl2url):
266282
self.assertEqual(entry['version'], '')
267283
self.assertEqual(entry['licenses'], [])
268284

285+
@patch('scanoss.scanners.scanner_hfh.purl2url')
286+
def test_license_uses_spdx_id_as_name(self, mock_purl2url):
287+
mock_purl2url.get_repo_url.return_value = ''
288+
component = {'purl': 'pkg:github/v/c', 'name': 'c', 'vendor': 'v'}
289+
best_version = {
290+
'version': '1.0',
291+
'licenses': [
292+
{'name': 'GNU General Public License v2.0 only', 'spdx_id': 'GPL-2.0-only', 'is_spdx_approved': True, 'url': 'https://spdx.org/licenses/GPL-2.0-only.html'},
293+
],
294+
}
295+
296+
entry = ScannerHFHPresenter._build_file_match_entry(
297+
component, best_version, 'file.py', 'hash', 'https://api.example.com'
298+
)
299+
300+
lic = entry['licenses'][0]
301+
self.assertEqual(lic['name'], 'GPL-2.0-only')
302+
self.assertEqual(lic['url'], 'https://spdx.org/licenses/GPL-2.0-only.html')
303+
304+
@patch('scanoss.scanners.scanner_hfh.purl2url')
305+
def test_license_without_spdx_id_falls_back_to_name(self, mock_purl2url):
306+
mock_purl2url.get_repo_url.return_value = ''
307+
component = {'purl': 'pkg:github/v/c', 'name': 'c', 'vendor': 'v'}
308+
best_version = {
309+
'version': '1.0',
310+
'licenses': [{'name': 'Some Custom License'}],
311+
}
312+
313+
entry = ScannerHFHPresenter._build_file_match_entry(
314+
component, best_version, 'file.py', 'hash', 'https://api.example.com'
315+
)
316+
317+
lic = entry['licenses'][0]
318+
self.assertEqual(lic['name'], 'Some Custom License')
319+
self.assertEqual(lic['url'], '')
320+
321+
@patch('scanoss.scanners.scanner_hfh.purl2url')
322+
def test_multiple_licenses_transformed(self, mock_purl2url):
323+
mock_purl2url.get_repo_url.return_value = ''
324+
component = {'purl': 'pkg:github/v/c', 'name': 'c', 'vendor': 'v'}
325+
best_version = {
326+
'version': '1.0',
327+
'licenses': [
328+
{'name': 'MIT License', 'spdx_id': 'MIT', 'is_spdx_approved': True, 'url': 'https://spdx.org/licenses/MIT.html'},
329+
{'name': 'Apache License 2.0', 'spdx_id': 'Apache-2.0', 'is_spdx_approved': True, 'url': 'https://spdx.org/licenses/Apache-2.0.html'},
330+
],
331+
}
332+
333+
entry = ScannerHFHPresenter._build_file_match_entry(
334+
component, best_version, 'file.py', 'hash', 'https://api.example.com'
335+
)
336+
337+
self.assertEqual(len(entry['licenses']), 2)
338+
self.assertEqual(entry['licenses'][0]['name'], 'MIT')
339+
self.assertEqual(entry['licenses'][0]['url'], 'https://spdx.org/licenses/MIT.html')
340+
self.assertEqual(entry['licenses'][1]['name'], 'Apache-2.0')
341+
self.assertEqual(entry['licenses'][1]['url'], 'https://spdx.org/licenses/Apache-2.0.html')
342+
269343
@patch('scanoss.scanners.scanner_hfh.purl2url')
270344
def test_purl2url_returns_none(self, mock_purl2url):
271345
mock_purl2url.get_repo_url.return_value = None

0 commit comments

Comments
 (0)