Skip to content

Commit 367e4e6

Browse files
committed
feat(hfh):SP-4181 implement raw output format for folder hashing
1 parent 8d75044 commit 367e4e6

4 files changed

Lines changed: 474 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88
## [Unreleased]
9+
### Added
10+
- Added `--format raw` option to `folder-scan` command to export HFH results in snippet-scanner JSON format
11+
- Expands directory-level HFH results into per-file entries keyed by relative file path
12+
- Assigns each file to the most specific matching `path_id` (deepest directory match wins)
913

1014
## [1.50.1] - 2026-03-23
1115
### Fixed

src/scanoss/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
988988
'--format',
989989
'-f',
990990
type=str,
991-
choices=['json', 'cyclonedx'],
991+
choices=['json', 'cyclonedx', 'raw'],
992992
default='json',
993993
help='Result output format (optional - default: json)',
994994
)

src/scanoss/scanners/scanner_hfh.py

Lines changed: 185 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,15 @@
2222
THE SOFTWARE.
2323
"""
2424

25+
import hashlib
2526
import json
27+
import os
2628
import threading
2729
import time
28-
from typing import Dict, Optional
30+
from pathlib import Path
31+
from typing import Dict, List, Optional, Tuple
2932

33+
from packageurl.contrib import purl2url
3034
from progress.spinner import Spinner
3135

3236
from scanoss.constants import (
@@ -163,6 +167,13 @@ class ScannerHFHPresenter(AbstractPresenter):
163167
"""
164168

165169
def __init__(self, scanner: ScannerHFH, **kwargs):
170+
"""
171+
Initialize the presenter.
172+
173+
Args:
174+
scanner (ScannerHFH): The HFH scanner instance containing scan results and file filters.
175+
**kwargs: Additional arguments passed to AbstractPresenter (debug, trace, quiet, etc.).
176+
"""
166177
super().__init__(**kwargs)
167178
self.scanner = scanner
168179

@@ -249,4 +260,176 @@ def _format_csv_output(self) -> str:
249260
raise NotImplementedError('CSV output is not implemented')
250261

251262
def _format_raw_output(self) -> str:
252-
raise NotImplementedError('Raw output is not implemented')
263+
"""
264+
Convert HFH scan results into snippet-scanner JSON format.
265+
266+
Expands directory-level HFH results into per-file entries keyed by
267+
relative file path, matching the structure returned by the snippet scanner.
268+
For each file, computes the MD5 hash and constructs the file_url using
269+
the API base URL from the scanner config.
270+
271+
Returns:
272+
str: A JSON string with the snippet-scanner format, or '{}' if no results.
273+
"""
274+
if not self.scanner.scan_results or 'results' not in self.scanner.scan_results:
275+
return '{}'
276+
277+
hfh_results = self.scanner.scan_results.get('results', [])
278+
if not hfh_results:
279+
return '{}'
280+
281+
# Collect best-match component info per path_id
282+
path_components = self._extract_best_components(hfh_results)
283+
if not path_components:
284+
return '{}'
285+
286+
# Get all filtered files once (relative paths to scan_dir)
287+
all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir)
288+
289+
# Sort path_ids by depth (deepest first) so most-specific match wins.
290+
# Root path '.' is always last (-1), others sort by separator count then path length.
291+
# Example with path_ids: ['.', 'external', 'project-1.0', 'project-1.0/src/lib']
292+
# Sorted result: ['project-1.0/src/lib', 'project-1.0', 'external', '.']
293+
# - 'project-1.0/src/lib' (depth 2) claims its files first
294+
# - 'project-1.0' (depth 0, len 11) claims remaining files under it
295+
# - 'external' (depth 0, len 8) claims external/ files
296+
# - '.' (root, always last) picks up everything else
297+
sorted_path_ids = sorted(
298+
path_components.keys(),
299+
key=lambda p: (-1, 0) if p == '.' else (p.count(os.sep), len(p)),
300+
reverse=True,
301+
)
302+
303+
output = {}
304+
claimed_files = set()
305+
scan_dir = Path(self.scanner.scan_dir).resolve()
306+
307+
for path_id in sorted_path_ids:
308+
component, best_version = path_components[path_id]
309+
for file_path in all_files:
310+
if file_path in claimed_files:
311+
continue
312+
if not self._file_matches_path_id(file_path, path_id):
313+
continue
314+
315+
claimed_files.add(file_path)
316+
# Path.__truediv__ (/) joins paths using the correct OS separator
317+
file_hash = self._compute_file_md5(scan_dir / file_path)
318+
api_url = self.scanner.client.orig_url or ''
319+
entry = self._build_file_match_entry(component, best_version, file_path, file_hash, api_url)
320+
output[file_path] = [entry]
321+
322+
return json.dumps(output, indent=2)
323+
324+
@staticmethod
325+
def _extract_best_components(hfh_results: List[Dict]) -> Dict[str, Tuple[Dict, Dict]]:
326+
"""
327+
Extract the best-match component and version for each path_id from HFH results.
328+
329+
Filters for components with order == 1 (best match) and takes their first version.
330+
Results without a qualifying component or without versions are skipped.
331+
332+
Args:
333+
hfh_results (List[Dict]): The 'results' list from the HFH API response.
334+
335+
Returns:
336+
Dict[str, Tuple[Dict, Dict]]: A dict mapping path_id to (component, best_version).
337+
"""
338+
path_components = {}
339+
for result in hfh_results:
340+
path_id = result.get('path_id', '.')
341+
components = result.get('components', [])
342+
best = [c for c in components if c.get('order') == 1]
343+
if not best:
344+
continue
345+
component = best[0]
346+
versions = component.get('versions', [])
347+
if not versions:
348+
continue
349+
path_components[path_id] = (component, versions[0])
350+
return path_components
351+
352+
@staticmethod
353+
def _file_matches_path_id(file_path: str, path_id: str) -> bool:
354+
"""
355+
Check if a file path belongs under a given path_id directory.
356+
357+
Both file_path and path_id are relative to the scan root directory.
358+
A path_id of '.' matches all files (root directory).
359+
360+
Args:
361+
file_path (str): Relative file path from the scan root.
362+
path_id (str): Relative directory path from the HFH result.
363+
364+
Returns:
365+
bool: True if the file is under the given path_id directory.
366+
"""
367+
if path_id == '.':
368+
return True
369+
# file_path and path_id are both relative to scan_dir
370+
return file_path == path_id or file_path.startswith(path_id + os.sep)
371+
372+
def _compute_file_md5(self, file_path: Path) -> str:
373+
"""
374+
Compute the MD5 hash of a file's contents.
375+
376+
Uses the same approach as the snippet scanner (winnowing.py) to ensure
377+
consistent file_hash values across scan types.
378+
379+
Args:
380+
file_path (Path): Absolute path to the file.
381+
382+
Returns:
383+
str: The MD5 hex digest, or an empty string if the file cannot be read.
384+
"""
385+
try:
386+
return hashlib.md5(file_path.read_bytes()).hexdigest()
387+
except (OSError, IOError) as e:
388+
self.base.print_stderr(f'Warning: Failed to compute MD5 for {file_path}: {e}')
389+
return ''
390+
391+
@staticmethod
392+
def _build_file_match_entry(
393+
component: Dict, best_version: Dict, file_path: str, file_hash: str, base_url: str,
394+
) -> Dict:
395+
"""
396+
Build a snippet-scanner-compatible result entry from an HFH component.
397+
398+
Maps HFH component fields to the standard scan result format. Fields not
399+
available from HFH (url_hash, release_date, licenses) are included as empty
400+
values since downstream validators require them.
401+
402+
Args:
403+
component (Dict): The HFH component with purl, name, vendor fields.
404+
best_version (Dict): The top version entry with version and score fields.
405+
file_path (str): Relative file path from the scan root directory.
406+
file_hash (str): Pre-computed MD5 hash of the local file.
407+
base_url (str): API base URL used to construct the file_url field.
408+
409+
Returns:
410+
Dict: A result entry compatible with the snippet-scanner JSON format.
411+
"""
412+
purl = component.get('purl', '')
413+
version = best_version.get('version', '')
414+
415+
url = purl2url.get_repo_url(purl) if purl else ''
416+
return {
417+
'id': 'file',
418+
'matched': '100%',
419+
'purl': [purl],
420+
'component': component.get('name', ''),
421+
'vendor': component.get('vendor', ''),
422+
'version': version,
423+
'latest': version,
424+
'url': url or '',
425+
'file': file_path,
426+
'file_hash': file_hash,
427+
'file_url': f'{base_url}/file_contents/{file_hash}',
428+
'source_hash': file_hash,
429+
'url_hash': '',
430+
'release_date': '',
431+
'licenses': [],
432+
'lines': 'all',
433+
'oss_lines': 'all',
434+
'status': 'pending',
435+
}

0 commit comments

Comments
 (0)