Skip to content

Commit ed33fe3

Browse files
committed
feat(hfh):SP-4181 implement raw output format for folder hashing
1 parent 7974777 commit ed33fe3

2 files changed

Lines changed: 160 additions & 3 deletions

File tree

src/scanoss/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
988988
'--format',
989989
'-f',
990990
type=str,
991-
choices=['json', 'cyclonedx'],
991+
choices=['json', 'cyclonedx', 'raw'],
992992
default='json',
993993
help='Result output format (optional - default: json)',
994994
)

src/scanoss/scanners/scanner_hfh.py

Lines changed: 159 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,15 @@
2222
THE SOFTWARE.
2323
"""
2424

25+
import hashlib
2526
import json
27+
import os
2628
import threading
2729
import time
28-
from typing import Dict, Optional
30+
from pathlib import Path
31+
from typing import Dict, List, Optional, Tuple
32+
33+
from packageurl.contrib import purl2url
2934

3035
from progress.spinner import Spinner
3136

@@ -249,4 +254,156 @@ def _format_csv_output(self) -> str:
249254
raise NotImplementedError('CSV output is not implemented')
250255

251256
def _format_raw_output(self) -> str:
252-
raise NotImplementedError('Raw output is not implemented')
257+
"""
258+
Convert HFH scan results into snippet-scanner JSON format.
259+
260+
Expands directory-level HFH results into per-file entries keyed by
261+
relative file path, matching the structure returned by the snippet scanner.
262+
"""
263+
if not self.scanner.scan_results or 'results' not in self.scanner.scan_results:
264+
return '{}'
265+
266+
hfh_results = self.scanner.scan_results.get('results', [])
267+
if not hfh_results:
268+
return '{}'
269+
270+
# Collect best-match component info per path_id
271+
path_components = self._extract_best_components(hfh_results)
272+
if not path_components:
273+
return '{}'
274+
275+
# Get all filtered files once (relative paths to scan_dir)
276+
all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir)
277+
278+
# Sort path_ids by depth (deepest first) so most-specific match wins
279+
path_components.sort(key=lambda x: x[0].count(os.sep), reverse=True)
280+
281+
output = {}
282+
claimed_files = set()
283+
scan_dir = Path(self.scanner.scan_dir).resolve()
284+
285+
for path_id, component, best_version in path_components:
286+
for file_path in all_files:
287+
if file_path in claimed_files:
288+
continue
289+
if not self._file_matches_path_id(file_path, path_id):
290+
continue
291+
292+
claimed_files.add(file_path)
293+
# Path.__truediv__ (/) joins paths using the correct OS separator
294+
file_hash = self._compute_file_md5(scan_dir / file_path)
295+
entry = self._build_snippet_entry(component, best_version, file_hash)
296+
output[file_path] = [entry]
297+
298+
return json.dumps(output, indent=2)
299+
300+
@staticmethod
301+
def _extract_best_components(hfh_results: List[Dict]) -> List[Tuple[str, Dict, Dict]]:
302+
"""
303+
Extract the best-match component and version for each path_id from HFH results.
304+
305+
Filters for components with order == 1 (best match) and takes their first version.
306+
Results without a qualifying component or without versions are skipped.
307+
308+
Args:
309+
hfh_results (List[Dict]): The 'results' list from the HFH API response.
310+
311+
Returns:
312+
List[Tuple[str, Dict, Dict]]: A list of (path_id, component, best_version) tuples.
313+
"""
314+
path_components = []
315+
for result in hfh_results:
316+
path_id = result.get('path_id', '.')
317+
components = result.get('components', [])
318+
best = [c for c in components if c.get('order') == 1]
319+
if not best:
320+
continue
321+
component = best[0]
322+
versions = component.get('versions', [])
323+
if not versions:
324+
continue
325+
path_components.append((path_id, component, versions[0]))
326+
return path_components
327+
328+
@staticmethod
329+
def _file_matches_path_id(file_path: str, path_id: str) -> bool:
330+
"""
331+
Check if a file path belongs under a given path_id directory.
332+
333+
Both file_path and path_id are relative to the scan root directory.
334+
A path_id of '.' matches all files (root directory).
335+
336+
Args:
337+
file_path (str): Relative file path from the scan root.
338+
path_id (str): Relative directory path from the HFH result.
339+
340+
Returns:
341+
bool: True if the file is under the given path_id directory.
342+
"""
343+
if path_id == '.':
344+
return True
345+
# file_path and path_id are both relative to scan_dir
346+
return file_path == path_id or file_path.startswith(path_id + os.sep)
347+
348+
@staticmethod
349+
def _compute_file_md5(file_path: Path) -> str:
350+
"""
351+
Compute the MD5 hash of a file's contents.
352+
353+
Uses the same approach as the snippet scanner (winnowing.py) to ensure
354+
consistent file_hash values across scan types.
355+
356+
Args:
357+
file_path (Path): Absolute path to the file.
358+
359+
Returns:
360+
str: The MD5 hex digest, or an empty string if the file cannot be read.
361+
"""
362+
try:
363+
return hashlib.md5(file_path.read_bytes()).hexdigest()
364+
except (OSError, IOError):
365+
return ''
366+
367+
@staticmethod
368+
def _build_snippet_entry(component: Dict, best_version: Dict, file_hash: str = '') -> Dict:
369+
"""
370+
Build a snippet-scanner-compatible result entry from an HFH component.
371+
372+
Maps HFH component fields to the standard scan result format. Fields not
373+
available from HFH (file_url, source_hash, url_hash, release_date, licenses)
374+
are included as empty values since downstream validators require them.
375+
376+
Args:
377+
component (Dict): The HFH component with purl, name, vendor fields.
378+
best_version (Dict): The top version entry with version and score fields.
379+
file_hash (str): Pre-computed MD5 hash of the local file.
380+
381+
Returns:
382+
Dict: A result entry compatible with the snippet-scanner JSON format.
383+
"""
384+
purl = component.get('purl', '')
385+
score = best_version.get('score', 0)
386+
version = best_version.get('version', '')
387+
388+
url = purl2url.get_repo_url(purl) if purl else ''
389+
390+
return {
391+
'id': 'file',
392+
'matched': f'{round(score * 100)}%',
393+
'purl': [purl],
394+
'component': component.get('name', ''),
395+
'vendor': component.get('vendor', ''),
396+
'version': version,
397+
'latest': version,
398+
'url': url or '',
399+
'file': '',
400+
'file_hash': file_hash,
401+
'file_url': '',
402+
'source_hash': '',
403+
'url_hash': '',
404+
'release_date': '',
405+
'licenses': [],
406+
'lines': 'all',
407+
'oss_lines': 'all',
408+
'status': 'pending',
409+
}

0 commit comments

Comments
 (0)