Skip to content

Commit 23636aa

Browse files
committed
feat(hfh):SP-4181 implement raw output format for folder hashing
1 parent 7974777 commit 23636aa

3 files changed

Lines changed: 164 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88
## [Unreleased]
9+
### Added
10+
- Added `--format raw` option to `folder-scan` command to export HFH results in snippet-scanner JSON format
11+
- Expands directory-level HFH results into per-file entries keyed by relative file path
12+
- Computes MD5 `file_hash` for each file from disk
13+
- Assigns each file to the most specific matching `path_id` (deepest directory match wins)
914

1015
## [1.50.0] - 2026-03-17
1116
### Fixed

src/scanoss/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
988988
'--format',
989989
'-f',
990990
type=str,
991-
choices=['json', 'cyclonedx'],
991+
choices=['json', 'cyclonedx', 'raw'],
992992
default='json',
993993
help='Result output format (optional - default: json)',
994994
)

src/scanoss/scanners/scanner_hfh.py

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,15 @@
2222
THE SOFTWARE.
2323
"""
2424

25+
import hashlib
2526
import json
27+
import os
2628
import threading
2729
import time
28-
from typing import Dict, Optional
30+
from pathlib import Path
31+
from typing import Dict, List, Optional, Tuple
2932

33+
from packageurl.contrib import purl2url
3034
from progress.spinner import Spinner
3135

3236
from scanoss.constants import (
@@ -249,4 +253,156 @@ def _format_csv_output(self) -> str:
249253
raise NotImplementedError('CSV output is not implemented')
250254

251255
def _format_raw_output(self) -> str:
252-
raise NotImplementedError('Raw output is not implemented')
256+
"""
257+
Convert HFH scan results into snippet-scanner JSON format.
258+
259+
Expands directory-level HFH results into per-file entries keyed by
260+
relative file path, matching the structure returned by the snippet scanner.
261+
"""
262+
if not self.scanner.scan_results or 'results' not in self.scanner.scan_results:
263+
return '{}'
264+
265+
hfh_results = self.scanner.scan_results.get('results', [])
266+
if not hfh_results:
267+
return '{}'
268+
269+
# Collect best-match component info per path_id
270+
path_components = self._extract_best_components(hfh_results)
271+
if not path_components:
272+
return '{}'
273+
274+
# Get all filtered files once (relative paths to scan_dir)
275+
all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir)
276+
277+
# Sort path_ids by depth (deepest first) so most-specific match wins
278+
path_components.sort(key=lambda x: x[0].count(os.sep), reverse=True)
279+
280+
output = {}
281+
claimed_files = set()
282+
scan_dir = Path(self.scanner.scan_dir).resolve()
283+
284+
for path_id, component, best_version in path_components:
285+
for file_path in all_files:
286+
if file_path in claimed_files:
287+
continue
288+
if not self._file_matches_path_id(file_path, path_id):
289+
continue
290+
291+
claimed_files.add(file_path)
292+
# Path.__truediv__ (/) joins paths using the correct OS separator
293+
file_hash = self._compute_file_md5(scan_dir / file_path)
294+
entry = self._build_snippet_entry(component, best_version, file_hash)
295+
output[file_path] = [entry]
296+
297+
return json.dumps(output, indent=2)
298+
299+
@staticmethod
300+
def _extract_best_components(hfh_results: List[Dict]) -> List[Tuple[str, Dict, Dict]]:
301+
"""
302+
Extract the best-match component and version for each path_id from HFH results.
303+
304+
Filters for components with order == 1 (best match) and takes their first version.
305+
Results without a qualifying component or without versions are skipped.
306+
307+
Args:
308+
hfh_results (List[Dict]): The 'results' list from the HFH API response.
309+
310+
Returns:
311+
List[Tuple[str, Dict, Dict]]: A list of (path_id, component, best_version) tuples.
312+
"""
313+
path_components = []
314+
for result in hfh_results:
315+
path_id = result.get('path_id', '.')
316+
components = result.get('components', [])
317+
best = [c for c in components if c.get('order') == 1]
318+
if not best:
319+
continue
320+
component = best[0]
321+
versions = component.get('versions', [])
322+
if not versions:
323+
continue
324+
path_components.append((path_id, component, versions[0]))
325+
return path_components
326+
327+
@staticmethod
328+
def _file_matches_path_id(file_path: str, path_id: str) -> bool:
329+
"""
330+
Check if a file path belongs under a given path_id directory.
331+
332+
Both file_path and path_id are relative to the scan root directory.
333+
A path_id of '.' matches all files (root directory).
334+
335+
Args:
336+
file_path (str): Relative file path from the scan root.
337+
path_id (str): Relative directory path from the HFH result.
338+
339+
Returns:
340+
bool: True if the file is under the given path_id directory.
341+
"""
342+
if path_id == '.':
343+
return True
344+
# file_path and path_id are both relative to scan_dir
345+
return file_path == path_id or file_path.startswith(path_id + os.sep)
346+
347+
@staticmethod
348+
def _compute_file_md5(file_path: Path) -> str:
349+
"""
350+
Compute the MD5 hash of a file's contents.
351+
352+
Uses the same approach as the snippet scanner (winnowing.py) to ensure
353+
consistent file_hash values across scan types.
354+
355+
Args:
356+
file_path (Path): Absolute path to the file.
357+
358+
Returns:
359+
str: The MD5 hex digest, or an empty string if the file cannot be read.
360+
"""
361+
try:
362+
return hashlib.md5(file_path.read_bytes()).hexdigest()
363+
except (OSError, IOError):
364+
return ''
365+
366+
@staticmethod
367+
def _build_snippet_entry(component: Dict, best_version: Dict, file_hash: str = '') -> Dict:
368+
"""
369+
Build a snippet-scanner-compatible result entry from an HFH component.
370+
371+
Maps HFH component fields to the standard scan result format. Fields not
372+
available from HFH (file_url, source_hash, url_hash, release_date, licenses)
373+
are included as empty values since downstream validators require them.
374+
375+
Args:
376+
component (Dict): The HFH component with purl, name, vendor fields.
377+
best_version (Dict): The top version entry with version and score fields.
378+
file_hash (str): Pre-computed MD5 hash of the local file.
379+
380+
Returns:
381+
Dict: A result entry compatible with the snippet-scanner JSON format.
382+
"""
383+
purl = component.get('purl', '')
384+
score = best_version.get('score', 0)
385+
version = best_version.get('version', '')
386+
387+
url = purl2url.get_repo_url(purl) if purl else ''
388+
389+
return {
390+
'id': 'file',
391+
'matched': f'{round(score * 100)}%',
392+
'purl': [purl],
393+
'component': component.get('name', ''),
394+
'vendor': component.get('vendor', ''),
395+
'version': version,
396+
'latest': version,
397+
'url': url or '',
398+
'file': '',
399+
'file_hash': file_hash,
400+
'file_url': '',
401+
'source_hash': '',
402+
'url_hash': '',
403+
'release_date': '',
404+
'licenses': [],
405+
'lines': 'all',
406+
'oss_lines': 'all',
407+
'status': 'pending',
408+
}

0 commit comments

Comments
 (0)