feat(hfh):SP-4181 implement raw output format for folder hashing

agustingroh · agustingroh · commit ed33fe38efc7 · 2026-03-24T12:26:18.000-03:00
diff --git a/src/scanoss/cli.py b/src/scanoss/cli.py
@@ -988,7 +988,7 @@ def setup_args() -> None:  # noqa: PLR0912, PLR0915
         '--format',
         '-f',
         type=str,
-        choices=['json', 'cyclonedx'],
+        choices=['json', 'cyclonedx', 'raw'],
         default='json',
         help='Result output format (optional - default: json)',
     )
diff --git a/src/scanoss/scanners/scanner_hfh.py b/src/scanoss/scanners/scanner_hfh.py
@@ -22,10 +22,15 @@
   THE SOFTWARE.
 """
 
+import hashlib
 import json
+import os
 import threading
 import time
-from typing import Dict, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from packageurl.contrib import purl2url
 
 from progress.spinner import Spinner
 
@@ -249,4 +254,156 @@ def _format_csv_output(self) -> str:
         raise NotImplementedError('CSV output is not implemented')
 
     def _format_raw_output(self) -> str:
-        raise NotImplementedError('Raw output is not implemented')
+        """
+        Convert HFH scan results into snippet-scanner JSON format.
+
+        Expands directory-level HFH results into per-file entries keyed by
+        relative file path, matching the structure returned by the snippet scanner.
+        """
+        if not self.scanner.scan_results or 'results' not in self.scanner.scan_results:
+            return '{}'
+
+        hfh_results = self.scanner.scan_results.get('results', [])
+        if not hfh_results:
+            return '{}'
+
+        # Collect best-match component info per path_id
+        path_components = self._extract_best_components(hfh_results)
+        if not path_components:
+            return '{}'
+
+        # Get all filtered files once (relative paths to scan_dir)
+        all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir)
+
+        # Sort path_ids by depth (deepest first) so most-specific match wins
+        path_components.sort(key=lambda x: x[0].count(os.sep), reverse=True)
+
+        output = {}
+        claimed_files = set()
+        scan_dir = Path(self.scanner.scan_dir).resolve()
+
+        for path_id, component, best_version in path_components:
+            for file_path in all_files:
+                if file_path in claimed_files:
+                    continue
+                if not self._file_matches_path_id(file_path, path_id):
+                    continue
+
+                claimed_files.add(file_path)
+                # Path.__truediv__ (/) joins paths using the correct OS separator
+                file_hash = self._compute_file_md5(scan_dir / file_path)
+                entry = self._build_snippet_entry(component, best_version, file_hash)
+                output[file_path] = [entry]
+
+        return json.dumps(output, indent=2)
+
+    @staticmethod
+    def _extract_best_components(hfh_results: List[Dict]) -> List[Tuple[str, Dict, Dict]]:
+        """
+        Extract the best-match component and version for each path_id from HFH results.
+
+        Filters for components with order == 1 (best match) and takes their first version.
+        Results without a qualifying component or without versions are skipped.
+
+        Args:
+            hfh_results (List[Dict]): The 'results' list from the HFH API response.
+
+        Returns:
+            List[Tuple[str, Dict, Dict]]: A list of (path_id, component, best_version) tuples.
+        """
+        path_components = []
+        for result in hfh_results:
+            path_id = result.get('path_id', '.')
+            components = result.get('components', [])
+            best = [c for c in components if c.get('order') == 1]
+            if not best:
+                continue
+            component = best[0]
+            versions = component.get('versions', [])
+            if not versions:
+                continue
+            path_components.append((path_id, component, versions[0]))
+        return path_components
+
+    @staticmethod
+    def _file_matches_path_id(file_path: str, path_id: str) -> bool:
+        """
+        Check if a file path belongs under a given path_id directory.
+
+        Both file_path and path_id are relative to the scan root directory.
+        A path_id of '.' matches all files (root directory).
+
+        Args:
+            file_path (str): Relative file path from the scan root.
+            path_id (str): Relative directory path from the HFH result.
+
+        Returns:
+            bool: True if the file is under the given path_id directory.
+        """
+        if path_id == '.':
+            return True
+        # file_path and path_id are both relative to scan_dir
+        return file_path == path_id or file_path.startswith(path_id + os.sep)
+
+    @staticmethod
+    def _compute_file_md5(file_path: Path) -> str:
+        """
+        Compute the MD5 hash of a file's contents.
+
+        Uses the same approach as the snippet scanner (winnowing.py) to ensure
+        consistent file_hash values across scan types.
+
+        Args:
+            file_path (Path): Absolute path to the file.
+
+        Returns:
+            str: The MD5 hex digest, or an empty string if the file cannot be read.
+        """
+        try:
+            return hashlib.md5(file_path.read_bytes()).hexdigest()
+        except (OSError, IOError):
+            return ''
+
+    @staticmethod
+    def _build_snippet_entry(component: Dict, best_version: Dict, file_hash: str = '') -> Dict:
+        """
+        Build a snippet-scanner-compatible result entry from an HFH component.
+
+        Maps HFH component fields to the standard scan result format. Fields not
+        available from HFH (file_url, source_hash, url_hash, release_date, licenses)
+        are included as empty values since downstream validators require them.
+
+        Args:
+            component (Dict): The HFH component with purl, name, vendor fields.
+            best_version (Dict): The top version entry with version and score fields.
+            file_hash (str): Pre-computed MD5 hash of the local file.
+
+        Returns:
+            Dict: A result entry compatible with the snippet-scanner JSON format.
+        """
+        purl = component.get('purl', '')
+        score = best_version.get('score', 0)
+        version = best_version.get('version', '')
+
+        url = purl2url.get_repo_url(purl) if purl else ''
+
+        return {
+            'id': 'file',
+            'matched': f'{round(score * 100)}%',
+            'purl': [purl],
+            'component': component.get('name', ''),
+            'vendor': component.get('vendor', ''),
+            'version': version,
+            'latest': version,
+            'url': url or '',
+            'file': '',
+            'file_hash': file_hash,
+            'file_url': '',
+            'source_hash': '',
+            'url_hash': '',
+            'release_date': '',
+            'licenses': [],
+            'lines': 'all',
+            'oss_lines': 'all',
+            'status': 'pending',
+        }

Original file line number	Diff line number	Diff line change
`@@ -988,7 +988,7 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915`
`988`	`988`	`'--format',`
`989`	`989`	`'-f',`
`990`	`990`	`type=str,`
`991`		`- choices=['json', 'cyclonedx'],`
	`991`	`+ choices=['json', 'cyclonedx', 'raw'],`
`992`	`992`	`default='json',`
`993`	`993`	`help='Result output format (optional - default: json)',`
`994`	`994`	`)`