|
22 | 22 | THE SOFTWARE. |
23 | 23 | """ |
24 | 24 |
|
| 25 | +import hashlib |
25 | 26 | import json |
| 27 | +import os |
26 | 28 | import threading |
27 | 29 | import time |
28 | | -from typing import Dict, Optional |
| 30 | +from pathlib import Path |
| 31 | +from typing import Dict, List, Optional, Tuple |
| 32 | + |
| 33 | +from packageurl.contrib import purl2url |
29 | 34 |
|
30 | 35 | from progress.spinner import Spinner |
31 | 36 |
|
@@ -249,4 +254,156 @@ def _format_csv_output(self) -> str: |
249 | 254 | raise NotImplementedError('CSV output is not implemented') |
250 | 255 |
|
251 | 256 | def _format_raw_output(self) -> str: |
252 | | - raise NotImplementedError('Raw output is not implemented') |
| 257 | + """ |
| 258 | + Convert HFH scan results into snippet-scanner JSON format. |
| 259 | +
|
| 260 | + Expands directory-level HFH results into per-file entries keyed by |
| 261 | + relative file path, matching the structure returned by the snippet scanner. |
| 262 | + """ |
| 263 | + if not self.scanner.scan_results or 'results' not in self.scanner.scan_results: |
| 264 | + return '{}' |
| 265 | + |
| 266 | + hfh_results = self.scanner.scan_results.get('results', []) |
| 267 | + if not hfh_results: |
| 268 | + return '{}' |
| 269 | + |
| 270 | + # Collect best-match component info per path_id |
| 271 | + path_components = self._extract_best_components(hfh_results) |
| 272 | + if not path_components: |
| 273 | + return '{}' |
| 274 | + |
| 275 | + # Get all filtered files once (relative paths to scan_dir) |
| 276 | + all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir) |
| 277 | + |
| 278 | + # Sort path_ids by depth (deepest first) so most-specific match wins |
| 279 | + path_components.sort(key=lambda x: x[0].count(os.sep), reverse=True) |
| 280 | + |
| 281 | + output = {} |
| 282 | + claimed_files = set() |
| 283 | + scan_dir = Path(self.scanner.scan_dir).resolve() |
| 284 | + |
| 285 | + for path_id, component, best_version in path_components: |
| 286 | + for file_path in all_files: |
| 287 | + if file_path in claimed_files: |
| 288 | + continue |
| 289 | + if not self._file_matches_path_id(file_path, path_id): |
| 290 | + continue |
| 291 | + |
| 292 | + claimed_files.add(file_path) |
| 293 | + # Path.__truediv__ (/) joins paths using the correct OS separator |
| 294 | + file_hash = self._compute_file_md5(scan_dir / file_path) |
| 295 | + entry = self._build_snippet_entry(component, best_version, file_hash) |
| 296 | + output[file_path] = [entry] |
| 297 | + |
| 298 | + return json.dumps(output, indent=2) |
| 299 | + |
| 300 | + @staticmethod |
| 301 | + def _extract_best_components(hfh_results: List[Dict]) -> List[Tuple[str, Dict, Dict]]: |
| 302 | + """ |
| 303 | + Extract the best-match component and version for each path_id from HFH results. |
| 304 | +
|
| 305 | + Filters for components with order == 1 (best match) and takes their first version. |
| 306 | + Results without a qualifying component or without versions are skipped. |
| 307 | +
|
| 308 | + Args: |
| 309 | + hfh_results (List[Dict]): The 'results' list from the HFH API response. |
| 310 | +
|
| 311 | + Returns: |
| 312 | + List[Tuple[str, Dict, Dict]]: A list of (path_id, component, best_version) tuples. |
| 313 | + """ |
| 314 | + path_components = [] |
| 315 | + for result in hfh_results: |
| 316 | + path_id = result.get('path_id', '.') |
| 317 | + components = result.get('components', []) |
| 318 | + best = [c for c in components if c.get('order') == 1] |
| 319 | + if not best: |
| 320 | + continue |
| 321 | + component = best[0] |
| 322 | + versions = component.get('versions', []) |
| 323 | + if not versions: |
| 324 | + continue |
| 325 | + path_components.append((path_id, component, versions[0])) |
| 326 | + return path_components |
| 327 | + |
| 328 | + @staticmethod |
| 329 | + def _file_matches_path_id(file_path: str, path_id: str) -> bool: |
| 330 | + """ |
| 331 | + Check if a file path belongs under a given path_id directory. |
| 332 | +
|
| 333 | + Both file_path and path_id are relative to the scan root directory. |
| 334 | + A path_id of '.' matches all files (root directory). |
| 335 | +
|
| 336 | + Args: |
| 337 | + file_path (str): Relative file path from the scan root. |
| 338 | + path_id (str): Relative directory path from the HFH result. |
| 339 | +
|
| 340 | + Returns: |
| 341 | + bool: True if the file is under the given path_id directory. |
| 342 | + """ |
| 343 | + if path_id == '.': |
| 344 | + return True |
| 345 | + # file_path and path_id are both relative to scan_dir |
| 346 | + return file_path == path_id or file_path.startswith(path_id + os.sep) |
| 347 | + |
| 348 | + @staticmethod |
| 349 | + def _compute_file_md5(file_path: Path) -> str: |
| 350 | + """ |
| 351 | + Compute the MD5 hash of a file's contents. |
| 352 | +
|
| 353 | + Uses the same approach as the snippet scanner (winnowing.py) to ensure |
| 354 | + consistent file_hash values across scan types. |
| 355 | +
|
| 356 | + Args: |
| 357 | + file_path (Path): Absolute path to the file. |
| 358 | +
|
| 359 | + Returns: |
| 360 | + str: The MD5 hex digest, or an empty string if the file cannot be read. |
| 361 | + """ |
| 362 | + try: |
| 363 | + return hashlib.md5(file_path.read_bytes()).hexdigest() |
| 364 | + except (OSError, IOError): |
| 365 | + return '' |
| 366 | + |
| 367 | + @staticmethod |
| 368 | + def _build_snippet_entry(component: Dict, best_version: Dict, file_hash: str = '') -> Dict: |
| 369 | + """ |
| 370 | + Build a snippet-scanner-compatible result entry from an HFH component. |
| 371 | +
|
| 372 | + Maps HFH component fields to the standard scan result format. Fields not |
| 373 | + available from HFH (file_url, source_hash, url_hash, release_date, licenses) |
| 374 | + are included as empty values since downstream validators require them. |
| 375 | +
|
| 376 | + Args: |
| 377 | + component (Dict): The HFH component with purl, name, vendor fields. |
| 378 | + best_version (Dict): The top version entry with version and score fields. |
| 379 | + file_hash (str): Pre-computed MD5 hash of the local file. |
| 380 | +
|
| 381 | + Returns: |
| 382 | + Dict: A result entry compatible with the snippet-scanner JSON format. |
| 383 | + """ |
| 384 | + purl = component.get('purl', '') |
| 385 | + score = best_version.get('score', 0) |
| 386 | + version = best_version.get('version', '') |
| 387 | + |
| 388 | + url = purl2url.get_repo_url(purl) if purl else '' |
| 389 | + |
| 390 | + return { |
| 391 | + 'id': 'file', |
| 392 | + 'matched': f'{round(score * 100)}%', |
| 393 | + 'purl': [purl], |
| 394 | + 'component': component.get('name', ''), |
| 395 | + 'vendor': component.get('vendor', ''), |
| 396 | + 'version': version, |
| 397 | + 'latest': version, |
| 398 | + 'url': url or '', |
| 399 | + 'file': '', |
| 400 | + 'file_hash': file_hash, |
| 401 | + 'file_url': '', |
| 402 | + 'source_hash': '', |
| 403 | + 'url_hash': '', |
| 404 | + 'release_date': '', |
| 405 | + 'licenses': [], |
| 406 | + 'lines': 'all', |
| 407 | + 'oss_lines': 'all', |
| 408 | + 'status': 'pending', |
| 409 | + } |
0 commit comments