|
22 | 22 | THE SOFTWARE. |
23 | 23 | """ |
24 | 24 |
|
| 25 | +import hashlib |
25 | 26 | import json |
| 27 | +import os |
26 | 28 | import threading |
27 | 29 | import time |
28 | | -from typing import Dict, Optional |
| 30 | +from pathlib import Path |
| 31 | +from typing import Dict, List, Optional, Tuple |
29 | 32 |
|
| 33 | +from packageurl.contrib import purl2url |
30 | 34 | from progress.spinner import Spinner |
31 | 35 |
|
32 | 36 | from scanoss.constants import ( |
@@ -163,6 +167,13 @@ class ScannerHFHPresenter(AbstractPresenter): |
163 | 167 | """ |
164 | 168 |
|
165 | 169 | def __init__(self, scanner: ScannerHFH, **kwargs): |
| 170 | + """ |
| 171 | + Initialize the presenter. |
| 172 | +
|
| 173 | + Args: |
| 174 | + scanner (ScannerHFH): The HFH scanner instance containing scan results and file filters. |
| 175 | + **kwargs: Additional arguments passed to AbstractPresenter (debug, trace, quiet, etc.). |
| 176 | + """ |
166 | 177 | super().__init__(**kwargs) |
167 | 178 | self.scanner = scanner |
168 | 179 |
|
@@ -249,4 +260,176 @@ def _format_csv_output(self) -> str: |
249 | 260 | raise NotImplementedError('CSV output is not implemented') |
250 | 261 |
|
251 | 262 | def _format_raw_output(self) -> str: |
252 | | - raise NotImplementedError('Raw output is not implemented') |
| 263 | + """ |
| 264 | + Convert HFH scan results into snippet-scanner JSON format. |
| 265 | +
|
| 266 | + Expands directory-level HFH results into per-file entries keyed by |
| 267 | + relative file path, matching the structure returned by the snippet scanner. |
| 268 | + For each file, computes the MD5 hash and constructs the file_url using |
| 269 | + the API base URL from the scanner config. |
| 270 | +
|
| 271 | + Returns: |
| 272 | + str: A JSON string with the snippet-scanner format, or '{}' if no results. |
| 273 | + """ |
| 274 | + if not self.scanner.scan_results or 'results' not in self.scanner.scan_results: |
| 275 | + return '{}' |
| 276 | + |
| 277 | + hfh_results = self.scanner.scan_results.get('results', []) |
| 278 | + if not hfh_results: |
| 279 | + return '{}' |
| 280 | + |
| 281 | + # Collect best-match component info per path_id |
| 282 | + path_components = self._extract_best_components(hfh_results) |
| 283 | + if not path_components: |
| 284 | + return '{}' |
| 285 | + |
| 286 | + # Get all filtered files once (relative paths to scan_dir) |
| 287 | + all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir) |
| 288 | + |
| 289 | + # Sort path_ids by depth (deepest first) so most-specific match wins. |
| 290 | + # Root path '.' is always last (-1), others sort by separator count then path length. |
| 291 | + # Example with path_ids: ['.', 'external', 'project-1.0', 'project-1.0/src/lib'] |
| 292 | + # Sorted result: ['project-1.0/src/lib', 'project-1.0', 'external', '.'] |
| 293 | + # - 'project-1.0/src/lib' (depth 2) claims its files first |
| 294 | + # - 'project-1.0' (depth 0, len 11) claims remaining files under it |
| 295 | + # - 'external' (depth 0, len 8) claims external/ files |
| 296 | + # - '.' (root, always last) picks up everything else |
| 297 | + sorted_path_ids = sorted( |
| 298 | + path_components.keys(), |
| 299 | + key=lambda p: (-1, 0) if p == '.' else (p.count(os.sep), len(p)), |
| 300 | + reverse=True, |
| 301 | + ) |
| 302 | + |
| 303 | + output = {} |
| 304 | + claimed_files = set() |
| 305 | + scan_dir = Path(self.scanner.scan_dir).resolve() |
| 306 | + |
| 307 | + for path_id in sorted_path_ids: |
| 308 | + component, best_version = path_components[path_id] |
| 309 | + for file_path in all_files: |
| 310 | + if file_path in claimed_files: |
| 311 | + continue |
| 312 | + if not self._file_matches_path_id(file_path, path_id): |
| 313 | + continue |
| 314 | + |
| 315 | + claimed_files.add(file_path) |
| 316 | + # Path.__truediv__ (/) joins paths using the correct OS separator |
| 317 | + file_hash = self._compute_file_md5(scan_dir / file_path) |
| 318 | + api_url = self.scanner.client.orig_url or '' |
| 319 | + entry = self._build_file_match_entry(component, best_version, file_path, file_hash, api_url) |
| 320 | + output[file_path] = [entry] |
| 321 | + |
| 322 | + return json.dumps(output, indent=2) |
| 323 | + |
| 324 | + @staticmethod |
| 325 | + def _extract_best_components(hfh_results: List[Dict]) -> Dict[str, Tuple[Dict, Dict]]: |
| 326 | + """ |
| 327 | + Extract the best-match component and version for each path_id from HFH results. |
| 328 | +
|
| 329 | + Filters for components with order == 1 (best match) and takes their first version. |
| 330 | + Results without a qualifying component or without versions are skipped. |
| 331 | +
|
| 332 | + Args: |
| 333 | + hfh_results (List[Dict]): The 'results' list from the HFH API response. |
| 334 | +
|
| 335 | + Returns: |
| 336 | + Dict[str, Tuple[Dict, Dict]]: A dict mapping path_id to (component, best_version). |
| 337 | + """ |
| 338 | + path_components = {} |
| 339 | + for result in hfh_results: |
| 340 | + path_id = result.get('path_id', '.') |
| 341 | + components = result.get('components', []) |
| 342 | + best = [c for c in components if c.get('order') == 1] |
| 343 | + if not best: |
| 344 | + continue |
| 345 | + component = best[0] |
| 346 | + versions = component.get('versions', []) |
| 347 | + if not versions: |
| 348 | + continue |
| 349 | + path_components[path_id] = (component, versions[0]) |
| 350 | + return path_components |
| 351 | + |
| 352 | + @staticmethod |
| 353 | + def _file_matches_path_id(file_path: str, path_id: str) -> bool: |
| 354 | + """ |
| 355 | + Check if a file path belongs under a given path_id directory. |
| 356 | +
|
| 357 | + Both file_path and path_id are relative to the scan root directory. |
| 358 | + A path_id of '.' matches all files (root directory). |
| 359 | +
|
| 360 | + Args: |
| 361 | + file_path (str): Relative file path from the scan root. |
| 362 | + path_id (str): Relative directory path from the HFH result. |
| 363 | +
|
| 364 | + Returns: |
| 365 | + bool: True if the file is under the given path_id directory. |
| 366 | + """ |
| 367 | + if path_id == '.': |
| 368 | + return True |
| 369 | + # file_path and path_id are both relative to scan_dir |
| 370 | + return file_path == path_id or file_path.startswith(path_id + os.sep) |
| 371 | + |
| 372 | + def _compute_file_md5(self, file_path: Path) -> str: |
| 373 | + """ |
| 374 | + Compute the MD5 hash of a file's contents. |
| 375 | +
|
| 376 | + Uses the same approach as the snippet scanner (winnowing.py) to ensure |
| 377 | + consistent file_hash values across scan types. |
| 378 | +
|
| 379 | + Args: |
| 380 | + file_path (Path): Absolute path to the file. |
| 381 | +
|
| 382 | + Returns: |
| 383 | + str: The MD5 hex digest, or an empty string if the file cannot be read. |
| 384 | + """ |
| 385 | + try: |
| 386 | + return hashlib.md5(file_path.read_bytes()).hexdigest() |
| 387 | + except (OSError, IOError) as e: |
| 388 | + self.base.print_stderr(f'Warning: Failed to compute MD5 for {file_path}: {e}') |
| 389 | + return '' |
| 390 | + |
| 391 | + @staticmethod |
| 392 | + def _build_file_match_entry( |
| 393 | + component: Dict, best_version: Dict, file_path: str, file_hash: str, base_url: str, |
| 394 | + ) -> Dict: |
| 395 | + """ |
| 396 | + Build a snippet-scanner-compatible result entry from an HFH component. |
| 397 | +
|
| 398 | + Maps HFH component fields to the standard scan result format. Fields not |
| 399 | + available from HFH (url_hash, release_date, licenses) are included as empty |
| 400 | + values since downstream validators require them. |
| 401 | +
|
| 402 | + Args: |
| 403 | + component (Dict): The HFH component with purl, name, vendor fields. |
| 404 | + best_version (Dict): The top version entry with version and score fields. |
| 405 | + file_path (str): Relative file path from the scan root directory. |
| 406 | + file_hash (str): Pre-computed MD5 hash of the local file. |
| 407 | + base_url (str): API base URL used to construct the file_url field. |
| 408 | +
|
| 409 | + Returns: |
| 410 | + Dict: A result entry compatible with the snippet-scanner JSON format. |
| 411 | + """ |
| 412 | + purl = component.get('purl', '') |
| 413 | + version = best_version.get('version', '') |
| 414 | + |
| 415 | + url = purl2url.get_repo_url(purl) if purl else '' |
| 416 | + return { |
| 417 | + 'id': 'file', |
| 418 | + 'matched': '100%', |
| 419 | + 'purl': [purl], |
| 420 | + 'component': component.get('name', ''), |
| 421 | + 'vendor': component.get('vendor', ''), |
| 422 | + 'version': version, |
| 423 | + 'latest': version, |
| 424 | + 'url': url or '', |
| 425 | + 'file': file_path, |
| 426 | + 'file_hash': file_hash, |
| 427 | + 'file_url': f'{base_url}/file_contents/{file_hash}', |
| 428 | + 'source_hash': file_hash, |
| 429 | + 'url_hash': '', |
| 430 | + 'release_date': '', |
| 431 | + 'licenses': [], |
| 432 | + 'lines': 'all', |
| 433 | + 'oss_lines': 'all', |
| 434 | + 'status': 'pending', |
| 435 | + } |
0 commit comments