|
11 | 11 | class LicenseService(BaseService): |
12 | 12 | """Detects SPDX license from a cloned repository using the licensee gem.""" |
13 | 13 |
|
14 | | - async def detect(self, repo_path: str) -> str | None: |
15 | | - """Run licensee against repo_path and return the SPDX identifier, or None.""" |
| 14 | + async def detect(self, repo_path: str) -> list[str]: |
| 15 | + """Run licensee against repo_path and return a list of SPDX identifiers. |
| 16 | +
|
| 17 | + Returns [] when licensee is unavailable or finds no license files. |
| 18 | + Returns ['NOASSERTION'] when files are found but none meet the confidence threshold. |
| 19 | + """ |
16 | 20 | try: |
17 | 21 | output = await run_shell_command( |
18 | 22 | ["licensee", "detect", "--json", repo_path], timeout=60 |
19 | 23 | ) |
20 | 24 | except CommandExecutionError: |
21 | 25 | self.logger.info(f"licensee found no license in {repo_path}") |
22 | | - return None |
| 26 | + return [] |
23 | 27 | except CommandTimeoutError as e: |
24 | 28 | self.logger.warning(f"licensee timed out: {repr(e)}") |
25 | | - return None |
| 29 | + return [] |
26 | 30 | except FileNotFoundError as e: |
27 | 31 | self.logger.warning(f"licensee binary not found in PATH: {repr(e)}") |
28 | | - return None |
| 32 | + return [] |
29 | 33 | except Exception as e: |
30 | 34 | self.logger.warning(f"licensee failed: {repr(e)}") |
31 | | - return None |
| 35 | + return [] |
32 | 36 |
|
33 | 37 | try: |
34 | 38 | data = json.loads(output) |
35 | 39 | licenses = data.get("licenses") or [] |
36 | 40 | matched_files = data.get("matched_files") or [] |
37 | | - spdx_id = licenses[0].get("spdx_id") if licenses else None |
38 | | - confidence = ( |
39 | | - (matched_files[0].get("matcher") or {}).get("confidence") |
40 | | - if matched_files |
41 | | - else None |
42 | | - ) |
| 41 | + |
| 42 | + # Build a map from spdx_id to its best confidence across matched files. |
| 43 | + # licensee puts per-file confidence inside each matched_file's matcher object. |
| 44 | + confidence_by_spdx: dict[str, float] = {} |
| 45 | + for mf in matched_files: |
| 46 | + ml = mf.get("matched_license") |
| 47 | + # licensee JSON emits matched_license as a plain string (SPDX id), not a dict |
| 48 | + spdx = ml if isinstance(ml, str) else ((ml or {}).get("spdx_id") or "") |
| 49 | + conf = (mf.get("matcher") or {}).get("confidence") |
| 50 | + if spdx and conf is not None: |
| 51 | + confidence_by_spdx[spdx] = max(confidence_by_spdx.get(spdx, 0), conf) |
43 | 52 |
|
44 | 53 | # Mirror GitHub's threshold — below LICENSE_CONFIDENCE_THRESHOLD the match is unreliable. |
45 | | - # Downgrade low-confidence matches to NOASSERTION so the distinction is clean: |
46 | | - # NULL = licensee didn't run, timed out, or found no license file |
47 | | - # NOASSERTION = found a license file but couldn't reliably identify it |
48 | | - # The UI should display NOASSERTION as "Other". |
49 | | - if ( |
50 | | - spdx_id |
51 | | - and spdx_id != "NOASSERTION" |
52 | | - and confidence is not None |
53 | | - and confidence < LICENSE_CONFIDENCE_THRESHOLD |
54 | | - ): |
55 | | - self.logger.info( |
56 | | - f"License downgraded to NOASSERTION: confidence {confidence}% below threshold in {repo_path}" |
57 | | - ) |
58 | | - return "NOASSERTION" |
| 54 | + # Drop low-confidence entries; if nothing passes, use NOASSERTION: |
| 55 | + # [] = licensee didn't run, timed out, or found no license file |
| 56 | + # ['NOASSERTION'] = found a license file but couldn't reliably identify it |
| 57 | + result: list[str] = [] |
| 58 | + seen: set[str] = set() |
| 59 | + for entry in licenses: |
| 60 | + spdx_id = entry.get("spdx_id") |
| 61 | + if not spdx_id or spdx_id in seen: |
| 62 | + continue |
| 63 | + if spdx_id == "NOASSERTION": |
| 64 | + continue |
| 65 | + confidence = confidence_by_spdx.get(spdx_id) |
| 66 | + if confidence is not None and confidence < LICENSE_CONFIDENCE_THRESHOLD: |
| 67 | + self.logger.info( |
| 68 | + f"License {spdx_id} dropped: confidence {confidence}% below threshold in {repo_path}" |
| 69 | + ) |
| 70 | + continue |
| 71 | + result.append(spdx_id) |
| 72 | + seen.add(spdx_id) |
59 | 73 |
|
60 | | - if spdx_id: |
| 74 | + if not result and licenses: |
61 | 75 | self.logger.info( |
62 | | - f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}" |
| 76 | + f"All licenses below threshold, storing NOASSERTION in {repo_path}" |
63 | 77 | ) |
| 78 | + return ["NOASSERTION"] |
| 79 | + |
| 80 | + if result: |
| 81 | + self.logger.info(f"Licenses detected: {result} in {repo_path}") |
64 | 82 | else: |
65 | 83 | self.logger.info(f"No SPDX license matched in {repo_path}") |
66 | | - return spdx_id |
| 84 | + return result |
67 | 85 | except Exception as e: |
68 | 86 | self.logger.warning(f"Failed to parse licensee output: {repr(e)}") |
69 | | - return None |
| 87 | + return [] |
0 commit comments