Skip to content

Commit 443176c

Browse files
authored
feat: store multiple repo licenses as array IN-1099 (#4105)
Signed-off-by: Gašper Grom <gasper.grom@gmail.com>
1 parent 0af8dc1 commit 443176c

10 files changed

Lines changed: 73 additions & 45 deletions

File tree

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ALTER TABLE public.repositories DROP COLUMN licenses;
2+
ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255);
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ALTER TABLE public.repositories DROP COLUMN license;
2+
ALTER TABLE public.repositories ADD COLUMN licenses VARCHAR(255)[];

services/apps/git_integration/src/crowdgit/database/crud.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,15 +283,15 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s
283283
return str(result)
284284

285285

286-
async def update_repository_license(repository_id: str, license_spdx: str | None) -> None:
286+
async def update_repository_licenses(repository_id: str, licenses: list[str]) -> None:
287287
sql_query = """
288288
UPDATE public.repositories
289-
SET license = $1::varchar,
289+
SET licenses = $1::varchar[],
290290
"updatedAt" = NOW()
291291
WHERE id = $2
292-
AND license IS DISTINCT FROM $1::varchar
292+
AND licenses IS DISTINCT FROM $1::varchar[]
293293
"""
294-
await execute(sql_query, (license_spdx, repository_id))
294+
await execute(sql_query, (licenses, repository_id))
295295

296296

297297
async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState):

services/apps/git_integration/src/crowdgit/services/license/license_service.py

Lines changed: 48 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,59 +11,77 @@
1111
class LicenseService(BaseService):
1212
"""Detects SPDX license from a cloned repository using the licensee gem."""
1313

14-
async def detect(self, repo_path: str) -> str | None:
15-
"""Run licensee against repo_path and return the SPDX identifier, or None."""
14+
async def detect(self, repo_path: str) -> list[str]:
15+
"""Run licensee against repo_path and return a list of SPDX identifiers.
16+
17+
Returns [] when licensee is unavailable or finds no license files.
18+
Returns ['NOASSERTION'] when files are found but none meet the confidence threshold.
19+
"""
1620
try:
1721
output = await run_shell_command(
1822
["licensee", "detect", "--json", repo_path], timeout=60
1923
)
2024
except CommandExecutionError:
2125
self.logger.info(f"licensee found no license in {repo_path}")
22-
return None
26+
return []
2327
except CommandTimeoutError as e:
2428
self.logger.warning(f"licensee timed out: {repr(e)}")
25-
return None
29+
return []
2630
except FileNotFoundError as e:
2731
self.logger.warning(f"licensee binary not found in PATH: {repr(e)}")
28-
return None
32+
return []
2933
except Exception as e:
3034
self.logger.warning(f"licensee failed: {repr(e)}")
31-
return None
35+
return []
3236

3337
try:
3438
data = json.loads(output)
3539
licenses = data.get("licenses") or []
3640
matched_files = data.get("matched_files") or []
37-
spdx_id = licenses[0].get("spdx_id") if licenses else None
38-
confidence = (
39-
(matched_files[0].get("matcher") or {}).get("confidence")
40-
if matched_files
41-
else None
42-
)
41+
42+
# Build a map from spdx_id to its best confidence across matched files.
43+
# licensee puts per-file confidence inside each matched_file's matcher object.
44+
confidence_by_spdx: dict[str, float] = {}
45+
for mf in matched_files:
46+
ml = mf.get("matched_license")
47+
# licensee JSON emits matched_license as a plain string (SPDX id), not a dict
48+
spdx = ml if isinstance(ml, str) else ((ml or {}).get("spdx_id") or "")
49+
conf = (mf.get("matcher") or {}).get("confidence")
50+
if spdx and conf is not None:
51+
confidence_by_spdx[spdx] = max(confidence_by_spdx.get(spdx, 0), conf)
4352

4453
# Mirror GitHub's threshold — below LICENSE_CONFIDENCE_THRESHOLD the match is unreliable.
45-
# Downgrade low-confidence matches to NOASSERTION so the distinction is clean:
46-
# NULL = licensee didn't run, timed out, or found no license file
47-
# NOASSERTION = found a license file but couldn't reliably identify it
48-
# The UI should display NOASSERTION as "Other".
49-
if (
50-
spdx_id
51-
and spdx_id != "NOASSERTION"
52-
and confidence is not None
53-
and confidence < LICENSE_CONFIDENCE_THRESHOLD
54-
):
55-
self.logger.info(
56-
f"License downgraded to NOASSERTION: confidence {confidence}% below threshold in {repo_path}"
57-
)
58-
return "NOASSERTION"
54+
# Drop low-confidence entries; if nothing passes, use NOASSERTION:
55+
# [] = licensee didn't run, timed out, or found no license file
56+
# ['NOASSERTION'] = found a license file but couldn't reliably identify it
57+
result: list[str] = []
58+
seen: set[str] = set()
59+
for entry in licenses:
60+
spdx_id = entry.get("spdx_id")
61+
if not spdx_id or spdx_id in seen:
62+
continue
63+
if spdx_id == "NOASSERTION":
64+
continue
65+
confidence = confidence_by_spdx.get(spdx_id)
66+
if confidence is not None and confidence < LICENSE_CONFIDENCE_THRESHOLD:
67+
self.logger.info(
68+
f"License {spdx_id} dropped: confidence {confidence}% below threshold in {repo_path}"
69+
)
70+
continue
71+
result.append(spdx_id)
72+
seen.add(spdx_id)
5973

60-
if spdx_id:
74+
if not result and licenses:
6175
self.logger.info(
62-
f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}"
76+
f"All licenses below threshold, storing NOASSERTION in {repo_path}"
6377
)
78+
return ["NOASSERTION"]
79+
80+
if result:
81+
self.logger.info(f"Licenses detected: {result} in {repo_path}")
6482
else:
6583
self.logger.info(f"No SPDX license matched in {repo_path}")
66-
return spdx_id
84+
return result
6785
except Exception as e:
6886
self.logger.warning(f"Failed to parse licensee output: {repr(e)}")
69-
return None
87+
return []

services/apps/git_integration/src/crowdgit/worker/repository_worker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
mark_repo_as_processed,
88
release_repo,
99
update_last_processed_commit,
10-
update_repository_license,
10+
update_repository_licenses,
1111
)
1212
from crowdgit.enums import RepositoryState
1313
from crowdgit.errors import (
@@ -242,8 +242,8 @@ async def _process_single_repository(self, repository: Repository):
242242
repository.id, batch_info.repo_path, repository.url
243243
)
244244
await self.maintainer_service.process_maintainers(repository, batch_info)
245-
license_spdx = await self.license_service.detect(batch_info.repo_path)
246-
await update_repository_license(repository.id, license_spdx)
245+
licenses = await self.license_service.detect(batch_info.repo_path)
246+
await update_repository_licenses(repository.id, licenses)
247247
await self.commit_service.process_single_batch_commits(
248248
repository,
249249
batch_info,

services/libs/data-access-layer/src/repositories/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ export interface IRepository {
2121
updatedAt: string
2222
deletedAt: string | null
2323
lastArchivedCheckAt: string | null
24-
license: string | null
24+
licenses: string[] | null
2525
}
2626

2727
export interface ICreateRepository {
@@ -150,7 +150,7 @@ export async function getRepositoriesBySourceIntegrationId(
150150
"updatedAt",
151151
"deletedAt",
152152
"lastArchivedCheckAt",
153-
license
153+
licenses
154154
FROM public.repositories
155155
WHERE "sourceIntegrationId" = $(sourceIntegrationId)
156156
AND "deletedAt" IS NULL
@@ -193,7 +193,7 @@ export async function getRepositoriesByUrl(
193193
"updatedAt",
194194
"deletedAt",
195195
"lastArchivedCheckAt",
196-
license
196+
licenses
197197
FROM public.repositories
198198
WHERE url IN ($(repoUrls:csv))
199199
${deletedFilter}

services/libs/tinybird/datasources/insights_projects_populated_ds.datasource

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ SCHEMA >
6565
`communityLanguages` Array(String),
6666
`status` String,
6767
`maturity` LowCardinality(String),
68-
`lastVulnerabilityScanStatus` Nullable(String)
68+
`lastVulnerabilityScanStatus` Nullable(String),
69+
`repoLicenses` Array(Tuple(String, String))
6970

7071
ENGINE MergeTree
7172
ENGINE_PARTITION_KEY toYear(createdAt)

services/libs/tinybird/datasources/repositories.datasource

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ SCHEMA >
3131
`createdAt` DateTime64(3) `json:$.record.createdAt`,
3232
`updatedAt` DateTime64(3) `json:$.record.updatedAt`,
3333
`deletedAt` Nullable(DateTime64(3)) `json:$.record.deletedAt`,
34-
`lastArchivedCheckAt` Nullable(DateTime64(3)) `json:$.record.lastArchivedCheckAt`
34+
`lastArchivedCheckAt` Nullable(DateTime64(3)) `json:$.record.lastArchivedCheckAt`,
35+
`licenses` Array(String) `json:$.record.licenses` DEFAULT []
3536

3637
ENGINE ReplacingMergeTree
3738
ENGINE_PARTITION_KEY toYear(createdAt)

services/libs/tinybird/pipes/insightsProjects_filtered.pipe

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ SQL >
3535
insights_projects_populated_ds.communityLanguages,
3636
insights_projects_populated_ds.status,
3737
insights_projects_populated_ds.maturity,
38-
insights_projects_populated_ds.lastVulnerabilityScanStatus
38+
insights_projects_populated_ds.lastVulnerabilityScanStatus,
39+
insights_projects_populated_ds.repoLicenses
3940
FROM insights_projects_populated_ds
4041
where
4142
insights_projects_populated_ds.enabled = 1

services/libs/tinybird/pipes/insights_projects_populated_copy.pipe

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ SQL >
6666

6767
NODE insights_projects_populated_copy_flatten_projects
6868
SQL >
69-
SELECT i.id, r.url AS repository
69+
SELECT i.id, r.url AS repository, r.licenses AS licenses
7070
FROM insightsProjects i FINAL
7171
JOIN
7272
repositories r FINAL ON r.insightsProjectId = i.id AND isNull (r.deletedAt) AND r.enabled = true
@@ -77,6 +77,7 @@ SQL >
7777
SELECT
7878
insights_projects_populated_copy_flatten_projects.id as id,
7979
insights_projects_populated_copy_flatten_projects.repository as repository,
80+
insights_projects_populated_copy_flatten_projects.licenses as licenses,
8081
insights_projects_populated_copy_criticality_scores_deduplicated.score as score,
8182
insights_projects_populated_copy_criticality_scores_deduplicated.rank as rank
8283
FROM insights_projects_populated_copy_flatten_projects
@@ -92,7 +93,8 @@ SQL >
9293
max(score) as projectScore,
9394
argMax(rank, score) AS projectRank,
9495
groupArray((repository, score, rank)) as repoData,
95-
groupArray(repository) as repositories
96+
groupArray(repository) as repositories,
97+
arrayFlatten(groupArray(arrayMap(l -> tuple(repository, l), licenses))) as repoLicenses
9698
FROM insights_projects_populated_copy_repository_criticality
9799
GROUP BY id
98100

@@ -209,6 +211,7 @@ SQL >
209211
any (insightsProjects.twitter) as twitter,
210212
any (insightsProjects.widgets) as widgets,
211213
any (insights_projects_populated_copy_project_repo_data.repositories) as repositories,
214+
any (insights_projects_populated_copy_project_repo_data.repoLicenses) as repoLicenses,
212215
any (insightsProjects.enabled) as enabled,
213216
any (insightsProjects.isLF) as isLF,
214217
any (insightsProjects.keywords) as keywords,

0 commit comments

Comments
 (0)