Skip to content

Commit 0f11470

Browse files
committed
feat: detect and store repo license via licensee IN-1105
Signed-off-by: Gašper Grom <gasper.grom@gmail.com>
1 parent 68aeb10 commit 0f11470

9 files changed

Lines changed: 73 additions & 3 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE public.repositories DROP COLUMN license;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255);

scripts/services/docker/Dockerfile.git_integration

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,13 @@ RUN apt-get update && apt-get install -y \
8383
ca-certificates \
8484
git \
8585
ripgrep \
86+
ruby \
87+
ruby-dev \
88+
build-essential \
8689
--no-install-recommends \
87-
&& rm -rf /var/lib/apt/lists/* \
88-
&& apt-get clean \
89-
&& apt-get autoremove -y
90+
&& gem install licensee -v '10.0.0' --no-document \
91+
&& apt-get remove --autoremove -y ruby-dev build-essential \
92+
&& rm -rf /var/lib/apt/lists/*
9093

9194
ENV PYTHONUNBUFFERED=1 \
9295
PYTHONDONTWRITEBYTECODE=1 \

services/apps/git_integration/src/crowdgit/database/crud.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,17 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s
283283
return str(result)
284284

285285

286+
async def update_repository_license(repository_id: str, license_spdx: str | None) -> None:
287+
sql_query = """
288+
UPDATE public.repositories
289+
SET license = $1,
290+
"updatedAt" = NOW()
291+
WHERE id = $2
292+
AND ($1 IS NOT NULL OR license IS NULL)
293+
"""
294+
await execute(sql_query, (license_spdx, repository_id))
295+
296+
286297
async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState):
287298
sql_query = """
288299
UPDATE git."repositoryProcessing"

services/apps/git_integration/src/crowdgit/server.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from crowdgit.services import (
99
CloneService,
1010
CommitService,
11+
LicenseService,
1112
MaintainerService,
1213
QueueService,
1314
SoftwareValueService,
@@ -28,6 +29,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
2829
software_value_service = SoftwareValueService()
2930
vulnerability_scanner_service = VulnerabilityScannerService()
3031
maintainer_service = MaintainerService()
32+
license_service = LicenseService()
3133

3234
worker_task = None
3335
worker = RepositoryWorker(
@@ -36,6 +38,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
3638
software_value_service=software_value_service,
3739
vulnerability_scanner_service=vulnerability_scanner_service,
3840
maintainer_service=maintainer_service,
41+
license_service=license_service,
3942
queue_service=queue_service,
4043
)
4144
logger.info("Repo worker initialized")

services/apps/git_integration/src/crowdgit/services/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from crowdgit.services.base.base_service import BaseService
22
from crowdgit.services.clone.clone_service import CloneService
33
from crowdgit.services.commit.commit_service import CommitService
4+
from crowdgit.services.license.license_service import LicenseService
45
from crowdgit.services.maintainer.maintainer_service import MaintainerService
56
from crowdgit.services.queue.queue_service import QueueService
67
from crowdgit.services.software_value.software_value_service import SoftwareValueService
@@ -12,6 +13,7 @@
1213
"BaseService",
1314
"CloneService",
1415
"CommitService",
16+
"LicenseService",
1517
"SoftwareValueService",
1618
"VulnerabilityScannerService",
1719
"MaintainerService",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from crowdgit.services.license.license_service import LicenseService
2+
3+
__all__ = ["LicenseService"]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import json
2+
3+
from crowdgit.errors import CommandExecutionError, CommandTimeoutError
4+
from crowdgit.services.base.base_service import BaseService
5+
from crowdgit.services.utils import run_shell_command
6+
7+
8+
class LicenseService(BaseService):
9+
"""Detects SPDX license from a cloned repository using the licensee gem."""
10+
11+
async def detect(self, repo_path: str) -> str | None:
12+
"""Run licensee against repo_path and return the SPDX identifier, or None."""
13+
try:
14+
output = await run_shell_command(["licensee", "detect", "--json", repo_path])
15+
except CommandExecutionError:
16+
self.logger.info(f"licensee found no license in {repo_path}")
17+
return None
18+
except CommandTimeoutError as e:
19+
self.logger.warning(f"licensee timed out: {repr(e)}")
20+
return None
21+
except FileNotFoundError as e:
22+
self.logger.warning(f"licensee binary not found in PATH: {repr(e)}")
23+
return None
24+
except Exception as e:
25+
self.logger.warning(f"licensee failed: {repr(e)}")
26+
return None
27+
28+
try:
29+
data = json.loads(output)
30+
matched = data.get("matched_license") or {}
31+
spdx_id = matched.get("spdx_id")
32+
confidence = matched.get("confidence")
33+
if spdx_id:
34+
self.logger.info(f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}")
35+
else:
36+
self.logger.info(f"No SPDX license matched in {repo_path}")
37+
return spdx_id
38+
except Exception as e:
39+
self.logger.warning(f"Failed to parse licensee output: {repr(e)}")
40+
return None

services/apps/git_integration/src/crowdgit/worker/repository_worker.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
mark_repo_as_processed,
88
release_repo,
99
update_last_processed_commit,
10+
update_repository_license,
1011
)
1112
from crowdgit.enums import RepositoryState
1213
from crowdgit.errors import (
@@ -22,6 +23,7 @@
2223
from crowdgit.services import (
2324
CloneService,
2425
CommitService,
26+
LicenseService,
2527
MaintainerService,
2628
QueueService,
2729
SoftwareValueService,
@@ -46,13 +48,15 @@ def __init__(
4648
software_value_service: SoftwareValueService,
4749
vulnerability_scanner_service: VulnerabilityScannerService,
4850
maintainer_service: MaintainerService,
51+
license_service: LicenseService,
4952
queue_service: QueueService,
5053
):
5154
self.clone_service = clone_service
5255
self.commit_service = commit_service
5356
self.software_value_service = software_value_service
5457
self.vulnerability_scanner_service = vulnerability_scanner_service
5558
self.maintainer_service = maintainer_service
59+
self.license_service = license_service
5660
self.queue_service = queue_service
5761
self._shutdown = False
5862

@@ -236,6 +240,8 @@ async def _process_single_repository(self, repository: Repository):
236240
repository.id, batch_info.repo_path, repository.url
237241
)
238242
await self.maintainer_service.process_maintainers(repository, batch_info)
243+
license_spdx = await self.license_service.detect(batch_info.repo_path)
244+
await update_repository_license(repository.id, license_spdx)
239245
await self.commit_service.process_single_batch_commits(
240246
repository,
241247
batch_info,

0 commit comments

Comments
 (0)