Skip to content

Commit d244ab4

Browse files
authored
Merge branch 'main' into chore/add-type-to-identities-public-api
2 parents 0e79e6f + 2537232 commit d244ab4

10 files changed

Lines changed: 93 additions & 5 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE public.repositories DROP COLUMN license;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255);

scripts/services/docker/Dockerfile.git_integration

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,17 @@ RUN apt-get update && apt-get install -y \
8383
ca-certificates \
8484
git \
8585
ripgrep \
86+
ruby \
87+
libgit2-1.1 \
88+
ruby-dev \
89+
build-essential \
90+
libgit2-dev \
91+
cmake \
92+
pkg-config \
8693
--no-install-recommends \
87-
&& rm -rf /var/lib/apt/lists/* \
88-
&& apt-get clean \
89-
&& apt-get autoremove -y
94+
&& gem install licensee -v '9.15.3' --no-document \
95+
&& apt-get remove --autoremove -y ruby-dev build-essential libgit2-dev cmake pkg-config \
96+
&& rm -rf /var/lib/apt/lists/*
9097

9198
ENV PYTHONUNBUFFERED=1 \
9299
PYTHONDONTWRITEBYTECODE=1 \

services/apps/git_integration/src/crowdgit/database/crud.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,17 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s
283283
return str(result)
284284

285285

286+
async def update_repository_license(repository_id: str, license_spdx: str | None) -> None:
287+
sql_query = """
288+
UPDATE public.repositories
289+
SET license = $1::varchar,
290+
"updatedAt" = NOW()
291+
WHERE id = $2
292+
AND license IS DISTINCT FROM $1::varchar
293+
"""
294+
await execute(sql_query, (license_spdx, repository_id))
295+
296+
286297
async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState):
287298
sql_query = """
288299
UPDATE git."repositoryProcessing"

services/apps/git_integration/src/crowdgit/server.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from crowdgit.services import (
99
CloneService,
1010
CommitService,
11+
LicenseService,
1112
MaintainerService,
1213
QueueService,
1314
SoftwareValueService,
@@ -28,6 +29,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
2829
software_value_service = SoftwareValueService()
2930
vulnerability_scanner_service = VulnerabilityScannerService()
3031
maintainer_service = MaintainerService()
32+
license_service = LicenseService()
3133

3234
worker_task = None
3335
worker = RepositoryWorker(
@@ -36,6 +38,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
3638
software_value_service=software_value_service,
3739
vulnerability_scanner_service=vulnerability_scanner_service,
3840
maintainer_service=maintainer_service,
41+
license_service=license_service,
3942
queue_service=queue_service,
4043
)
4144
logger.info("Repo worker initialized")

services/apps/git_integration/src/crowdgit/services/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from crowdgit.services.base.base_service import BaseService
22
from crowdgit.services.clone.clone_service import CloneService
33
from crowdgit.services.commit.commit_service import CommitService
4+
from crowdgit.services.license.license_service import LicenseService
45
from crowdgit.services.maintainer.maintainer_service import MaintainerService
56
from crowdgit.services.queue.queue_service import QueueService
67
from crowdgit.services.software_value.software_value_service import SoftwareValueService
@@ -12,6 +13,7 @@
1213
"BaseService",
1314
"CloneService",
1415
"CommitService",
16+
"LicenseService",
1517
"SoftwareValueService",
1618
"VulnerabilityScannerService",
1719
"MaintainerService",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from crowdgit.services.license.license_service import LicenseService
2+
3+
__all__ = ["LicenseService"]
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import json
2+
3+
from crowdgit.errors import CommandExecutionError, CommandTimeoutError
4+
from crowdgit.services.base.base_service import BaseService
5+
from crowdgit.services.utils import run_shell_command
6+
7+
8+
class LicenseService(BaseService):
9+
"""Detects SPDX license from a cloned repository using the licensee gem."""
10+
11+
async def detect(self, repo_path: str) -> str | None:
12+
"""Run licensee against repo_path and return the SPDX identifier, or None."""
13+
try:
14+
output = await run_shell_command(
15+
["licensee", "detect", "--json", repo_path], timeout=60
16+
)
17+
except CommandExecutionError:
18+
self.logger.info(f"licensee found no license in {repo_path}")
19+
return None
20+
except CommandTimeoutError as e:
21+
self.logger.warning(f"licensee timed out: {repr(e)}")
22+
return None
23+
except FileNotFoundError as e:
24+
self.logger.warning(f"licensee binary not found in PATH: {repr(e)}")
25+
return None
26+
except Exception as e:
27+
self.logger.warning(f"licensee failed: {repr(e)}")
28+
return None
29+
30+
try:
31+
data = json.loads(output)
32+
licenses = data.get("licenses") or []
33+
matched_files = data.get("matched_files") or []
34+
spdx_id = licenses[0].get("spdx_id") if licenses else None
35+
confidence = (
36+
(matched_files[0].get("matcher") or {}).get("confidence")
37+
if matched_files
38+
else None
39+
)
40+
if spdx_id:
41+
self.logger.info(
42+
f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}"
43+
)
44+
else:
45+
self.logger.info(f"No SPDX license matched in {repo_path}")
46+
return spdx_id
47+
except Exception as e:
48+
self.logger.warning(f"Failed to parse licensee output: {repr(e)}")
49+
return None

services/apps/git_integration/src/crowdgit/worker/repository_worker.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
mark_repo_as_processed,
88
release_repo,
99
update_last_processed_commit,
10+
update_repository_license,
1011
)
1112
from crowdgit.enums import RepositoryState
1213
from crowdgit.errors import (
@@ -22,6 +23,7 @@
2223
from crowdgit.services import (
2324
CloneService,
2425
CommitService,
26+
LicenseService,
2527
MaintainerService,
2628
QueueService,
2729
SoftwareValueService,
@@ -46,13 +48,15 @@ def __init__(
4648
software_value_service: SoftwareValueService,
4749
vulnerability_scanner_service: VulnerabilityScannerService,
4850
maintainer_service: MaintainerService,
51+
license_service: LicenseService,
4952
queue_service: QueueService,
5053
):
5154
self.clone_service = clone_service
5255
self.commit_service = commit_service
5356
self.software_value_service = software_value_service
5457
self.vulnerability_scanner_service = vulnerability_scanner_service
5558
self.maintainer_service = maintainer_service
59+
self.license_service = license_service
5660
self.queue_service = queue_service
5761
self._shutdown = False
5862

@@ -159,6 +163,7 @@ def _bind_repository_context(self, repository: Repository, repo_name: str) -> No
159163
(self.maintainer_service, "maintainer_processing"),
160164
(self.software_value_service, "software_value_processing"),
161165
(self.vulnerability_scanner_service, "vulnerability_scan_processing"),
166+
(self.license_service, "license_detection"),
162167
(self.queue_service, "queue_service"),
163168
]
164169

@@ -174,6 +179,7 @@ def _reset_all_contexts(self) -> None:
174179
self.maintainer_service,
175180
self.software_value_service,
176181
self.vulnerability_scanner_service,
182+
self.license_service,
177183
self.queue_service,
178184
]
179185

@@ -236,6 +242,8 @@ async def _process_single_repository(self, repository: Repository):
236242
repository.id, batch_info.repo_path, repository.url
237243
)
238244
await self.maintainer_service.process_maintainers(repository, batch_info)
245+
license_spdx = await self.license_service.detect(batch_info.repo_path)
246+
await update_repository_license(repository.id, license_spdx)
239247
await self.commit_service.process_single_batch_commits(
240248
repository,
241249
batch_info,

services/libs/data-access-layer/src/repositories/index.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ export interface IRepository {
2121
updatedAt: string
2222
deletedAt: string | null
2323
lastArchivedCheckAt: string | null
24+
license: string | null
2425
}
2526

2627
export interface ICreateRepository {
@@ -148,7 +149,8 @@ export async function getRepositoriesBySourceIntegrationId(
148149
"createdAt",
149150
"updatedAt",
150151
"deletedAt",
151-
"lastArchivedCheckAt"
152+
"lastArchivedCheckAt",
153+
license
152154
FROM public.repositories
153155
WHERE "sourceIntegrationId" = $(sourceIntegrationId)
154156
AND "deletedAt" IS NULL
@@ -190,7 +192,8 @@ export async function getRepositoriesByUrl(
190192
"createdAt",
191193
"updatedAt",
192194
"deletedAt",
193-
"lastArchivedCheckAt"
195+
"lastArchivedCheckAt",
196+
license
194197
FROM public.repositories
195198
WHERE url IN ($(repoUrls:csv))
196199
${deletedFilter}

0 commit comments

Comments
 (0)