Skip to content

Commit d13a22a

Browse files
committed
Merge branch 'main' into 1538-memory-limit-exceeded-in-batch-process-dataset-prod-function
2 parents c2ada2d + a848501 commit d13a22a

20 files changed

Lines changed: 593 additions & 237 deletions

File tree

api/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,5 @@ fastapi-filter[sqlalchemy]==1.0.0
4848
PyJWT
4949
shapely
5050
google-cloud-pubsub
51-
pycountry
51+
pycountry
52+
pytz

api/src/scripts/populate_db_gtfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def process_redirects(self, session: "Session"):
168168
continue
169169
feed = self.query_feed_by_stable_id(session, stable_id, None)
170170
raw_comments = row.get("redirect.comment", None)
171-
comments = raw_comments.split("|") if raw_comments is not None else []
171+
comments = str(raw_comments).split("|") if raw_comments is not None else []
172172
if len(redirects_ids) != len(comments) and len(comments) > 0:
173173
self.logger.warning(f"Number of redirect ids and redirect comments differ for feed {stable_id}")
174174
for redirect_id in redirects_ids:

api/src/shared/common/license_utils.py

Lines changed: 71 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ class MatchingLicense:
6666
(re.compile(r"opensource\.org/licenses/MIT/?", re.I), "MIT"),
6767
(re.compile(r"choosealicense\.com/licenses/mit/?", re.I), "MIT"),
6868
(re.compile(r"choosealicense\.com/licenses/apache-2\.0/?", re.I), "Apache-2.0"),
69-
# add Etalab / Québec, etc., once verified
7069
]
7170

7271

@@ -281,6 +280,44 @@ def resolve_fuzzy_match(
281280
return results
282281

283282

283+
def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
284+
"""Find exact match of normalized license URL in DB (License.url)."""
285+
if not db_session:
286+
return None
287+
# Compare normalized strings using SQL functions on License.url
288+
return (
289+
db_session.query(License)
290+
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
291+
.first()
292+
)
293+
294+
295+
def extract_spdx_id_from_url(url_normalized: str) -> Optional[str]:
296+
"""Extract an SPDX license ID from an SPDX-style URL if present.
297+
298+
Recognizes URLs of the form used on spdx.org, for example::
299+
300+
https://spdx.org/licenses/ODbL-1.0.html
301+
http://spdx.org/licenses/MIT
302+
303+
The function is conservative and only returns an SPDX ID when it finds a
304+
path segment under ``/licenses/`` that looks like an SPDX identifier. Any
305+
optional ``.html`` suffix is stripped.
306+
"""
307+
# Match host 'spdx.org' and capture the token after '/licenses/' up to
308+
# an optional '.html' suffix and optional trailing slash.
309+
match = re.search(r"spdx\.org/licenses/([^/?#]+?)(?:\.html)?/?$", url_normalized, re.I)
310+
if not match:
311+
return None
312+
313+
spdx_id = match.group(1)
314+
# Basic sanity check: SPDX IDs are typically alnum plus '-', '.' (e.g. 'CC-BY-4.0')
315+
if not re.fullmatch(r"[A-Za-z0-9.+-]+", spdx_id):
316+
return None
317+
318+
return spdx_id
319+
320+
284321
def resolve_license(
285322
license_url: str,
286323
allow_fuzzy: bool = True,
@@ -290,11 +327,12 @@ def resolve_license(
290327
"""Resolve a license URL to one or more SPDX candidates using multiple strategies.
291328
292329
Strategies (in order of precedence):
293-
1) Exact match in DB(db.license) -> return [exact]
294-
2) Creative Commons resolver(cc-resolver) -> return [cc]
295-
3) Generic heuristics(pattern-heuristics) -> return [heuristic]
296-
4) Fuzzy (same host candidates) -> return [fuzzy...]
297-
5) No match -> return [none]
330+
1) Exact match in DB (``db.license``) -> return [exact]
331+
2) Creative Commons resolver (``cc-resolver``) -> return [cc]
332+
3) SPDX catalog URL resolver (``spdx.org/licenses``) -> return [spdx]
333+
4) Generic heuristics (pattern-based) -> return [heuristic]
334+
5) Fuzzy (same-host candidates) -> return [fuzzy...]
335+
6) No match -> return []
298336
299337
Args:
300338
license_url (str): The license URL to resolve.
@@ -350,7 +388,31 @@ def resolve_license(
350388
)
351389
]
352390

353-
# 3) Generic heuristics
391+
# 3) SPDX catalog URL (spdx.org/licenses/<ID>[.html])
392+
spdx_id = extract_spdx_id_from_url(url_normalized)
393+
if spdx_id:
394+
# Try to enrich from DB if a matching License row exists
395+
db_lic: License | None = (
396+
db_session.query(License).filter(func.lower(License.id) == func.lower(spdx_id)).one_or_none()
397+
)
398+
if db_lic is not None:
399+
return [
400+
MatchingLicense(
401+
license_id=db_lic.id,
402+
license_url=url_str,
403+
normalized_url=url_normalized,
404+
spdx_id=spdx_id,
405+
match_type="heuristic",
406+
confidence=0.98,
407+
matched_name=db_lic.name,
408+
matched_catalog_url=db_lic.url,
409+
matched_source="spdx-resolver",
410+
)
411+
]
412+
else:
413+
logging.warning("SPDX ID %s resolved from URL but not found in DB", spdx_id)
414+
415+
# 4) Generic heuristics
354416
heuristic_match = heuristic_spdx(url_str)
355417
if heuristic_match:
356418
return [
@@ -366,7 +428,7 @@ def resolve_license(
366428
)
367429
]
368430

369-
# 4) Fuzzy (same host candidates only)
431+
# 5) Fuzzy (same host candidates only)
370432
if allow_fuzzy and url_host and db_session is not None:
371433
fuzzy_results = resolve_fuzzy_match(
372434
url_str=url_str,
@@ -378,17 +440,5 @@ def resolve_license(
378440
if fuzzy_results:
379441
return fuzzy_results
380442

381-
# 5) No match
443+
# 6) No match
382444
return []
383-
384-
385-
def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
386-
"""Find exact match of normalized license URL in DB (License.url)."""
387-
if not db_session:
388-
return None
389-
# Compare normalized strings using SQL functions on License.url
390-
return (
391-
db_session.query(License)
392-
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
393-
.first()
394-
)

api/tests/utils/test_license_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,39 @@ def test_resolve_license_creative_commons(self, _mock_find):
155155
self.assertEqual(results[0].spdx_id, "CC-BY-4.0")
156156
self.assertEqual(results[0].match_type, "heuristic")
157157

158+
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
159+
def test_resolve_license_spdx_catalog_url_db_hit(self, _mock_find):
160+
"""SPDX catalog URLs (spdx.org/licenses/ID) should resolve via SPDX branch when license exists in DB."""
161+
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
162+
lic = self._make_license("odbl-1.0", "https://spdx.org/licenses/ODbL-1.0.html", "ODbL 1.0")
163+
# Configure session to return our license when queried by ID
164+
self.session.query.return_value.filter.return_value.one_or_none.return_value = lic
165+
166+
results = resolve_license(spdx_url, db_session=self.session)
167+
168+
self.assertEqual(len(results), 1)
169+
r = results[0]
170+
# Implementation currently lowercases the SPDX ID extracted from the URL
171+
self.assertEqual(r.spdx_id, "odbl-1.0")
172+
self.assertEqual(r.license_id, "odbl-1.0")
173+
self.assertEqual(r.match_type, "heuristic")
174+
self.assertEqual(r.matched_source, "spdx-resolver")
175+
self.assertEqual(r.matched_name, "ODbL 1.0")
176+
self.assertEqual(r.matched_catalog_url, "https://spdx.org/licenses/ODbL-1.0.html")
177+
178+
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
179+
def test_resolve_license_spdx_catalog_url_db_miss(self, _mock_find):
180+
"""When SPDX ID is parsed from URL but not present in DB,
181+
resolver should log and return no SPDX-based result."""
182+
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
183+
# Simulate no matching License in DB
184+
self.session.query.return_value.filter.return_value.one_or_none.return_value = None
185+
186+
results = resolve_license(spdx_url, db_session=self.session)
187+
188+
# Current behavior: we only log a warning and return an empty list when SPDX ID is not found in DB.
189+
self.assertEqual(results, [])
190+
158191
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
159192
def test_resolve_license_generic_heuristic(self, _mock_find):
160193
# Provide URL that matches heuristic patterns

docker-compose.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
services:
22
postgres:
33
container_name: database
4-
image: postgis/postgis:13-3.5
4+
image: postgis/postgis:14-3.3
55
# Fix warning message on arm64 machines
66
platform: linux/amd64
77
healthcheck:
@@ -25,7 +25,7 @@ services:
2525
- local
2626
postgres-test:
2727
container_name: database_test
28-
image: postgis/postgis:13-3.5
28+
image: postgis/postgis:14-3.3
2929
# Fix warning message on arm64 machines
3030
platform: linux/amd64
3131
healthcheck:

0 commit comments

Comments
 (0)