Skip to content

Commit dcb6920

Browse files
authored
feat: add auto assign license when a feed is added (#1651)
1 parent 32d2b91 commit dcb6920

File tree

8 files changed

+278
-3
lines changed

8 files changed

+278
-3
lines changed

api/src/scripts/populate_db_gbfs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from scripts.gbfs_utils.fetching import fetch_data, get_data_content
1515
from scripts.gbfs_utils.license import get_license_url
1616
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
17+
from shared.common.license_utils import assign_license_by_url
1718
from shared.database.database import generate_unique_id, configure_polymorphic_mappers
1819
from shared.database_gen.sqlacodegen_models import Gbfsfeed, Location, Externalid
1920

@@ -126,6 +127,8 @@ def populate_db(self, session, fetch_url=True):
126127
gbfs_feed.locations = [location]
127128

128129
session.flush()
130+
if is_new_feed and gbfs_feed.license_url:
131+
assign_license_by_url(gbfs_feed, session)
129132
if is_new_feed:
130133
self.added_feeds.append(
131134
{

api/src/scripts/populate_db_gtfs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from scripts.load_dataset_on_create import publish_all
99
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
10+
from shared.common.license_utils import assign_license_by_url
1011
from shared.database.database import generate_unique_id
1112
from shared.database_gen.sqlacodegen_models import (
1213
Entitytype,
@@ -212,6 +213,7 @@ def populate_db(self, session: "Session", fetch_url: bool = True):
212213
stable_id = self.get_stable_id(row)
213214
is_official_from_csv = self.get_safe_boolean_value(row, "is_official", None)
214215
feed = self.query_feed_by_stable_id(session, stable_id, data_type)
216+
is_new_feed = feed is None
215217
if feed:
216218
self.logger.debug(f"Updating {feed.__class__.__name__}: {stable_id}")
217219
# Always set the deprecated status if found in the csv
@@ -264,6 +266,8 @@ def populate_db(self, session: "Session", fetch_url: bool = True):
264266

265267
session.add(feed)
266268
session.flush()
269+
if is_new_feed and feed.license_url:
270+
assign_license_by_url(feed, session)
267271
# This need to be done after all feeds are added to the session to avoid FK violation
268272
self.process_feed_references(session)
269273
self.process_redirects(session)

api/src/shared/common/license_utils.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import List, Tuple, Optional
88

99
from shared.common.db_utils import normalize_url, normalize_url_str
10-
from shared.database_gen.sqlacodegen_models import License
10+
from shared.database_gen.sqlacodegen_models import License, FeedLicenseChange
1111

1212

1313
@dataclass
@@ -442,3 +442,95 @@ def resolve_license(
442442

443443
# 6) No match
444444
return []
445+
446+
447+
# Confidence threshold above which an auto-assigned license is considered verified
448+
# without requiring human review. Covers exact, CC resolver, SPDX, and pattern heuristic matches.
449+
_AUTO_VERIFY_THRESHOLD = 0.95
450+
451+
452+
def assign_license_by_url(
453+
feed,
454+
db_session: Session,
455+
*,
456+
only_if_single: bool = True,
457+
) -> Optional[MatchingLicense]:
458+
"""Resolve feed.license_url and auto-assign a license if exactly one match is found.
459+
460+
Behavior:
461+
- 0 matches: logs info, returns None (no change).
462+
- >1 matches: logs a warning and returns None when only_if_single=True;
463+
the feed retains its current license_id for manual review.
464+
- 1 match: assigns feed.license_id / feed.license_notes and appends a
465+
FeedLicenseChange audit row. verified is set based on confidence:
466+
- True if match_type == 'exact' or confidence >= _AUTO_VERIFY_THRESHOLD
467+
(covers exact DB matches, CC resolver, SPDX, pattern heuristics)
468+
- False if match_type == 'fuzzy' (needs human confirmation)
469+
470+
Args:
471+
feed: Any Feed ORM instance (Gtfsfeed, Gtfsrealtimefeed, Gbfsfeed).
472+
db_session: Active SQLAlchemy session; required for DB-backed resolution.
473+
only_if_single: When True (default), skip assignment if multiple candidates
474+
are returned, requiring a human to choose.
475+
476+
Returns:
477+
The assigned MatchingLicense, or None if no assignment was made.
478+
"""
479+
if not feed.license_url:
480+
return None
481+
482+
matches = resolve_license(feed.license_url, db_session=db_session)
483+
484+
if not matches:
485+
logging.info(
486+
"No license match found for feed %s (url: %s)",
487+
feed.stable_id,
488+
feed.license_url,
489+
)
490+
return None
491+
492+
if only_if_single and len(matches) > 1:
493+
logging.warning(
494+
"Skipping auto-assignment for feed %s: %d license candidates found — manual review required",
495+
feed.stable_id,
496+
len(matches),
497+
)
498+
return None
499+
500+
best = matches[0]
501+
502+
if best.license_id == feed.license_id:
503+
logging.info("Feed %s license unchanged: %s", feed.stable_id, best.license_id)
504+
return best
505+
506+
is_verified = best.match_type == "exact" or best.confidence >= _AUTO_VERIFY_THRESHOLD
507+
508+
logging.info(
509+
"Assigning license %s to feed %s (match_type=%s, confidence=%.2f, verified=%s)",
510+
best.license_id,
511+
feed.stable_id,
512+
best.match_type,
513+
best.confidence,
514+
is_verified,
515+
)
516+
517+
feed.license_id = best.license_id
518+
feed.license_notes = best.notes
519+
feed.feed_license_changes.append(
520+
FeedLicenseChange(
521+
feed_id=feed.id,
522+
changed_at=None, # set by DB default
523+
feed_license_url=feed.license_url,
524+
matched_license_id=best.license_id,
525+
confidence=best.confidence,
526+
match_type=best.match_type,
527+
matched_name=best.matched_name,
528+
matched_catalog_url=best.matched_catalog_url,
529+
matched_source=best.matched_source,
530+
notes=best.notes,
531+
regional_id=best.regional_id,
532+
verified=is_verified,
533+
)
534+
)
535+
536+
return best

api/tests/utils/test_license_utils.py

Lines changed: 152 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
resolve_fuzzy_match,
1111
resolve_license,
1212
find_exact_match_license_url,
13+
assign_license_by_url,
1314
MatchingLicense,
1415
)
1516
from shared.database_gen.sqlacodegen_models import License
@@ -246,5 +247,154 @@ def test_matching_license_dataclass(self):
246247
self.assertEqual(ml.confidence, 1.0)
247248

248249

249-
if __name__ == "__main__":
250-
unittest.main()
250+
class TestAssignLicenseByUrl(unittest.TestCase):
251+
"""Unit tests for assign_license_by_url."""
252+
253+
def _make_match(self, license_id="MIT", match_type="exact", confidence=1.0):
254+
return MatchingLicense(
255+
license_id=license_id,
256+
license_url="http://example.com/license",
257+
normalized_url="example.com/license",
258+
match_type=match_type,
259+
confidence=confidence,
260+
matched_name="MIT License",
261+
matched_catalog_url="http://example.com/license",
262+
matched_source="db.license",
263+
)
264+
265+
def _make_feed(self, license_url="http://example.com/license", license_id=None):
266+
feed = MagicMock()
267+
feed.stable_id = "test-feed-1"
268+
feed.id = "feed-id-1"
269+
feed.license_url = license_url
270+
feed.license_id = license_id
271+
feed.license_notes = None
272+
feed.feed_license_changes = []
273+
return feed
274+
275+
# --- No license_url ---
276+
277+
def test_no_license_url_returns_none(self):
278+
feed = self._make_feed(license_url=None)
279+
result = assign_license_by_url(feed, MagicMock())
280+
self.assertIsNone(result)
281+
self.assertIsNone(feed.license_id)
282+
283+
def test_empty_license_url_returns_none(self):
284+
feed = self._make_feed(license_url="")
285+
result = assign_license_by_url(feed, MagicMock())
286+
self.assertIsNone(result)
287+
288+
# --- No match ---
289+
290+
@patch("shared.common.license_utils.resolve_license")
291+
def test_no_match_returns_none(self, mock_resolve):
292+
mock_resolve.return_value = []
293+
feed = self._make_feed()
294+
result = assign_license_by_url(feed, MagicMock())
295+
self.assertIsNone(result)
296+
self.assertIsNone(feed.license_id)
297+
self.assertEqual(feed.feed_license_changes, [])
298+
299+
# --- Multiple matches ---
300+
301+
@patch("shared.common.license_utils.resolve_license")
302+
def test_multiple_matches_skips_assignment(self, mock_resolve):
303+
mock_resolve.return_value = [
304+
self._make_match("MIT", "fuzzy", 0.96),
305+
self._make_match("Apache-2.0", "fuzzy", 0.94),
306+
]
307+
feed = self._make_feed()
308+
result = assign_license_by_url(feed, MagicMock())
309+
self.assertIsNone(result)
310+
self.assertIsNone(feed.license_id)
311+
self.assertEqual(feed.feed_license_changes, [])
312+
313+
# --- Single exact match — auto-verified ---
314+
315+
@patch("shared.common.license_utils.resolve_license")
316+
def test_exact_match_assigns_and_marks_verified(self, mock_resolve):
317+
match = self._make_match("MIT", "exact", 1.0)
318+
mock_resolve.return_value = [match]
319+
feed = self._make_feed()
320+
321+
result = assign_license_by_url(feed, MagicMock())
322+
323+
self.assertEqual(result, match)
324+
self.assertEqual(feed.license_id, "MIT")
325+
self.assertEqual(len(feed.feed_license_changes), 1)
326+
self.assertTrue(feed.feed_license_changes[0].verified)
327+
328+
@patch("shared.common.license_utils.resolve_license")
329+
def test_heuristic_high_confidence_assigns_and_marks_verified(self, mock_resolve):
330+
match = self._make_match("CC-BY-4.0", "heuristic", 0.99)
331+
mock_resolve.return_value = [match]
332+
feed = self._make_feed()
333+
334+
result = assign_license_by_url(feed, MagicMock())
335+
336+
self.assertEqual(result, match)
337+
self.assertEqual(feed.license_id, "CC-BY-4.0")
338+
self.assertTrue(feed.feed_license_changes[0].verified)
339+
340+
@patch("shared.common.license_utils.resolve_license")
341+
def test_threshold_boundary_095_marks_verified(self, mock_resolve):
342+
match = self._make_match("ODbL-1.0", "heuristic", 0.95)
343+
mock_resolve.return_value = [match]
344+
feed = self._make_feed()
345+
346+
assign_license_by_url(feed, MagicMock())
347+
348+
self.assertTrue(feed.feed_license_changes[0].verified)
349+
350+
# --- Fuzzy / low-confidence match — unverified ---
351+
352+
@patch("shared.common.license_utils.resolve_license")
353+
def test_fuzzy_match_assigns_but_unverified(self, mock_resolve):
354+
match = self._make_match("MIT", "fuzzy", 0.94)
355+
mock_resolve.return_value = [match]
356+
feed = self._make_feed()
357+
358+
result = assign_license_by_url(feed, MagicMock())
359+
360+
self.assertEqual(result, match)
361+
self.assertEqual(feed.license_id, "MIT")
362+
self.assertFalse(feed.feed_license_changes[0].verified)
363+
364+
@patch("shared.common.license_utils.resolve_license")
365+
def test_below_threshold_unverified(self, mock_resolve):
366+
match = self._make_match("MIT", "heuristic", 0.80)
367+
mock_resolve.return_value = [match]
368+
feed = self._make_feed()
369+
370+
assign_license_by_url(feed, MagicMock())
371+
372+
self.assertFalse(feed.feed_license_changes[0].verified)
373+
374+
# --- Duplicate assignment guard ---
375+
376+
@patch("shared.common.license_utils.resolve_license")
377+
def test_same_license_id_no_new_audit_row(self, mock_resolve):
378+
match = self._make_match("MIT", "exact", 1.0)
379+
mock_resolve.return_value = [match]
380+
feed = self._make_feed(license_id="MIT") # already assigned
381+
382+
result = assign_license_by_url(feed, MagicMock())
383+
384+
self.assertEqual(result, match)
385+
self.assertEqual(feed.license_id, "MIT")
386+
self.assertEqual(feed.feed_license_changes, []) # no new audit row
387+
388+
# --- only_if_single=False allows multiple matches ---
389+
390+
@patch("shared.common.license_utils.resolve_license")
391+
def test_only_if_single_false_assigns_best_match(self, mock_resolve):
392+
best = self._make_match("MIT", "fuzzy", 0.97)
393+
second = self._make_match("Apache-2.0", "fuzzy", 0.94)
394+
mock_resolve.return_value = [best, second]
395+
feed = self._make_feed()
396+
397+
result = assign_license_by_url(feed, MagicMock(), only_if_single=False)
398+
399+
self.assertEqual(result, best)
400+
self.assertEqual(feed.license_id, "MIT")

functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
Feed,
5757
Gtfsrealtimefeed,
5858
)
59+
from shared.common.license_utils import assign_license_by_url
5960
from shared.common.gcp_utils import create_web_revalidation_task
6061
from shared.helpers.pub_sub import get_execution_id, trigger_dataset_download
6162
from shared.helpers.query_helper import (
@@ -406,6 +407,9 @@ async def create_gtfs_feed(
406407
status_code=500,
407408
detail=f"Failed to create GTFS feed with URL: {new_feed.producer_url}",
408409
)
410+
if created_feed.license_url:
411+
assign_license_by_url(created_feed, db_session)
412+
db_session.commit()
409413
try:
410414
trigger_dataset_download(
411415
created_feed,
@@ -447,6 +451,9 @@ async def create_gtfs_rt_feed(
447451
db_session.add(new_feed)
448452
db_session.commit()
449453
created_feed = db_session.get(Gtfsrealtimefeed, new_feed.id)
454+
if created_feed and created_feed.license_url:
455+
assign_license_by_url(created_feed, db_session)
456+
db_session.commit()
450457
logging.info("Created new GTFS-RT feed with ID: %s", new_feed.stable_id)
451458
refreshed = refresh_materialized_view(db_session, t_feedsearch.name)
452459
logging.info("Materialized view %s refreshed: %s", t_feedsearch.name, refreshed)

functions-python/tasks_executor/src/tasks/licenses/license_matcher.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
3636
feed.stable_id,
3737
license_match.license_id,
3838
)
39+
from shared.common.license_utils import _AUTO_VERIFY_THRESHOLD
40+
41+
is_verified = (
42+
license_match.match_type == "exact"
43+
or license_match.confidence >= _AUTO_VERIFY_THRESHOLD
44+
)
3945
feed.license_id = license_match.license_id
4046
feed.license_notes = license_match.notes
4147
feed_license_change: FeedLicenseChange = FeedLicenseChange(
@@ -50,6 +56,7 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
5056
matched_source=license_match.matched_source,
5157
notes=license_match.notes,
5258
regional_id=license_match.regional_id,
59+
verified=is_verified,
5360
)
5461
feed.feed_license_changes.append(feed_license_change)
5562
else:

liquibase/changelog.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@
101101
<include file="changes/feat_1542.sql" relativeToChangelogFile="true"/>
102102
<!-- Add license_tag table and license_license_tags join table for tag classification of licenses. -->
103103
<include file="changes/feat_1565.sql" relativeToChangelogFile="true"/>
104+
<!-- Add verified column to feed_license_change to track human review of auto-assigned licenses. -->
105+
<include file="changes/feat_1568.sql" relativeToChangelogFile="true"/>
104106
<!-- Centralized materialized view definitions.
105107
Views are rebuilt from source SQL files using runOnChange. -->
106108
<!-- Keep this at the very end to ensure all table and schema changes

liquibase/changes/feat_1568.sql

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Add verified column to feed_license_change to track human review status.
2+
-- Auto-assigned licenses start as unverified (false = needs review);
3+
-- manually confirmed or high-confidence assignments are marked true.
4+
ALTER TABLE feed_license_change ADD COLUMN IF NOT EXISTS verified BOOLEAN NOT NULL DEFAULT false;
5+
6+
-- Backfill all pre-existing rows as verified — prior assignments are considered trusted.
7+
UPDATE feed_license_change SET verified = true;
8+
9+
-- Index for efficient filtering of unverified assignments.
10+
CREATE INDEX IF NOT EXISTS ix_flc_verified ON feed_license_change (verified);

0 commit comments

Comments
 (0)