From f30b852d5405f9abb894a5420175001b7f387808 Mon Sep 17 00:00:00 2001 From: David Gamez Diaz <1192523+davidgamez@users.noreply.github.com> Date: Tue, 7 Apr 2026 17:14:09 -0400 Subject: [PATCH 1/2] add auto assing calls when a feed is added --- api/src/scripts/populate_db_gbfs.py | 3 + api/src/scripts/populate_db_gtfs.py | 4 + api/src/shared/common/license_utils.py | 94 ++++++++++- api/tests/utils/test_license_utils.py | 154 +++++++++++++++++- .../impl/feeds_operations_impl.py | 7 + .../src/tasks/licenses/license_matcher.py | 7 + liquibase/changelog.xml | 2 + 7 files changed, 268 insertions(+), 3 deletions(-) diff --git a/api/src/scripts/populate_db_gbfs.py b/api/src/scripts/populate_db_gbfs.py index 23b5a3844..785f4e418 100644 --- a/api/src/scripts/populate_db_gbfs.py +++ b/api/src/scripts/populate_db_gbfs.py @@ -14,6 +14,7 @@ from scripts.gbfs_utils.fetching import fetch_data, get_data_content from scripts.gbfs_utils.license import get_license_url from scripts.populate_db import DatabasePopulateHelper, set_up_configs +from shared.common.license_utils import assign_license_by_url from shared.database.database import generate_unique_id, configure_polymorphic_mappers from shared.database_gen.sqlacodegen_models import Gbfsfeed, Location, Externalid @@ -126,6 +127,8 @@ def populate_db(self, session, fetch_url=True): gbfs_feed.locations = [location] session.flush() + if is_new_feed and gbfs_feed.license_url: + assign_license_by_url(gbfs_feed, session) if is_new_feed: self.added_feeds.append( { diff --git a/api/src/scripts/populate_db_gtfs.py b/api/src/scripts/populate_db_gtfs.py index a4633164c..924298660 100644 --- a/api/src/scripts/populate_db_gtfs.py +++ b/api/src/scripts/populate_db_gtfs.py @@ -7,6 +7,7 @@ from scripts.load_dataset_on_create import publish_all from scripts.populate_db import DatabasePopulateHelper, set_up_configs +from shared.common.license_utils import assign_license_by_url from shared.database.database import generate_unique_id from shared.database_gen.sqlacodegen_models import ( Entitytype, @@ -212,6 +213,7 @@ def populate_db(self, session: "Session", fetch_url: bool = True): stable_id = self.get_stable_id(row) is_official_from_csv = self.get_safe_boolean_value(row, "is_official", None) feed = self.query_feed_by_stable_id(session, stable_id, data_type) + is_new_feed = feed is None if feed: self.logger.debug(f"Updating {feed.__class__.__name__}: {stable_id}") # Always set the deprecated status if found in the csv @@ -264,6 +266,8 @@ def populate_db(self, session: "Session", fetch_url: bool = True): session.add(feed) session.flush() + if is_new_feed and feed.license_url: + assign_license_by_url(feed, session) # This need to be done after all feeds are added to the session to avoid FK violation self.process_feed_references(session) self.process_redirects(session) diff --git a/api/src/shared/common/license_utils.py b/api/src/shared/common/license_utils.py index ec60dd79c..73036d870 100644 --- a/api/src/shared/common/license_utils.py +++ b/api/src/shared/common/license_utils.py @@ -7,7 +7,7 @@ from typing import List, Tuple, Optional from shared.common.db_utils import normalize_url, normalize_url_str -from shared.database_gen.sqlacodegen_models import License +from shared.database_gen.sqlacodegen_models import License, FeedLicenseChange @dataclass @@ -442,3 +442,95 @@ def resolve_license( # 6) No match return [] + + +# Confidence threshold above which an auto-assigned license is considered verified +# without requiring human review. Covers exact, CC resolver, SPDX, and pattern heuristic matches. +_AUTO_VERIFY_THRESHOLD = 0.95 + + +def assign_license_by_url( + feed, + db_session: Session, + *, + only_if_single: bool = True, +) -> Optional[MatchingLicense]: + """Resolve feed.license_url and auto-assign a license if exactly one match is found. + + Behavior: + - 0 matches: logs info, returns None (no change). + - >1 matches: logs a warning and returns None when only_if_single=True; + the feed retains its current license_id for manual review. + - 1 match: assigns feed.license_id / feed.license_notes and appends a + FeedLicenseChange audit row. verified is set based on confidence: + - True if match_type == 'exact' or confidence >= _AUTO_VERIFY_THRESHOLD + (covers exact DB matches, CC resolver, SPDX, pattern heuristics) + - False if match_type == 'fuzzy' (needs human confirmation) + + Args: + feed: Any Feed ORM instance (Gtfsfeed, Gtfsrealtimefeed, Gbfsfeed). + db_session: Active SQLAlchemy session; required for DB-backed resolution. + only_if_single: When True (default), skip assignment if multiple candidates + are returned, requiring a human to choose. + + Returns: + The assigned MatchingLicense, or None if no assignment was made. + """ + if not feed.license_url: + return None + + matches = resolve_license(feed.license_url, db_session=db_session) + + if not matches: + logging.info( + "No license match found for feed %s (url: %s)", + feed.stable_id, + feed.license_url, + ) + return None + + if only_if_single and len(matches) > 1: + logging.warning( + "Skipping auto-assignment for feed %s: %d license candidates found — manual review required", + feed.stable_id, + len(matches), + ) + return None + + best = matches[0] + + if best.license_id == feed.license_id: + logging.info("Feed %s license unchanged: %s", feed.stable_id, best.license_id) + return best + + is_verified = best.match_type == "exact" or best.confidence >= _AUTO_VERIFY_THRESHOLD + + logging.info( + "Assigning license %s to feed %s (match_type=%s, confidence=%.2f, verified=%s)", + best.license_id, + feed.stable_id, + best.match_type, + best.confidence, + is_verified, + ) + + feed.license_id = best.license_id + feed.license_notes = best.notes + feed.feed_license_changes.append( + FeedLicenseChange( + feed_id=feed.id, + changed_at=None, # set by DB default + feed_license_url=feed.license_url, + matched_license_id=best.license_id, + confidence=best.confidence, + match_type=best.match_type, + matched_name=best.matched_name, + matched_catalog_url=best.matched_catalog_url, + matched_source=best.matched_source, + notes=best.notes, + regional_id=best.regional_id, + verified=is_verified, + ) + ) + + return best diff --git a/api/tests/utils/test_license_utils.py b/api/tests/utils/test_license_utils.py index d6877596a..980009c4f 100644 --- a/api/tests/utils/test_license_utils.py +++ b/api/tests/utils/test_license_utils.py @@ -10,6 +10,7 @@ resolve_fuzzy_match, resolve_license, find_exact_match_license_url, + assign_license_by_url, MatchingLicense, ) from shared.database_gen.sqlacodegen_models import License @@ -246,5 +247,154 @@ def test_matching_license_dataclass(self): self.assertEqual(ml.confidence, 1.0) -if __name__ == "__main__": - unittest.main() +class TestAssignLicenseByUrl(unittest.TestCase): + """Unit tests for assign_license_by_url.""" + + def _make_match(self, license_id="MIT", match_type="exact", confidence=1.0): + return MatchingLicense( + license_id=license_id, + license_url="http://example.com/license", + normalized_url="example.com/license", + match_type=match_type, + confidence=confidence, + matched_name="MIT License", + matched_catalog_url="http://example.com/license", + matched_source="db.license", + ) + + def _make_feed(self, license_url="http://example.com/license", license_id=None): + feed = MagicMock() + feed.stable_id = "test-feed-1" + feed.id = "feed-id-1" + feed.license_url = license_url + feed.license_id = license_id + feed.license_notes = None + feed.feed_license_changes = [] + return feed + + # --- No license_url --- + + def test_no_license_url_returns_none(self): + feed = self._make_feed(license_url=None) + result = assign_license_by_url(feed, MagicMock()) + self.assertIsNone(result) + self.assertIsNone(feed.license_id) + + def test_empty_license_url_returns_none(self): + feed = self._make_feed(license_url="") + result = assign_license_by_url(feed, MagicMock()) + self.assertIsNone(result) + + # --- No match --- + + @patch("shared.common.license_utils.resolve_license") + def test_no_match_returns_none(self, mock_resolve): + mock_resolve.return_value = [] + feed = self._make_feed() + result = assign_license_by_url(feed, MagicMock()) + self.assertIsNone(result) + self.assertIsNone(feed.license_id) + self.assertEqual(feed.feed_license_changes, []) + + # --- Multiple matches --- + + @patch("shared.common.license_utils.resolve_license") + def test_multiple_matches_skips_assignment(self, mock_resolve): + mock_resolve.return_value = [ + self._make_match("MIT", "fuzzy", 0.96), + self._make_match("Apache-2.0", "fuzzy", 0.94), + ] + feed = self._make_feed() + result = assign_license_by_url(feed, MagicMock()) + self.assertIsNone(result) + self.assertIsNone(feed.license_id) + self.assertEqual(feed.feed_license_changes, []) + + # --- Single exact match — auto-verified --- + + @patch("shared.common.license_utils.resolve_license") + def test_exact_match_assigns_and_marks_verified(self, mock_resolve): + match = self._make_match("MIT", "exact", 1.0) + mock_resolve.return_value = [match] + feed = self._make_feed() + + result = assign_license_by_url(feed, MagicMock()) + + self.assertEqual(result, match) + self.assertEqual(feed.license_id, "MIT") + self.assertEqual(len(feed.feed_license_changes), 1) + self.assertTrue(feed.feed_license_changes[0].verified) + + @patch("shared.common.license_utils.resolve_license") + def test_heuristic_high_confidence_assigns_and_marks_verified(self, mock_resolve): + match = self._make_match("CC-BY-4.0", "heuristic", 0.99) + mock_resolve.return_value = [match] + feed = self._make_feed() + + result = assign_license_by_url(feed, MagicMock()) + + self.assertEqual(result, match) + self.assertEqual(feed.license_id, "CC-BY-4.0") + self.assertTrue(feed.feed_license_changes[0].verified) + + @patch("shared.common.license_utils.resolve_license") + def test_threshold_boundary_095_marks_verified(self, mock_resolve): + match = self._make_match("ODbL-1.0", "heuristic", 0.95) + mock_resolve.return_value = [match] + feed = self._make_feed() + + assign_license_by_url(feed, MagicMock()) + + self.assertTrue(feed.feed_license_changes[0].verified) + + # --- Fuzzy / low-confidence match — unverified --- + + @patch("shared.common.license_utils.resolve_license") + def test_fuzzy_match_assigns_but_unverified(self, mock_resolve): + match = self._make_match("MIT", "fuzzy", 0.94) + mock_resolve.return_value = [match] + feed = self._make_feed() + + result = assign_license_by_url(feed, MagicMock()) + + self.assertEqual(result, match) + self.assertEqual(feed.license_id, "MIT") + self.assertFalse(feed.feed_license_changes[0].verified) + + @patch("shared.common.license_utils.resolve_license") + def test_below_threshold_unverified(self, mock_resolve): + match = self._make_match("MIT", "heuristic", 0.80) + mock_resolve.return_value = [match] + feed = self._make_feed() + + assign_license_by_url(feed, MagicMock()) + + self.assertFalse(feed.feed_license_changes[0].verified) + + # --- Duplicate assignment guard --- + + @patch("shared.common.license_utils.resolve_license") + def test_same_license_id_no_new_audit_row(self, mock_resolve): + match = self._make_match("MIT", "exact", 1.0) + mock_resolve.return_value = [match] + feed = self._make_feed(license_id="MIT") # already assigned + + result = assign_license_by_url(feed, MagicMock()) + + self.assertEqual(result, match) + self.assertEqual(feed.license_id, "MIT") + self.assertEqual(feed.feed_license_changes, []) # no new audit row + + # --- only_if_single=False allows multiple matches --- + + @patch("shared.common.license_utils.resolve_license") + def test_only_if_single_false_assigns_best_match(self, mock_resolve): + best = self._make_match("MIT", "fuzzy", 0.97) + second = self._make_match("Apache-2.0", "fuzzy", 0.94) + mock_resolve.return_value = [best, second] + feed = self._make_feed() + + result = assign_license_by_url(feed, MagicMock(), only_if_single=False) + + self.assertEqual(result, best) + self.assertEqual(feed.license_id, "MIT") diff --git a/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py b/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py index 3f471e5f0..3cae855c7 100644 --- a/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py +++ b/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py @@ -56,6 +56,7 @@ Feed, Gtfsrealtimefeed, ) +from shared.common.license_utils import assign_license_by_url from shared.helpers.pub_sub import get_execution_id, trigger_dataset_download from shared.helpers.query_helper import ( query_feed_by_stable_id, @@ -397,6 +398,9 @@ async def create_gtfs_feed( status_code=500, detail=f"Failed to create GTFS feed with URL: {new_feed.producer_url}", ) + if created_feed.license_url: + assign_license_by_url(created_feed, db_session) + db_session.commit() try: trigger_dataset_download( created_feed, @@ -438,6 +442,9 @@ async def create_gtfs_rt_feed( db_session.add(new_feed) db_session.commit() created_feed = db_session.get(Gtfsrealtimefeed, new_feed.id) + if created_feed and created_feed.license_url: + assign_license_by_url(created_feed, db_session) + db_session.commit() logging.info("Created new GTFS-RT feed with ID: %s", new_feed.stable_id) refreshed = refresh_materialized_view(db_session, t_feedsearch.name) logging.info("Materialized view %s refreshed: %s", t_feedsearch.name, refreshed) diff --git a/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py b/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py index b5afcdab9..c8d97dcd6 100644 --- a/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py +++ b/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py @@ -36,6 +36,12 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense): feed.stable_id, license_match.license_id, ) + from shared.common.license_utils import _AUTO_VERIFY_THRESHOLD + + is_verified = ( + license_match.match_type == "exact" + or license_match.confidence >= _AUTO_VERIFY_THRESHOLD + ) feed.license_id = license_match.license_id feed.license_notes = license_match.notes feed_license_change: FeedLicenseChange = FeedLicenseChange( @@ -50,6 +56,7 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense): matched_source=license_match.matched_source, notes=license_match.notes, regional_id=license_match.regional_id, + verified=is_verified, ) feed.feed_license_changes.append(feed_license_change) else: diff --git a/liquibase/changelog.xml b/liquibase/changelog.xml index 69d47cc33..6b97e1cbf 100644 --- a/liquibase/changelog.xml +++ b/liquibase/changelog.xml @@ -101,6 +101,8 @@ + +