From f30b852d5405f9abb894a5420175001b7f387808 Mon Sep 17 00:00:00 2001
From: David Gamez Diaz <1192523+davidgamez@users.noreply.github.com>
Date: Tue, 7 Apr 2026 17:14:09 -0400
Subject: [PATCH 1/2] add auto assing calls when a feed is added
---
api/src/scripts/populate_db_gbfs.py | 3 +
api/src/scripts/populate_db_gtfs.py | 4 +
api/src/shared/common/license_utils.py | 94 ++++++++++-
api/tests/utils/test_license_utils.py | 154 +++++++++++++++++-
.../impl/feeds_operations_impl.py | 7 +
.../src/tasks/licenses/license_matcher.py | 7 +
liquibase/changelog.xml | 2 +
7 files changed, 268 insertions(+), 3 deletions(-)
diff --git a/api/src/scripts/populate_db_gbfs.py b/api/src/scripts/populate_db_gbfs.py
index 23b5a3844..785f4e418 100644
--- a/api/src/scripts/populate_db_gbfs.py
+++ b/api/src/scripts/populate_db_gbfs.py
@@ -14,6 +14,7 @@
from scripts.gbfs_utils.fetching import fetch_data, get_data_content
from scripts.gbfs_utils.license import get_license_url
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
+from shared.common.license_utils import assign_license_by_url
from shared.database.database import generate_unique_id, configure_polymorphic_mappers
from shared.database_gen.sqlacodegen_models import Gbfsfeed, Location, Externalid
@@ -126,6 +127,8 @@ def populate_db(self, session, fetch_url=True):
gbfs_feed.locations = [location]
session.flush()
+ if is_new_feed and gbfs_feed.license_url:
+ assign_license_by_url(gbfs_feed, session)
if is_new_feed:
self.added_feeds.append(
{
diff --git a/api/src/scripts/populate_db_gtfs.py b/api/src/scripts/populate_db_gtfs.py
index a4633164c..924298660 100644
--- a/api/src/scripts/populate_db_gtfs.py
+++ b/api/src/scripts/populate_db_gtfs.py
@@ -7,6 +7,7 @@
from scripts.load_dataset_on_create import publish_all
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
+from shared.common.license_utils import assign_license_by_url
from shared.database.database import generate_unique_id
from shared.database_gen.sqlacodegen_models import (
Entitytype,
@@ -212,6 +213,7 @@ def populate_db(self, session: "Session", fetch_url: bool = True):
stable_id = self.get_stable_id(row)
is_official_from_csv = self.get_safe_boolean_value(row, "is_official", None)
feed = self.query_feed_by_stable_id(session, stable_id, data_type)
+ is_new_feed = feed is None
if feed:
self.logger.debug(f"Updating {feed.__class__.__name__}: {stable_id}")
# Always set the deprecated status if found in the csv
@@ -264,6 +266,8 @@ def populate_db(self, session: "Session", fetch_url: bool = True):
session.add(feed)
session.flush()
+ if is_new_feed and feed.license_url:
+ assign_license_by_url(feed, session)
# This need to be done after all feeds are added to the session to avoid FK violation
self.process_feed_references(session)
self.process_redirects(session)
diff --git a/api/src/shared/common/license_utils.py b/api/src/shared/common/license_utils.py
index ec60dd79c..73036d870 100644
--- a/api/src/shared/common/license_utils.py
+++ b/api/src/shared/common/license_utils.py
@@ -7,7 +7,7 @@
from typing import List, Tuple, Optional
from shared.common.db_utils import normalize_url, normalize_url_str
-from shared.database_gen.sqlacodegen_models import License
+from shared.database_gen.sqlacodegen_models import License, FeedLicenseChange
@dataclass
@@ -442,3 +442,95 @@ def resolve_license(
# 6) No match
return []
+
+
+# Confidence threshold above which an auto-assigned license is considered verified
+# without requiring human review. Covers exact, CC resolver, SPDX, and pattern heuristic matches.
+_AUTO_VERIFY_THRESHOLD = 0.95
+
+
+def assign_license_by_url(
+ feed,
+ db_session: Session,
+ *,
+ only_if_single: bool = True,
+) -> Optional[MatchingLicense]:
+ """Resolve feed.license_url and auto-assign a license if exactly one match is found.
+
+ Behavior:
+ - 0 matches: logs info, returns None (no change).
+ - >1 matches: logs a warning and returns None when only_if_single=True;
+ the feed retains its current license_id for manual review.
+ - 1 match: assigns feed.license_id / feed.license_notes and appends a
+ FeedLicenseChange audit row. verified is set based on confidence:
+ - True if match_type == 'exact' or confidence >= _AUTO_VERIFY_THRESHOLD
+ (covers exact DB matches, CC resolver, SPDX, pattern heuristics)
+ - False if match_type == 'fuzzy' (needs human confirmation)
+
+ Args:
+ feed: Any Feed ORM instance (Gtfsfeed, Gtfsrealtimefeed, Gbfsfeed).
+ db_session: Active SQLAlchemy session; required for DB-backed resolution.
+ only_if_single: When True (default), skip assignment if multiple candidates
+ are returned, requiring a human to choose.
+
+ Returns:
+ The assigned MatchingLicense, or None if no assignment was made.
+ """
+ if not feed.license_url:
+ return None
+
+ matches = resolve_license(feed.license_url, db_session=db_session)
+
+ if not matches:
+ logging.info(
+ "No license match found for feed %s (url: %s)",
+ feed.stable_id,
+ feed.license_url,
+ )
+ return None
+
+ if only_if_single and len(matches) > 1:
+ logging.warning(
+ "Skipping auto-assignment for feed %s: %d license candidates found — manual review required",
+ feed.stable_id,
+ len(matches),
+ )
+ return None
+
+ best = matches[0]
+
+ if best.license_id == feed.license_id:
+ logging.info("Feed %s license unchanged: %s", feed.stable_id, best.license_id)
+ return best
+
+ is_verified = best.match_type == "exact" or best.confidence >= _AUTO_VERIFY_THRESHOLD
+
+ logging.info(
+ "Assigning license %s to feed %s (match_type=%s, confidence=%.2f, verified=%s)",
+ best.license_id,
+ feed.stable_id,
+ best.match_type,
+ best.confidence,
+ is_verified,
+ )
+
+ feed.license_id = best.license_id
+ feed.license_notes = best.notes
+ feed.feed_license_changes.append(
+ FeedLicenseChange(
+ feed_id=feed.id,
+ changed_at=None, # set by DB default
+ feed_license_url=feed.license_url,
+ matched_license_id=best.license_id,
+ confidence=best.confidence,
+ match_type=best.match_type,
+ matched_name=best.matched_name,
+ matched_catalog_url=best.matched_catalog_url,
+ matched_source=best.matched_source,
+ notes=best.notes,
+ regional_id=best.regional_id,
+ verified=is_verified,
+ )
+ )
+
+ return best
diff --git a/api/tests/utils/test_license_utils.py b/api/tests/utils/test_license_utils.py
index d6877596a..980009c4f 100644
--- a/api/tests/utils/test_license_utils.py
+++ b/api/tests/utils/test_license_utils.py
@@ -10,6 +10,7 @@
resolve_fuzzy_match,
resolve_license,
find_exact_match_license_url,
+ assign_license_by_url,
MatchingLicense,
)
from shared.database_gen.sqlacodegen_models import License
@@ -246,5 +247,154 @@ def test_matching_license_dataclass(self):
self.assertEqual(ml.confidence, 1.0)
-if __name__ == "__main__":
- unittest.main()
+class TestAssignLicenseByUrl(unittest.TestCase):
+ """Unit tests for assign_license_by_url."""
+
+ def _make_match(self, license_id="MIT", match_type="exact", confidence=1.0):
+ return MatchingLicense(
+ license_id=license_id,
+ license_url="http://example.com/license",
+ normalized_url="example.com/license",
+ match_type=match_type,
+ confidence=confidence,
+ matched_name="MIT License",
+ matched_catalog_url="http://example.com/license",
+ matched_source="db.license",
+ )
+
+ def _make_feed(self, license_url="http://example.com/license", license_id=None):
+ feed = MagicMock()
+ feed.stable_id = "test-feed-1"
+ feed.id = "feed-id-1"
+ feed.license_url = license_url
+ feed.license_id = license_id
+ feed.license_notes = None
+ feed.feed_license_changes = []
+ return feed
+
+ # --- No license_url ---
+
+ def test_no_license_url_returns_none(self):
+ feed = self._make_feed(license_url=None)
+ result = assign_license_by_url(feed, MagicMock())
+ self.assertIsNone(result)
+ self.assertIsNone(feed.license_id)
+
+ def test_empty_license_url_returns_none(self):
+ feed = self._make_feed(license_url="")
+ result = assign_license_by_url(feed, MagicMock())
+ self.assertIsNone(result)
+
+ # --- No match ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_no_match_returns_none(self, mock_resolve):
+ mock_resolve.return_value = []
+ feed = self._make_feed()
+ result = assign_license_by_url(feed, MagicMock())
+ self.assertIsNone(result)
+ self.assertIsNone(feed.license_id)
+ self.assertEqual(feed.feed_license_changes, [])
+
+ # --- Multiple matches ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_multiple_matches_skips_assignment(self, mock_resolve):
+ mock_resolve.return_value = [
+ self._make_match("MIT", "fuzzy", 0.96),
+ self._make_match("Apache-2.0", "fuzzy", 0.94),
+ ]
+ feed = self._make_feed()
+ result = assign_license_by_url(feed, MagicMock())
+ self.assertIsNone(result)
+ self.assertIsNone(feed.license_id)
+ self.assertEqual(feed.feed_license_changes, [])
+
+ # --- Single exact match — auto-verified ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_exact_match_assigns_and_marks_verified(self, mock_resolve):
+ match = self._make_match("MIT", "exact", 1.0)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed()
+
+ result = assign_license_by_url(feed, MagicMock())
+
+ self.assertEqual(result, match)
+ self.assertEqual(feed.license_id, "MIT")
+ self.assertEqual(len(feed.feed_license_changes), 1)
+ self.assertTrue(feed.feed_license_changes[0].verified)
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_heuristic_high_confidence_assigns_and_marks_verified(self, mock_resolve):
+ match = self._make_match("CC-BY-4.0", "heuristic", 0.99)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed()
+
+ result = assign_license_by_url(feed, MagicMock())
+
+ self.assertEqual(result, match)
+ self.assertEqual(feed.license_id, "CC-BY-4.0")
+ self.assertTrue(feed.feed_license_changes[0].verified)
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_threshold_boundary_095_marks_verified(self, mock_resolve):
+ match = self._make_match("ODbL-1.0", "heuristic", 0.95)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed()
+
+ assign_license_by_url(feed, MagicMock())
+
+ self.assertTrue(feed.feed_license_changes[0].verified)
+
+ # --- Fuzzy / low-confidence match — unverified ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_fuzzy_match_assigns_but_unverified(self, mock_resolve):
+ match = self._make_match("MIT", "fuzzy", 0.94)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed()
+
+ result = assign_license_by_url(feed, MagicMock())
+
+ self.assertEqual(result, match)
+ self.assertEqual(feed.license_id, "MIT")
+ self.assertFalse(feed.feed_license_changes[0].verified)
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_below_threshold_unverified(self, mock_resolve):
+ match = self._make_match("MIT", "heuristic", 0.80)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed()
+
+ assign_license_by_url(feed, MagicMock())
+
+ self.assertFalse(feed.feed_license_changes[0].verified)
+
+ # --- Duplicate assignment guard ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_same_license_id_no_new_audit_row(self, mock_resolve):
+ match = self._make_match("MIT", "exact", 1.0)
+ mock_resolve.return_value = [match]
+ feed = self._make_feed(license_id="MIT") # already assigned
+
+ result = assign_license_by_url(feed, MagicMock())
+
+ self.assertEqual(result, match)
+ self.assertEqual(feed.license_id, "MIT")
+ self.assertEqual(feed.feed_license_changes, []) # no new audit row
+
+ # --- only_if_single=False allows multiple matches ---
+
+ @patch("shared.common.license_utils.resolve_license")
+ def test_only_if_single_false_assigns_best_match(self, mock_resolve):
+ best = self._make_match("MIT", "fuzzy", 0.97)
+ second = self._make_match("Apache-2.0", "fuzzy", 0.94)
+ mock_resolve.return_value = [best, second]
+ feed = self._make_feed()
+
+ result = assign_license_by_url(feed, MagicMock(), only_if_single=False)
+
+ self.assertEqual(result, best)
+ self.assertEqual(feed.license_id, "MIT")
diff --git a/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py b/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py
index 3f471e5f0..3cae855c7 100644
--- a/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py
+++ b/functions-python/operations_api/src/feeds_operations/impl/feeds_operations_impl.py
@@ -56,6 +56,7 @@
Feed,
Gtfsrealtimefeed,
)
+from shared.common.license_utils import assign_license_by_url
from shared.helpers.pub_sub import get_execution_id, trigger_dataset_download
from shared.helpers.query_helper import (
query_feed_by_stable_id,
@@ -397,6 +398,9 @@ async def create_gtfs_feed(
status_code=500,
detail=f"Failed to create GTFS feed with URL: {new_feed.producer_url}",
)
+ if created_feed.license_url:
+ assign_license_by_url(created_feed, db_session)
+ db_session.commit()
try:
trigger_dataset_download(
created_feed,
@@ -438,6 +442,9 @@ async def create_gtfs_rt_feed(
db_session.add(new_feed)
db_session.commit()
created_feed = db_session.get(Gtfsrealtimefeed, new_feed.id)
+ if created_feed and created_feed.license_url:
+ assign_license_by_url(created_feed, db_session)
+ db_session.commit()
logging.info("Created new GTFS-RT feed with ID: %s", new_feed.stable_id)
refreshed = refresh_materialized_view(db_session, t_feedsearch.name)
logging.info("Materialized view %s refreshed: %s", t_feedsearch.name, refreshed)
diff --git a/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py b/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py
index b5afcdab9..c8d97dcd6 100644
--- a/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py
+++ b/functions-python/tasks_executor/src/tasks/licenses/license_matcher.py
@@ -36,6 +36,12 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
feed.stable_id,
license_match.license_id,
)
+ from shared.common.license_utils import _AUTO_VERIFY_THRESHOLD
+
+ is_verified = (
+ license_match.match_type == "exact"
+ or license_match.confidence >= _AUTO_VERIFY_THRESHOLD
+ )
feed.license_id = license_match.license_id
feed.license_notes = license_match.notes
feed_license_change: FeedLicenseChange = FeedLicenseChange(
@@ -50,6 +56,7 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
matched_source=license_match.matched_source,
notes=license_match.notes,
regional_id=license_match.regional_id,
+ verified=is_verified,
)
feed.feed_license_changes.append(feed_license_change)
else:
diff --git a/liquibase/changelog.xml b/liquibase/changelog.xml
index 69d47cc33..6b97e1cbf 100644
--- a/liquibase/changelog.xml
+++ b/liquibase/changelog.xml
@@ -101,6 +101,8 @@
+
+