Skip to content

Commit fce2e1a

Browse files
Copilotdavidgamez
andauthored
feat: import license tags from licenses-catalog and associate them with licenses (#1614)
Co-authored-by: davidgamez <1192523+davidgamez@users.noreply.github.com>
1 parent d3e0206 commit fce2e1a

File tree

10 files changed

+543
-85
lines changed

10 files changed

+543
-85
lines changed

functions-python/tasks_executor/README.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,6 @@ To update the geolocation files precision:
6868
}
6969
```
7070

71-
To populate license rules:
72-
73-
```json
74-
{
75-
"task": "populate_license_rules",
76-
"payload": {
77-
"dry_run": true
78-
}
79-
}
80-
```
81-
8271
To populate licenses:
8372

8473
```json

functions-python/tasks_executor/src/main.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,12 @@
4747
)
4848
from tasks.data_import.jbda.import_jbda_feeds import import_jbda_handler
4949

50-
from tasks.licenses.populate_license_rules import (
51-
populate_license_rules_handler,
52-
)
5350

5451
from tasks.licenses.populate_licenses import (
5552
populate_licenses_handler,
5653
)
5754

55+
5856
init_logger()
5957
LIST_COMMAND: Final[str] = "list"
6058
tasks = {
@@ -97,12 +95,9 @@
9795
"description": "Imports JBDA data into the system.",
9896
"handler": import_jbda_handler,
9997
},
100-
"populate_license_rules": {
101-
"description": "Populates license rules in the database from a predefined JSON source.",
102-
"handler": populate_license_rules_handler,
103-
},
10498
"populate_licenses": {
105-
"description": "Populates licenses and license-rules in the database from a predefined JSON source.",
99+
"description": "Populates licenses, license-rules and license-tags "
100+
"in the database from a predefined JSON source.",
106101
"handler": populate_licenses_handler,
107102
},
108103
"match_licenses": {

functions-python/tasks_executor/src/tasks/licenses/populate_license_rules.py

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -26,28 +26,16 @@
2626
)
2727

2828

29-
def populate_license_rules_handler(payload):
30-
"""
31-
Handler for populating license rules.
32-
33-
Args:
34-
payload (dict): Incoming payload data.
35-
36-
"""
37-
(dry_run) = get_parameters(payload)
38-
return populate_license_rules_task(dry_run)
39-
40-
4129
@with_db_session
42-
def populate_license_rules_task(dry_run, db_session):
30+
def populate_license_rules(dry_run, db_session):
4331
"""
4432
Populates license rules in the database. This function is triggered by a Cloud Task.
4533
4634
Args:
4735
dry_run (bool): If True, the function will simulate the operation without making changes.
4836
db_session: Database session for executing queries.
4937
"""
50-
logging.info("Starting populate_license_rules_task with dry_run=%s", dry_run)
38+
logging.info("Starting populate_license_rules with dry_run=%s", dry_run)
5139

5240
try:
5341
logging.info("Downloading rules from %s", RULES_JSON_URL)
@@ -72,9 +60,10 @@ def populate_license_rules_task(dry_run, db_session):
7260
logging.info(
7361
"Loaded %d rules from %d categories.", len(rules_data), len(rules_json)
7462
)
75-
63+
result = ""
7664
if dry_run:
77-
logging.info("Dry run: would insert/update %d rules.", len(rules_data))
65+
result = f"Dry run: would insert/update {len(rules_data)} rules."
66+
logging.info(result)
7867
else:
7968
for rule_data in rules_data:
8069
rule_object = Rule(
@@ -85,9 +74,9 @@ def populate_license_rules_task(dry_run, db_session):
8574
)
8675
db_session.merge(rule_object)
8776

88-
logging.info(
89-
"Successfully upserted %d rules into the database.", len(rules_data)
90-
)
77+
result = f"Successfully upserted {len(rules_data)} rules into the database."
78+
logging.info(result)
79+
return result
9180

9281
except requests.exceptions.RequestException as e:
9382
logging.error("Failed to download rules JSON file: %s", e)
@@ -96,16 +85,3 @@ def populate_license_rules_task(dry_run, db_session):
9685
logging.error("An error occurred while populating license rules: %s", e)
9786
db_session.rollback()
9887
raise
99-
100-
101-
def get_parameters(payload):
102-
"""
103-
Get parameters from the payload and environment variables.
104-
105-
Args:
106-
payload (dict): dictionary containing the payload data.
107-
Returns:
108-
tuple: (dry_run, after_date)
109-
"""
110-
dry_run = payload.get("dry_run", False)
111-
return dry_run
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# This script defines a task to populate the 'license_tag' table in the database from a canonical
2+
# JSON file. It is designed to be triggered as a background task.
3+
#
4+
# The script performs the following steps:
5+
# 1. Fetches the 'tags.json' file from the MobilityData/licenses-aas GitHub repository.
6+
# 2. The JSON file categorises tags into groups (e.g., 'spdx', 'license', 'domain', 'copyleft').
7+
# Each group contains a '_group' metadata entry and individual tag entries.
8+
# 3. For each group and each tag (skipping the '_group' metadata key) it builds a record with:
9+
# a. id: composite key of the form "group:tag" (e.g., "spdx:osi-approved")
10+
# b. group: the group name (e.g., "spdx")
11+
# c. tag: the tag name (e.g., "osi-approved")
12+
# d. description: human-readable description of the tag
13+
# 4. For each tag record, it performs an "upsert" operation using SQLAlchemy's `merge` method,
14+
# with the tag's composite 'id' acting as the primary key.
15+
# 5. Supports a 'dry_run' mode, which simulates the process and logs intended actions
16+
# without committing any changes to the database.
17+
# 6. Includes error handling for network issues and database transactions.
18+
import logging
19+
20+
import requests
21+
from shared.database.database import with_db_session
22+
from shared.database_gen.sqlacodegen_models import LicenseTag, LicenseTagGroup
23+
24+
25+
TAGS_JSON_URL = "https://raw.githubusercontent.com/MobilityData/licenses-catalog/main/data/tags.json"
26+
27+
28+
@with_db_session
29+
def populate_license_tags(dry_run, db_session):
30+
"""
31+
Populates license tags in the database from a canonical JSON source.
32+
33+
Args:
34+
dry_run (bool): If True, simulates the operation without making changes.
35+
db_session: Database session for executing queries.
36+
"""
37+
logging.info("Starting populate_license_tags with dry_run=%s", dry_run)
38+
39+
try:
40+
logging.info("Downloading tags from %s", TAGS_JSON_URL)
41+
response = requests.get(TAGS_JSON_URL, timeout=10)
42+
response.raise_for_status()
43+
tags_json = response.json()
44+
45+
tags_data = []
46+
groups_data = {}
47+
48+
for group_name, group_entries in tags_json.items():
49+
group_meta = group_entries.get("_group", {}) or {}
50+
groups_data[group_name] = {
51+
"id": group_name,
52+
"short_name": group_meta.get("short"),
53+
"description": group_meta.get("description") or group_name,
54+
}
55+
56+
for tag_name, tag_entry in group_entries.items():
57+
if tag_name == "_group":
58+
# Skip the group-level metadata entry
59+
continue
60+
tag_id = f"{group_name}:{tag_name}"
61+
tags_data.append(
62+
{
63+
"id": tag_id,
64+
"group": group_name,
65+
"tag": tag_name,
66+
"description": tag_entry.get("description"),
67+
"url": tag_entry.get("url"),
68+
}
69+
)
70+
71+
logging.info(
72+
"Loaded %d groups and %d tags from tags.json.",
73+
len(groups_data),
74+
len(tags_data),
75+
)
76+
77+
result = ""
78+
if dry_run:
79+
result = f"Dry run: would insert/update {len(groups_data)} groups and {len(tags_data)} tags."
80+
logging.info(result)
81+
else:
82+
# Upsert groups first so FK from license_tag.group is satisfied
83+
for group in groups_data.values():
84+
group_object = LicenseTagGroup(
85+
id=group["id"],
86+
short_name=group["short_name"],
87+
description=group["description"],
88+
)
89+
db_session.merge(group_object)
90+
91+
# Then upsert tags that reference those groups
92+
for tag_data in tags_data:
93+
tag_object = LicenseTag(
94+
id=tag_data["id"],
95+
group=tag_data["group"],
96+
tag=tag_data["tag"],
97+
description=tag_data["description"],
98+
url=tag_data["url"],
99+
)
100+
db_session.merge(tag_object)
101+
result = f"Successfully upserted {len(groups_data)} groups and {len(tags_data)} tags into the database."
102+
logging.info(result)
103+
return result
104+
105+
except requests.exceptions.RequestException as e:
106+
logging.error("Failed to download tags JSON file: %s", e)
107+
raise
108+
except Exception as e:
109+
logging.error("An error occurred while populating license tags: %s", e)
110+
db_session.rollback()
111+
raise

functions-python/tasks_executor/src/tasks/licenses/populate_licenses.py

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
# This script defines a task to populate the 'licenses' table and the 'license_rules'
2-
# association table in the database. It is designed to be triggered as a background task.
1+
# This script defines a task to populate the 'licenses' table, the 'license_rules'
2+
# association table, and the 'license_license_tags' association table in the database.
3+
# It is designed to be triggered as a background task.
34
#
45
# The script performs the following steps:
56
# 1. Fetches a list of license definition files from the MobilityData/licenses-aas GitHub repository
@@ -15,6 +16,9 @@
1516
# e. Associates the found rules with the license. The SQLAlchemy ORM automatically
1617
# manages the creation of records in the 'license_rules' join table to establish
1718
# the many-to-many relationship.
19+
# f. Extracts the associated tag IDs from the 'tags' list at the top level of the JSON.
20+
# g. Queries the 'license_tag' table to find the corresponding LicenseTag objects.
21+
# h. Associates the found tags with the license via the 'license_license_tags' join table.
1822
# 4. Supports a 'dry_run' mode, which simulates the process and logs intended
1923
# actions without committing any changes to the database.
2024
# 5. Includes error handling for network issues and database transactions.
@@ -23,22 +27,33 @@
2327

2428
import requests
2529
from shared.database.database import with_db_session
26-
from shared.database_gen.sqlacodegen_models import License, Rule
30+
from shared.database_gen.sqlacodegen_models import License, LicenseTag, Rule
31+
from .populate_license_rules import populate_license_rules
32+
from .populate_license_tags import populate_license_tags
2733

2834
LICENSES_API_URL = (
29-
"https://api.github.com/repos/MobilityData/licenses-aas/contents/data/licenses"
35+
"https://api.github.com/repos/MobilityData/licenses-catalog/contents/data/licenses"
3036
)
3137

3238

3339
def populate_licenses_handler(payload):
3440
"""
35-
Handler for populating licenses.
41+
Handler function for the populate licenses task.
42+
This function imports the license rules and tags population tasks to ensure that the necessary data is available
43+
before populating the licenses.
3644
3745
Args:
3846
payload (dict): Incoming payload data.
3947
"""
4048
dry_run = get_parameters(payload)
41-
return populate_licenses_task(dry_run)
49+
rules_result = populate_license_rules(dry_run)
50+
tags_result = populate_license_tags(dry_run)
51+
license_result = populate_licenses_task(dry_run)
52+
return {
53+
"rules": rules_result,
54+
"tags": tags_result,
55+
"licenses": license_result,
56+
}
4257

4358

4459
@with_db_session
@@ -69,7 +84,8 @@ def populate_licenses_task(dry_run, db_session):
6984
logging.info("Loaded %d licenses.", len(licenses_data))
7085

7186
if dry_run:
72-
logging.info("Dry run: would process %d licenses.", len(licenses_data))
87+
result = f"Dry run: would process {len(licenses_data)} licenses."
88+
logging.info(result)
7389
else:
7490
for license_data in licenses_data:
7591
spdx_data = license_data.get("spdx")
@@ -132,14 +148,33 @@ def populate_licenses_task(dry_run, db_session):
132148
len(rules),
133149
len(all_rule_names),
134150
)
151+
152+
# Clear existing tags and assign updated ones
153+
license_object.tags = []
154+
155+
tag_ids = license_data.get("tags", [])
156+
if tag_ids:
157+
tags = (
158+
db_session.query(LicenseTag)
159+
.filter(LicenseTag.id.in_(tag_ids))
160+
.all()
161+
)
162+
license_object.tags.extend(tags)
163+
if len(tags) != len(tag_ids):
164+
logging.warning(
165+
"License '%s': Found %d of %d tags in the database.",
166+
license_id,
167+
len(tags),
168+
len(tag_ids),
169+
)
170+
135171
# Merge the license object into the session. This handles updating existing licenses (upsert),
136-
# including their rule associations.
172+
# including their rule and tag associations.
137173
if not is_new:
138174
db_session.merge(license_object)
139-
140-
logging.info(
141-
"Successfully upserted licenses into the database.",
142-
)
175+
result = "Successfully upserted licenses into the database."
176+
logging.info(result)
177+
return result
143178

144179
except requests.exceptions.RequestException as e:
145180
logging.error("Failed to download licenses JSON file: %s", e)

functions-python/tasks_executor/tests/tasks/populate_licenses_and_rules/test_populate_license_rules.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from sqlalchemy.exc import SQLAlchemyError
55

66
from tasks.licenses.populate_license_rules import (
7-
populate_license_rules_task,
7+
populate_license_rules,
88
RULES_JSON_URL,
99
)
1010
from shared.database_gen.sqlacodegen_models import Rule
@@ -43,7 +43,7 @@ def test_populate_rules_success(self, mock_requests_get):
4343
mock_db_session = MagicMock()
4444

4545
# Act
46-
populate_license_rules_task(dry_run=False, db_session=mock_db_session)
46+
populate_license_rules(dry_run=False, db_session=mock_db_session)
4747

4848
# Assert
4949
mock_requests_get.assert_called_once_with(RULES_JSON_URL, timeout=10)
@@ -78,7 +78,7 @@ def test_populate_rules_dry_run(self, mock_requests_get):
7878
mock_db_session = MagicMock()
7979

8080
# Act
81-
populate_license_rules_task(dry_run=True, db_session=mock_db_session)
81+
populate_license_rules(dry_run=True, db_session=mock_db_session)
8282

8383
# Assert
8484
mock_requests_get.assert_called_once_with(RULES_JSON_URL, timeout=10)
@@ -96,7 +96,7 @@ def test_request_exception_handling(self, mock_requests_get):
9696

9797
# Act & Assert
9898
with self.assertRaises(requests.exceptions.RequestException):
99-
populate_license_rules_task(dry_run=False, db_session=mock_db_session)
99+
populate_license_rules(dry_run=False, db_session=mock_db_session)
100100

101101
mock_db_session.merge.assert_not_called()
102102
mock_db_session.rollback.assert_not_called()
@@ -115,7 +115,7 @@ def test_database_exception_handling(self, mock_requests_get):
115115

116116
# Act & Assert
117117
with self.assertRaises(SQLAlchemyError):
118-
populate_license_rules_task(dry_run=False, db_session=mock_db_session)
118+
populate_license_rules(dry_run=False, db_session=mock_db_session)
119119

120120
self.assertTrue(mock_db_session.merge.called)
121121
mock_db_session.rollback.assert_called_once()

0 commit comments

Comments
 (0)