|
| 1 | +# This script defines a task to populate the 'license_tag' table in the database from a canonical |
| 2 | +# JSON file. It is designed to be triggered as a background task. |
| 3 | +# |
| 4 | +# The script performs the following steps: |
| 5 | +# 1. Fetches the 'tags.json' file from the MobilityData/licenses-aas GitHub repository. |
| 6 | +# 2. The JSON file categorises tags into groups (e.g., 'spdx', 'license', 'domain', 'copyleft'). |
| 7 | +# Each group contains a '_group' metadata entry and individual tag entries. |
| 8 | +# 3. For each group and each tag (skipping the '_group' metadata key) it builds a record with: |
| 9 | +# a. id: composite key of the form "group:tag" (e.g., "spdx:osi-approved") |
| 10 | +# b. group: the group name (e.g., "spdx") |
| 11 | +# c. tag: the tag name (e.g., "osi-approved") |
| 12 | +# d. description: human-readable description of the tag |
| 13 | +# 4. For each tag record, it performs an "upsert" operation using SQLAlchemy's `merge` method, |
| 14 | +# with the tag's composite 'id' acting as the primary key. |
| 15 | +# 5. Supports a 'dry_run' mode, which simulates the process and logs intended actions |
| 16 | +# without committing any changes to the database. |
| 17 | +# 6. Includes error handling for network issues and database transactions. |
| 18 | +import logging |
| 19 | + |
| 20 | +import requests |
| 21 | +from shared.database.database import with_db_session |
| 22 | +from shared.database_gen.sqlacodegen_models import LicenseTag, LicenseTagGroup |
| 23 | + |
| 24 | + |
| 25 | +TAGS_JSON_URL = "https://raw.githubusercontent.com/MobilityData/licenses-catalog/main/data/tags.json" |
| 26 | + |
| 27 | + |
| 28 | +@with_db_session |
| 29 | +def populate_license_tags(dry_run, db_session): |
| 30 | + """ |
| 31 | + Populates license tags in the database from a canonical JSON source. |
| 32 | +
|
| 33 | + Args: |
| 34 | + dry_run (bool): If True, simulates the operation without making changes. |
| 35 | + db_session: Database session for executing queries. |
| 36 | + """ |
| 37 | + logging.info("Starting populate_license_tags with dry_run=%s", dry_run) |
| 38 | + |
| 39 | + try: |
| 40 | + logging.info("Downloading tags from %s", TAGS_JSON_URL) |
| 41 | + response = requests.get(TAGS_JSON_URL, timeout=10) |
| 42 | + response.raise_for_status() |
| 43 | + tags_json = response.json() |
| 44 | + |
| 45 | + tags_data = [] |
| 46 | + groups_data = {} |
| 47 | + |
| 48 | + for group_name, group_entries in tags_json.items(): |
| 49 | + group_meta = group_entries.get("_group", {}) or {} |
| 50 | + groups_data[group_name] = { |
| 51 | + "id": group_name, |
| 52 | + "short_name": group_meta.get("short"), |
| 53 | + "description": group_meta.get("description") or group_name, |
| 54 | + } |
| 55 | + |
| 56 | + for tag_name, tag_entry in group_entries.items(): |
| 57 | + if tag_name == "_group": |
| 58 | + # Skip the group-level metadata entry |
| 59 | + continue |
| 60 | + tag_id = f"{group_name}:{tag_name}" |
| 61 | + tags_data.append( |
| 62 | + { |
| 63 | + "id": tag_id, |
| 64 | + "group": group_name, |
| 65 | + "tag": tag_name, |
| 66 | + "description": tag_entry.get("description"), |
| 67 | + "url": tag_entry.get("url"), |
| 68 | + } |
| 69 | + ) |
| 70 | + |
| 71 | + logging.info( |
| 72 | + "Loaded %d groups and %d tags from tags.json.", |
| 73 | + len(groups_data), |
| 74 | + len(tags_data), |
| 75 | + ) |
| 76 | + |
| 77 | + result = "" |
| 78 | + if dry_run: |
| 79 | + result = f"Dry run: would insert/update {len(groups_data)} groups and {len(tags_data)} tags." |
| 80 | + logging.info(result) |
| 81 | + else: |
| 82 | + # Upsert groups first so FK from license_tag.group is satisfied |
| 83 | + for group in groups_data.values(): |
| 84 | + group_object = LicenseTagGroup( |
| 85 | + id=group["id"], |
| 86 | + short_name=group["short_name"], |
| 87 | + description=group["description"], |
| 88 | + ) |
| 89 | + db_session.merge(group_object) |
| 90 | + |
| 91 | + # Then upsert tags that reference those groups |
| 92 | + for tag_data in tags_data: |
| 93 | + tag_object = LicenseTag( |
| 94 | + id=tag_data["id"], |
| 95 | + group=tag_data["group"], |
| 96 | + tag=tag_data["tag"], |
| 97 | + description=tag_data["description"], |
| 98 | + url=tag_data["url"], |
| 99 | + ) |
| 100 | + db_session.merge(tag_object) |
| 101 | + result = f"Successfully upserted {len(groups_data)} groups and {len(tags_data)} tags into the database." |
| 102 | + logging.info(result) |
| 103 | + return result |
| 104 | + |
| 105 | + except requests.exceptions.RequestException as e: |
| 106 | + logging.error("Failed to download tags JSON file: %s", e) |
| 107 | + raise |
| 108 | + except Exception as e: |
| 109 | + logging.error("An error occurred while populating license tags: %s", e) |
| 110 | + db_session.rollback() |
| 111 | + raise |
0 commit comments