diff --git a/README.md b/README.md index 68cad3d1..db20ae5c 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,15 @@ We recognize the following properties: - **Forks url**: Links to forks made of the project - **Full name**: Name + owner (owner/name) - **Full title**: If the repository is a short name, we will attempt to extract the longer version of the repository name -- **Funding**: Funding information associated with the project. **Note**: Currently, this information is only extracted from existing `codemeta.json` files within the repository. -- **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected. +- **Funding**: Funding information associated with the project. **Note**: This information is extracted from existing `codemeta.json` files within the repository. When using `-e`, the project data is enriched with OpenAIRE, adding: + - `project_code`: Project code + - `project_title`: Project title + - `project_acronym`: Project acronym + - `grant_id`: Call/grant identifier +- **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected. When using `-e`, the following enrichment identifiers are also added: + - `openalex_id`: OpenAlex ID for the software + - `openaire_id`: URL to the OpenAIRE explore page + - `swhid`: Software Heritage identifier (for Zenodo DOIs) - **Images**: Images used to illustrate the software component - **Installation instructions**: A set of instructions that indicate how to install a target repository - **Invocation**: Execution command(s) needed to run a scientific software component @@ -347,11 +354,14 @@ Options: requests and increase execution time -h, --help Show this message and exit. - + + -e, --enrichment Enrich metadata with external APIs (OpenAlex, OpenAIRE, Zenodo) + Repoository versions [mutually_exclusive] (see section *Repository versions*t): -b, --branch name branch Branch of the repository to analyze. Overrides the default branch. --tag text Tag of the repository to analyze. Cannot be used together with --branch. + ``` ## Usage example: @@ -389,6 +399,14 @@ This includes identifying dependencies, runtime requirements, and development to SOMEF is designed to work primarily with repositories written in English. Repositories in other languages may not be processed as effectively, and results could be incomplete or less accurate. +### Enrichment with `-e` + +The `-e` (or `--enrichment`) flag queries external APIs to complete the extracted metadata: +- **OpenAlex**: adds `openalex_id` to DOIs of publications and software. +- **OpenAIRE**: adds `openaire_id` and enriches funding information (project code, title, acronym, grant id). +- **Zenodo**: adds `swhid` (Software Heritage ID) for Zenodo DOIs. + +**Note:** Enrichment makes additional network requests to external services, which may slow down the overall execution time. Use this flag only when you need the extra metadata. ## Repository versions: default behavior, branch and tag diff --git a/src/somef/__main__.py b/src/somef/__main__.py index 0b666830..46f4b457 100644 --- a/src/somef/__main__.py +++ b/src/somef/__main__.py @@ -197,6 +197,14 @@ def configure(auto, base_uri): default=None, help="Tag of the repository to analyze. Incompatible with --branch" ) +@click.option( + "--enrich", + "-e", + is_flag=True, + default=False, + help="Enrich metadata with external APIs (OpenAlex, OpenAIRE, Zenodo)" +) + def describe(requirements_v, requirements_all, **kwargs): # import so missing packages get installed when appropriate if requirements_v: diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 371489bb..48fb0430 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -11,7 +11,7 @@ from . import header_analysis, regular_expressions, process_repository, configuration, process_files, \ supervised_classification from .process_results import Result -from .utils import constants, markdown_utils +from .utils import constants, markdown_utils, enrichment from .parser import mardown_parser, create_excerpts from .export.turtle_export import DataGraph from .export import json_export @@ -266,7 +266,8 @@ def run_cli(*, requirements_mode="all", reconcile_authors=False, branch=None, - tag=None + tag=None, + enrich=False ): """Function to run all the required components of the cli for a repository""" # check if it is a valid url @@ -308,6 +309,9 @@ def run_cli(*, repo_data = json_export.unify_results(repo_data.results) + if enrich: + repo_data = enrichment.run_enrichment(repo_data) + if output is not None: output = output.replace(".json","") output = output + "_" + encoded_url + ".json" @@ -349,6 +353,8 @@ def run_cli(*, repo_data = repo_data.get_json() repo_data = json_export.unify_results(repo_data.results) + if enrich: + repo_data = enrichment.run_enrichment(repo_data) if output is not None: json_export.save_json_output(repo_data, output, missing, pretty=pretty) diff --git a/src/somef/test/test_data/expected/runtime_platform_repo.yaml b/src/somef/test/test_data/expected/runtime_platform_repo.yaml index d0f08f14..8deaa449 100644 --- a/src/somef/test/test_data/expected/runtime_platform_repo.yaml +++ b/src/somef/test/test_data/expected/runtime_platform_repo.yaml @@ -6,4 +6,4 @@ CAT_PROGRAMMING_LANGUAGES: name: Java value: Java version: "1.8" - type: Language + type: Programming_language \ No newline at end of file diff --git a/src/somef/test/test_enrichment.py b/src/somef/test/test_enrichment.py new file mode 100644 index 00000000..c7d0adfc --- /dev/null +++ b/src/somef/test/test_enrichment.py @@ -0,0 +1,67 @@ +import json +import os +import unittest +from pathlib import Path +from .. import somef_cli +from ..utils import constants + +test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep + + +class TestEnrichment(unittest.TestCase): + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it requires external APIs") + def test_enrichment_integration(self): + """Tests that --enrich adds openalex_id, openaire_id, swhid, + orcid identifier and funding project properties to the output.""" + + somef_cli.run_cli(threshold=0.8, + repo_url="https://github.com/oeg-upm/rsfc", + output=test_data_path + "test-enrich.json", + enrich=True, + pretty=True) + + with open(test_data_path + "test-enrich.json") as f: + data = json.load(f) + + citations = data.get("citation", []) + self.assertTrue(any("openalex_id" in c["result"] for c in citations)) + self.assertTrue(any("openaire_id" in c["result"] for c in citations)) + + identifiers = data.get("identifier", []) + self.assertTrue(any("openalex_id" in i["result"] for i in identifiers)) + self.assertTrue(any("openaire_id" in i["result"] for i in identifiers)) + self.assertTrue(any("swhid" in i["result"] for i in identifiers)) + + authors = data.get("author", []) + self.assertTrue(any( + "identifier" in a["result"] and "orcid" in a["result"].get("identifier", "").lower() + for a in authors + )) + + fundings = data.get("funding", []) + if fundings: + self.assertTrue(any("project_code" in f["result"] for f in fundings)) + self.assertTrue(any("grant_id" in f["result"] for f in fundings)) + + os.remove(test_data_path + "test-enrich.json") + + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI") + def test_enrichment_funding(self): + """Tests funding enrichment with a repo that has codemeta.json with funding.""" + + somef_cli.run_cli(threshold=0.8, + repo_url="https://github.com/codemeta/codemeta", + output=test_data_path + "test-enrich-funding.json", + enrich=True, + pretty=True) + + with open(test_data_path + "test-enrich-funding.json") as f: + data = json.load(f) + + fundings = data.get("funding", []) + self.assertTrue(any("project_code" in f["result"] for f in fundings)) + self.assertTrue(any("project_title" in f["result"] for f in fundings)) + + os.remove(test_data_path + "test-enrich-funding.json") \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index bec49c19..87afb2d3 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -587,4 +587,22 @@ class RepositoryType(Enum): CAT_RUNTIME_PLATFORM, CAT_REQUIREMENTS, CAT_INSTALLATION, -} \ No newline at end of file +} + +# Enrichment +OPENALEX_BASE = "https://api.openalex.org" +OPENAIRE_BASE = "https://api.openaire.eu" +OPENAIRE_EXPLORE = "https://explore.openaire.eu" +OPENAIRE_NAMESPACE = "http://namespace.openaire.eu/oaf" +REGEXP_DOI_IN_URL = r'(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)' +REGEXP_FIND_ZENODO = r'zenodo\.(\d+)' +PROP_OPENALEX_ID = "openalex_id" +PROP_OPENAIRE_ID = "openaire_id" +PROP_SWHID = "swhid" +PROP_PROJECT_CODE = "project_code" +PROP_PROJECT_TITLE = "project_title" +PROP_PROJECT_ACRONYM = "project_acronym" +PROP_GRANT_ID = "grant_id" +PROP_FUNDER = "funder" +PROP_START_DATE = "start_date" +PROP_END_DATE = "end_date" \ No newline at end of file diff --git a/src/somef/utils/enrichment.py b/src/somef/utils/enrichment.py new file mode 100644 index 00000000..c0bf6740 --- /dev/null +++ b/src/somef/utils/enrichment.py @@ -0,0 +1,212 @@ + +import requests +import re +import logging +from ..utils import constants +import xml.etree.ElementTree as ET + +def get_openalex_id(doi): + url = f"{constants.OPENALEX_BASE}/works/doi:{doi}" + resp = requests.get(url) + if resp.status_code != 200: + return None + return resp.json().get("id") + +def get_openaire_id(doi) -> dict | None: + url = f"{constants.OPENAIRE_BASE}/search/researchProducts?doi={doi}&format=json" + resp = requests.get(url) + if resp.status_code != 200: + return None + data = resp.json() + results = data.get("response", {}).get("results", {}).get("result", []) + if results: + raw_id = results[0].get("header", {}).get("dri:objIdentifier", {}).get("$") + if raw_id: + return f"{constants.OPENAIRE_EXPLORE}/search/software?orpId={raw_id}" + return None + +def get_zenodo_swhid(doi): + """Get SWHID from a Zenodo DOI""" + + match = re.search(constants.REGEXP_FIND_ZENODO, doi) + if not match: + return None + record_id = match.group(1) + + url = f"https://zenodo.org/api/records/{record_id}" + resp = requests.get(url) + if resp.status_code != 200: + return None + + data = resp.json() + + swh = data.get("swh", {}) + if swh: + return swh.get("id") or swh.get(constants.PROP_SWHID) + + for rel_id in data.get("metadata", {}).get("related_identifiers", []): + if rel_id.get("identifier", "").startswith("swh:"): + return rel_id["identifier"] + + return None + + +def extract_doi(result): + """Extract a DOI from a result dict which may contain a DOI in different fields and formats.""" + + doi = result.get("doi") + if doi: + return doi + + for entry in result.get("identifier", []): + if entry.get("type") == "doi": + return entry.get("value") + + for key in ("url", "value"): + doi_url = re.search(constants.REGEXP_DOI_IN_URL, result.get(key, "")) + if doi_url: + return doi_url.group(1) + + return None + + +def search_openalex_author(name): + url = f"{constants.OPENALEX_BASE}/authors?search={requests.utils.quote(name)}" + resp = requests.get(url) + if resp.status_code != 200: + return None + results = resp.json().get("results", []) + if results: + return results[0].get("orcid") + return None + + +def collect_existing_orcids(results): + """ + Collect ORCIDs already present in the data. + Looks for ORCIDs in citation authors (url field) and in author/contributor entries (identifier and url fields). + """ + orcid_map = {} + + for citation in results.get(constants.CAT_CITATION, []): + for author in citation["result"].get(constants.PROP_AUTHOR, []): + add_orcid_to_map(orcid_map, author.get(constants.PROP_NAME, ""), author.get(constants.PROP_URL, "")) + add_orcid_to_map(orcid_map, author.get(constants.PROP_NAME, ""), author.get(constants.PROP_IDENTIFIER, "")) + + for category in (constants.CAT_AUTHORS, constants.CAT_CONTRIBUTORS): + for entry in results.get(category, []): + result = entry["result"] + add_orcid_to_map(orcid_map, result.get(constants.PROP_NAME, ""), result.get(constants.PROP_IDENTIFIER, "")) + add_orcid_to_map(orcid_map, result.get(constants.PROP_NAME, ""), result.get(constants.PROP_URL, "")) + return orcid_map + + +def add_orcid_to_map(orcid_map, name, value): + """Add an ORCID to the map if value contains an ORCID and name is not empty.""" + if value and "orcid" in value.lower() and name: + orcid_map[name.lower().strip()] = value + + +def clean_name(name): + """Remove newlines and surrounding whitespace from a name string.""" + return name.replace("\n", "").strip() + + +def has_orcid(result): + """Check if a result already has an ORCID.""" + for key in ("identifier", "url"): + val = result.get(key, "") + if "orcid" in val.lower(): + return True + return False + + +def get_openaire_project(identifier): + """Busca un proyecto en OpenAIRE por grant ID o call identifier""" + url = f"{constants.OPENAIRE_BASE}/search/projects?keywords={requests.utils.quote(identifier)}" + resp = requests.get(url) + if resp.status_code != 200: + return None + + root = ET.fromstring(resp.text) + ns = constants.OPENAIRE_NAMESPACE + + project = root.find(f".//{{{ns}}}project") + if project is None: + return None + + return { + constants.PROP_PROJECT_CODE: project.findtext("code"), + constants.PROP_PROJECT_TITLE: project.findtext("title"), + constants.PROP_PROJECT_ACRONYM: project.findtext("acronym"), + constants.PROP_GRANT_ID: project.findtext("callidentifier"), + constants.PROP_FUNDER: project.findtext(".//funder/shortname"), + constants.PROP_START_DATE: project.findtext("startdate"), + constants.PROP_END_DATE: project.findtext("enddate"), + } + +def run_enrichment(results) -> dict: + + logging.info("Enrichment process started.") + + for citation in results.get(constants.CAT_CITATION, []): + doi = extract_doi(citation["result"]) + if doi: + citation["result"][constants.PROP_OPENALEX_ID] = get_openalex_id(doi) + citation["result"][constants.PROP_OPENAIRE_ID] = get_openaire_id(doi) + + for identifier in results.get(constants.PROP_IDENTIFIER, []): + value = identifier["result"].get("value", "") + m = re.search(constants.REGEXP_DOI_IN_URL, value) + if m: + doi = m.group(0) + identifier["result"][constants.PROP_OPENALEX_ID] = get_openalex_id(doi) + identifier["result"][constants.PROP_OPENAIRE_ID] = get_openaire_id(doi) + if "zenodo" in doi.lower(): + identifier["result"][constants.PROP_SWHID] = get_zenodo_swhid(doi) + + orcid_map = collect_existing_orcids(results) + + for category in (constants.CAT_AUTHORS, constants.CAT_CONTRIBUTORS): + for entry in results.get(category, []): + result = entry["result"] + if has_orcid(result): + continue + name = clean_name(result.get(constants.PROP_NAME) or result.get("value", "")) + if not name: + continue + + orcid = orcid_map.get(name.lower()) + + if not orcid: + orcid = search_openalex_author(name) + + if orcid: + result[constants.PROP_IDENTIFIER] = orcid + + for funding in results.get(constants.PROP_FUNDING, []): + result = funding["result"] + identifier = result.get(constants.PROP_FUNDING) + if identifier: + identifier = identifier.split(";")[0].strip() + else: + funder = result.get(constants.PROP_FUNDER) + if isinstance(funder, dict): + identifier = funder.get(constants.PROP_NAME) + else: + identifier = funder + + if identifier: + project = get_openaire_project(identifier) + # logging.info(f"Enrichment found project for funding identifier '{identifier}': {project}") + if project: + if project[constants.PROP_PROJECT_CODE]: + result[constants.PROP_PROJECT_CODE] = project[constants.PROP_PROJECT_CODE] + if project[constants.PROP_PROJECT_TITLE]: + result[constants.PROP_PROJECT_TITLE] = project[constants.PROP_PROJECT_TITLE] + if project[constants.PROP_PROJECT_ACRONYM]: + result[constants.PROP_PROJECT_ACRONYM] = project[constants.PROP_PROJECT_ACRONYM] + if project[constants.PROP_GRANT_ID]: + result[constants.PROP_GRANT_ID] = project[constants.PROP_GRANT_ID] + + return results \ No newline at end of file