Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,15 @@ We recognize the following properties:
- **Forks url**: Links to forks made of the project
- **Full name**: Name + owner (owner/name)
- **Full title**: If the repository is a short name, we will attempt to extract the longer version of the repository name
- **Funding**: Funding information associated with the project. **Note**: Currently, this information is only extracted from existing `codemeta.json` files within the repository.
- **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected.
- **Funding**: Funding information associated with the project. **Note**: This information is extracted from existing `codemeta.json` files within the repository. When using `-e`, the project data is enriched with OpenAIRE, adding:
- `project_code`: Project code
- `project_title`: Project title
- `project_acronym`: Project acronym
- `grant_id`: Call/grant identifier
- **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected. When using `-e`, the following enrichment identifiers are also added:
- `openalex_id`: OpenAlex ID for the software
- `openaire_id`: URL to the OpenAIRE explore page
- `swhid`: Software Heritage identifier (for Zenodo DOIs)
- **Images**: Images used to illustrate the software component
- **Installation instructions**: A set of instructions that indicate how to install a target repository
- **Invocation**: Execution command(s) needed to run a scientific software component
Expand Down Expand Up @@ -347,11 +354,14 @@ Options:
requests and increase execution time

-h, --help Show this message and exit.


-e, --enrichment Enrich metadata with external APIs (OpenAlex, OpenAIRE, Zenodo)

Repoository versions [mutually_exclusive] (see section *Repository versions*t):
-b, --branch name branch Branch of the repository to analyze. Overrides the default branch.

--tag text Tag of the repository to analyze. Cannot be used together with --branch.

```

## Usage example:
Expand Down Expand Up @@ -389,6 +399,14 @@ This includes identifying dependencies, runtime requirements, and development to
SOMEF is designed to work primarily with repositories written in English.
Repositories in other languages may not be processed as effectively, and results could be incomplete or less accurate.

### Enrichment with `-e`

The `-e` (or `--enrichment`) flag queries external APIs to complete the extracted metadata:
- **OpenAlex**: adds `openalex_id` to DOIs of publications and software.
- **OpenAIRE**: adds `openaire_id` and enriches funding information (project code, title, acronym, grant id).
- **Zenodo**: adds `swhid` (Software Heritage ID) for Zenodo DOIs.

**Note:** Enrichment makes additional network requests to external services, which may slow down the overall execution time. Use this flag only when you need the extra metadata.

## Repository versions: default behavior, branch and tag

Expand Down
8 changes: 8 additions & 0 deletions src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,14 @@ def configure(auto, base_uri):
default=None,
help="Tag of the repository to analyze. Incompatible with --branch"
)
@click.option(
"--enrich",
"-e",
is_flag=True,
default=False,
help="Enrich metadata with external APIs (OpenAlex, OpenAIRE, Zenodo)"
)

def describe(requirements_v, requirements_all, **kwargs):
# import so missing packages get installed when appropriate
if requirements_v:
Expand Down
10 changes: 8 additions & 2 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import header_analysis, regular_expressions, process_repository, configuration, process_files, \
supervised_classification
from .process_results import Result
from .utils import constants, markdown_utils
from .utils import constants, markdown_utils, enrichment
from .parser import mardown_parser, create_excerpts
from .export.turtle_export import DataGraph
from .export import json_export
Expand Down Expand Up @@ -266,7 +266,8 @@ def run_cli(*,
requirements_mode="all",
reconcile_authors=False,
branch=None,
tag=None
tag=None,
enrich=False
):
"""Function to run all the required components of the cli for a repository"""
# check if it is a valid url
Expand Down Expand Up @@ -308,6 +309,9 @@ def run_cli(*,

repo_data = json_export.unify_results(repo_data.results)

if enrich:
repo_data = enrichment.run_enrichment(repo_data)

if output is not None:
output = output.replace(".json","")
output = output + "_" + encoded_url + ".json"
Expand Down Expand Up @@ -349,6 +353,8 @@ def run_cli(*,
repo_data = repo_data.get_json()

repo_data = json_export.unify_results(repo_data.results)
if enrich:
repo_data = enrichment.run_enrichment(repo_data)

if output is not None:
json_export.save_json_output(repo_data, output, missing, pretty=pretty)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ CAT_PROGRAMMING_LANGUAGES:
name: Java
value: Java
version: "1.8"
type: Language
type: Programming_language
67 changes: 67 additions & 0 deletions src/somef/test/test_enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import json
import os
import unittest
from pathlib import Path
from .. import somef_cli
from ..utils import constants

test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep


class TestEnrichment(unittest.TestCase):

@unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it requires external APIs")
def test_enrichment_integration(self):
"""Tests that --enrich adds openalex_id, openaire_id, swhid,
orcid identifier and funding project properties to the output."""

somef_cli.run_cli(threshold=0.8,
repo_url="https://github.com/oeg-upm/rsfc",
output=test_data_path + "test-enrich.json",
enrich=True,
pretty=True)

with open(test_data_path + "test-enrich.json") as f:
data = json.load(f)

citations = data.get("citation", [])
self.assertTrue(any("openalex_id" in c["result"] for c in citations))
self.assertTrue(any("openaire_id" in c["result"] for c in citations))

identifiers = data.get("identifier", [])
self.assertTrue(any("openalex_id" in i["result"] for i in identifiers))
self.assertTrue(any("openaire_id" in i["result"] for i in identifiers))
self.assertTrue(any("swhid" in i["result"] for i in identifiers))

authors = data.get("author", [])
self.assertTrue(any(
"identifier" in a["result"] and "orcid" in a["result"].get("identifier", "").lower()
for a in authors
))

fundings = data.get("funding", [])
if fundings:
self.assertTrue(any("project_code" in f["result"] for f in fundings))
self.assertTrue(any("grant_id" in f["result"] for f in fundings))

os.remove(test_data_path + "test-enrich.json")


@unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI")
def test_enrichment_funding(self):
"""Tests funding enrichment with a repo that has codemeta.json with funding."""

somef_cli.run_cli(threshold=0.8,
repo_url="https://github.com/codemeta/codemeta",
output=test_data_path + "test-enrich-funding.json",
enrich=True,
pretty=True)

with open(test_data_path + "test-enrich-funding.json") as f:
data = json.load(f)

fundings = data.get("funding", [])
self.assertTrue(any("project_code" in f["result"] for f in fundings))
self.assertTrue(any("project_title" in f["result"] for f in fundings))

os.remove(test_data_path + "test-enrich-funding.json")
20 changes: 19 additions & 1 deletion src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,4 +587,22 @@ class RepositoryType(Enum):
CAT_RUNTIME_PLATFORM,
CAT_REQUIREMENTS,
CAT_INSTALLATION,
}
}

# Enrichment
OPENALEX_BASE = "https://api.openalex.org"
OPENAIRE_BASE = "https://api.openaire.eu"
OPENAIRE_EXPLORE = "https://explore.openaire.eu"
OPENAIRE_NAMESPACE = "http://namespace.openaire.eu/oaf"
REGEXP_DOI_IN_URL = r'(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)'
REGEXP_FIND_ZENODO = r'zenodo\.(\d+)'
PROP_OPENALEX_ID = "openalex_id"
PROP_OPENAIRE_ID = "openaire_id"
PROP_SWHID = "swhid"
PROP_PROJECT_CODE = "project_code"
PROP_PROJECT_TITLE = "project_title"
PROP_PROJECT_ACRONYM = "project_acronym"
PROP_GRANT_ID = "grant_id"
PROP_FUNDER = "funder"
PROP_START_DATE = "start_date"
PROP_END_DATE = "end_date"
Loading
Loading