diff --git a/README.md b/README.md index c969bad3..73f323ff 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository -- **Contributors**: Contributors to a software component +- **Contributors**: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs. - **Creation date**: Date when the repository was created - **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available. - **Date updated**: Date of last release. diff --git a/docs/codemetajson.md b/docs/codemetajson.md index 6fb84c89..8a702278 100644 --- a/docs/codemetajson.md +++ b/docs/codemetajson.md @@ -15,6 +15,12 @@ These fields are defined in the [Codemeta specification](https://github.com/code | citation - doi | citation[i].result.doi | referencePublication.identifier | | code_repository | code_repository[i].result.value | codeRepository | | continuous_integration | continuous_integration[i].result.value | contIntegration | +| contributors - value | contributors[i].result.value | contributor.givenName + contributor.familyName or just name if organization | +| contributors - name | contributors[i].result.value | contributor.givenName + contributor.familyName or just name if organization | +| contributors - last_name | contributors[i].result.value | contributor.familyName | +| contributors - given_name | contributors[i].result.value | contributor.givenName | +| contributors - identifier | contributors[i].result.value | contributor.@id | +| contributors - email | contributors[i].result.value | contributor.email | | date_created | date_created[i].result.value | dateCreated | | date_updated | date_updated[i].result.value | dateModified | | date_published | date_published[i].result .value | datePublished | diff --git a/docs/index.md b/docs/index.md index bcc1f7df..ac13d86e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,7 +42,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository -- **Contributors**: Contributors to a software component +- **Contributors**: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs. - **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available. - **Creation date**: Date when the repository was created - **Date updated**: Date of last release. diff --git a/docs/output.md b/docs/output.md index 73a71dba..30831499 100644 --- a/docs/output.md +++ b/docs/output.md @@ -74,7 +74,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `contact`: Contact person responsible for maintaining a software component. - `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab. - `contributing guidelines`: Guidelines indicating how to contribute to a software component. -- `contributors`: Contributors to a software component +- `contributors`: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs. - `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available. - `date_created`: Date when the software component was created. - `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction). diff --git a/poetry.lock b/poetry.lock index ed414950..be43bff3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1603,14 +1603,14 @@ testing = ["pytest", "setuptools", "twine", "wheel"] [[package]] name = "pygments" -version = "2.19.2" +version = "2.20.0" description = "Pygments is a syntax highlighting package written in Python." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, - {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, + {file = "pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176"}, + {file = "pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f"}, ] [package.extras] diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index 8f1aa7f2..02477d80 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -581,8 +581,10 @@ def format_date(date_string): if runtimes: codemeta_output[constants.CAT_CODEMETA_RUNTIMEPLATFORM] = ", ".join(runtimes) - # if "contributors" in repo_data: - # codemeta_output["contributor"] = data_path(["contributors", "excerpt"]) + if constants.CAT_CONTRIBUTORS in repo_data: + raw_contributors = repo_data[constants.CAT_CONTRIBUTORS] + codemeta_output[constants.CAT_CODEMETA_CONTRIBUTOR] = parse_contributors(raw_contributors) + # A person is expected, and we extract text at the moment if descriptions_text: codemeta_output[constants.CAT_CODEMETA_DESCRIPTION] = descriptions_text @@ -684,6 +686,74 @@ def map_requirement_type(t): # default return constants.SCHEMA_SOFTWARE_APPLICATION +def parse_contributors(raw): + contributors = [] + seen = set() + + for entry in raw: + result = entry.get("result", {}) + rtype = result.get("type") + name = result.get("value") + + if not name: + continue + + if rtype == "Agent": + + if name not in seen: + + if re.search(constants.REGEXP_LTD_INC, name, re.IGNORECASE): + type_contributor = "Organization" + else: + type_contributor = "Person" + + contributor = { + "@type": type_contributor, + "name": name + } + if "given_name" in result: + contributor["givenName"] = result["given_name"] + + if "last_name" in result: + contributor["familyName"] = result["last_name"] + + if "email" in result: + contributor["email"] = result["email"] + + if "identifier" in result: + contributor["@id"] = result["identifier"] + + contributors.append(contributor) + seen.add(name) + + if rtype == "File_dump": + for line in result.get("value", "").splitlines(): + line = line.strip() + + if (not line or line.startswith(("#", "##", "|")) or "[" in line): + continue + + # avoid sentences + if len(line.split()) > 4: + continue + + if line in seen: + continue + + if re.search(constants.REGEXP_LTD_INC, line, re.IGNORECASE): + type_contributor = "Organization" + else: + type_contributor = "Person" + + contributors.append({ + "@type": type_contributor, + "name": line + }) + + seen.add(line) + + + return contributors """ diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py index 6c6098a8..86ebad36 100644 --- a/src/somef/parser/codemeta_parser.py +++ b/src/somef/parser/codemeta_parser.py @@ -234,6 +234,82 @@ def parse_programming_language(language_data): return None +def parse_contributors(contributors_data): + """ + Parse contributors from codemeta.json + + Parameters + ---------- + contributors_data: list, dict + Contributor data from codemeta.json + + Returns + ------- + list + List of contributor dictionaries + """ + contributors_list = [] + + if isinstance(contributors_data, dict): + contributors_data = [contributors_data] + + if not isinstance(contributors_data, list): + return contributors_list + + for contributor in contributors_data: + + if isinstance(contributor, dict): + + given = contributor.get("givenName") + family = contributor.get("familyName") + name = contributor.get("name") + + if given and family: + full_name = f"{given} {family}" + elif name: + full_name = name + else: + continue + + contributor_info = { + "value": full_name, + "name": full_name, + "type": constants.AGENT + } + + if given: + contributor_info["given_name"] = given + + if family: + contributor_info["last_name"] = family + + if "email" in contributor: + contributor_info["email"] = contributor["email"] + + affil = contributor.get("affiliation") + if affil: + if isinstance(affil, dict) and affil.get("name"): + contributor_info["affiliation"] = affil["name"] + elif isinstance(affil, str): + contributor_info["affiliation"] = affil + + identifier = contributor.get("identifier") or contributor.get("@id") + if identifier: + contributor_info["identifier"] = identifier + + contributors_list.append(contributor_info) + + elif isinstance(contributor, str): + name = contributor.strip() + if name: + contributors_list.append({ + "value": name, + "type": constants.AGENT + }) + + return contributors_list + + def parse_codemeta_json_file(file_path, metadata_result: Result, source): """ @@ -290,6 +366,17 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): source ) + if "contributor" in data: + contributors = parse_contributors(data["contributor"]) + for contributor in contributors: + metadata_result.add_result( + constants.CAT_CONTRIBUTORS, + contributor, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + if "issueTracker" in data: metadata_result.add_result( constants.CAT_ISSUE_TRACKER, @@ -570,7 +657,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): if author_name: author_info = { "value": author_name, - "type": constants.STRING + "type": constants.AGENT } if "email" in author: @@ -604,7 +691,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): if author_name: author_info = { "value": author_name, - "type": constants.STRING + "type": constants.AGENT } if "email" in author: diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index 4c6ceb57..84d1a7f6 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -618,6 +618,55 @@ def test_issue_886_apache_code(self): os.remove(test_data_path + "test_issue_886_apache_code.json") + + + def test_issue_936_contributors(self): + """Checks whether contributors are correctly extracted from the repository""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "codemeta_repo", + doc_src=None, + in_file=None, + output=None, + graph_out=None, + graph_format="turtle", + codemeta_out=test_data_path + "test_issue_936_contributors.json", + pretty=True, + missing=False, + readme_only=False) + + text_file = open(test_data_path + "test_issue_936_contributors.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) + + contributors = json_content[constants.CAT_CODEMETA_CONTRIBUTOR] + print(contributors) + self.assertTrue(any( + c["name"] == "Abby Cabunoc Mayes" and + c.get("givenName") == "Abby Cabunoc" + for c in contributors + ), + "Expected contributor Abby Cabunoc Mayes with givenName='Abby Cabunoc' not found") + + self.assertTrue(any( + c["name"] == "Arfon Smith" and + c.get("@id") == "http://orcid.org/0000-0002-3957-2474" + for c in contributors + ), + "Expected contributor Arfon Smith with @id='http://orcid.org/0000-0002-3957-2474' not found") + + self.assertTrue(any( + c["name"] == "Dan Katz" and + c.get("email") == "dskatz@illinois.edu" + for c in contributors + ), + "Expected contributor Dan Katz with email='dskatz@illinois.edu' not found") + + os.remove(test_data_path + "test_issue_936_contributors.json") + + @classmethod def tearDownClass(cls): """delete temp file JSON just if all the test pass""" diff --git a/src/somef/test/test_codemeta_parser.py b/src/somef/test/test_codemeta_parser.py index f56b9a85..de36c783 100644 --- a/src/somef/test/test_codemeta_parser.py +++ b/src/somef/test/test_codemeta_parser.py @@ -17,6 +17,8 @@ def load_expected(self, repo_name): """Load expected YAML for a given repo.""" yaml_path = EXPECT_DIR / f"{repo_name}.yaml" if not yaml_path.exists(): + if repo_name == "codemeta_repo": + return {} self.skipTest(f"No expected YAML for repository '{repo_name}'") with open(yaml_path, "r", encoding="utf-8") as f: return yaml.safe_load(f) @@ -62,5 +64,34 @@ def test_parse_multiple_codemeta_files(self): f"[{repo_folder}] Mismatch in {cat_name}" ) + + def test_parse_contributors(self): + codemeta_path = REPOS_DIR / "codemeta_repo" / "codemeta.json" + result = Result() + + metadata_result = parse_codemeta_json_file(codemeta_path, result, "https://example.org/codemeta.json") + + self.assertIn(constants.CAT_CONTRIBUTORS, metadata_result.results) + contributors = result.results[constants.CAT_CONTRIBUTORS] + + self.assertTrue(any( + c["result"]["name"] == "Abby Cabunoc Mayes" and + c["result"].get("given_name") == "Abby Cabunoc" + for c in contributors + )) + + self.assertTrue(any( + c["result"]["name"] == "Arfon Smith" and + c["result"].get("identifier") == "http://orcid.org/0000-0002-3957-2474" + for c in contributors + )) + + self.assertTrue(any( + c["result"]["name"] == "Dan Katz" and + c["result"].get("email") == "dskatz@illinois.edu" + for c in contributors + )) + + if __name__ == "__main__": unittest.main() diff --git a/src/somef/test/test_data/expected/aladin-lite.yaml b/src/somef/test/test_data/expected/aladin-lite.yaml index 2c4b6541..0497706d 100644 --- a/src/somef/test/test_data/expected/aladin-lite.yaml +++ b/src/somef/test/test_data/expected/aladin-lite.yaml @@ -15,7 +15,7 @@ CAT_IDENTIFIER: 10.5281/zenodo.7638833 # Passed CAT_DESCRIPTION: An astronomical HiPS visualizer in the browser. # Passed CAT_AUTHORS: # Passed value: Matthieu Baumann - type: String + type: Agent email: matthieu.baumann@unistra.fr affiliation: "Universit\u00e9 de Strasbourg, CNRS, Observatoire astronomique de Strasbourg, UMR 7550, F-67000 Strasbourg, France" identifier: "https://orcid.org/0000-0002-7123-773X" diff --git a/src/somef/test/test_data/expected/gammapy.yaml b/src/somef/test/test_data/expected/gammapy.yaml index 4d89b73d..b3953f03 100644 --- a/src/somef/test/test_data/expected/gammapy.yaml +++ b/src/somef/test/test_data/expected/gammapy.yaml @@ -28,4 +28,4 @@ CAT_REQUIREMENTS: # Passed CAT_AUTHORS: # Passed value: Fabio Acero - type: String \ No newline at end of file + type: Agent \ No newline at end of file diff --git a/src/somef/test/test_data/expected/r3broot2.yaml b/src/somef/test/test_data/expected/r3broot2.yaml index 89538725..e2525b61 100644 --- a/src/somef/test/test_data/expected/r3broot2.yaml +++ b/src/somef/test/test_data/expected/r3broot2.yaml @@ -36,7 +36,7 @@ CAT_REQUIREMENTS: # Passed version: Null CAT_AUTHORS: # Passed value: "Jose Luis Rodr\u00edguez-S\u00e1nchez" - type: String + type: Agent email: j.l.rodriguez.sanchez@udc.es affiliation: "CITENI, Industrial Campus of Ferrol, University of Coruña, 15403 Ferrol, Spain" # Passed identifier: https://orcid.org/0000-0002-4702-5294 diff --git a/src/somef/test/test_data/repositories/codemeta_repo/codemeta.json b/src/somef/test/test_data/repositories/codemeta_repo/codemeta.json new file mode 100644 index 00000000..e6054571 --- /dev/null +++ b/src/somef/test/test_data/repositories/codemeta_repo/codemeta.json @@ -0,0 +1,214 @@ +{ + "@context": "https://w3id.org/codemeta/3.0", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "3.1", + "author": [ + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "contributor": [ + { + "@type": "Person", + "givenName": "Abby Cabunoc", + "familyName": "Mayes", + "email": "abbycabs@gmail.com" + }, + { + "@type": "Person", + "givenName": "Arfon", + "familyName": "Smith", + "email": "arfon.smith@gmail.com", + "@id": "http://orcid.org/0000-0002-3957-2474" + }, + { + "@type": "Person", + "givenName": "Peter", + "familyName": "Slaughter", + "email": "slaughter@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0002-2192-403X" + }, + { + "@type": "Person", + "givenName": "Kyle", + "familyName": "Niemeyer", + "email": "Kyle.Niemeyer@oregonstate.edu", + "@id": "http://orcid.org/0000-0003-4425-7097" + }, + { + "@type": "Person", + "givenName": "Yolanda", + "familyName": "Gil", + "email": "GIL@ISI.EDU", + "@id": "http://orcid.org/0000-0001-8465-8341" + }, + { + "@type": "Person", + "givenName": "Krzysztof", + "familyName": "Nowak" + }, + { + "@type": "Person", + "givenName": "Martin", + "familyName": "Fenner", + "@id": "http://orcid.org/0000-0003-1419-2405" + }, + { + "@type": "Person", + "givenName": "Mark", + "familyName": "Hahnel", + "@id": "http://orcid.org/0000-0003-4741-0309" + }, + { + "@type": "Person", + "givenName": "Luke", + "familyName": "Coy", + "email": "luke.coy@rit.edu" + }, + { + "@type": "Person", + "givenName": "Alice", + "familyName": "Allen", + "email": "aallen@ascl.net", + "@id": "http://orcid.org/0000-0003-3477-2845" + }, + { + "@type": "Person", + "givenName": "Mercè", + "familyName": "Crosas", + "@id": "http://orcid.org/0000-0003-1304-1939" + }, + { + "@type": "Person", + "givenName": "Ashley", + "familyName": "Sands", + "@id": "http://orcid.org/0000-0001-5636-0433" + }, + { + "@type": "Person", + "givenName": "Neil", + "familyName": "Chue Hong", + "email": "n.chuehong@epcc.ed.ac.uk", + "@id": "http://orcid.org/0000-0002-8876-7606" + }, + { + "@type": "Person", + "givenName": "Patricia", + "familyName": "Cruse", + "@id": "http://orcid.org/0000-0002-9300-5278" + }, + { + "@type": "Person", + "givenName": "Dan", + "familyName": "Katz", + "email": "dskatz@illinois.edu", + "@id": "http://orcid.org/0000-0003-2720-0339" + }, + { + "@type": "Person", + "givenName": "Carole", + "familyName": "Goble", + "email": "carole.goble@manchester.ac.uk", + "@id": "http://orcid.org/0000-0003-1219-2137" + }, + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Stephan", + "familyName": "Druskat", + "email": "mail@sdruskat.net", + "@id": "http://orcid.org/0000-0003-4925-7248" + } + ], + "maintainer": [ + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + }, + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Abby Cabunoc", + "familyName": "Mayes", + "email": "abbycabs@gmail.com" + }, + { + "@type": "Person", + "givenName": "Arfon", + "familyName": "Smith", + "email": "arfon.smith@gmail.com", + "@id": "http://orcid.org/0000-0002-3957-2474" + }, + { + "@type": "Person", + "givenName": "Morane", + "familyName": "Gruenpeter", + "email": "morane@softwareheritage.org" + }, + { + "@type": "Person", + "givenName": "Valentin", + "familyName": "Lorentz", + "email": "vlorentz@softwareheritage.org" + }, + { + "@type": "Person", + "givenName": "Thomas", + "familyName": "Morrell", + "email": "tmorrell@library.caltech.edu" + }, + { + "@type": "Person", + "givenName": "Daniel", + "familyName": "Garijo" + } + ], + "continuousIntegration": "https://github.com/codemeta/codemeta/actions", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/3.0.zip", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "dateCreated":"2017-06-05", + "datePublished":"2023-07-23", + "programmingLanguage": "JSON-LD" +} \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index a9ae7a9b..abda6638 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -424,6 +424,7 @@ class RepositoryType(Enum): CAT_CODEMETA_AUTHOR = "author" CAT_CODEMETA_BUILDINSTRUCTIONS = "buildInstructions" CAT_CODEMETA_CODEREPOSITORY = "codeRepository" +CAT_CODEMETA_CONTRIBUTOR = "contributor" CAT_CODEMETA_CONTINUOUSINTEGRATION = "continuousIntegration" CAT_CODEMETA_COPYRIGHTHOLDER = "copyrightHolder" CAT_CODEMETA_COPYRIGHTYEAR = "copyrightYear"