Skip to content

Commit c82bc1f

Browse files
authored
Merge pull request #944 from juanjemdIos/master
Contributors in parser codemeta and codemeta export. Type agent in authors. Test. Fixes #936
2 parents 066df83 + 46a6fc3 commit c82bc1f

14 files changed

Lines changed: 472 additions & 14 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
3535
- **Contact**: Contact person responsible for maintaining a software component
3636
- **Continuous integration**: Link to continuous integration service(s)
3737
- **Contribution guidelines**: Text indicating how to contribute to this code repository
38-
- **Contributors**: Contributors to a software component
38+
- **Contributors**: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs.
3939
- **Creation date**: Date when the repository was created
4040
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
4141
- **Date updated**: Date of last release.

docs/codemetajson.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ These fields are defined in the [Codemeta specification](https://github.com/code
1515
| citation - doi | citation[i].result.doi | referencePublication.identifier |
1616
| code_repository | code_repository[i].result.value | codeRepository |
1717
| continuous_integration | continuous_integration[i].result.value | contIntegration |
18+
| contributors - value | contributors[i].result.value | contributor.givenName + contributor.familyName or just name if organization |
19+
| contributors - name | contributors[i].result.value | contributor.givenName + contributor.familyName or just name if organization |
20+
| contributors - last_name | contributors[i].result.value | contributor.familyName |
21+
| contributors - given_name | contributors[i].result.value | contributor.givenName |
22+
| contributors - identifier | contributors[i].result.value | contributor.@id |
23+
| contributors - email | contributors[i].result.value | contributor.email |
1824
| date_created | date_created[i].result.value | dateCreated |
1925
| date_updated | date_updated[i].result.value | dateModified |
2026
| date_published | date_published[i].result .value | datePublished |

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca
4242
- **Contact**: Contact person responsible for maintaining a software component
4343
- **Continuous integration**: Link to continuous integration service(s)
4444
- **Contribution guidelines**: Text indicating how to contribute to this code repository
45-
- **Contributors**: Contributors to a software component
45+
- **Contributors**: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs.
4646
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
4747
- **Creation date**: Date when the repository was created
4848
- **Date updated**: Date of last release.

docs/output.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
7474
- `contact`: Contact person responsible for maintaining a software component.
7575
- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab.
7676
- `contributing guidelines`: Guidelines indicating how to contribute to a software component.
77-
- `contributors`: Contributors to a software component
77+
- `contributors`: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs.
7878
- `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available.
7979
- `date_created`: Date when the software component was created.
8080
- `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction).

poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/somef/export/json_export.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,8 +581,10 @@ def format_date(date_string):
581581
if runtimes:
582582
codemeta_output[constants.CAT_CODEMETA_RUNTIMEPLATFORM] = ", ".join(runtimes)
583583

584-
# if "contributors" in repo_data:
585-
# codemeta_output["contributor"] = data_path(["contributors", "excerpt"])
584+
if constants.CAT_CONTRIBUTORS in repo_data:
585+
raw_contributors = repo_data[constants.CAT_CONTRIBUTORS]
586+
codemeta_output[constants.CAT_CODEMETA_CONTRIBUTOR] = parse_contributors(raw_contributors)
587+
586588
# A person is expected, and we extract text at the moment
587589
if descriptions_text:
588590
codemeta_output[constants.CAT_CODEMETA_DESCRIPTION] = descriptions_text
@@ -684,6 +686,74 @@ def map_requirement_type(t):
684686
# default
685687
return constants.SCHEMA_SOFTWARE_APPLICATION
686688

689+
def parse_contributors(raw):
690+
contributors = []
691+
seen = set()
692+
693+
for entry in raw:
694+
result = entry.get("result", {})
695+
rtype = result.get("type")
696+
name = result.get("value")
697+
698+
if not name:
699+
continue
700+
701+
if rtype == "Agent":
702+
703+
if name not in seen:
704+
705+
if re.search(constants.REGEXP_LTD_INC, name, re.IGNORECASE):
706+
type_contributor = "Organization"
707+
else:
708+
type_contributor = "Person"
709+
710+
contributor = {
711+
"@type": type_contributor,
712+
"name": name
713+
}
714+
if "given_name" in result:
715+
contributor["givenName"] = result["given_name"]
716+
717+
if "last_name" in result:
718+
contributor["familyName"] = result["last_name"]
719+
720+
if "email" in result:
721+
contributor["email"] = result["email"]
722+
723+
if "identifier" in result:
724+
contributor["@id"] = result["identifier"]
725+
726+
contributors.append(contributor)
727+
seen.add(name)
728+
729+
if rtype == "File_dump":
730+
for line in result.get("value", "").splitlines():
731+
line = line.strip()
732+
733+
if (not line or line.startswith(("#", "##", "|")) or "[" in line):
734+
continue
735+
736+
# avoid sentences
737+
if len(line.split()) > 4:
738+
continue
739+
740+
if line in seen:
741+
continue
742+
743+
if re.search(constants.REGEXP_LTD_INC, line, re.IGNORECASE):
744+
type_contributor = "Organization"
745+
else:
746+
type_contributor = "Person"
747+
748+
contributors.append({
749+
"@type": type_contributor,
750+
"name": line
751+
})
752+
753+
seen.add(line)
754+
755+
756+
return contributors
687757

688758

689759
"""

src/somef/parser/codemeta_parser.py

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,82 @@ def parse_programming_language(language_data):
234234

235235
return None
236236

237+
def parse_contributors(contributors_data):
238+
"""
239+
Parse contributors from codemeta.json
240+
241+
Parameters
242+
----------
243+
contributors_data: list, dict
244+
Contributor data from codemeta.json
245+
246+
Returns
247+
-------
248+
list
249+
List of contributor dictionaries
250+
"""
251+
contributors_list = []
252+
253+
if isinstance(contributors_data, dict):
254+
contributors_data = [contributors_data]
255+
256+
if not isinstance(contributors_data, list):
257+
return contributors_list
258+
259+
for contributor in contributors_data:
260+
261+
if isinstance(contributor, dict):
262+
263+
given = contributor.get("givenName")
264+
family = contributor.get("familyName")
265+
name = contributor.get("name")
266+
267+
if given and family:
268+
full_name = f"{given} {family}"
269+
elif name:
270+
full_name = name
271+
else:
272+
continue
273+
274+
contributor_info = {
275+
"value": full_name,
276+
"name": full_name,
277+
"type": constants.AGENT
278+
}
279+
280+
if given:
281+
contributor_info["given_name"] = given
282+
283+
if family:
284+
contributor_info["last_name"] = family
285+
286+
if "email" in contributor:
287+
contributor_info["email"] = contributor["email"]
288+
289+
affil = contributor.get("affiliation")
290+
if affil:
291+
if isinstance(affil, dict) and affil.get("name"):
292+
contributor_info["affiliation"] = affil["name"]
293+
elif isinstance(affil, str):
294+
contributor_info["affiliation"] = affil
295+
296+
identifier = contributor.get("identifier") or contributor.get("@id")
297+
if identifier:
298+
contributor_info["identifier"] = identifier
299+
300+
contributors_list.append(contributor_info)
301+
302+
elif isinstance(contributor, str):
303+
name = contributor.strip()
304+
if name:
305+
contributors_list.append({
306+
"value": name,
307+
"type": constants.AGENT
308+
})
309+
310+
return contributors_list
311+
312+
237313
def parse_codemeta_json_file(file_path, metadata_result: Result, source):
238314
"""
239315
@@ -290,6 +366,17 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
290366
source
291367
)
292368

369+
if "contributor" in data:
370+
contributors = parse_contributors(data["contributor"])
371+
for contributor in contributors:
372+
metadata_result.add_result(
373+
constants.CAT_CONTRIBUTORS,
374+
contributor,
375+
1,
376+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
377+
source
378+
)
379+
293380
if "issueTracker" in data:
294381
metadata_result.add_result(
295382
constants.CAT_ISSUE_TRACKER,
@@ -570,7 +657,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
570657
if author_name:
571658
author_info = {
572659
"value": author_name,
573-
"type": constants.STRING
660+
"type": constants.AGENT
574661
}
575662

576663
if "email" in author:
@@ -604,7 +691,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
604691
if author_name:
605692
author_info = {
606693
"value": author_name,
607-
"type": constants.STRING
694+
"type": constants.AGENT
608695
}
609696

610697
if "email" in author:

src/somef/test/test_codemeta_export.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,55 @@ def test_issue_886_apache_code(self):
618618

619619
os.remove(test_data_path + "test_issue_886_apache_code.json")
620620

621+
622+
623+
def test_issue_936_contributors(self):
624+
"""Checks whether contributors are correctly extracted from the repository"""
625+
somef_cli.run_cli(threshold=0.8,
626+
ignore_classifiers=False,
627+
repo_url=None,
628+
local_repo=test_data_repositories + "codemeta_repo",
629+
doc_src=None,
630+
in_file=None,
631+
output=None,
632+
graph_out=None,
633+
graph_format="turtle",
634+
codemeta_out=test_data_path + "test_issue_936_contributors.json",
635+
pretty=True,
636+
missing=False,
637+
readme_only=False)
638+
639+
text_file = open(test_data_path + "test_issue_936_contributors.json", "r")
640+
data = text_file.read()
641+
text_file.close()
642+
json_content = json.loads(data)
643+
644+
contributors = json_content[constants.CAT_CODEMETA_CONTRIBUTOR]
645+
print(contributors)
646+
self.assertTrue(any(
647+
c["name"] == "Abby Cabunoc Mayes" and
648+
c.get("givenName") == "Abby Cabunoc"
649+
for c in contributors
650+
),
651+
"Expected contributor Abby Cabunoc Mayes with givenName='Abby Cabunoc' not found")
652+
653+
self.assertTrue(any(
654+
c["name"] == "Arfon Smith" and
655+
c.get("@id") == "http://orcid.org/0000-0002-3957-2474"
656+
for c in contributors
657+
),
658+
"Expected contributor Arfon Smith with @id='http://orcid.org/0000-0002-3957-2474' not found")
659+
660+
self.assertTrue(any(
661+
c["name"] == "Dan Katz" and
662+
c.get("email") == "dskatz@illinois.edu"
663+
for c in contributors
664+
),
665+
"Expected contributor Dan Katz with email='dskatz@illinois.edu' not found")
666+
667+
os.remove(test_data_path + "test_issue_936_contributors.json")
668+
669+
621670
@classmethod
622671
def tearDownClass(cls):
623672
"""delete temp file JSON just if all the test pass"""

src/somef/test/test_codemeta_parser.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def load_expected(self, repo_name):
1717
"""Load expected YAML for a given repo."""
1818
yaml_path = EXPECT_DIR / f"{repo_name}.yaml"
1919
if not yaml_path.exists():
20+
if repo_name == "codemeta_repo":
21+
return {}
2022
self.skipTest(f"No expected YAML for repository '{repo_name}'")
2123
with open(yaml_path, "r", encoding="utf-8") as f:
2224
return yaml.safe_load(f)
@@ -62,5 +64,34 @@ def test_parse_multiple_codemeta_files(self):
6264
f"[{repo_folder}] Mismatch in {cat_name}"
6365
)
6466

67+
68+
def test_parse_contributors(self):
69+
codemeta_path = REPOS_DIR / "codemeta_repo" / "codemeta.json"
70+
result = Result()
71+
72+
metadata_result = parse_codemeta_json_file(codemeta_path, result, "https://example.org/codemeta.json")
73+
74+
self.assertIn(constants.CAT_CONTRIBUTORS, metadata_result.results)
75+
contributors = result.results[constants.CAT_CONTRIBUTORS]
76+
77+
self.assertTrue(any(
78+
c["result"]["name"] == "Abby Cabunoc Mayes" and
79+
c["result"].get("given_name") == "Abby Cabunoc"
80+
for c in contributors
81+
))
82+
83+
self.assertTrue(any(
84+
c["result"]["name"] == "Arfon Smith" and
85+
c["result"].get("identifier") == "http://orcid.org/0000-0002-3957-2474"
86+
for c in contributors
87+
))
88+
89+
self.assertTrue(any(
90+
c["result"]["name"] == "Dan Katz" and
91+
c["result"].get("email") == "dskatz@illinois.edu"
92+
for c in contributors
93+
))
94+
95+
6596
if __name__ == "__main__":
6697
unittest.main()

src/somef/test/test_data/expected/aladin-lite.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ CAT_IDENTIFIER: 10.5281/zenodo.7638833 # Passed
1515
CAT_DESCRIPTION: An astronomical HiPS visualizer in the browser. # Passed
1616
CAT_AUTHORS: # Passed
1717
value: Matthieu Baumann
18-
type: String
18+
type: Agent
1919
email: matthieu.baumann@unistra.fr
2020
affiliation: "Universit\u00e9 de Strasbourg, CNRS, Observatoire astronomique de Strasbourg, UMR 7550, F-67000 Strasbourg, France"
2121
identifier: "https://orcid.org/0000-0002-7123-773X"

0 commit comments

Comments
 (0)