Skip to content

Commit df18637

Browse files
authored
Merge pull request #927 from juanjemdIos/master
Copyright holder and year extracted from license. Fixes #886
2 parents 9e33125 + e4cf641 commit df18637

12 files changed

Lines changed: 141 additions & 111 deletions

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
3737
- **Contribution guidelines**: Text indicating how to contribute to this code repository
3838
- **Contributors**: Contributors to a software component
3939
- **Creation date**: Date when the repository was created
40+
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
4041
- **Date updated**: Date of last release.
4142
- **Description**: A description of what the software does
4243
- **Documentation**: Where to find additional documentation about a software component

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca
4343
- **Continuous integration**: Link to continuous integration service(s)
4444
- **Contribution guidelines**: Text indicating how to contribute to this code repository
4545
- **Contributors**: Contributors to a software component
46+
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
4647
- **Creation date**: Date when the repository was created
4748
- **Date updated**: Date of last release.
4849
- **Description**: A description of what the software does

docs/output.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
7575
- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab.
7676
- `contributing guidelines`: Guidelines indicating how to contribute to a software component.
7777
- `contributors`: Contributors to a software component
78+
- `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available.
7879
- `date_created`: Date when the software component was created.
7980
- `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction).
8081
- `description`: A description of what the software component does.

src/somef/export/json_export.py

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from ..utils import constants
99
from ..regular_expressions import detect_license_spdx,extract_scholarly_article_natural, extract_scholarly_article_properties
1010

11-
1211
def save_json_output(repo_data, out_path, missing, pretty=False):
1312
"""
1413
Function that saves the final json Object in the output file
@@ -49,22 +48,6 @@ def format_date(date_string):
4948
date_object = date_parser.parse(date_string)
5049
return date_object.strftime("%Y-%m-%d")
5150

52-
# latest_release = None
53-
# releases = data_path(["releases", "excerpt"])
54-
#
55-
# if releases is not None and len(releases) > 0:
56-
# latest_release = releases[0]
57-
# latest_pub_date = date_parser.parse(latest_release["datePublished"])
58-
# for index in range(1, len(releases)):
59-
# release = releases[index]
60-
# pub_date = date_parser.parse(release["datePublished"])
61-
#
62-
# if pub_date > latest_pub_date:
63-
# latest_release = release
64-
# latest_pub_date = pub_date
65-
66-
# def release_path(path):
67-
# return DataGraph.resolve_path(latest_release, path)
6851
code_repository = None
6952
if constants.CAT_CODE_REPOSITORY in repo_data:
7053
code_repository = repo_data[constants.CAT_CODE_REPOSITORY][0][constants.PROP_RESULT][constants.PROP_VALUE]
@@ -110,12 +93,6 @@ def format_date(date_string):
11093

11194
descriptions_text = flat_descriptions
11295

113-
# descriptions_text = [d[constants.PROP_RESULT][constants.PROP_VALUE] for d in selected]
114-
# descriptions.sort(key=lambda x: (x[constants.PROP_CONFIDENCE] + (1 if x[constants.PROP_TECHNIQUE] == constants.GITHUB_API else 0)),
115-
# reverse=True)
116-
# descriptions_text = [x[constants.PROP_RESULT][constants.PROP_VALUE] for x in descriptions]
117-
118-
11996
codemeta_output = {
12097
"@context": "https://w3id.org/codemeta/3.0",
12198
"@type": ["SoftwareSourceCode", "SoftwareApplication"]
@@ -171,6 +148,13 @@ def format_date(date_string):
171148
value = repo_data[constants.CAT_DATE_UPDATED][0][constants.PROP_RESULT][constants.PROP_VALUE]
172149
if value:
173150
codemeta_output[constants.CAT_CODEMETA_DATEMODIFIED] = format_date(value)
151+
if constants.CAT_COPYRIGHT in repo_data:
152+
holder = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT][constants.PROP_VALUE]
153+
year = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT].get(constants.PROP_YEAR)
154+
if holder:
155+
codemeta_output[constants.CAT_CODEMETA_COPYRIGHTHOLDER] = holder
156+
if year:
157+
codemeta_output[constants.CAT_CODEMETA_COPYRIGHTYEAR] = year
174158
if constants.CAT_DOWNLOAD_URL in repo_data:
175159
codemeta_output[constants.CAT_CODEMETA_DOWNLOADURL] = repo_data[constants.CAT_DOWNLOAD_URL][0][constants.PROP_RESULT][constants.PROP_VALUE]
176160
if constants.CAT_NAME in repo_data:
@@ -192,10 +176,6 @@ def format_date(date_string):
192176
for item in items:
193177
if item not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
194178
codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(item)
195-
# for key in repo_data[constants.CAT_KEYWORDS]:
196-
# key_value = key[constants.PROP_RESULT][constants.PROP_VALUE]
197-
# if key_value not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
198-
# codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(key_value)
199179

200180
if constants.CAT_PROGRAMMING_LANGUAGES in repo_data:
201181
# Calculate the total code size of all the programming languages
@@ -239,7 +219,7 @@ def format_date(date_string):
239219
req_type = x[constants.PROP_RESULT].get("type")
240220
if req_type:
241221
entry["@type"] = map_requirement_type(req_type)
242-
222+
243223
if version:
244224
if isinstance(version, str):
245225
entry["version"] = version.strip()
@@ -268,10 +248,6 @@ def format_date(date_string):
268248
other_requirements.append(value)
269249
seen_text.add(normalized)
270250

271-
# if requirements_mode == "v":
272-
# codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements
273-
# else:
274-
# codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements + other_requirements
275251

276252
if requirements_mode == "v":
277253
codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements

src/somef/process_files.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
174174
repo_default_branch,
175175
repo_dir, repo_relative_path, filename, dir_path,
176176
metadata_result, constants.CAT_LICENSE)
177+
177178

178179
if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper():
179180
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
@@ -503,6 +504,38 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
503504
result[constants.PROP_NAME] = license_info['name']
504505
result[constants.PROP_SPDX_ID] = license_info['spdx_id']
505506

507+
508+
# Extraction copyright holder from license text
509+
matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE)
510+
511+
for year, holder in matches_copyright:
512+
holder = holder.strip() if holder else None
513+
year = year.strip() if year else None
514+
515+
if not holder:
516+
logging.info("Skipping copyright holder with empty name")
517+
continue
518+
519+
# sometimes we get not desired characters
520+
holder = holder.lstrip(",;: ").strip()
521+
522+
result_copy = {
523+
constants.PROP_VALUE: holder,
524+
constants.PROP_TYPE: constants.AGENT
525+
}
526+
527+
if year:
528+
result_copy[constants.PROP_YEAR] = year
529+
530+
logging.info(f"Extracted copyright holder: {holder.strip()} with year: {year}")
531+
metadata_result.add_result(
532+
constants.CAT_COPYRIGHT,
533+
result_copy,
534+
1,
535+
constants.TECHNIQUE_FILE_EXPLORATION,
536+
url
537+
)
538+
506539
if category is constants.CAT_AUTHORS:
507540
result = {}
508541
authors_list = parse_author_file(file_text)

src/somef/process_repository.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,6 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization,
472472
base = f"https://gitlab.com/{project_path}" if project_path else f"https://gitlab.com/{owner}/{repo_name}"
473473
primary_url = f"{base}/-/raw/{default_branch}/README.md"
474474
secondary_url = f"{base}/-/raw/master/README.md"
475-
# primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md"
476-
# secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md"
477475
elif repo_type is constants.RepositoryType.GITHUB:
478476
primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md"
479477
secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md"

src/somef/somef_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
3636
@param keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them
3737
@param authorization: GitHub authorization token
3838
@param ignore_test_folder: Ignore contents of test folders
39-
@param requiriments_mode: flag to indicate what requirements show in codemeta
39+
@param requirements_mode: flag to indicate what requirements show in codemeta
4040
@param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API.
4141
@param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata.
4242
@param tag: tag of the repository to analyze. Cannot be used together with the branch parameter.

src/somef/test/test_JSON_export.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,4 +662,61 @@ def test_issue_914(self):
662662
requirements = json_content.get(constants.CAT_REQUIREMENTS, [])
663663
self.assertEqual(requirements[0].get("result", {}).get("version"), "3.20.3")
664664

665-
os.remove(output_file)
665+
os.remove(output_file)
666+
667+
668+
def test_issue_886_bsd3(self):
669+
"""Checks whether copyright holder are correctly extracted from BSD 3-Clause license text"""
670+
somef_cli.run_cli(threshold=0.8,
671+
ignore_classifiers=False,
672+
repo_url=None,
673+
local_repo=test_data_repositories + "captum",
674+
doc_src=None,
675+
in_file=None,
676+
output=test_data_path + "test_issue_886_bsd3.json",
677+
graph_out=None,
678+
graph_format="turtle",
679+
codemeta_out=None,
680+
pretty=True,
681+
missing=False,
682+
readme_only=False)
683+
684+
text_file = open(test_data_path + "test_issue_886_bsd3.json", "r")
685+
data = text_file.read()
686+
text_file.close()
687+
json_content = json.loads(data)
688+
689+
copyright_entries = json_content[constants.CAT_COPYRIGHT]
690+
copy = copyright_entries[0]["result"]
691+
assert copy["value"] == "PyTorch team"
692+
assert copy["year"] == "2019"
693+
694+
os.remove(test_data_path + "test_issue_886_bsd3.json")
695+
696+
697+
def test_issue_886_apache(self):
698+
"""Checks whether copyright holder are correctly extracted from Apache license text"""
699+
somef_cli.run_cli(threshold=0.8,
700+
ignore_classifiers=False,
701+
repo_url=None,
702+
local_repo=test_data_repositories + "Widoco",
703+
doc_src=None,
704+
in_file=None,
705+
output=test_data_path + "test_issue_886_apache.json",
706+
graph_out=None,
707+
graph_format="turtle",
708+
codemeta_out=None,
709+
pretty=True,
710+
missing=False,
711+
readme_only=False)
712+
713+
text_file = open(test_data_path + "test_issue_886_apache.json", "r")
714+
data = text_file.read()
715+
text_file.close()
716+
json_content = json.loads(data)
717+
718+
copyright_entries = json_content[constants.CAT_COPYRIGHT]
719+
copy = copyright_entries[0]["result"]
720+
assert copy["value"] == "Daniel Garijo, Information Sciences Institute, USC."
721+
assert copy["year"] == "2016"
722+
os.remove(test_data_path + "test_issue_886_apache.json")

src/somef/test/test_bower_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_parse_bower_json(self):
6565
dependency = req_result["result"]
6666
if dependency.get("name") == "jquery" and dependency.get("dependency_type") == constants.DEPENDENCY_TYPE_RUNTIME:
6767
found_jquery = True
68-
self.assertEqual(dependency.get("dependency_resolver"),"bower","jQuery should come from the bower resolver")
68+
self.assertEqual(dependency.get("dependency_resolver"),"bower","Bower should come from the bower resolver")
6969

7070
self.assertTrue(found_jquery, "jQuery dependency not found")
7171

src/somef/test/test_codemeta_export.py

Lines changed: 26 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from .. import somef_cli
66
from ..parser import pom_xml_parser
77
from ..export import json_export
8+
from ..utils import constants
89

910
test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
1011
test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
@@ -587,69 +588,35 @@ def test_issue_891(self):
587588

588589
os.remove(output_path)
589590

590-
# def test_codemeta_local(self):
591-
592-
# """
593-
# codemeta local
594-
# """
595-
596-
# pom_xml_parser.processed_pom = False
597591

598-
# output_path = test_data_path + 'test_urban_pfr.json'
599-
# if os.path.exists(output_path):
600-
# os.remove(output_path)
601-
602-
# somef_cli.run_cli(threshold=0.9,
603-
# ignore_classifiers=False,
604-
# repo_url=None,
605-
# doc_src=None,
606-
# local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
607-
# in_file=None,
608-
# output=None,
609-
# graph_out=None,
610-
# graph_format="turtle",
611-
# codemeta_out= output_path,
612-
# pretty=True,
613-
# missing=False,
614-
# readme_only=False)
592+
def test_issue_886_apache_code(self):
593+
"""Checks whether copyright holder are correctly extracted from Apache license text in codemeta"""
594+
somef_cli.run_cli(threshold=0.8,
595+
ignore_classifiers=False,
596+
repo_url=None,
597+
local_repo=test_data_repositories + "Widoco",
598+
doc_src=None,
599+
in_file=None,
600+
output=None,
601+
graph_out=None,
602+
graph_format="turtle",
603+
codemeta_out=test_data_path + "test_issue_886_apache_code.json",
604+
pretty=True,
605+
missing=False,
606+
readme_only=False)
615607

616-
# with open(output_path, "r") as f:
617-
# json_content = json.load(f)
618-
619-
# runtime = json_content.get("runtimePlatform", [])
620-
# assert runtime == "Java: 1.8", f"It was expected 'Java: 1.8' but it was '{runtime}'"
621-
# os.remove(output_path)
622-
623-
624-
# def test_codemeta_local_2(self):
625-
626-
# """
627-
# codemeta local
628-
# """
629-
630-
# pom_xml_parser.processed_pom = False
608+
text_file = open(test_data_path + "test_issue_886_apache_code.json", "r")
609+
data = text_file.read()
610+
text_file.close()
611+
json_content = json.loads(data)
631612

632-
# output_path = test_data_path + 'test_json_urban_pfr.json'
633-
# if os.path.exists(output_path):
634-
# os.remove(output_path)
635-
636-
# somef_cli.run_cli(threshold=0.9,
637-
# ignore_classifiers=False,
638-
# repo_url=None,
639-
# doc_src=None,
640-
# local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
641-
# in_file=None,
642-
# output=output_path,
643-
# graph_out=None,
644-
# graph_format="turtle",
645-
# codemeta_out= None,
646-
# pretty=True,
647-
# missing=False,
648-
# readme_only=False)
649-
650-
# with open(output_path, "r") as f:
651-
# json_content = json.load(f)
613+
copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER]
614+
copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR]
615+
616+
assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC."
617+
assert copyright_year == "2016"
652618

619+
os.remove(test_data_path + "test_issue_886_apache_code.json")
653620

654621
@classmethod
655622
def tearDownClass(cls):

0 commit comments

Comments
 (0)