diff --git a/README.md b/README.md index 4f8d0cc1..609375b0 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - **Contribution guidelines**: Text indicating how to contribute to this code repository - **Contributors**: Contributors to a software component - **Creation date**: Date when the repository was created +- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available. - **Date updated**: Date of last release. - **Description**: A description of what the software does - **Documentation**: Where to find additional documentation about a software component diff --git a/docs/index.md b/docs/index.md index fdf438dc..bcc1f7df 100644 --- a/docs/index.md +++ b/docs/index.md @@ -43,6 +43,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository - **Contributors**: Contributors to a software component +- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available. - **Creation date**: Date when the repository was created - **Date updated**: Date of last release. - **Description**: A description of what the software does diff --git a/docs/output.md b/docs/output.md index b618be3a..73a71dba 100644 --- a/docs/output.md +++ b/docs/output.md @@ -75,6 +75,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab. - `contributing guidelines`: Guidelines indicating how to contribute to a software component. - `contributors`: Contributors to a software component +- `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available. - `date_created`: Date when the software component was created. - `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction). - `description`: A description of what the software component does. diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index 537e2efa..1b2beddf 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -8,7 +8,6 @@ from ..utils import constants from ..regular_expressions import detect_license_spdx,extract_scholarly_article_natural, extract_scholarly_article_properties - def save_json_output(repo_data, out_path, missing, pretty=False): """ Function that saves the final json Object in the output file @@ -49,22 +48,6 @@ def format_date(date_string): date_object = date_parser.parse(date_string) return date_object.strftime("%Y-%m-%d") - # latest_release = None - # releases = data_path(["releases", "excerpt"]) - # - # if releases is not None and len(releases) > 0: - # latest_release = releases[0] - # latest_pub_date = date_parser.parse(latest_release["datePublished"]) - # for index in range(1, len(releases)): - # release = releases[index] - # pub_date = date_parser.parse(release["datePublished"]) - # - # if pub_date > latest_pub_date: - # latest_release = release - # latest_pub_date = pub_date - - # def release_path(path): - # return DataGraph.resolve_path(latest_release, path) code_repository = None if constants.CAT_CODE_REPOSITORY in repo_data: code_repository = repo_data[constants.CAT_CODE_REPOSITORY][0][constants.PROP_RESULT][constants.PROP_VALUE] @@ -110,12 +93,6 @@ def format_date(date_string): descriptions_text = flat_descriptions - # descriptions_text = [d[constants.PROP_RESULT][constants.PROP_VALUE] for d in selected] - # descriptions.sort(key=lambda x: (x[constants.PROP_CONFIDENCE] + (1 if x[constants.PROP_TECHNIQUE] == constants.GITHUB_API else 0)), - # reverse=True) - # descriptions_text = [x[constants.PROP_RESULT][constants.PROP_VALUE] for x in descriptions] - - codemeta_output = { "@context": "https://w3id.org/codemeta/3.0", "@type": ["SoftwareSourceCode", "SoftwareApplication"] @@ -171,6 +148,13 @@ def format_date(date_string): value = repo_data[constants.CAT_DATE_UPDATED][0][constants.PROP_RESULT][constants.PROP_VALUE] if value: codemeta_output[constants.CAT_CODEMETA_DATEMODIFIED] = format_date(value) + if constants.CAT_COPYRIGHT in repo_data: + holder = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT][constants.PROP_VALUE] + year = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT].get(constants.PROP_YEAR) + if holder: + codemeta_output[constants.CAT_CODEMETA_COPYRIGHTHOLDER] = holder + if year: + codemeta_output[constants.CAT_CODEMETA_COPYRIGHTYEAR] = year if constants.CAT_DOWNLOAD_URL in repo_data: codemeta_output[constants.CAT_CODEMETA_DOWNLOADURL] = repo_data[constants.CAT_DOWNLOAD_URL][0][constants.PROP_RESULT][constants.PROP_VALUE] if constants.CAT_NAME in repo_data: @@ -192,10 +176,6 @@ def format_date(date_string): for item in items: if item not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]: codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(item) - # for key in repo_data[constants.CAT_KEYWORDS]: - # key_value = key[constants.PROP_RESULT][constants.PROP_VALUE] - # if key_value not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]: - # codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(key_value) if constants.CAT_PROGRAMMING_LANGUAGES in repo_data: # Calculate the total code size of all the programming languages @@ -239,7 +219,7 @@ def format_date(date_string): req_type = x[constants.PROP_RESULT].get("type") if req_type: entry["@type"] = map_requirement_type(req_type) - + if version: if isinstance(version, str): entry["version"] = version.strip() @@ -268,10 +248,6 @@ def format_date(date_string): other_requirements.append(value) seen_text.add(normalized) - # if requirements_mode == "v": - # codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements - # else: - # codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements + other_requirements if requirements_mode == "v": codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements diff --git a/src/somef/process_files.py b/src/somef/process_files.py index c97a894b..9edc6baf 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -174,6 +174,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner repo_default_branch, repo_dir, repo_relative_path, filename, dir_path, metadata_result, constants.CAT_LICENSE) + if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper(): metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name, @@ -503,6 +504,38 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul result[constants.PROP_NAME] = license_info['name'] result[constants.PROP_SPDX_ID] = license_info['spdx_id'] + + # Extraction copyright holder from license text + matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE) + + for year, holder in matches_copyright: + holder = holder.strip() if holder else None + year = year.strip() if year else None + + if not holder: + logging.info("Skipping copyright holder with empty name") + continue + + # sometimes we get not desired characters + holder = holder.lstrip(",;: ").strip() + + result_copy = { + constants.PROP_VALUE: holder, + constants.PROP_TYPE: constants.AGENT + } + + if year: + result_copy[constants.PROP_YEAR] = year + + logging.info(f"Extracted copyright holder: {holder.strip()} with year: {year}") + metadata_result.add_result( + constants.CAT_COPYRIGHT, + result_copy, + 1, + constants.TECHNIQUE_FILE_EXPLORATION, + url + ) + if category is constants.CAT_AUTHORS: result = {} authors_list = parse_author_file(file_text) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 2946dd33..fc6b4231 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -472,8 +472,6 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, base = f"https://gitlab.com/{project_path}" if project_path else f"https://gitlab.com/{owner}/{repo_name}" primary_url = f"{base}/-/raw/{default_branch}/README.md" secondary_url = f"{base}/-/raw/master/README.md" - # primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md" - # secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md" elif repo_type is constants.RepositoryType.GITHUB: primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md" secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md" diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index f9b6db16..4bd4efbc 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -36,7 +36,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc @param keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them @param authorization: GitHub authorization token @param ignore_test_folder: Ignore contents of test folders - @param requiriments_mode: flag to indicate what requirements show in codemeta + @param requirements_mode: flag to indicate what requirements show in codemeta @param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API. @param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata. @param tag: tag of the repository to analyze. Cannot be used together with the branch parameter. diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index 4eb95c6e..afaf5b14 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -662,4 +662,61 @@ def test_issue_914(self): requirements = json_content.get(constants.CAT_REQUIREMENTS, []) self.assertEqual(requirements[0].get("result", {}).get("version"), "3.20.3") - os.remove(output_file) \ No newline at end of file + os.remove(output_file) + + + def test_issue_886_bsd3(self): + """Checks whether copyright holder are correctly extracted from BSD 3-Clause license text""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "captum", + doc_src=None, + in_file=None, + output=test_data_path + "test_issue_886_bsd3.json", + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) + + text_file = open(test_data_path + "test_issue_886_bsd3.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) + + copyright_entries = json_content[constants.CAT_COPYRIGHT] + copy = copyright_entries[0]["result"] + assert copy["value"] == "PyTorch team" + assert copy["year"] == "2019" + + os.remove(test_data_path + "test_issue_886_bsd3.json") + + + def test_issue_886_apache(self): + """Checks whether copyright holder are correctly extracted from Apache license text""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "Widoco", + doc_src=None, + in_file=None, + output=test_data_path + "test_issue_886_apache.json", + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) + + text_file = open(test_data_path + "test_issue_886_apache.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) + + copyright_entries = json_content[constants.CAT_COPYRIGHT] + copy = copyright_entries[0]["result"] + assert copy["value"] == "Daniel Garijo, Information Sciences Institute, USC." + assert copy["year"] == "2016" + os.remove(test_data_path + "test_issue_886_apache.json") \ No newline at end of file diff --git a/src/somef/test/test_bower_parser.py b/src/somef/test/test_bower_parser.py index d3d36235..3a52c0bd 100644 --- a/src/somef/test/test_bower_parser.py +++ b/src/somef/test/test_bower_parser.py @@ -65,7 +65,7 @@ def test_parse_bower_json(self): dependency = req_result["result"] if dependency.get("name") == "jquery" and dependency.get("dependency_type") == constants.DEPENDENCY_TYPE_RUNTIME: found_jquery = True - self.assertEqual(dependency.get("dependency_resolver"),"bower","jQuery should come from the bower resolver") + self.assertEqual(dependency.get("dependency_resolver"),"bower","Bower should come from the bower resolver") self.assertTrue(found_jquery, "jQuery dependency not found") diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index 82576799..4c6ceb57 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -5,6 +5,7 @@ from .. import somef_cli from ..parser import pom_xml_parser from ..export import json_export +from ..utils import constants test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep @@ -587,69 +588,35 @@ def test_issue_891(self): os.remove(output_path) - # def test_codemeta_local(self): - - # """ - # codemeta local - # """ - - # pom_xml_parser.processed_pom = False - # output_path = test_data_path + 'test_urban_pfr.json' - # if os.path.exists(output_path): - # os.remove(output_path) - - # somef_cli.run_cli(threshold=0.9, - # ignore_classifiers=False, - # repo_url=None, - # doc_src=None, - # local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg", - # in_file=None, - # output=None, - # graph_out=None, - # graph_format="turtle", - # codemeta_out= output_path, - # pretty=True, - # missing=False, - # readme_only=False) + def test_issue_886_apache_code(self): + """Checks whether copyright holder are correctly extracted from Apache license text in codemeta""" + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "Widoco", + doc_src=None, + in_file=None, + output=None, + graph_out=None, + graph_format="turtle", + codemeta_out=test_data_path + "test_issue_886_apache_code.json", + pretty=True, + missing=False, + readme_only=False) - # with open(output_path, "r") as f: - # json_content = json.load(f) - - # runtime = json_content.get("runtimePlatform", []) - # assert runtime == "Java: 1.8", f"It was expected 'Java: 1.8' but it was '{runtime}'" - # os.remove(output_path) - - - # def test_codemeta_local_2(self): - - # """ - # codemeta local - # """ - - # pom_xml_parser.processed_pom = False + text_file = open(test_data_path + "test_issue_886_apache_code.json", "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) - # output_path = test_data_path + 'test_json_urban_pfr.json' - # if os.path.exists(output_path): - # os.remove(output_path) - - # somef_cli.run_cli(threshold=0.9, - # ignore_classifiers=False, - # repo_url=None, - # doc_src=None, - # local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg", - # in_file=None, - # output=output_path, - # graph_out=None, - # graph_format="turtle", - # codemeta_out= None, - # pretty=True, - # missing=False, - # readme_only=False) - - # with open(output_path, "r") as f: - # json_content = json.load(f) + copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER] + copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR] + + assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC." + assert copyright_year == "2016" + os.remove(test_data_path + "test_issue_886_apache_code.json") @classmethod def tearDownClass(cls): diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 012908de..fbf64373 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -4,10 +4,11 @@ import json from pathlib import Path +from ..parser import pom_xml_parser from .. import process_repository, process_files, somef_cli from ..utils import constants from ..process_results import Result -from somef.parser import pom_xml_parser + test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 258f4f2f..a9ae7a9b 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -39,14 +39,6 @@ REGEXP_PROJECT_HOMEPAGE = r'\[\!\[Project homepage\]([^\]]+)\]\(([^)]+)\)' # Readthedocs badges' -# REGEXP_READTHEDOCS_BADGES = r"https?://[^\s]*readthedocs\.org/projects/[^\s]*/badge/\?version=[^\s]*(?:.|\n)*?:target:\s*(https?://[^\s]+)" -# REGEXP_READTHEDOCS_BADGES = r"https?://readthedocs\.org/projects/[^/\s]+/badge/\?version=[^)\s]+" -# REGEXP_READTHEDOCS_BADGES = ( -# r"https?://readthedocs\.org/projects/[^/\s]+/badge/\?version=[^)\s]+" -# r"(?:.|\n)*?:target:\s*(https?://[^\s]+)" # rst -# r"|" -# r"\((https?://readthedocs\.org/projects/[^/\s]+/[^)\s]+)\)" # md -# ) REGEXP_READTHEDOCS_RST = ( r"https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s]*" r"[^\n]*?:target:\s*(https?://[^\s\"']+)" @@ -55,16 +47,12 @@ r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)" ) -# REGEXP_READTHEDOCS_HTML = ( -# r"]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>" -# r"(?:(?!)[\s\S])*?" -# r"]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*" -# ) REGEXP_READTHEDOCS_HTML = r""" ]*href=['"](https?://[^'"]+)['"][^>]*> # Capture href (?:\s*|\n*) # breaklines and optional spaces ]*src=['"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'"]+['"] # Badge """ + # For natural language citation REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+' REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b' @@ -99,6 +87,9 @@ REGEXP_ZENODO_DOI = r'https://zenodo\.org/badge/DOI/\d+' REGEXP_ZENODO_JSON_LD = r"]*type=['\"]application/ld\+json['\"][^>]*>(.*?)" +# Detect copyright information in license files. +REGEXP_COPYRIGHT = r"copyright\s*(?:\(c\)|©|\(C\))?\s*\{?(\d{4}(?:-\d{4})?)\}?\s*\{?([^\n}]+)\}?" + LICENSES_DICT = { "Apache License 2.0": {"regex": REGEXP_APACHE, "spdx_id": "Apache-2.0"}, "GNU General Public License v3.0": {"regex": REGEXP_GPL3, "spdx_id": "GPL-3.0"}, @@ -125,6 +116,7 @@ CAT_COC = "code_of_conduct" CAT_CODE_REPOSITORY = "code_repository" CAT_CONTACT = "contact" +CAT_COPYRIGHT = "copyright_holder" CAT_DATE_CREATED = "date_created" CAT_DATE_UPDATED = "date_updated" CAT_DATE_PUBLISHED = "date_published" @@ -196,7 +188,7 @@ # list with all categories all_categories = [CAT_APPLICATION_DOMAIN, CAT_ACKNOWLEDGEMENT, CAT_AUTHORS, CAT_CITATION, CAT_CONTRIBUTORS, CAT_CONTRIBUTING_GUIDELINES, CAT_CONTINUOUS_INTEGRATION, - CAT_COC, CAT_CODE_REPOSITORY, CAT_CONTACT, CAT_DESCRIPTION, CAT_DATE_CREATED, CAT_DATE_UPDATED, + CAT_COC, CAT_CODE_REPOSITORY, CAT_CONTACT, CAT_COPYRIGHT, CAT_DESCRIPTION, CAT_DATE_CREATED, CAT_DATE_UPDATED, CAT_DOCUMENTATION, CAT_DOWNLOAD, CAT_DOWNLOAD_URL, CAT_EXECUTABLE_EXAMPLE, CAT_FAQ, CAT_FORK_COUNTS, CAT_FORKS_URLS, CAT_FULL_NAME, CAT_FULL_TITLE, CAT_HAS_BUILD_FILE, CAT_HAS_SCRIPT_FILE, CAT_IDENTIFIER, CAT_IMAGE, CAT_INSTALLATION, @@ -250,6 +242,7 @@ PROP_URL = "url" PROP_USERNAME = "username" PROP_VERSION = "version" +PROP_YEAR = "year" PROP_ZIPBALL_URL = "zipball_url" PROP_TARBALL_URL = "tarball_url" # Publications @@ -432,6 +425,8 @@ class RepositoryType(Enum): CAT_CODEMETA_BUILDINSTRUCTIONS = "buildInstructions" CAT_CODEMETA_CODEREPOSITORY = "codeRepository" CAT_CODEMETA_CONTINUOUSINTEGRATION = "continuousIntegration" +CAT_CODEMETA_COPYRIGHTHOLDER = "copyrightHolder" +CAT_CODEMETA_COPYRIGHTYEAR = "copyrightYear" CAT_CODEMETA_DATECREATED = "dateCreated" CAT_CODEMETA_DATEMODIFIED = "dateModified" CAT_CODEMETA_DATEPUBLISHED = "datePublished"