diff --git a/README.md b/README.md
index 4f8d0cc1..609375b0 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
- **Contribution guidelines**: Text indicating how to contribute to this code repository
- **Contributors**: Contributors to a software component
- **Creation date**: Date when the repository was created
+- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
- **Date updated**: Date of last release.
- **Description**: A description of what the software does
- **Documentation**: Where to find additional documentation about a software component
diff --git a/docs/index.md b/docs/index.md
index fdf438dc..bcc1f7df 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -43,6 +43,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca
- **Continuous integration**: Link to continuous integration service(s)
- **Contribution guidelines**: Text indicating how to contribute to this code repository
- **Contributors**: Contributors to a software component
+- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
- **Creation date**: Date when the repository was created
- **Date updated**: Date of last release.
- **Description**: A description of what the software does
diff --git a/docs/output.md b/docs/output.md
index b618be3a..73a71dba 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -75,6 +75,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab.
- `contributing guidelines`: Guidelines indicating how to contribute to a software component.
- `contributors`: Contributors to a software component
+- `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available.
- `date_created`: Date when the software component was created.
- `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction).
- `description`: A description of what the software component does.
diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py
index 537e2efa..1b2beddf 100644
--- a/src/somef/export/json_export.py
+++ b/src/somef/export/json_export.py
@@ -8,7 +8,6 @@
from ..utils import constants
from ..regular_expressions import detect_license_spdx,extract_scholarly_article_natural, extract_scholarly_article_properties
-
def save_json_output(repo_data, out_path, missing, pretty=False):
"""
Function that saves the final json Object in the output file
@@ -49,22 +48,6 @@ def format_date(date_string):
date_object = date_parser.parse(date_string)
return date_object.strftime("%Y-%m-%d")
- # latest_release = None
- # releases = data_path(["releases", "excerpt"])
- #
- # if releases is not None and len(releases) > 0:
- # latest_release = releases[0]
- # latest_pub_date = date_parser.parse(latest_release["datePublished"])
- # for index in range(1, len(releases)):
- # release = releases[index]
- # pub_date = date_parser.parse(release["datePublished"])
- #
- # if pub_date > latest_pub_date:
- # latest_release = release
- # latest_pub_date = pub_date
-
- # def release_path(path):
- # return DataGraph.resolve_path(latest_release, path)
code_repository = None
if constants.CAT_CODE_REPOSITORY in repo_data:
code_repository = repo_data[constants.CAT_CODE_REPOSITORY][0][constants.PROP_RESULT][constants.PROP_VALUE]
@@ -110,12 +93,6 @@ def format_date(date_string):
descriptions_text = flat_descriptions
- # descriptions_text = [d[constants.PROP_RESULT][constants.PROP_VALUE] for d in selected]
- # descriptions.sort(key=lambda x: (x[constants.PROP_CONFIDENCE] + (1 if x[constants.PROP_TECHNIQUE] == constants.GITHUB_API else 0)),
- # reverse=True)
- # descriptions_text = [x[constants.PROP_RESULT][constants.PROP_VALUE] for x in descriptions]
-
-
codemeta_output = {
"@context": "https://w3id.org/codemeta/3.0",
"@type": ["SoftwareSourceCode", "SoftwareApplication"]
@@ -171,6 +148,13 @@ def format_date(date_string):
value = repo_data[constants.CAT_DATE_UPDATED][0][constants.PROP_RESULT][constants.PROP_VALUE]
if value:
codemeta_output[constants.CAT_CODEMETA_DATEMODIFIED] = format_date(value)
+ if constants.CAT_COPYRIGHT in repo_data:
+ holder = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT][constants.PROP_VALUE]
+ year = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT].get(constants.PROP_YEAR)
+ if holder:
+ codemeta_output[constants.CAT_CODEMETA_COPYRIGHTHOLDER] = holder
+ if year:
+ codemeta_output[constants.CAT_CODEMETA_COPYRIGHTYEAR] = year
if constants.CAT_DOWNLOAD_URL in repo_data:
codemeta_output[constants.CAT_CODEMETA_DOWNLOADURL] = repo_data[constants.CAT_DOWNLOAD_URL][0][constants.PROP_RESULT][constants.PROP_VALUE]
if constants.CAT_NAME in repo_data:
@@ -192,10 +176,6 @@ def format_date(date_string):
for item in items:
if item not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(item)
- # for key in repo_data[constants.CAT_KEYWORDS]:
- # key_value = key[constants.PROP_RESULT][constants.PROP_VALUE]
- # if key_value not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
- # codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(key_value)
if constants.CAT_PROGRAMMING_LANGUAGES in repo_data:
# Calculate the total code size of all the programming languages
@@ -239,7 +219,7 @@ def format_date(date_string):
req_type = x[constants.PROP_RESULT].get("type")
if req_type:
entry["@type"] = map_requirement_type(req_type)
-
+
if version:
if isinstance(version, str):
entry["version"] = version.strip()
@@ -268,10 +248,6 @@ def format_date(date_string):
other_requirements.append(value)
seen_text.add(normalized)
- # if requirements_mode == "v":
- # codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements
- # else:
- # codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements + other_requirements
if requirements_mode == "v":
codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
index c97a894b..9edc6baf 100644
--- a/src/somef/process_files.py
+++ b/src/somef/process_files.py
@@ -174,6 +174,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
repo_default_branch,
repo_dir, repo_relative_path, filename, dir_path,
metadata_result, constants.CAT_LICENSE)
+
if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper():
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
@@ -503,6 +504,38 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
result[constants.PROP_NAME] = license_info['name']
result[constants.PROP_SPDX_ID] = license_info['spdx_id']
+
+ # Extraction copyright holder from license text
+ matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE)
+
+ for year, holder in matches_copyright:
+ holder = holder.strip() if holder else None
+ year = year.strip() if year else None
+
+ if not holder:
+ logging.info("Skipping copyright holder with empty name")
+ continue
+
+ # sometimes we get not desired characters
+ holder = holder.lstrip(",;: ").strip()
+
+ result_copy = {
+ constants.PROP_VALUE: holder,
+ constants.PROP_TYPE: constants.AGENT
+ }
+
+ if year:
+ result_copy[constants.PROP_YEAR] = year
+
+ logging.info(f"Extracted copyright holder: {holder.strip()} with year: {year}")
+ metadata_result.add_result(
+ constants.CAT_COPYRIGHT,
+ result_copy,
+ 1,
+ constants.TECHNIQUE_FILE_EXPLORATION,
+ url
+ )
+
if category is constants.CAT_AUTHORS:
result = {}
authors_list = parse_author_file(file_text)
diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
index 2946dd33..fc6b4231 100644
--- a/src/somef/process_repository.py
+++ b/src/somef/process_repository.py
@@ -472,8 +472,6 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization,
base = f"https://gitlab.com/{project_path}" if project_path else f"https://gitlab.com/{owner}/{repo_name}"
primary_url = f"{base}/-/raw/{default_branch}/README.md"
secondary_url = f"{base}/-/raw/master/README.md"
- # primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md"
- # secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md"
elif repo_type is constants.RepositoryType.GITHUB:
primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md"
secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md"
diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py
index f9b6db16..4bd4efbc 100644
--- a/src/somef/somef_cli.py
+++ b/src/somef/somef_cli.py
@@ -36,7 +36,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
@param keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them
@param authorization: GitHub authorization token
@param ignore_test_folder: Ignore contents of test folders
- @param requiriments_mode: flag to indicate what requirements show in codemeta
+ @param requirements_mode: flag to indicate what requirements show in codemeta
@param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API.
@param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata.
@param tag: tag of the repository to analyze. Cannot be used together with the branch parameter.
diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py
index 4eb95c6e..afaf5b14 100644
--- a/src/somef/test/test_JSON_export.py
+++ b/src/somef/test/test_JSON_export.py
@@ -662,4 +662,61 @@ def test_issue_914(self):
requirements = json_content.get(constants.CAT_REQUIREMENTS, [])
self.assertEqual(requirements[0].get("result", {}).get("version"), "3.20.3")
- os.remove(output_file)
\ No newline at end of file
+ os.remove(output_file)
+
+
+ def test_issue_886_bsd3(self):
+ """Checks whether copyright holder are correctly extracted from BSD 3-Clause license text"""
+ somef_cli.run_cli(threshold=0.8,
+ ignore_classifiers=False,
+ repo_url=None,
+ local_repo=test_data_repositories + "captum",
+ doc_src=None,
+ in_file=None,
+ output=test_data_path + "test_issue_886_bsd3.json",
+ graph_out=None,
+ graph_format="turtle",
+ codemeta_out=None,
+ pretty=True,
+ missing=False,
+ readme_only=False)
+
+ text_file = open(test_data_path + "test_issue_886_bsd3.json", "r")
+ data = text_file.read()
+ text_file.close()
+ json_content = json.loads(data)
+
+ copyright_entries = json_content[constants.CAT_COPYRIGHT]
+ copy = copyright_entries[0]["result"]
+ assert copy["value"] == "PyTorch team"
+ assert copy["year"] == "2019"
+
+ os.remove(test_data_path + "test_issue_886_bsd3.json")
+
+
+ def test_issue_886_apache(self):
+ """Checks whether copyright holder are correctly extracted from Apache license text"""
+ somef_cli.run_cli(threshold=0.8,
+ ignore_classifiers=False,
+ repo_url=None,
+ local_repo=test_data_repositories + "Widoco",
+ doc_src=None,
+ in_file=None,
+ output=test_data_path + "test_issue_886_apache.json",
+ graph_out=None,
+ graph_format="turtle",
+ codemeta_out=None,
+ pretty=True,
+ missing=False,
+ readme_only=False)
+
+ text_file = open(test_data_path + "test_issue_886_apache.json", "r")
+ data = text_file.read()
+ text_file.close()
+ json_content = json.loads(data)
+
+ copyright_entries = json_content[constants.CAT_COPYRIGHT]
+ copy = copyright_entries[0]["result"]
+ assert copy["value"] == "Daniel Garijo, Information Sciences Institute, USC."
+ assert copy["year"] == "2016"
+ os.remove(test_data_path + "test_issue_886_apache.json")
\ No newline at end of file
diff --git a/src/somef/test/test_bower_parser.py b/src/somef/test/test_bower_parser.py
index d3d36235..3a52c0bd 100644
--- a/src/somef/test/test_bower_parser.py
+++ b/src/somef/test/test_bower_parser.py
@@ -65,7 +65,7 @@ def test_parse_bower_json(self):
dependency = req_result["result"]
if dependency.get("name") == "jquery" and dependency.get("dependency_type") == constants.DEPENDENCY_TYPE_RUNTIME:
found_jquery = True
- self.assertEqual(dependency.get("dependency_resolver"),"bower","jQuery should come from the bower resolver")
+ self.assertEqual(dependency.get("dependency_resolver"),"bower","Bower should come from the bower resolver")
self.assertTrue(found_jquery, "jQuery dependency not found")
diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py
index 82576799..4c6ceb57 100644
--- a/src/somef/test/test_codemeta_export.py
+++ b/src/somef/test/test_codemeta_export.py
@@ -5,6 +5,7 @@
from .. import somef_cli
from ..parser import pom_xml_parser
from ..export import json_export
+from ..utils import constants
test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
@@ -587,69 +588,35 @@ def test_issue_891(self):
os.remove(output_path)
- # def test_codemeta_local(self):
-
- # """
- # codemeta local
- # """
-
- # pom_xml_parser.processed_pom = False
- # output_path = test_data_path + 'test_urban_pfr.json'
- # if os.path.exists(output_path):
- # os.remove(output_path)
-
- # somef_cli.run_cli(threshold=0.9,
- # ignore_classifiers=False,
- # repo_url=None,
- # doc_src=None,
- # local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
- # in_file=None,
- # output=None,
- # graph_out=None,
- # graph_format="turtle",
- # codemeta_out= output_path,
- # pretty=True,
- # missing=False,
- # readme_only=False)
+ def test_issue_886_apache_code(self):
+ """Checks whether copyright holder are correctly extracted from Apache license text in codemeta"""
+ somef_cli.run_cli(threshold=0.8,
+ ignore_classifiers=False,
+ repo_url=None,
+ local_repo=test_data_repositories + "Widoco",
+ doc_src=None,
+ in_file=None,
+ output=None,
+ graph_out=None,
+ graph_format="turtle",
+ codemeta_out=test_data_path + "test_issue_886_apache_code.json",
+ pretty=True,
+ missing=False,
+ readme_only=False)
- # with open(output_path, "r") as f:
- # json_content = json.load(f)
-
- # runtime = json_content.get("runtimePlatform", [])
- # assert runtime == "Java: 1.8", f"It was expected 'Java: 1.8' but it was '{runtime}'"
- # os.remove(output_path)
-
-
- # def test_codemeta_local_2(self):
-
- # """
- # codemeta local
- # """
-
- # pom_xml_parser.processed_pom = False
+ text_file = open(test_data_path + "test_issue_886_apache_code.json", "r")
+ data = text_file.read()
+ text_file.close()
+ json_content = json.loads(data)
- # output_path = test_data_path + 'test_json_urban_pfr.json'
- # if os.path.exists(output_path):
- # os.remove(output_path)
-
- # somef_cli.run_cli(threshold=0.9,
- # ignore_classifiers=False,
- # repo_url=None,
- # doc_src=None,
- # local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
- # in_file=None,
- # output=output_path,
- # graph_out=None,
- # graph_format="turtle",
- # codemeta_out= None,
- # pretty=True,
- # missing=False,
- # readme_only=False)
-
- # with open(output_path, "r") as f:
- # json_content = json.load(f)
+ copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER]
+ copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR]
+
+ assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC."
+ assert copyright_year == "2016"
+ os.remove(test_data_path + "test_issue_886_apache_code.json")
@classmethod
def tearDownClass(cls):
diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py
index 012908de..fbf64373 100644
--- a/src/somef/test/test_process_repository.py
+++ b/src/somef/test/test_process_repository.py
@@ -4,10 +4,11 @@
import json
from pathlib import Path
+from ..parser import pom_xml_parser
from .. import process_repository, process_files, somef_cli
from ..utils import constants
from ..process_results import Result
-from somef.parser import pom_xml_parser
+
test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
index 258f4f2f..a9ae7a9b 100644
--- a/src/somef/utils/constants.py
+++ b/src/somef/utils/constants.py
@@ -39,14 +39,6 @@
REGEXP_PROJECT_HOMEPAGE = r'\[\!\[Project homepage\]([^\]]+)\]\(([^)]+)\)'
# Readthedocs badges'
-# REGEXP_READTHEDOCS_BADGES = r"https?://[^\s]*readthedocs\.org/projects/[^\s]*/badge/\?version=[^\s]*(?:.|\n)*?:target:\s*(https?://[^\s]+)"
-# REGEXP_READTHEDOCS_BADGES = r"https?://readthedocs\.org/projects/[^/\s]+/badge/\?version=[^)\s]+"
-# REGEXP_READTHEDOCS_BADGES = (
-# r"https?://readthedocs\.org/projects/[^/\s]+/badge/\?version=[^)\s]+"
-# r"(?:.|\n)*?:target:\s*(https?://[^\s]+)" # rst
-# r"|"
-# r"\((https?://readthedocs\.org/projects/[^/\s]+/[^)\s]+)\)" # md
-# )
REGEXP_READTHEDOCS_RST = (
r"https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s]*"
r"[^\n]*?:target:\s*(https?://[^\s\"']+)"
@@ -55,16 +47,12 @@
r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)"
)
-# REGEXP_READTHEDOCS_HTML = (
-# r"]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>"
-# r"(?:(?!)[\s\S])*?"
-# r"
]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*"
-# )
REGEXP_READTHEDOCS_HTML = r"""
]*href=['"](https?://[^'"]+)['"][^>]*> # Capture href
(?:\s*|\n*) # breaklines and optional spaces
]*src=['"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'"]+['"] # Badge
"""
+
# For natural language citation
REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'
REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b'
@@ -99,6 +87,9 @@
REGEXP_ZENODO_DOI = r'https://zenodo\.org/badge/DOI/\d+'
REGEXP_ZENODO_JSON_LD = r""
+# Detect copyright information in license files.
+REGEXP_COPYRIGHT = r"copyright\s*(?:\(c\)|©|\(C\))?\s*\{?(\d{4}(?:-\d{4})?)\}?\s*\{?([^\n}]+)\}?"
+
LICENSES_DICT = {
"Apache License 2.0": {"regex": REGEXP_APACHE, "spdx_id": "Apache-2.0"},
"GNU General Public License v3.0": {"regex": REGEXP_GPL3, "spdx_id": "GPL-3.0"},
@@ -125,6 +116,7 @@
CAT_COC = "code_of_conduct"
CAT_CODE_REPOSITORY = "code_repository"
CAT_CONTACT = "contact"
+CAT_COPYRIGHT = "copyright_holder"
CAT_DATE_CREATED = "date_created"
CAT_DATE_UPDATED = "date_updated"
CAT_DATE_PUBLISHED = "date_published"
@@ -196,7 +188,7 @@
# list with all categories
all_categories = [CAT_APPLICATION_DOMAIN, CAT_ACKNOWLEDGEMENT, CAT_AUTHORS, CAT_CITATION, CAT_CONTRIBUTORS,
CAT_CONTRIBUTING_GUIDELINES, CAT_CONTINUOUS_INTEGRATION,
- CAT_COC, CAT_CODE_REPOSITORY, CAT_CONTACT, CAT_DESCRIPTION, CAT_DATE_CREATED, CAT_DATE_UPDATED,
+ CAT_COC, CAT_CODE_REPOSITORY, CAT_CONTACT, CAT_COPYRIGHT, CAT_DESCRIPTION, CAT_DATE_CREATED, CAT_DATE_UPDATED,
CAT_DOCUMENTATION, CAT_DOWNLOAD, CAT_DOWNLOAD_URL, CAT_EXECUTABLE_EXAMPLE,
CAT_FAQ, CAT_FORK_COUNTS, CAT_FORKS_URLS, CAT_FULL_NAME, CAT_FULL_TITLE, CAT_HAS_BUILD_FILE,
CAT_HAS_SCRIPT_FILE, CAT_IDENTIFIER, CAT_IMAGE, CAT_INSTALLATION,
@@ -250,6 +242,7 @@
PROP_URL = "url"
PROP_USERNAME = "username"
PROP_VERSION = "version"
+PROP_YEAR = "year"
PROP_ZIPBALL_URL = "zipball_url"
PROP_TARBALL_URL = "tarball_url"
# Publications
@@ -432,6 +425,8 @@ class RepositoryType(Enum):
CAT_CODEMETA_BUILDINSTRUCTIONS = "buildInstructions"
CAT_CODEMETA_CODEREPOSITORY = "codeRepository"
CAT_CODEMETA_CONTINUOUSINTEGRATION = "continuousIntegration"
+CAT_CODEMETA_COPYRIGHTHOLDER = "copyrightHolder"
+CAT_CODEMETA_COPYRIGHTYEAR = "copyrightYear"
CAT_CODEMETA_DATECREATED = "dateCreated"
CAT_CODEMETA_DATEMODIFIED = "dateModified"
CAT_CODEMETA_DATEPUBLISHED = "datePublished"