Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
- **Contribution guidelines**: Text indicating how to contribute to this code repository
- **Contributors**: Contributors to a software component
- **Creation date**: Date when the repository was created
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
- **Date updated**: Date of last release.
- **Description**: A description of what the software does
- **Documentation**: Where to find additional documentation about a software component
Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca
- **Continuous integration**: Link to continuous integration service(s)
- **Contribution guidelines**: Text indicating how to contribute to this code repository
- **Contributors**: Contributors to a software component
- **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
- **Creation date**: Date when the repository was created
- **Date updated**: Date of last release.
- **Description**: A description of what the software does
Expand Down
1 change: 1 addition & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab.
- `contributing guidelines`: Guidelines indicating how to contribute to a software component.
- `contributors`: Contributors to a software component
- `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available.
- `date_created`: Date when the software component was created.
- `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction).
- `description`: A description of what the software component does.
Expand Down
40 changes: 8 additions & 32 deletions src/somef/export/json_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from ..utils import constants
from ..regular_expressions import detect_license_spdx,extract_scholarly_article_natural, extract_scholarly_article_properties


def save_json_output(repo_data, out_path, missing, pretty=False):
"""
Function that saves the final json Object in the output file
Expand Down Expand Up @@ -49,22 +48,6 @@ def format_date(date_string):
date_object = date_parser.parse(date_string)
return date_object.strftime("%Y-%m-%d")

# latest_release = None
# releases = data_path(["releases", "excerpt"])
#
# if releases is not None and len(releases) > 0:
# latest_release = releases[0]
# latest_pub_date = date_parser.parse(latest_release["datePublished"])
# for index in range(1, len(releases)):
# release = releases[index]
# pub_date = date_parser.parse(release["datePublished"])
#
# if pub_date > latest_pub_date:
# latest_release = release
# latest_pub_date = pub_date

# def release_path(path):
# return DataGraph.resolve_path(latest_release, path)
code_repository = None
if constants.CAT_CODE_REPOSITORY in repo_data:
code_repository = repo_data[constants.CAT_CODE_REPOSITORY][0][constants.PROP_RESULT][constants.PROP_VALUE]
Expand Down Expand Up @@ -110,12 +93,6 @@ def format_date(date_string):

descriptions_text = flat_descriptions

# descriptions_text = [d[constants.PROP_RESULT][constants.PROP_VALUE] for d in selected]
# descriptions.sort(key=lambda x: (x[constants.PROP_CONFIDENCE] + (1 if x[constants.PROP_TECHNIQUE] == constants.GITHUB_API else 0)),
# reverse=True)
# descriptions_text = [x[constants.PROP_RESULT][constants.PROP_VALUE] for x in descriptions]


codemeta_output = {
"@context": "https://w3id.org/codemeta/3.0",
"@type": ["SoftwareSourceCode", "SoftwareApplication"]
Expand Down Expand Up @@ -171,6 +148,13 @@ def format_date(date_string):
value = repo_data[constants.CAT_DATE_UPDATED][0][constants.PROP_RESULT][constants.PROP_VALUE]
if value:
codemeta_output[constants.CAT_CODEMETA_DATEMODIFIED] = format_date(value)
if constants.CAT_COPYRIGHT in repo_data:
holder = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT][constants.PROP_VALUE]
year = repo_data[constants.CAT_COPYRIGHT][0][constants.PROP_RESULT].get(constants.PROP_YEAR)
if holder:
codemeta_output[constants.CAT_CODEMETA_COPYRIGHTHOLDER] = holder
if year:
codemeta_output[constants.CAT_CODEMETA_COPYRIGHTYEAR] = year
if constants.CAT_DOWNLOAD_URL in repo_data:
codemeta_output[constants.CAT_CODEMETA_DOWNLOADURL] = repo_data[constants.CAT_DOWNLOAD_URL][0][constants.PROP_RESULT][constants.PROP_VALUE]
if constants.CAT_NAME in repo_data:
Expand All @@ -192,10 +176,6 @@ def format_date(date_string):
for item in items:
if item not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(item)
# for key in repo_data[constants.CAT_KEYWORDS]:
# key_value = key[constants.PROP_RESULT][constants.PROP_VALUE]
# if key_value not in codemeta_output[constants.CAT_CODEMETA_KEYWORDS]:
# codemeta_output[constants.CAT_CODEMETA_KEYWORDS].append(key_value)

if constants.CAT_PROGRAMMING_LANGUAGES in repo_data:
# Calculate the total code size of all the programming languages
Expand Down Expand Up @@ -239,7 +219,7 @@ def format_date(date_string):
req_type = x[constants.PROP_RESULT].get("type")
if req_type:
entry["@type"] = map_requirement_type(req_type)

if version:
if isinstance(version, str):
entry["version"] = version.strip()
Expand Down Expand Up @@ -268,10 +248,6 @@ def format_date(date_string):
other_requirements.append(value)
seen_text.add(normalized)

# if requirements_mode == "v":
# codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements
# else:
# codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements + other_requirements

if requirements_mode == "v":
codemeta_output[constants.CAT_CODEMETA_SOFTWAREREQUIREMENTS] = code_parser_requirements
Expand Down
33 changes: 33 additions & 0 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
repo_default_branch,
repo_dir, repo_relative_path, filename, dir_path,
metadata_result, constants.CAT_LICENSE)


if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper():
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
Expand Down Expand Up @@ -503,6 +504,38 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
result[constants.PROP_NAME] = license_info['name']
result[constants.PROP_SPDX_ID] = license_info['spdx_id']


# Extraction copyright holder from license text
matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE)

for year, holder in matches_copyright:
holder = holder.strip() if holder else None
year = year.strip() if year else None

if not holder:
logging.info("Skipping copyright holder with empty name")
continue

# sometimes we get not desired characters
holder = holder.lstrip(",;: ").strip()

result_copy = {
constants.PROP_VALUE: holder,
constants.PROP_TYPE: constants.AGENT
}

if year:
result_copy[constants.PROP_YEAR] = year

logging.info(f"Extracted copyright holder: {holder.strip()} with year: {year}")
metadata_result.add_result(
constants.CAT_COPYRIGHT,
result_copy,
1,
constants.TECHNIQUE_FILE_EXPLORATION,
url
)

if category is constants.CAT_AUTHORS:
result = {}
authors_list = parse_author_file(file_text)
Expand Down
2 changes: 0 additions & 2 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,6 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization,
base = f"https://gitlab.com/{project_path}" if project_path else f"https://gitlab.com/{owner}/{repo_name}"
primary_url = f"{base}/-/raw/{default_branch}/README.md"
secondary_url = f"{base}/-/raw/master/README.md"
# primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md"
# secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md"
elif repo_type is constants.RepositoryType.GITHUB:
primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md"
secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md"
Expand Down
2 changes: 1 addition & 1 deletion src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
@param keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them
@param authorization: GitHub authorization token
@param ignore_test_folder: Ignore contents of test folders
@param requiriments_mode: flag to indicate what requirements show in codemeta
@param requirements_mode: flag to indicate what requirements show in codemeta
@param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API.
@param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata.
@param tag: tag of the repository to analyze. Cannot be used together with the branch parameter.
Expand Down
59 changes: 58 additions & 1 deletion src/somef/test/test_JSON_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,4 +662,61 @@ def test_issue_914(self):
requirements = json_content.get(constants.CAT_REQUIREMENTS, [])
self.assertEqual(requirements[0].get("result", {}).get("version"), "3.20.3")

os.remove(output_file)
os.remove(output_file)


def test_issue_886_bsd3(self):
"""Checks whether copyright holder are correctly extracted from BSD 3-Clause license text"""
somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=test_data_repositories + "captum",
doc_src=None,
in_file=None,
output=test_data_path + "test_issue_886_bsd3.json",
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=False,
readme_only=False)

text_file = open(test_data_path + "test_issue_886_bsd3.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)

copyright_entries = json_content[constants.CAT_COPYRIGHT]
copy = copyright_entries[0]["result"]
assert copy["value"] == "PyTorch team"
assert copy["year"] == "2019"

os.remove(test_data_path + "test_issue_886_bsd3.json")


def test_issue_886_apache(self):
"""Checks whether copyright holder are correctly extracted from Apache license text"""
somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=test_data_repositories + "Widoco",
doc_src=None,
in_file=None,
output=test_data_path + "test_issue_886_apache.json",
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=False,
readme_only=False)

text_file = open(test_data_path + "test_issue_886_apache.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)

copyright_entries = json_content[constants.CAT_COPYRIGHT]
copy = copyright_entries[0]["result"]
assert copy["value"] == "Daniel Garijo, Information Sciences Institute, USC."
assert copy["year"] == "2016"
os.remove(test_data_path + "test_issue_886_apache.json")
2 changes: 1 addition & 1 deletion src/somef/test/test_bower_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_parse_bower_json(self):
dependency = req_result["result"]
if dependency.get("name") == "jquery" and dependency.get("dependency_type") == constants.DEPENDENCY_TYPE_RUNTIME:
found_jquery = True
self.assertEqual(dependency.get("dependency_resolver"),"bower","jQuery should come from the bower resolver")
self.assertEqual(dependency.get("dependency_resolver"),"bower","Bower should come from the bower resolver")

self.assertTrue(found_jquery, "jQuery dependency not found")

Expand Down
85 changes: 26 additions & 59 deletions src/somef/test/test_codemeta_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .. import somef_cli
from ..parser import pom_xml_parser
from ..export import json_export
from ..utils import constants

test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
Expand Down Expand Up @@ -587,69 +588,35 @@ def test_issue_891(self):

os.remove(output_path)

# def test_codemeta_local(self):

# """
# codemeta local
# """

# pom_xml_parser.processed_pom = False

# output_path = test_data_path + 'test_urban_pfr.json'
# if os.path.exists(output_path):
# os.remove(output_path)

# somef_cli.run_cli(threshold=0.9,
# ignore_classifiers=False,
# repo_url=None,
# doc_src=None,
# local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
# in_file=None,
# output=None,
# graph_out=None,
# graph_format="turtle",
# codemeta_out= output_path,
# pretty=True,
# missing=False,
# readme_only=False)
def test_issue_886_apache_code(self):
"""Checks whether copyright holder are correctly extracted from Apache license text in codemeta"""
somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=test_data_repositories + "Widoco",
doc_src=None,
in_file=None,
output=None,
graph_out=None,
graph_format="turtle",
codemeta_out=test_data_path + "test_issue_886_apache_code.json",
pretty=True,
missing=False,
readme_only=False)

# with open(output_path, "r") as f:
# json_content = json.load(f)

# runtime = json_content.get("runtimePlatform", [])
# assert runtime == "Java: 1.8", f"It was expected 'Java: 1.8' but it was '{runtime}'"
# os.remove(output_path)


# def test_codemeta_local_2(self):

# """
# codemeta local
# """

# pom_xml_parser.processed_pom = False
text_file = open(test_data_path + "test_issue_886_apache_code.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)

# output_path = test_data_path + 'test_json_urban_pfr.json'
# if os.path.exists(output_path):
# os.remove(output_path)

# somef_cli.run_cli(threshold=0.9,
# ignore_classifiers=False,
# repo_url=None,
# doc_src=None,
# local_repo=test_data_repositories + "urban_pfr_toolbox_hamburg",
# in_file=None,
# output=output_path,
# graph_out=None,
# graph_format="turtle",
# codemeta_out= None,
# pretty=True,
# missing=False,
# readme_only=False)

# with open(output_path, "r") as f:
# json_content = json.load(f)
copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER]
copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR]

assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC."
assert copyright_year == "2016"

os.remove(test_data_path + "test_issue_886_apache_code.json")

@classmethod
def tearDownClass(cls):
Expand Down
3 changes: 2 additions & 1 deletion src/somef/test/test_process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import json
from pathlib import Path

from ..parser import pom_xml_parser
from .. import process_repository, process_files, somef_cli
from ..utils import constants
from ..process_results import Result
from somef.parser import pom_xml_parser


test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
Expand Down
Loading
Loading