Skip to content
Merged

Dev #973

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 79 additions & 20 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
scipy = "^1.11.4"
inflect = "^7.0.0"
contractions = "^0.1.73"
chardet = "^5.2.0"
imbalanced-learn = "^0.12.0"
chardet = ">=5.2,<8.0"
imbalanced-learn = "^0.14.1"
pytest = ">=8,<10"
morph-kgc = "^2.7.0"
bibtexparser = "^1.4.1"
Expand Down
77 changes: 59 additions & 18 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
try:
parsed_build_files = set()

for dir_path, dir_names, filenames in os.walk(repo_dir):

dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS]
if is_local_repo:
dir_names[:] = [d for d in dir_names if d.lower() != "lib"]

for dir_path, dir_names, filenames in sorted(os.walk(repo_dir),key=lambda x: x[0].count(os.sep)):
dir_names.sort()
filenames.sort()
# dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS]
# if is_local_repo:
# dir_names[:] = [d for d in dir_names if d.lower() != "lib"]

# repo_relative_path = os.path.relpath(dir_path, repo_dir)
# current_dir = os.path.basename(repo_relative_path).lower()
repo_relative_path = os.path.relpath(dir_path, repo_dir)
path_parts = repo_relative_path.split(os.sep)
current_dir = os.path.basename(repo_relative_path).lower()
is_in_ignored = any(part.lower() in constants.IGNORED_DIRS for part in path_parts)
is_lib = is_local_repo and "lib" in path_parts
if is_in_ignored or is_lib:
continue
# if this is a test folder, we ignore it (except for the root repo)
# if ignore_test_folder and repo_relative_path != "." and "test" in repo_relative_path.lower():
if ignore_test_folder and repo_relative_path != "." and current_dir in constants.IGNORED_DIRS:
Expand Down Expand Up @@ -170,12 +178,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
logging.error(f"{type(err).__name__} was raised: {err}")
if ("LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD"== filename.upper()
or "LICENSE.RST"== filename.upper()):

metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
repo_default_branch,
repo_dir, repo_relative_path, filename, dir_path,
metadata_result, constants.CAT_LICENSE)
repo_default_branch,
repo_dir, repo_relative_path, filename, dir_path,
metadata_result, constants.CAT_LICENSE)


if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper():
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
repo_default_branch,
Expand Down Expand Up @@ -483,13 +491,26 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION):
new_file_path = extract_directory_path(url)
existing_path = extract_directory_path(entry[constants.PROP_SOURCE])
if new_file_path.startswith(existing_path):
# the existing file is higher, ignore this one

if new_file_path != existing_path and new_file_path.startswith(existing_path):
return metadata_result
else:
# replace result in hierarchy (below)
replace = True
break
# if new_file_path.startswith(existing_path):
# # the existing file is higher, ignore this one
# return metadata_result
# if existing_path == new_file_path:
# return metadata_result

# if (category == constants.CAT_CITATION and existing_path == "." and new_file_path != "."):
# return metadata_result

# replace = True
# break
# if repo_relative_path != "." and category == constants.CAT_LICENSE:
# return metadata_result
# else:
# # replace result in hierarchy (below)
# replace = True
# break
except Exception as e:
logging.warning("Error when trying to determine if redundant files exist " + str(e))
try:
Expand All @@ -500,9 +521,11 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
constants.PROP_VALUE: file_text,
constants.PROP_TYPE: constants.FILE_DUMP
}

if category is constants.CAT_LICENSE:
license_text = file_text
license_info = detect_license_spdx(license_text, 'JSON')

if license_info:
result[constants.PROP_NAME] = license_info['name']
result[constants.PROP_SPDX_ID] = license_info['spdx_id']
Expand All @@ -514,6 +537,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE)

for year, holder in matches_copyright:

holder = holder.strip() if holder else None
year = year.strip() if year else None

Expand Down Expand Up @@ -604,7 +628,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
)

pref = yaml_content.get("preferred-citation")
if pref:
if pref and not should_skip_citation(metadata_result, url):
pref_result = parse_cff_preferred(pref)
pref_result[constants.PROP_VALUE] = yaml.dump({"preferred-citation": pref}, default_flow_style=False)
# pref_result[constants.PROP_TYPE] = constants.FILE_DUMP
Expand Down Expand Up @@ -769,6 +793,23 @@ def parse_cff_preferred(pref):

return clean_nulls(result)

def should_skip_citation(metadata_result, new_path):
for entry in metadata_result.results.get(constants.CAT_CITATION, []):
if entry.get(constants.PROP_TECHNIQUE) != constants.TECHNIQUE_FILE_EXPLORATION:
continue

existing_path = extract_directory_path(entry[constants.PROP_SOURCE])

# mismo fichero
if existing_path == new_path:
return True

# ya existe uno en raíz → bloquea subcarpetas
if existing_path == ".":
return True

return False

def clean_nulls(d: dict) -> dict:
return {k: v for k, v in d.items() if v not in (None, "")}

Expand All @@ -790,14 +831,14 @@ def parse_license_cff(license_value, metadata_result, url):
else:
license_result[constants.PROP_NAME] = license_value


metadata_result.add_result(
constants.CAT_LICENSE,
license_result,
1,
constants.TECHNIQUE_FILE_EXPLORATION,
url
)

except Exception as e:
logging.error(f"Error parsing license from CFF: {str(e)}")

Expand Down
8 changes: 4 additions & 4 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,13 +548,13 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,

if url.netloc != constants.GITHUB_DOMAIN:
logging.error("Repository must be from Github")
return repository_metadata, "", "", ""
return repository_metadata, "", "", "", ""

path_components = url.path.split('/')

if len(path_components) < 3:
logging.error("Repository link is not correct. \nThe correct format is https://github.com/{owner}/{repo_name}.")
return repository_metadata, "", "", ""
return repository_metadata, "", "", "", ""

owner = path_components[1]
repo_name = path_components[2]
Expand All @@ -567,7 +567,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
logging.error(f"Github link is not correct. \n"
f"The correct format is https://github.com/{owner}/{repo_name}/tree/... \n"
f"or https://github.com/{owner}/{repo_name}/blob/....")
return repository_metadata, "", "", ""
return repository_metadata, "", "", "", ""

# we must join all after 4, as sometimes tags have "/" in them.
# default_branch = "/".join(path_components[4:])
Expand All @@ -580,7 +580,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
general_resp_raw, date = rate_limit_get(repo_api_base_url, headers=header)
if general_resp_raw is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or content-lenght none")
return repository_metadata, "", "", ""
return repository_metadata, "", "", "", ""

general_resp = general_resp_raw.json()

Expand Down
Loading