Skip to content

Commit f690bcd

Browse files
authored
Merge pull request #973 from KnowledgeCaptureAndDiscovery/dev
Dev
2 parents 5719ace + e257d8e commit f690bcd

4 files changed

Lines changed: 144 additions & 44 deletions

File tree

poetry.lock

Lines changed: 79 additions & 20 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
3333
scipy = "^1.11.4"
3434
inflect = "^7.0.0"
3535
contractions = "^0.1.73"
36-
chardet = "^5.2.0"
37-
imbalanced-learn = "^0.12.0"
36+
chardet = ">=5.2,<8.0"
37+
imbalanced-learn = "^0.14.1"
3838
pytest = ">=8,<10"
3939
morph-kgc = "^2.7.0"
4040
bibtexparser = "^1.4.1"

src/somef/process_files.py

Lines changed: 59 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
6161
try:
6262
parsed_build_files = set()
6363

64-
for dir_path, dir_names, filenames in os.walk(repo_dir):
65-
66-
dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS]
67-
if is_local_repo:
68-
dir_names[:] = [d for d in dir_names if d.lower() != "lib"]
69-
64+
for dir_path, dir_names, filenames in sorted(os.walk(repo_dir),key=lambda x: x[0].count(os.sep)):
65+
dir_names.sort()
66+
filenames.sort()
67+
# dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS]
68+
# if is_local_repo:
69+
# dir_names[:] = [d for d in dir_names if d.lower() != "lib"]
70+
71+
# repo_relative_path = os.path.relpath(dir_path, repo_dir)
72+
# current_dir = os.path.basename(repo_relative_path).lower()
7073
repo_relative_path = os.path.relpath(dir_path, repo_dir)
74+
path_parts = repo_relative_path.split(os.sep)
7175
current_dir = os.path.basename(repo_relative_path).lower()
76+
is_in_ignored = any(part.lower() in constants.IGNORED_DIRS for part in path_parts)
77+
is_lib = is_local_repo and "lib" in path_parts
78+
if is_in_ignored or is_lib:
79+
continue
7280
# if this is a test folder, we ignore it (except for the root repo)
7381
# if ignore_test_folder and repo_relative_path != "." and "test" in repo_relative_path.lower():
7482
if ignore_test_folder and repo_relative_path != "." and current_dir in constants.IGNORED_DIRS:
@@ -170,12 +178,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
170178
logging.error(f"{type(err).__name__} was raised: {err}")
171179
if ("LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD"== filename.upper()
172180
or "LICENSE.RST"== filename.upper()):
181+
173182
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
174-
repo_default_branch,
175-
repo_dir, repo_relative_path, filename, dir_path,
176-
metadata_result, constants.CAT_LICENSE)
183+
repo_default_branch,
184+
repo_dir, repo_relative_path, filename, dir_path,
185+
metadata_result, constants.CAT_LICENSE)
177186

178-
179187
if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper():
180188
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
181189
repo_default_branch,
@@ -483,13 +491,26 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
483491
entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION):
484492
new_file_path = extract_directory_path(url)
485493
existing_path = extract_directory_path(entry[constants.PROP_SOURCE])
486-
if new_file_path.startswith(existing_path):
487-
# the existing file is higher, ignore this one
494+
495+
if new_file_path != existing_path and new_file_path.startswith(existing_path):
488496
return metadata_result
489-
else:
490-
# replace result in hierarchy (below)
491-
replace = True
492-
break
497+
# if new_file_path.startswith(existing_path):
498+
# # the existing file is higher, ignore this one
499+
# return metadata_result
500+
# if existing_path == new_file_path:
501+
# return metadata_result
502+
503+
# if (category == constants.CAT_CITATION and existing_path == "." and new_file_path != "."):
504+
# return metadata_result
505+
506+
# replace = True
507+
# break
508+
# if repo_relative_path != "." and category == constants.CAT_LICENSE:
509+
# return metadata_result
510+
# else:
511+
# # replace result in hierarchy (below)
512+
# replace = True
513+
# break
493514
except Exception as e:
494515
logging.warning("Error when trying to determine if redundant files exist " + str(e))
495516
try:
@@ -500,9 +521,11 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
500521
constants.PROP_VALUE: file_text,
501522
constants.PROP_TYPE: constants.FILE_DUMP
502523
}
524+
503525
if category is constants.CAT_LICENSE:
504526
license_text = file_text
505527
license_info = detect_license_spdx(license_text, 'JSON')
528+
506529
if license_info:
507530
result[constants.PROP_NAME] = license_info['name']
508531
result[constants.PROP_SPDX_ID] = license_info['spdx_id']
@@ -514,6 +537,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
514537
matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE)
515538

516539
for year, holder in matches_copyright:
540+
517541
holder = holder.strip() if holder else None
518542
year = year.strip() if year else None
519543

@@ -604,7 +628,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
604628
)
605629

606630
pref = yaml_content.get("preferred-citation")
607-
if pref:
631+
if pref and not should_skip_citation(metadata_result, url):
608632
pref_result = parse_cff_preferred(pref)
609633
pref_result[constants.PROP_VALUE] = yaml.dump({"preferred-citation": pref}, default_flow_style=False)
610634
# pref_result[constants.PROP_TYPE] = constants.FILE_DUMP
@@ -769,6 +793,23 @@ def parse_cff_preferred(pref):
769793

770794
return clean_nulls(result)
771795

796+
def should_skip_citation(metadata_result, new_path):
797+
for entry in metadata_result.results.get(constants.CAT_CITATION, []):
798+
if entry.get(constants.PROP_TECHNIQUE) != constants.TECHNIQUE_FILE_EXPLORATION:
799+
continue
800+
801+
existing_path = extract_directory_path(entry[constants.PROP_SOURCE])
802+
803+
# mismo fichero
804+
if existing_path == new_path:
805+
return True
806+
807+
# ya existe uno en raíz → bloquea subcarpetas
808+
if existing_path == ".":
809+
return True
810+
811+
return False
812+
772813
def clean_nulls(d: dict) -> dict:
773814
return {k: v for k, v in d.items() if v not in (None, "")}
774815

@@ -790,14 +831,14 @@ def parse_license_cff(license_value, metadata_result, url):
790831
else:
791832
license_result[constants.PROP_NAME] = license_value
792833

793-
794834
metadata_result.add_result(
795835
constants.CAT_LICENSE,
796836
license_result,
797837
1,
798838
constants.TECHNIQUE_FILE_EXPLORATION,
799839
url
800840
)
841+
801842
except Exception as e:
802843
logging.error(f"Error parsing license from CFF: {str(e)}")
803844

src/somef/process_repository.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -548,13 +548,13 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
548548

549549
if url.netloc != constants.GITHUB_DOMAIN:
550550
logging.error("Repository must be from Github")
551-
return repository_metadata, "", "", ""
551+
return repository_metadata, "", "", "", ""
552552

553553
path_components = url.path.split('/')
554554

555555
if len(path_components) < 3:
556556
logging.error("Repository link is not correct. \nThe correct format is https://github.com/{owner}/{repo_name}.")
557-
return repository_metadata, "", "", ""
557+
return repository_metadata, "", "", "", ""
558558

559559
owner = path_components[1]
560560
repo_name = path_components[2]
@@ -567,7 +567,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
567567
logging.error(f"Github link is not correct. \n"
568568
f"The correct format is https://github.com/{owner}/{repo_name}/tree/... \n"
569569
f"or https://github.com/{owner}/{repo_name}/blob/....")
570-
return repository_metadata, "", "", ""
570+
return repository_metadata, "", "", "", ""
571571

572572
# we must join all after 4, as sometimes tags have "/" in them.
573573
# default_branch = "/".join(path_components[4:])
@@ -580,7 +580,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
580580
general_resp_raw, date = rate_limit_get(repo_api_base_url, headers=header)
581581
if general_resp_raw is None:
582582
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or content-lenght none")
583-
return repository_metadata, "", "", ""
583+
return repository_metadata, "", "", "", ""
584584

585585
general_resp = general_resp_raw.json()
586586

0 commit comments

Comments
 (0)