diff --git a/poetry.lock b/poetry.lock index adc9c6ea..5d62d2a0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -89,14 +89,48 @@ files = [ [[package]] name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" +version = "7.4.3" +description = "Universal character encoding detector" optional = false -python-versions = ">=3.7" +python-versions = ">=3.10" groups = ["main"] files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, + {file = "chardet-7.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0c79b13c9908ac7dfe0a74116ebc9a0f28b2319d23c32f3dfcdfbe1279c7eaf"}, + {file = "chardet-7.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bba8bea1b28d927b3e99e47deafe53658d34497c0a891d95ff1ba8ff6663f01c"}, + {file = "chardet-7.4.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23163921dccf3103ce59540b0443c106d2c0a0ff2e0503e05196f5e6fdea453f"}, + {file = "chardet-7.4.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cfb54563fe5f130da17c44c6a4e2e8052ba628e5ab4eab7ef8190f736f0f8f72"}, + {file = "chardet-7.4.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3990fffcc6a6045f2234ab72752ad037e3b2d48c72037f244d42738db397eb75"}, + {file = "chardet-7.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:c7116b0452994734ccff35e154b44240090eb0f4f74b9106292668133557c175"}, + {file = "chardet-7.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:25a862cddc6a9ac07023e808aedd297115345fbaabc2690479481ddc0f980e09"}, + {file = "chardet-7.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7005c88da26fd95d8abb8acbe6281d833e9a9181b03cf49b4546c4555389bd97"}, + {file = "chardet-7.4.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc50f28bad067393cce0af9091052c3b8df7a23115afd8ba7b2e0947f0cef1f8"}, + {file = "chardet-7.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3da294de1a681097848ab58bd3f2771a674f8039d2d87a5538b28856b815e9"}, + {file = "chardet-7.4.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:93c45e116dd51b66226a53ade3f9f635e870de5399b90e00ce45dcc311093bf4"}, + {file = "chardet-7.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:ccc1f83ab4bcfb901cf39e0c4ba6bc6e726fc6264735f10e24ceb5cb47387578"}, + {file = "chardet-7.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:75d3c65cc16bddf40b8da1fd25ba84fca5f8070f2b14e86083653c1c85aee971"}, + {file = "chardet-7.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:29af5999f654e8729d251f1724a62b538b1262d9292cccaefddf8a02aae1ef6a"}, + {file = "chardet-7.4.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:626f00299ad62dfe937058a09572beed442ccc7b58f87aa667949b20fd3db235"}, + {file = "chardet-7.4.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9a4904dd5f071b7a7d7f50b4a67a86db3c902d243bf31708f1d5cde2f68239cb"}, + {file = "chardet-7.4.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5d2879598bc220689e8ce509fe9c3f37ad2fca53a36be9c9bd91abdd91dd364f"}, + {file = "chardet-7.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:4b2799bd58e7245cfa8d4ab2e8ad1d76a5c3a5b1f32318eb6acca4c69a3e7101"}, + {file = "chardet-7.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a9e4486df251b8962e86ea9f139ca235aa6e0542a00f7844c9a04160afb99aa9"}, + {file = "chardet-7.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4fbff1907925b0c5a1064cffb5e040cd5e338585c9c552625f30de6bc2f3107a"}, + {file = "chardet-7.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:365135eaf37ba65a828f8e668eb0a8c38c479dcbec724dc25f4dfd781049c357"}, + {file = "chardet-7.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfc134b70c846c21ead8e43ada3ae1a805fff732f6922f8abcf2ff27b8f6493d"}, + {file = "chardet-7.4.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9acd9988a93e09390f3cd231201ea7166c415eb8da1b735928990ffc05cb9fbb"}, + {file = "chardet-7.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:e1b98790c284ff813f18f7cf7de5f05ea2435a080030c7f1a8318f3a4f80b131"}, + {file = "chardet-7.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d892d3dcd652fdef53e3d6327d39b17c0df40a899dfc919abaeb64c974497531"}, + {file = "chardet-7.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:acc46d1b8b7d5783216afe15db56d1c179b9a40e5a1558bc13164c4fd20674c4"}, + {file = "chardet-7.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ac3bf11c645734a1701a3804e43eabd98851838192267d08c353a834ab79fea"}, + {file = "chardet-7.4.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e3bd9f936e04bae89c254262af08d9e5b98f805175ba1e29d454e6cba3107b7"}, + {file = "chardet-7.4.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:27cc23da03630cdecc9aa81a895aa86629c211f995cd57651f0fbc280717bf93"}, + {file = "chardet-7.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:b95c934b9ad59e2ba8abb9be49df70d3ad1b0d95d864b9fdb7588d4fa8bd921c"}, + {file = "chardet-7.4.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:c77867f0c1cb8bd819502249fcdc500364aedb07881e11b743726fa2148e7b6e"}, + {file = "chardet-7.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cf1efeaf65a6ef2f5b9cc3a1df6f08ba2831b369ccaa4c7018eaf90aa757bb11"}, + {file = "chardet-7.4.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f3504c139a2ad544077dd2d9e412cd08b01786843d76997cd43bb6de311723c"}, + {file = "chardet-7.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457f619882ba66327d4d8d14c6c342269bdb1e4e1c38e8117df941d14d351b04"}, + {file = "chardet-7.4.3-py3-none-any.whl", hash = "sha256:1173b74051570cf08099d7429d92e4882d375ad4217f92a6e5240ccfb26f231e"}, + {file = "chardet-7.4.3.tar.gz", hash = "sha256:cc1d4eb92a4ec1c2df3b490836ffa46922e599d34ce0bb75cf41fd2bf6303d56"}, ] [[package]] @@ -621,28 +655,32 @@ all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2 [[package]] name = "imbalanced-learn" -version = "0.12.4" -description = "Toolbox for imbalanced dataset in machine learning." +version = "0.14.1" +description = "Toolbox for imbalanced dataset in machine learning" optional = false -python-versions = "*" +python-versions = ">=3.10" groups = ["main"] files = [ - {file = "imbalanced-learn-0.12.4.tar.gz", hash = "sha256:8153ba385d296b07d97e0901a2624a86c06b48c94c2f92da3a5354827697b7a3"}, - {file = "imbalanced_learn-0.12.4-py3-none-any.whl", hash = "sha256:d47fc599160d3ea882e712a3a6b02bdd353c1a6436d8d68d41b1922e6ee4a703"}, + {file = "imbalanced_learn-0.14.1-py3-none-any.whl", hash = "sha256:fcdff8d27870d6992ea3496230788b97ff98e24302e7f6c598701da525ae440f"}, + {file = "imbalanced_learn-0.14.1.tar.gz", hash = "sha256:46eeb5773a96b6fa92426356da66f3e4390a7ed8e715a633c6fb68ee4a3ccdcb"}, ] [package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3" -scikit-learn = ">=1.0.2" -scipy = ">=1.5.0" -threadpoolctl = ">=2.0.0" +joblib = ">=1.2.0,<2" +numpy = ">=1.25.2,<3" +scikit-learn = ">=1.4.2,<2" +scipy = ">=1.11.4,<2" +sklearn-compat = ">=0.1.5,<0.2" +threadpoolctl = ">=2.0.0,<4" [package.extras] -docs = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.5.0)", "pandas (>=1.0.5)", "pydata-sphinx-theme (>=0.13.3)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-gallery (>=0.13.0)", "sphinxcontrib-bibtex (>=2.4.1)", "tensorflow (>=2.4.3)"] -examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seaborn (>=0.9.0)", "tensorflow (>=2.4.3)"] -optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"] -tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"] +dev = ["ipykernel", "ipython", "jupyterlab"] +docs = ["matplotlib (>=3.7.3,<4)", "memory_profiler (>=0.61.0,<1)", "numpydoc (>=1.5.0,<2)", "pandas (>=2.0.3,<3)", "pydata-sphinx-theme (>=0.15.4,<1)", "seaborn (>=0.12.2,<1)", "sphinx (>=8.0.2,<9)", "sphinx-copybutton (>=0.5.2,<1)", "sphinx-design (>=0.6.1,<1)", "sphinx-gallery (>=0.13.0,<1)", "sphinxcontrib-bibtex (>=2.6.3,<3)", "tensorflow (>=2.16.1,<3)"] +keras = ["keras (>=3.3.3,<4)"] +linters = ["black (==23.3.0)", "pre-commit", "ruff (==0.14.2)"] +optional = ["pandas (>=2.0.3,<3)"] +tensorflow = ["tensorflow (>=2.16.1,<3)"] +tests = ["packaging (>=23.2,<25)", "pytest (>=7.2.2,<9)", "pytest-cov (>=4.1.0,<6)", "pytest-xdist (>=3.5.0,<4)"] [[package]] name = "inflect" @@ -2349,6 +2387,27 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "sklearn-compat" +version = "0.1.5" +description = "Ease support for compatible scikit-learn estimators across versions" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "sklearn_compat-0.1.5-py3-none-any.whl", hash = "sha256:dddd00c442027b6a2c2fd4a86667b804a7353cdb5093bfd0d5431f5e3c135fce"}, + {file = "sklearn_compat-0.1.5.tar.gz", hash = "sha256:1a0c3a2f384100e034def49ee5a6cfe984a826f79d032eb559f10445e012b02c"}, +] + +[package.dependencies] +scikit-learn = ">=1.2,<1.9" + +[package.extras] +dev = ["ipython"] +docs = ["mkdocs", "mkdocs-material"] +lint = ["pre-commit"] +tests = ["pandas", "polars", "pyarrow", "pytest", "pytest-cov", "pytest-xdist", "pytz"] + [[package]] name = "soupsieve" version = "2.8.3" @@ -2556,4 +2615,4 @@ scikit-learn = ["scikit-learn"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "1c7e4b421ae3f1d08158fc5027abc834cb81e3f2fada84497ae65e67aef1ce22" +content-hash = "55ed40695a5442715f1e3379fc3920e570cc0a1de6a95fdbe9aa8e160a4c46e3" diff --git a/pyproject.toml b/pyproject.toml index cac0f944..2cdb168c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,8 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef" scipy = "^1.11.4" inflect = "^7.0.0" contractions = "^0.1.73" - chardet = "^5.2.0" - imbalanced-learn = "^0.12.0" + chardet = ">=5.2,<8.0" + imbalanced-learn = "^0.14.1" pytest = ">=8,<10" morph-kgc = "^2.7.0" bibtexparser = "^1.4.1" diff --git a/src/somef/process_files.py b/src/somef/process_files.py index a531b01a..02734549 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -61,14 +61,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner try: parsed_build_files = set() - for dir_path, dir_names, filenames in os.walk(repo_dir): - - dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS] - if is_local_repo: - dir_names[:] = [d for d in dir_names if d.lower() != "lib"] - + for dir_path, dir_names, filenames in sorted(os.walk(repo_dir),key=lambda x: x[0].count(os.sep)): + dir_names.sort() + filenames.sort() + # dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS] + # if is_local_repo: + # dir_names[:] = [d for d in dir_names if d.lower() != "lib"] + + # repo_relative_path = os.path.relpath(dir_path, repo_dir) + # current_dir = os.path.basename(repo_relative_path).lower() repo_relative_path = os.path.relpath(dir_path, repo_dir) + path_parts = repo_relative_path.split(os.sep) current_dir = os.path.basename(repo_relative_path).lower() + is_in_ignored = any(part.lower() in constants.IGNORED_DIRS for part in path_parts) + is_lib = is_local_repo and "lib" in path_parts + if is_in_ignored or is_lib: + continue # if this is a test folder, we ignore it (except for the root repo) # if ignore_test_folder and repo_relative_path != "." and "test" in repo_relative_path.lower(): if ignore_test_folder and repo_relative_path != "." and current_dir in constants.IGNORED_DIRS: @@ -170,12 +178,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner logging.error(f"{type(err).__name__} was raised: {err}") if ("LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD"== filename.upper() or "LICENSE.RST"== filename.upper()): + metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name, - repo_default_branch, - repo_dir, repo_relative_path, filename, dir_path, - metadata_result, constants.CAT_LICENSE) + repo_default_branch, + repo_dir, repo_relative_path, filename, dir_path, + metadata_result, constants.CAT_LICENSE) - if "CODE_OF_CONDUCT" == filename.upper() or "CODE_OF_CONDUCT.MD" == filename.upper(): metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_default_branch, @@ -483,13 +491,26 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION): new_file_path = extract_directory_path(url) existing_path = extract_directory_path(entry[constants.PROP_SOURCE]) - if new_file_path.startswith(existing_path): - # the existing file is higher, ignore this one + + if new_file_path != existing_path and new_file_path.startswith(existing_path): return metadata_result - else: - # replace result in hierarchy (below) - replace = True - break + # if new_file_path.startswith(existing_path): + # # the existing file is higher, ignore this one + # return metadata_result + # if existing_path == new_file_path: + # return metadata_result + + # if (category == constants.CAT_CITATION and existing_path == "." and new_file_path != "."): + # return metadata_result + + # replace = True + # break + # if repo_relative_path != "." and category == constants.CAT_LICENSE: + # return metadata_result + # else: + # # replace result in hierarchy (below) + # replace = True + # break except Exception as e: logging.warning("Error when trying to determine if redundant files exist " + str(e)) try: @@ -500,9 +521,11 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul constants.PROP_VALUE: file_text, constants.PROP_TYPE: constants.FILE_DUMP } + if category is constants.CAT_LICENSE: license_text = file_text license_info = detect_license_spdx(license_text, 'JSON') + if license_info: result[constants.PROP_NAME] = license_info['name'] result[constants.PROP_SPDX_ID] = license_info['spdx_id'] @@ -514,6 +537,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE) for year, holder in matches_copyright: + holder = holder.strip() if holder else None year = year.strip() if year else None @@ -604,7 +628,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul ) pref = yaml_content.get("preferred-citation") - if pref: + if pref and not should_skip_citation(metadata_result, url): pref_result = parse_cff_preferred(pref) pref_result[constants.PROP_VALUE] = yaml.dump({"preferred-citation": pref}, default_flow_style=False) # pref_result[constants.PROP_TYPE] = constants.FILE_DUMP @@ -769,6 +793,23 @@ def parse_cff_preferred(pref): return clean_nulls(result) +def should_skip_citation(metadata_result, new_path): + for entry in metadata_result.results.get(constants.CAT_CITATION, []): + if entry.get(constants.PROP_TECHNIQUE) != constants.TECHNIQUE_FILE_EXPLORATION: + continue + + existing_path = extract_directory_path(entry[constants.PROP_SOURCE]) + + # mismo fichero + if existing_path == new_path: + return True + + # ya existe uno en raíz → bloquea subcarpetas + if existing_path == ".": + return True + + return False + def clean_nulls(d: dict) -> dict: return {k: v for k, v in d.items() if v not in (None, "")} @@ -790,7 +831,6 @@ def parse_license_cff(license_value, metadata_result, url): else: license_result[constants.PROP_NAME] = license_value - metadata_result.add_result( constants.CAT_LICENSE, license_result, @@ -798,6 +838,7 @@ def parse_license_cff(license_value, metadata_result, url): constants.TECHNIQUE_FILE_EXPLORATION, url ) + except Exception as e: logging.error(f"Error parsing license from CFF: {str(e)}") diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 7b4235e8..6cbd6e06 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -548,13 +548,13 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, if url.netloc != constants.GITHUB_DOMAIN: logging.error("Repository must be from Github") - return repository_metadata, "", "", "" + return repository_metadata, "", "", "", "" path_components = url.path.split('/') if len(path_components) < 3: logging.error("Repository link is not correct. \nThe correct format is https://github.com/{owner}/{repo_name}.") - return repository_metadata, "", "", "" + return repository_metadata, "", "", "", "" owner = path_components[1] repo_name = path_components[2] @@ -567,7 +567,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, logging.error(f"Github link is not correct. \n" f"The correct format is https://github.com/{owner}/{repo_name}/tree/... \n" f"or https://github.com/{owner}/{repo_name}/blob/....") - return repository_metadata, "", "", "" + return repository_metadata, "", "", "", "" # we must join all after 4, as sometimes tags have "/" in them. # default_branch = "/".join(path_components[4:]) @@ -580,7 +580,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, general_resp_raw, date = rate_limit_get(repo_api_base_url, headers=header) if general_resp_raw is None: logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or content-lenght none") - return repository_metadata, "", "", "" + return repository_metadata, "", "", "", "" general_resp = general_resp_raw.json()