From a7d2e3eab5d64d535daa6f4316f956bdba85c05d Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 31 Mar 2026 09:41:15 +0200 Subject: [PATCH 1/5] nttk to 3.9.4. Test with mockups by Priya. Contributors in reame.md --- README.md | 3 + poetry.lock | 6 +- src/somef/process_repository.py | 179 +++++++++++++++++----- src/somef/test/test_process_repository.py | 173 ++++++++++++++++++++- 4 files changed, 318 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 609375b0..c969bad3 100644 --- a/README.md +++ b/README.md @@ -361,6 +361,9 @@ Try SOMEF in Binder with our sample notebook: [![Binder](https://mybinder.org/ba If you want to contribute with a pull request, please do so by submitting it to the `dev` branch. +## Contributors: +Priyanka O. + ## Next features: To see upcoming features, please have a look at our [open issues](https://github.com/KnowledgeCaptureAndDiscovery/somef/issues) and [milestones](https://github.com/KnowledgeCaptureAndDiscovery/somef/milestones) diff --git a/poetry.lock b/poetry.lock index 0cf032be..ed414950 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1231,14 +1231,14 @@ test = ["pep440", "pre-commit", "pytest", "testpath"] [[package]] name = "nltk" -version = "3.9.3" +version = "3.9.4" description = "Natural Language Toolkit" optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "nltk-3.9.3-py3-none-any.whl", hash = "sha256:60b3db6e9995b3dd976b1f0fa7dec22069b2677e759c28eb69b62ddd44870522"}, - {file = "nltk-3.9.3.tar.gz", hash = "sha256:cb5945d6424a98d694c2b9a0264519fab4363711065a46aa0ae7a2195b92e71f"}, + {file = "nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f"}, + {file = "nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0"}, ] [package.dependencies] diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index fc6b4231..f1eb42c0 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -51,10 +51,19 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const parsed = urlparse(url) is_api_request = "api.github.com" in parsed.netloc content_length = None - # just verify size if NOT is a request to api.github.com + # Check file size before downloading the full body (skip for GitHub API requests, + # which are always small JSON payloads). if not is_api_request: try: - head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) + # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) + # Use a proper HEAD request to read only the response headers. + # Previously this used requests.get(..., stream=True) which opens a full + # TCP connection and starts the response stream but never closes it — + # leaking a socket for every archive we inspect. HEAD is the correct + # tool here: it retrieves headers without downloading the body. + head_response = requests.head(url, allow_redirects=True, + timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs) + head_response.close() # release the connection back to the pool immediately content_length = head_response.headers.get("Content-Length") if content_length is not None: size_bytes = int(content_length) @@ -789,12 +798,108 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe return None +# def download_github_files(directory, owner, repo_name, repo_ref, authorization): +# """ +# Download all repository files from a GitHub repository +# Parameters +# ---------- +# repo_ref: link to branch of the repo +# repo_name: name of the repo +# owner: GitHub owner +# directory: directory where to extract all downloaded files +# authorization: GitHub authorization token + +# Returns +# ------- +# path to the folder where all files have been downloaded +# """ +# # download the repo at the selected branch with the link +# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip" +# logging.info(f"Downloading {repo_archive_url}") + +# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) + +# if repo_download is None: +# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") +# return None + +# if repo_download.status_code == 300: +# logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution") + +# for ref_type in ["tags", "heads"]: +# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip" +# logging.info(f"Trying to download {repo_archive_url}") + +# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) + +# if repo_download is None: +# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.") +# return None + +# if repo_download.status_code == 200: +# break + +# if repo_download.status_code == 404: +# logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") +# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip" +# logging.info(f"Trying to download {repo_archive_url}") +# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) +# if repo_download is None: +# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") +# return None + +# if repo_download.status_code != 200: +# logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") +# return None + +# repo_zip = repo_download.content + +# repo_name_full = owner + "_" + repo_name +# repo_zip_file = os.path.join(directory, repo_name_full + ".zip") +# repo_extract_dir = os.path.join(directory, repo_name_full) + +# with open(repo_zip_file, "wb") as f: +# f.write(repo_zip) + +# try: +# with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: +# zip_ref.extractall(repo_extract_dir) +# except zipfile.BadZipFile: +# logging.error("Downloaded archive is not a valid zip (repo may be empty)") +# return None + +# repo_folders = os.listdir(repo_extract_dir) +# if not repo_folders: +# logging.warning("Repository archive is empty") +# return None + +# repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) +# return repo_dir + def download_github_files(directory, owner, repo_name, repo_ref, authorization): """ - Download all repository files from a GitHub repository + Download all repository files from a GitHub repository. + + GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple + Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the + same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called + ``v2.0``). In that case we must use the fully-qualified ref URLs: + - ``/archive/refs/heads/{ref}.zip`` (explicit branch) + - ``/archive/refs/tags/{ref}.zip`` (explicit tag) + + We also keep the legacy ``main.zip`` fallback for repositories that renamed their + default branch to ``main`` after being created with ``master`` (or vice-versa) so + that the GitHub API default_branch value is momentarily stale. + + Fallback order tried in sequence until one returns HTTP 200: + 1. ``/archive/{ref}.zip`` — short form, works for unambiguous refs + 2. ``/archive/refs/heads/{ref}.zip`` — unambiguous branch (fixes HTTP 300) + 3. ``/archive/refs/tags/{ref}.zip`` — unambiguous tag (fixes HTTP 300) + 4. ``/archive/main.zip`` — legacy branch-rename fallback + Parameters ---------- - repo_ref: link to branch of the repo + repo_ref: default branch (or tag) returned by the GitHub API repo_name: name of the repo owner: GitHub owner directory: directory where to extract all downloaded files @@ -802,46 +907,44 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): Returns ------- - path to the folder where all files have been downloaded + Path to the folder where all files have been downloaded, or None on failure. """ - # download the repo at the selected branch with the link - repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip" - logging.info(f"Downloading {repo_archive_url}") - - repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) - if repo_download is None: - logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") - return None - - if repo_download.status_code == 300: - logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution") - - for ref_type in ["tags", "heads"]: - repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip" - logging.info(f"Trying to download {repo_archive_url}") - - repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) - - if repo_download is None: - logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.") - return None - - if repo_download.status_code == 200: - break - - if repo_download.status_code == 404: - logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") - repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip" - logging.info(f"Trying to download {repo_archive_url}") + # Candidate archive URLs tried in order. We start with the short form because it + # works for the vast majority of repos and avoids an extra HTTP round-trip. When + # that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the + # fully-qualified refs/heads/ and refs/tags/ forms before falling back to main. + candidate_urls = [ + f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/refs/tags/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/main.zip", + ] + repo_download = None + repo_archive_url = None + for repo_archive_url in candidate_urls: + logging.info(f"Downloading {repo_archive_url}") repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) if repo_download is None: - logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") + # Size limit exceeded or streaming error — no point trying other URLs + logging.warning( + f"Repository archive skipped due to size limit: " + f"{constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length." + ) return None - if repo_download.status_code != 200: - logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") - return None + if repo_download.status_code == 200: + break + logging.warning( + f"Archive URL {repo_archive_url} returned HTTP {repo_download.status_code}, " + f"trying next fallback..." + ) + if repo_download is None or repo_download.status_code != 200: + logging.error( + f"All archive download attempts failed for {owner}/{repo_name} " + f"(last status: {getattr(repo_download, 'status_code', 'N/A')})" + ) + return None repo_zip = repo_download.content diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index fbf64373..e6b552da 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -1,9 +1,11 @@ +import io import os import tempfile import unittest import json +import zipfile from pathlib import Path - +from unittest.mock import MagicMock, patch, call from ..parser import pom_xml_parser from .. import process_repository, process_files, somef_cli from ..utils import constants @@ -335,4 +337,171 @@ def test_issue_905_tag(self): source = version[0].get("source", "") assert "Widoco/v1.4.25" in source, f"The downloaded tag does not match the requested one. Source: {source}" - os.remove(test_data_path + "test_905_tag.json") \ No newline at end of file + os.remove(test_data_path + "test_905_tag.json") + + +def _make_mock_response(status_code, content=b""): + """Helper: create a minimal mock requests.Response.""" + resp = MagicMock() + resp.status_code = status_code + resp.content = content + resp.headers = {} + return resp + + +def _make_zip_bytes(inner_dir="owner_repo"): + """Helper: build a minimal valid zip archive containing one file.""" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr(f"{inner_dir}/README.md", "# Test") + return buf.getvalue() + +class TestIssue909ArchiveFallback(unittest.TestCase): + """ + Tests for download_github_files HTTP-300 fallback chain (issue #909). + + GitHub returns HTTP 300 (Multiple Choices) when the short-form archive URL + /archive/{ref}.zip is ambiguous — i.e. a branch and a tag share the same + name. The fix adds a cascade of unambiguous fallback URLs so SOMEF can + still download the archive instead of crashing. + """ + + @patch("somef.process_repository.rate_limit_get") + def test_http_300_falls_back_to_refs_heads(self, mock_rlg): + """ + HTTP 300 on the short-form URL must trigger the refs/heads/ fallback. + Scenario: balaje/icefem whose default branch 'v2.0' is also a tag name. + """ + zip_bytes = _make_zip_bytes("balaje_icefem") + mock_rlg.side_effect = [ + (_make_mock_response(300), ""), # /archive/v2.0.zip → 300 + (_make_mock_response(200, zip_bytes), ""), # refs/heads/v2.0.zip → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "balaje", "icefem", "v2.0", None) + + self.assertIsNotNone(result, "Should succeed via refs/heads/ fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("archive/v2.0.zip", urls_tried[0]) + self.assertIn("refs/heads/v2.0.zip", urls_tried[1]) + + @patch("somef.process_repository.rate_limit_get") + def test_http_300_falls_back_to_refs_tags(self, mock_rlg): + """ + When refs/heads/ also fails, refs/tags/ must be tried next. + """ + zip_bytes = _make_zip_bytes("owner_repo") + mock_rlg.side_effect = [ + (_make_mock_response(300), ""), # short form → 300 + (_make_mock_response(404), ""), # refs/heads/ → 404 + (_make_mock_response(200, zip_bytes), ""), # refs/tags/ → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "v1.0", None) + + self.assertIsNotNone(result, "Should succeed via refs/tags/ fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("refs/tags/v1.0.zip", urls_tried[2]) + + @patch("somef.process_repository.rate_limit_get") + def test_http_404_falls_back_to_main(self, mock_rlg): + """ + HTTP 404 on all ref-specific URLs must reach the legacy main.zip fallback. + """ + zip_bytes = _make_zip_bytes("owner_repo") + mock_rlg.side_effect = [ + (_make_mock_response(404), ""), # short form + (_make_mock_response(404), ""), # refs/heads/ + (_make_mock_response(404), ""), # refs/tags/ + (_make_mock_response(200, zip_bytes), ""), # main.zip → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "oldmaster", None) + + self.assertIsNotNone(result, "Should succeed via main.zip fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("archive/main.zip", urls_tried[-1]) + + @patch("somef.process_repository.rate_limit_get") + def test_all_fallbacks_fail_returns_none_not_exit(self, mock_rlg): + """ + When all four candidate URLs fail, download_github_files must return None + instead of calling sys.exit() (which would crash the whole process). + """ + mock_rlg.return_value = (_make_mock_response(404), "") + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "branch", None) + + self.assertIsNone(result) + # All four candidates should have been attempted + self.assertEqual(mock_rlg.call_count, 4) + + @patch("somef.process_repository.rate_limit_get") + def test_size_limit_stops_loop_immediately(self, mock_rlg): + """ + When rate_limit_get returns None (size limit exceeded), the fallback loop + must stop immediately — there is no point retrying other URLs for the same + oversized archive. + """ + mock_rlg.return_value = (None, None) + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "main", None) + + self.assertIsNone(result) + self.assertEqual(mock_rlg.call_count, 1, "Should stop after first None response") + + +class TestRateLimitGetHeadRequest(unittest.TestCase): + """ + Tests for the socket-leak fix in rate_limit_get (issue #909 follow-up). + + The previous implementation used requests.get(..., stream=True) to inspect + the Content-Length header before downloading. A streaming GET opens a full + TCP connection whose socket is never released if the stream body is never + read and the response is never closed. The fix uses requests.head() instead, + which retrieves headers only and whose connection is explicitly closed. + """ + + @patch("somef.process_repository.requests.get") + @patch("somef.process_repository.requests.head") + def test_head_used_instead_of_streaming_get(self, mock_head, mock_get): + """rate_limit_get must call requests.head() (not streaming GET) for size check.""" + head_resp = MagicMock() + head_resp.headers = {"Content-Length": "1024"} + head_resp.close = MagicMock() + mock_head.return_value = head_resp + + get_resp = MagicMock() + get_resp.status_code = 200 + get_resp.headers = {} + # Simulate a small non-streaming response + get_resp.iter_content = MagicMock(return_value=iter([b"data"])) + mock_get.return_value = get_resp + + process_repository.rate_limit_get( + "https://github.com/owner/repo/archive/main.zip" + ) + + mock_head.assert_called_once() + head_resp.close.assert_called_once() + + @patch("somef.process_repository.requests.get") + @patch("somef.process_repository.requests.head") + def test_head_response_closed_on_size_exceeded(self, mock_head, mock_get): + """ + The HEAD response must be closed even when the size check triggers an + early return — otherwise the connection stays open in the pool indefinitely. + """ + oversized = (constants.SIZE_DOWNLOAD_LIMIT_MB + 1) * 1024 * 1024 + head_resp = MagicMock() + head_resp.headers = {"Content-Length": str(oversized)} + head_resp.close = MagicMock() + mock_head.return_value = head_resp + + result, _ = process_repository.rate_limit_get( + "https://github.com/owner/repo/archive/main.zip" + ) + + self.assertIsNone(result) + head_resp.close.assert_called_once() + mock_get.assert_not_called() # full GET should never be issued \ No newline at end of file From b66b48630dfc71a86923c53c48e2da25a284ef29 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Tue, 31 Mar 2026 11:18:13 +0200 Subject: [PATCH 2/5] Apply suggestion from @dgarijo --- src/somef/process_repository.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index f1eb42c0..fb1ad41a 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -57,7 +57,6 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const try: # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) # Use a proper HEAD request to read only the response headers. - # Previously this used requests.get(..., stream=True) which opens a full # TCP connection and starts the response stream but never closes it — # leaking a socket for every archive we inspect. HEAD is the correct # tool here: it retrieves headers without downloading the body. From 7988b808c8eeb37ae2c2c35c0b8df08042458577 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Tue, 31 Mar 2026 11:18:32 +0200 Subject: [PATCH 3/5] Apply suggestion from @dgarijo --- src/somef/process_repository.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index fb1ad41a..b3916cfa 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -57,7 +57,6 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const try: # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) # Use a proper HEAD request to read only the response headers. - # TCP connection and starts the response stream but never closes it — # leaking a socket for every archive we inspect. HEAD is the correct # tool here: it retrieves headers without downloading the body. head_response = requests.head(url, allow_redirects=True, From f4740fd457b1056f92882d9aa517661ffe69c5fc Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Tue, 31 Mar 2026 11:19:05 +0200 Subject: [PATCH 4/5] Apply suggestion from @dgarijo --- src/somef/process_repository.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index b3916cfa..2a57e596 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -57,7 +57,6 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const try: # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) # Use a proper HEAD request to read only the response headers. - # leaking a socket for every archive we inspect. HEAD is the correct # tool here: it retrieves headers without downloading the body. head_response = requests.head(url, allow_redirects=True, timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs) From 0dc6125e112b54e020c6531c7c2205a84dae8f35 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Tue, 31 Mar 2026 11:19:14 +0200 Subject: [PATCH 5/5] Apply suggestion from @dgarijo --- src/somef/process_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 2a57e596..79932aae 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -57,7 +57,7 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const try: # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) # Use a proper HEAD request to read only the response headers. - # tool here: it retrieves headers without downloading the body. + head_response = requests.head(url, allow_redirects=True, timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs) head_response.close() # release the connection back to the pool immediately