From 069211d5b92b600940c3b0bc66d145ab385b722e Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Fri, 5 Jun 2026 12:49:56 +0200 Subject: [PATCH 1/3] codeberg repositories --- README.md | 12 +- docs/index.md | 6 +- docs/output.md | 31 +++- src/somef/process_files.py | 33 +++- src/somef/process_repository.py | 152 ++++++++++++++++++ src/somef/somef_cli.py | 16 +- src/somef/test/test_codeberg.py | 128 +++++++++++++++ .../expected/runtime_platform_repo.yaml | 2 +- src/somef/utils/constants.py | 50 ++++++ 9 files changed, 405 insertions(+), 25 deletions(-) create mode 100644 src/somef/test/test_codeberg.py diff --git a/README.md b/README.md index 68cad3d1..6d5285a0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ A command line interface for automatically extracting relevant metadata from cod ## Features -Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the following categories (if present), listed in alphabetical order: +Given a readme file (or a GitHub/Gitlab/Codeberg repository) SOMEF will extract the following categories (if present), listed in alphabetical order: - **Acknowledgement**: Text acknowledging funding sources or contributors - **Application domain**: The application domain of the repository. Current supported domains include: Astrophysics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/) @@ -38,7 +38,7 @@ We recognize the following properties: - Year: Year of publication - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project -- **Code repository**: Link to the GitHub/GitLab repository used for the extraction +- **Code repository**: Link to the GitHub/GitLab/Codeberg repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository @@ -72,7 +72,7 @@ We recognize the following properties: - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) -- **Releases** (GitHub only): Pointer to the available versions of a software component. For each release, somef will track the following properties: +- **Releases**: Pointer to the available versions of a software component. For each release, somef will track the following properties: - Description: Release notes - Author: Agent responsible of creating the release - Name: Name of the release @@ -93,7 +93,7 @@ We recognize the following properties: - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository -We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) +We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) ## Documentation @@ -297,10 +297,10 @@ Usage: somef describe [OPTIONS] Options: -t, --threshold FLOAT Threshold to classify the text [required] Input: [mutually_exclusive, required] - -r, --repo_url URL Github/Gitlab Repository URL + -r, --repo_url URL Github/Gitlab/Codeberg Repository URL -d, --doc_src PATH Path to the README file source -i, --in_file PATH A file of newline separated links to GitHub/ - Gitlab repositories + Gitlab/Codeberg repositories -l, --local_repo PATH Path to the local repository source. No APIs will be used Output: [required_any] diff --git a/docs/index.md b/docs/index.md index e3ac76e7..eea7b08e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ We recognize the following properties: - Year: Year of publication - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project -- **Code repository**: Link to the GitHub/GitLab repository used for the extraction +- **Code repository**: Link to the GitHub/GitLab/Codeberg repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository @@ -80,7 +80,7 @@ We recognize the following properties: - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) -- **Releases** (GitHub and Gitlab): Pointer to the available versions of a software component. For each release, somef will track the following properties: +- **Releases** (GitHub, Gitlab and Codeberg): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Assets: files attached to the release - Description: Release notes - Author: Agent responsible of creating the release @@ -102,7 +102,7 @@ We recognize the following properties: - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository -We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) +We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) 1 The available application domains currently are: diff --git a/docs/output.md b/docs/output.md index fcaf96ea..4aa9354a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -73,7 +73,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `code_of_conduct`: Link to the code of conduct file of the project - `code_repository`: Link to the source code (typically the repository where the readme can be found) - `contact`: Contact person responsible for maintaining a software component. -- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab. +- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab and Codeberg. - `contributing guidelines`: Guidelines indicating how to contribute to a software component. - `contributor`: Contributors to this software. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs. - `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available. @@ -167,7 +167,7 @@ Depending on the `type` of the result, additional properties may be found. The following object `types` are currently supported: -- `Release`: software releases of the current code repository, as available from GitHub. +- `Release`: software releases of the current code repository, as available from GitHub, GitLab and Codeberg - `Programming_language`: Programming language used in the repository. - `License`: object representing all the metadata SOMEF extracts from a license. - `Agent`: user (typically, a person) or organization responsible for authoring a software release or a paper. @@ -317,6 +317,7 @@ The techniques can be of several types: - `file_exploration`: the result comes from an exploration of the files in the repository - `GitHub_API`: the result was obtained from the GitHub API. - `GitLab_API`: the result was obtained from the GitLab API. +- `Codeberg_API`: the result was obtained from the Codeberg API. - `regular_expression`: the result was obtained after performing regular expressions on the files in the repository. - `software_type_heuristics`: the result was obtained from analysis of the repository based on various heuristics from the README, code and extension analysis. - `supervised_classification`: the results were obtained after running text classifiers trained for detecting that type of header. @@ -405,6 +406,32 @@ A more detailed explanation is provided in the [wiki](https://github.com/oeg-upm ``` As shown in the Turtle snippet above, SOMEF represents the software as an entity, its relationship with each release (software version), the license found in the repository and the Person who owns it. --> +## Codeberg API Crosswalk + +When analyzing a Codeberg repository, SOMEF uses the [Codeberg API](https://codeberg.org/api/v1/swagger) +(`GET /api/v1/repos/{owner}/{repo}`) to retrieve metadata. The table below shows how Codeberg API +fields map to SOMEF categories: + +| SOMEF category | Codeberg API field | Notes | +|---|---|---| +| `name` | `name` | | +| `description` | `description` | | +| `code_repository` | `html_url` | | +| `owner` | `owner.login` | | +| `date_created` | `created_at` | | +| `date_updated` | `updated_at` | | +| `stars` | `stars_count` | In GitHub this field is `stargazers_count` | +| `forks_count` | `forks_count` | | +| `homepage` | `website` | In GitHub this field is `homepage` | +| `keywords` | `topics` | | +| `issue_tracker` | *(constructed)* | Built as `{html_url}/issues` | +| `license` | *(not available)* | Codeberg API does not return license information | +| `programming_languages` | `languages_url` | Additional GET request to the languages endpoint | +| `releases` | `/repos/{owner}/{repo}/releases` | Additional GET request | + +For releases, the field mapping is identical to GitHub. The only differences are that Codeberg +uses `attachments` instead of `assets` for release files, and it does not provide +`author.type` (`AGENT_TYPE`) for release authors. ## Citation Reconciliation diff --git a/src/somef/process_files.py b/src/somef/process_files.py index 2b80ad0f..87a5795e 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -345,14 +345,20 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner { constants.PROP_VALUE: workflow_url, constants.PROP_TYPE: constants.URL - }, 1, constants.TECHNIQUE_FILE_EXPLORATION) + }, 1, constants.TECHNIQUE_FILE_EXPLORATION) + elif repo_type == constants.RepositoryType.CODEBERG: + if (file_path.startswith(".forgejo/workflows/") or file_path.startswith(".gitea/workflows/")): + category = constants.CAT_CONTINUOUS_INTEGRATION + else: + category = None + + if category: + workflow_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, + repo_dir, repo_relative_path, filename) + metadata_result.add_result(category, + {constants.PROP_VALUE: workflow_url, constants.PROP_TYPE: constants.URL}, + 1, constants.TECHNIQUE_FILE_EXPLORATION) elif repo_type == constants.RepositoryType.GITHUB: - # if file_path.startswith(".github/workflows/"): - # category = constants.CAT_WORKFLOWS - # elif filename in [".travis.yml", "azure-pipelines.yml", "jenkinsfile"] or file_path.startswith(".circleci/"): - # category = constants.CAT_CONTINUOUS_INTEGRATION - # else: - # category = None if file_path.startswith(".github/workflows/"): category = constants.CAT_CONTINUOUS_INTEGRATION else: @@ -413,6 +419,8 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner docs_url = f"https://github.com/{owner}/{repo_name}/tree/{urllib.parse.quote(repo_default_branch)}/{docs_path}" elif repo_type == constants.RepositoryType.GITLAB: docs_url = f"https://{domain_gitlab}/{owner}/{repo_name}/-/tree/{urllib.parse.quote(repo_default_branch)}/{docs_path}" + elif repo_type == constants.RepositoryType.CODEBERG: + docs_url = f"https://codeberg.org/{owner}/{repo_name}/src/branch/{urllib.parse.quote(repo_default_branch)}/{docs_path}" else: docs_url = os.path.join(repo_dir, docs_path) # docs.append(docs_url) @@ -452,6 +460,8 @@ def get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, r return convert_to_raw_user_content_github(file_path, owner, repo_name, repo_default_branch) elif repo_type == constants.RepositoryType.GITLAB: return convert_to_raw_user_content_gitlab(file_path, owner, repo_name, repo_default_branch) + elif repo_type == constants.RepositoryType.CODEBERG: + return convert_to_raw_user_content_codeberg(file_path, owner, repo_name, repo_default_branch) else: return os.path.join(repo_dir, repo_relative_path, filename) @@ -695,6 +705,15 @@ def convert_to_raw_user_content_github(partial, owner, repo_name, repo_ref): return f"https://raw.githubusercontent.com/{owner}/{repo_name}/{repo_ref}/{urllib.parse.quote(partial)}" +def convert_to_raw_user_content_codeberg(partial, owner, repo_name, repo_ref): + """Converts Codeberg paths into raw content URLs""" + if partial.startswith("./"): + partial = partial.replace("./", "") + if partial.startswith(".\\"): + partial = partial.replace(".\\", "") + return f"https://codeberg.org/{owner}/{repo_name}/raw/branch/{repo_ref}/{urllib.parse.quote(partial)}" + + def convert_to_raw_user_content_gitlab(partial, owner, repo_name, repo_ref): """Converts GitLab paths into raw.githubuser content URLs, accessible by users""" if partial.startswith("./"): diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 6cbd6e06..1bcc773c 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -483,6 +483,9 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, elif repo_type is constants.RepositoryType.GITHUB: primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md" secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md" + elif repo_type is constants.RepositoryType.CODEBERG: + primary_url = f"https://codeberg.org/{owner}/{repo_name}/raw/branch/{default_branch}/README.md" + secondary_url = f"https://codeberg.org/{owner}/{repo_name}/raw/branch/master/README.md" else: logging.error("Repository type not supported") return None @@ -531,6 +534,8 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, """ if repo_type == constants.RepositoryType.GITLAB: return load_gitlab_repository_metadata(repository_metadata, repository_url) + elif repo_type == constants.RepositoryType.CODEBERG: + return load_codeberg_repository_metadata(repository_metadata, repository_url) elif repo_type == constants.RepositoryType.LOCAL: logging.warning("Trying to download metadata from a local repository") return None @@ -796,6 +801,8 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe return download_github_files(target_dir, owner, repo_name, default_branch, authorization) elif repo_type == constants.RepositoryType.GITLAB: return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref) + elif repo_type == constants.RepositoryType.CODEBERG: + return download_codeberg_files(target_dir, owner, repo_name, default_branch) else: logging.error("Cannot download files from a local repository!") return None @@ -1143,3 +1150,148 @@ def get_all_paginated_results(base_url, headers, per_page=100): return all_results + +def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): + logging.info(f"Loading Repository {repository_url} Information....") + if repository_url[-1] == '/': + repository_url = repository_url[:-1] + url = urlparse(repository_url) + + path_components = [p for p in url.path.split('/') if p] + if len(path_components) < 2: + logging.error("Codeberg link is not correct. Expected https://codeberg.org//") + return repo_metadata, "", "", "", "" + + owner = path_components[0] + repo_name = path_components[1] + default_branch = None + + if len(path_components) >= 4 and path_components[2] == "tree": + default_branch = path_components[3] + + repo_api_url = f"{constants.CODEBERG_API}/{owner}/{repo_name}" + resp = requests.get(repo_api_url) + if resp.status_code != 200: + logging.error(f"Error fetching Codeberg repository: {resp.status_code}") + return repo_metadata, "", "", "", "" + general_resp = resp.json() + + if default_branch is None: + default_branch = general_resp.get('default_branch', 'main') + + filtered_resp = do_crosswalk(general_resp, constants.codeberg_crosswalk_table) + if 'html_url' in general_resp: + filtered_resp[constants.CAT_ISSUE_TRACKER] = f"{general_resp['html_url']}/issues" + + filtered_resp[constants.CAT_DOWNLOAD_URL] = f"https://codeberg.org/{owner}/{repo_name}/releases" + + for category, value in filtered_resp.items(): + value_type = constants.STRING + if category in constants.all_categories: + if category == constants.CAT_ISSUE_TRACKER: + value = value.replace("{/number}", "") if isinstance(value, str) else value + if category == constants.CAT_OWNER: + value_type = "User" + if category == constants.CAT_KEYWORDS: + value = '%s,' % (', '.join(value)) + value = value.rstrip(',') + if category in [constants.CAT_CODE_REPOSITORY, constants.CAT_ISSUE_TRACKER, + constants.CAT_DOWNLOAD_URL, constants.CAT_HOMEPAGE]: + value_type = constants.URL + if category in [constants.CAT_DATE_CREATED, constants.CAT_DATE_UPDATED]: + value_type = constants.DATE + if category in [constants.CAT_FORK_COUNTS, constants.CAT_STARS]: + value_type = constants.NUMBER + # Saltamos CAT_LICENSE porque la API de Codeberg no lo devuelve + + result = { + constants.PROP_VALUE: value, + constants.PROP_TYPE: value_type + } + if result['value']: + repo_metadata.add_result(category, result, 1, constants.TECHNIQUE_CODEBERG_API) + + if 'languages_url' in filtered_resp: + lang_resp = requests.get(filtered_resp['languages_url']) + if lang_resp.status_code == 200: + languages = lang_resp.json() + for l, s in languages.items(): + result = { + constants.PROP_VALUE: l, + constants.PROP_NAME: l, + constants.PROP_TYPE: constants.LANGUAGE, + constants.PROP_SIZE: s, + } + repo_metadata.add_result(constants.CAT_PROGRAMMING_LANGUAGES, result, 1, + constants.TECHNIQUE_CODEBERG_API) + + releases_url = f"{constants.CODEBERG_API}/{owner}/{repo_name}/releases" + releases_resp = requests.get(releases_url) + if releases_resp.status_code == 200: + releases_list = releases_resp.json() + release_list_filtered = [do_crosswalk(r, constants.release_codeberg_crosswalk_table) + for r in releases_list] + for release in release_list_filtered: + release_obj = { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: release.get(constants.PROP_URL, "") + } + for category, value in release.items(): + if category == constants.PROP_AUTHOR: + value = { + constants.PROP_NAME: value, + constants.PROP_TYPE: release.get(constants.AGENT_TYPE, "Person") + } + if value: + release_obj[category] = value + if category == constants.CAT_ASSETS and isinstance(value, list): + assets_filtered = [do_crosswalk(a, constants.release_assets_codeberg) for a in value] + key_mapping = { + constants.PROP_BROWSER_URL: constants.PROP_CONTENT_URL, + constants.PROP_SIZE: constants.PROP_CONTENT_SIZE, + constants.PROP_CONTENT_TYPE: constants.PROP_ENCODING_FORMAT, + constants.PROP_DATE_CREATED_AT: constants.PROP_UPLOAD_DATE + } + assets_filtered = [{key_mapping.get(k, k): v for k, v in a.items()} for a in assets_filtered] + release_obj[category] = assets_filtered + repo_metadata.add_result(constants.CAT_RELEASES, release_obj, 1, constants.TECHNIQUE_CODEBERG_API) + + logging.info("Repository information successfully loaded.\n") + return repo_metadata, owner, repo_name, default_branch, "/".join(path_components) + + +def download_codeberg_files(directory, owner, repo_name, repo_branch): + """ + Download all repository files from a Codeberg repository. + """ + repo_archive_url = f"https://codeberg.org/{owner}/{repo_name}/archive/{repo_branch}.zip" + logging.info(f"Downloading {repo_archive_url}") + + repo_download = requests.get(repo_archive_url) + if repo_download.status_code != 200: + logging.error(f"Error downloading Codeberg archive: HTTP {repo_download.status_code}") + return None + + repo_zip = repo_download.content + + repo_name_full = owner + "_" + repo_name + repo_zip_file = os.path.join(directory, repo_name_full + ".zip") + repo_extract_dir = os.path.join(directory, repo_name_full) + + with open(repo_zip_file, "wb") as f: + f.write(repo_zip) + + try: + with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: + zip_ref.extractall(repo_extract_dir) + except zipfile.BadZipFile: + logging.error("Downloaded archive is not a valid zip") + return None + + repo_folders = os.listdir(repo_extract_dir) + if not repo_folders: + logging.warning("Repository archive is empty") + return None + + repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) + return repo_dir diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 371489bb..9364aee9 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -78,16 +78,20 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc url = urlparse(repo_url) servidor = url.netloc bGitLab = False + bCodeberg = False if process_repository.is_gitlab(servidor): logging.info(f"{servidor} is GitLab.") + repo_type = constants.RepositoryType.GITLAB bGitLab = True - # if reconcile_authors: - # logging.info("Author enrichment disabled: GitLab repositories are not supported for GitHub user enrichment.") - # reconcile_authors = False + logging.info(f"DEBUG: {servidor} is_gitlab = {bGitLab}") + elif servidor == constants.CODEBERG_DOMAIN: + repo_type = constants.RepositoryType.CODEBERG + bCodeberg = True + logging.info(f"DEBUG: {servidor} is_codeberg = {bCodeberg}") + - logging.info(f"DEBUG: {servidor} is_gitlab = {bGitLab}") - if bGitLab: - repo_type = constants.RepositoryType.GITLAB + # if bGitLab: + # repo_type = constants.RepositoryType.GITLAB logging.info("Processing repository metadata.") repository_metadata, owner, repo_name, def_branch, project_path = process_repository.load_online_repository_metadata( diff --git a/src/somef/test/test_codeberg.py b/src/somef/test/test_codeberg.py new file mode 100644 index 00000000..4d36880d --- /dev/null +++ b/src/somef/test/test_codeberg.py @@ -0,0 +1,128 @@ +import os +import unittest +import json +from pathlib import Path +from .. import somef_cli +from ..utils import constants +from .. import process_repository +from ..process_results import Result +from unittest.mock import patch, MagicMock + +test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep + + +def _make_mock_response(status_code, content=b""): + """Helper: create a minimal mock requests.Response.""" + resp = MagicMock() + resp.status_code = status_code + resp.content = content + resp.headers = {} + try: + resp.json.return_value = json.loads(content) + except json.JSONDecodeError: + pass + return resp + +class TestCodebergRepository(unittest.TestCase): + + @patch("somef.process_repository.requests.get") + def test_load_codeberg_metadata(self, mock_get): + """Load Codeberg repository metadata via mocked API. From the API response previously saved in local""" + # load api response + repo_json = json.load(open(test_data_path + "api_responses/codeberg/codeberg_forgejo.json")) + lang_json = json.load(open(test_data_path + "api_responses/codeberg/codeberg_forgejo_languages.json")) + releases_json = json.load(open(test_data_path + "api_responses/codeberg/codeberg_forgejo_releases.json")) + + mock_get.side_effect = [ + _make_mock_response(200, json.dumps(repo_json).encode()), + _make_mock_response(200, json.dumps(lang_json).encode()), + _make_mock_response(200, json.dumps(releases_json).encode()) + ] + + result, owner, repo_name, branch, path = process_repository.load_codeberg_repository_metadata( + Result(), "https://codeberg.org/forgejo/forgejo" + ) + + self.assertIn(constants.CAT_NAME, result.results) + self.assertEqual(result.results[constants.CAT_NAME][0]["result"]["value"], "forgejo") + self.assertIn(constants.CAT_STARS, result.results) + self.assertIn(constants.CAT_DESCRIPTION, result.results) + self.assertIn(constants.CAT_ISSUE_TRACKER, result.results) + self.assertEqual(owner, "forgejo") + self.assertEqual(repo_name, "forgejo") + # language + self.assertIn(constants.CAT_PROGRAMMING_LANGUAGES, result.results) + # releases + self.assertIn(constants.CAT_RELEASES, result.results) + + @patch("somef.process_repository.requests.get") + def test_codeberg_api_error_returns_empty(self, mock_get): + """HTTP error must return empty tuples, not crash.""" + mock_get.return_value = _make_mock_response(404) + result, owner, repo_name, branch, path = \ + process_repository.load_codeberg_repository_metadata( + Result(), "https://codeberg.org/nonexistent/repo" + ) + self.assertEqual(owner, "") + self.assertNotIn(constants.CAT_NAME, result.results) + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally") + def test_codeberg_integration_cli(self): + """End-to-end CLI test against a real Codeberg repository.""" + output_file = test_data_path + "test-codeberg-integration.json" + + somef_cli.run_cli( + threshold=0.8, + ignore_classifiers=False, + repo_url="https://codeberg.org/forgejo/forgejo", + local_repo=None, + doc_src=None, + in_file=None, + output=output_file, + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=True, + readme_only=False, + reconcile_authors=False + ) + + with open(output_file, "r") as f: + json_content = json.load(f) + + name_entries = json_content.get(constants.CAT_NAME, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_CODEBERG_API and e["result"]["value"] == "forgejo" + for e in name_entries), + "Name from Codeberg_API should be 'forgejo'" + ) + + desc_entries = json_content.get(constants.CAT_DESCRIPTION, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_CODEBERG_API and e["result"]["value"] == "Beyond coding. We forge." + for e in desc_entries), + "Description from Codeberg_API should match" + ) + + code_repo_entries = json_content.get(constants.CAT_CODE_REPOSITORY, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_CODEBERG_API and + e["result"]["value"] == "https://codeberg.org/forgejo/forgejo" + for e in code_repo_entries) + ) + + homepage_entries = json_content.get(constants.CAT_HOMEPAGE, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_CODEBERG_API and + e["result"]["value"] == "https://forgejo.org" + for e in homepage_entries) + ) + self.assertIn(constants.CAT_STARS, json_content) + self.assertIn(constants.CAT_FORK_COUNTS, json_content) + + + ci_entries = json_content.get(constants.CAT_CONTINUOUS_INTEGRATION, []) + self.assertTrue(len(ci_entries) >= 10, f"Expected at least 10 CI workflows, got {len(ci_entries)}") + + os.remove(output_file) \ No newline at end of file diff --git a/src/somef/test/test_data/expected/runtime_platform_repo.yaml b/src/somef/test/test_data/expected/runtime_platform_repo.yaml index d0f08f14..6de7c6bb 100644 --- a/src/somef/test/test_data/expected/runtime_platform_repo.yaml +++ b/src/somef/test/test_data/expected/runtime_platform_repo.yaml @@ -6,4 +6,4 @@ CAT_PROGRAMMING_LANGUAGES: name: Java value: Java version: "1.8" - type: Language + type: Programming_language diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index bec49c19..9e30ed4e 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -311,6 +311,7 @@ TECHNIQUE_CODE_CONFIG_PARSER = "code_parser" TECHNIQUE_GITHUB_API = "GitHub_API" TECHNIQUE_GITLAB_API = "GitLab_API" +TECHNIQUE_CODEBERG_API = "Codeberg_API" TECHNIQUE_HEURISTICS = "software_type_heuristics" # GitHub properties @@ -318,6 +319,10 @@ GITHUB_ACCEPT_HEADER = "application/vnd.github.v3+json" GITHUB_API = "https://api.github.com/repos" +#Codeberg properties +CODEBERG_DOMAIN = "codeberg.org" +CODEBERG_API = "https://codeberg.org/api/v1/repos" + # Software Heritage SWH_ROOT = "https://archive.softwareheritage.org/" REGEXP_SWH = r'\[\!\[SWH\]([^\]]+)\]\(([^)]+)\)' @@ -349,6 +354,25 @@ CAT_HOMEPAGE: "homepage" } +# Crosswalk to retrieve easily contents of interest from the codeberg response +codeberg_crosswalk_table = { + CAT_CODE_REPOSITORY: "html_url", + "languages_url": "languages_url", + CAT_OWNER: ["owner", "login"], + # AGENT_TYPE: ["owner", "type"], + CAT_DATE_CREATED: "created_at", + CAT_DATE_UPDATED: "updated_at", + # CAT_LICENSE: "license", + CAT_DESCRIPTION: "description", + CAT_NAME: "name", + CAT_FULL_NAME: "full_name", + # CAT_ISSUE_TRACKER: "issues_url", + CAT_STARS: "stars_count", + CAT_KEYWORDS: "topics", + CAT_FORK_COUNTS: "forks_count", + CAT_HOMEPAGE: "website" +} + # Mapping for releases release_crosswalk_table = { PROP_TAG: 'tag_name', @@ -391,6 +415,31 @@ PROP_DOWNLOAD_COUNT: "download_count" } +release_assets_codeberg = { + PROP_URL: "url", + PROP_NAME: "name", + PROP_SIZE: "size", + PROP_BROWSER_URL: "browser_download_url", + PROP_CONTENT_TYPE: "content_type", + PROP_DATE_CREATED_AT: "created_at", + PROP_DOWNLOAD_COUNT: "download_count" +} + +release_codeberg_crosswalk_table = { + PROP_TAG: 'tag_name', + PROP_NAME: 'name', + PROP_AUTHOR: ['author', 'login'], + # AGENT_TYPE: ['author', 'type'], + PROP_DESCRIPTION: 'body', + PROP_TARBALL_URL: 'tarball_url', + PROP_ZIPBALL_URL: 'zipball_url', + PROP_HTML_URL: 'html_url', + PROP_URL: 'url', + PROP_RELEASE_ID: 'id', + PROP_DATE_CREATED: 'created_at', + PROP_DATE_PUBLISHED: "published_at", + # CAT_ASSETS: "attachments" +} # Minimum percentage of total bytes a programming language must have to be considered relevant in CodeMeta file. MINIMUM_PERCENTAGE_LANGUAGE_PROGRAMMING = 10 @@ -423,6 +472,7 @@ class RepositoryType(Enum): GITHUB = 1 GITLAB = 2 LOCAL = 3 + CODEBERG = 4 # Media/script/non-software sets workflow_extensions=('.ga','.cwl','.nf','.knwf','.t2flow','.dag','.kar','.wdl',".smk",".snake") From 0b63ac921eeb1ac4c31ec2f94348085d20f15b2c Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Mon, 8 Jun 2026 11:11:18 +0200 Subject: [PATCH 2/3] rate limit in repo codeberg. Token authorization will be added in a future PR --- src/somef/process_repository.py | 33 ++++++++++++++++++++++++--------- src/somef/utils/constants.py | 3 +++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 1bcc773c..fabbafae 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -535,7 +535,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, if repo_type == constants.RepositoryType.GITLAB: return load_gitlab_repository_metadata(repository_metadata, repository_url) elif repo_type == constants.RepositoryType.CODEBERG: - return load_codeberg_repository_metadata(repository_metadata, repository_url) + return load_codeberg_repository_metadata(repository_metadata, repository_url, authorization) elif repo_type == constants.RepositoryType.LOCAL: logging.warning("Trying to download metadata from a local repository") return None @@ -802,12 +802,11 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe elif repo_type == constants.RepositoryType.GITLAB: return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref) elif repo_type == constants.RepositoryType.CODEBERG: - return download_codeberg_files(target_dir, owner, repo_name, default_branch) + return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization) else: logging.error("Cannot download files from a local repository!") return None - # def download_github_files(directory, owner, repo_name, repo_ref, authorization): # """ # Download all repository files from a GitHub repository @@ -1151,8 +1150,12 @@ def get_all_paginated_results(base_url, headers, per_page=100): return all_results -def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): +def load_codeberg_repository_metadata(repo_metadata: Result, repository_url, authorization=None): logging.info(f"Loading Repository {repository_url} Information....") + + file_paths = configuration.get_configuration_file() + headers = codeberg_header_template(authorization) + if repository_url[-1] == '/': repository_url = repository_url[:-1] url = urlparse(repository_url) @@ -1170,7 +1173,8 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): default_branch = path_components[3] repo_api_url = f"{constants.CODEBERG_API}/{owner}/{repo_name}" - resp = requests.get(repo_api_url) + # resp = requests.get(repo_api_url) + resp, _ = rate_limit_get(repo_api_url, headers=headers) if resp.status_code != 200: logging.error(f"Error fetching Codeberg repository: {resp.status_code}") return repo_metadata, "", "", "", "" @@ -1212,7 +1216,7 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): repo_metadata.add_result(category, result, 1, constants.TECHNIQUE_CODEBERG_API) if 'languages_url' in filtered_resp: - lang_resp = requests.get(filtered_resp['languages_url']) + lang_resp, _ = rate_limit_get(filtered_resp['languages_url'], headers=headers) if lang_resp.status_code == 200: languages = lang_resp.json() for l, s in languages.items(): @@ -1226,7 +1230,7 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): constants.TECHNIQUE_CODEBERG_API) releases_url = f"{constants.CODEBERG_API}/{owner}/{repo_name}/releases" - releases_resp = requests.get(releases_url) + releases_resp, _ = rate_limit_get(releases_url, headers=headers) if releases_resp.status_code == 200: releases_list = releases_resp.json() release_list_filtered = [do_crosswalk(r, constants.release_codeberg_crosswalk_table) @@ -1260,14 +1264,16 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url): return repo_metadata, owner, repo_name, default_branch, "/".join(path_components) -def download_codeberg_files(directory, owner, repo_name, repo_branch): +def download_codeberg_files(directory, owner, repo_name, repo_branch,authorization=None): """ Download all repository files from a Codeberg repository. """ repo_archive_url = f"https://codeberg.org/{owner}/{repo_name}/archive/{repo_branch}.zip" logging.info(f"Downloading {repo_archive_url}") - repo_download = requests.get(repo_archive_url) + headers = codeberg_header_template(authorization) + + repo_download, _ = rate_limit_get(repo_archive_url, headers=headers) if repo_download.status_code != 200: logging.error(f"Error downloading Codeberg archive: HTTP {repo_download.status_code}") return None @@ -1295,3 +1301,12 @@ def download_codeberg_files(directory, owner, repo_name, repo_branch): repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) return repo_dir + +def codeberg_header_template(authorization=None): + header = {} + file_paths = configuration.get_configuration_file() + if authorization is not None: + header["Authorization"] = authorization + elif constants.CONF_CODEBERG_AUTHORIZATION in file_paths: + header["Authorization"] = file_paths[constants.CONF_CODEBERG_AUTHORIZATION] + return header \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 9e30ed4e..20d9a568 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -323,6 +323,9 @@ CODEBERG_DOMAIN = "codeberg.org" CODEBERG_API = "https://codeberg.org/api/v1/repos" +# Token codeberg +CONF_CODEBERG_AUTHORIZATION = "codeberg_authorization" + # Software Heritage SWH_ROOT = "https://archive.softwareheritage.org/" REGEXP_SWH = r'\[\!\[SWH\]([^\]]+)\]\(([^)]+)\)' From c75b852b39895c5fc5e6dc4209e71f5668faa87c Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 9 Jun 2026 10:32:56 +0200 Subject: [PATCH 3/3] add bitbucket repos to describe. Docs. Fixes #1007 --- .gitignore | 2 + README.md | 10 +- docs/index.md | 6 +- docs/output.md | 29 +- src/somef/process_files.py | 39 ++- src/somef/process_repository.py | 166 ++++++++++- src/somef/somef_cli.py | 4 + src/somef/test/test_bitbucket.py | 134 +++++++++ .../bitbucket/bitbucket_response.json | 127 +++++++++ .../bitbucket/bitbucket_tags.json | 1 + .../codeberg/codeberg_forgejo.json | 104 +++++++ .../codeberg/codeberg_forgejo_languages.json | 16 ++ .../codeberg/codeberg_forgejo_releases.json | 261 ++++++++++++++++++ src/somef/utils/constants.py | 27 ++ 14 files changed, 897 insertions(+), 29 deletions(-) create mode 100644 src/somef/test/test_bitbucket.py create mode 100644 src/somef/test/test_data/api_responses/bitbucket/bitbucket_response.json create mode 100644 src/somef/test/test_data/api_responses/bitbucket/bitbucket_tags.json create mode 100644 src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo.json create mode 100644 src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_languages.json create mode 100644 src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_releases.json diff --git a/.gitignore b/.gitignore index 7858c187..c02e1e33 100644 --- a/.gitignore +++ b/.gitignore @@ -24,5 +24,7 @@ repos.txt !package_neors.json !package_npm.json !test_data/api_responses/*.json +!**/test_data/api_responses/codeberg/*.json +!**/test_data/api_responses/bitbucket/*.json uv.lock .python-version diff --git a/README.md b/README.md index 6d5285a0..380dcdb1 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ A command line interface for automatically extracting relevant metadata from cod ## Features -Given a readme file (or a GitHub/Gitlab/Codeberg repository) SOMEF will extract the following categories (if present), listed in alphabetical order: +Given a readme file (or a GitHub/Gitlab/Codeberg/Bitbucket repository) SOMEF will extract the following categories (if present), listed in alphabetical order: - **Acknowledgement**: Text acknowledging funding sources or contributors - **Application domain**: The application domain of the repository. Current supported domains include: Astrophysics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/) @@ -38,7 +38,7 @@ We recognize the following properties: - Year: Year of publication - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project -- **Code repository**: Link to the GitHub/GitLab/Codeberg repository used for the extraction +- **Code repository**: Link to the GitHub/GitLab/Codeberg and Bitbucket repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository @@ -93,7 +93,7 @@ We recognize the following properties: - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository -We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) +We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg and Bitbucket API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) ## Documentation @@ -297,10 +297,10 @@ Usage: somef describe [OPTIONS] Options: -t, --threshold FLOAT Threshold to classify the text [required] Input: [mutually_exclusive, required] - -r, --repo_url URL Github/Gitlab/Codeberg Repository URL + -r, --repo_url URL Github/Gitlab/Codeberg/Bitbucket Repository URL -d, --doc_src PATH Path to the README file source -i, --in_file PATH A file of newline separated links to GitHub/ - Gitlab/Codeberg repositories + Gitlab/Codeberg/Bitbucket repositories -l, --local_repo PATH Path to the local repository source. No APIs will be used Output: [required_any] diff --git a/docs/index.md b/docs/index.md index eea7b08e..00fe5ed8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ We recognize the following properties: - Year: Year of publication - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project -- **Code repository**: Link to the GitHub/GitLab/Codeberg repository used for the extraction +- **Code repository**: Link to the GitHub/GitLab/Codeberg/Bitbucket repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component - **Continuous integration**: Link to continuous integration service(s) - **Contribution guidelines**: Text indicating how to contribute to this code repository @@ -80,7 +80,7 @@ We recognize the following properties: - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) -- **Releases** (GitHub, Gitlab and Codeberg): Pointer to the available versions of a software component. For each release, somef will track the following properties: +- **Releases** (GitHub, Gitlab, Codeberg and Bitbucket): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Assets: files attached to the release - Description: Release notes - Author: Agent responsible of creating the release @@ -102,7 +102,7 @@ We recognize the following properties: - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository -We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) +We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab/Codeberg/Bitbucket API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) 1 The available application domains currently are: diff --git a/docs/output.md b/docs/output.md index 4aa9354a..973c61ca 100644 --- a/docs/output.md +++ b/docs/output.md @@ -73,7 +73,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `code_of_conduct`: Link to the code of conduct file of the project - `code_repository`: Link to the source code (typically the repository where the readme can be found) - `contact`: Contact person responsible for maintaining a software component. -- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab and Codeberg. +- `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab, Codeberg and Bitbucket. - `contributing guidelines`: Guidelines indicating how to contribute to a software component. - `contributor`: Contributors to this software. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs. - `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available. @@ -318,6 +318,7 @@ The techniques can be of several types: - `GitHub_API`: the result was obtained from the GitHub API. - `GitLab_API`: the result was obtained from the GitLab API. - `Codeberg_API`: the result was obtained from the Codeberg API. +- `Bitbucket_API`: the result was obtained from the Bitbucket API. - `regular_expression`: the result was obtained after performing regular expressions on the files in the repository. - `software_type_heuristics`: the result was obtained from analysis of the repository based on various heuristics from the README, code and extension analysis. - `supervised_classification`: the results were obtained after running text classifiers trained for detecting that type of header. @@ -433,6 +434,32 @@ For releases, the field mapping is identical to GitHub. The only differences are uses `attachments` instead of `assets` for release files, and it does not provide `author.type` (`AGENT_TYPE`) for release authors. + +## Bitbucket API Crosswalk + +When analyzing a Bitbucket repository, SOMEF uses the [Bitbucket Cloud API](https://developer.atlassian.com/cloud/bitbucket/rest/api-group-repositories/) +(`GET /2.0/repositories/{workspace}/{repo_slug}`) to retrieve metadata. The table below shows how Bitbucket API +fields map to SOMEF categories: + +| SOMEF category | Bitbucket API field | Notes | +|---|---|---| +| `name` | `slug` | | +| `description` | `description` | | +| `full_name` | `full_name` | Format: `{workspace}/{slug}` | +| `code_repository` | `links.html.href` | | +| `owner` | `owner.nickname` | Falls back to `owner.username` for team workspaces | +| `date_created` | `created_on` | | +| `date_updated` | `updated_on` | | +| `homepage` | `website` | | +| `forks_url` | `links.forks.href` | | +| `download_url` | *(constructed)* | Built as `{html_url}/downloads` | +| `issue_tracker` | *(constructed)* | Built as `{html_url}/issues` when `has_issues` is true | +| `programming_languages` | `language` | Single string, not a dictionary with sizes | +| `releases` | `/refs/tags` | Bitbucket has no dedicated releases endpoint; uses the tags endpoint | +| `stars` | *(not available)* | Bitbucket does not have a stargazers feature | +| `forks_count` | *(not available)* | Bitbucket does not expose fork counts in its API | + + ## Citation Reconciliation SOMEF reconciles extracted citations by matching unique identifiers (such as DOIs or URLs) to ensure a single, non-duplicated list of references. During this process, each entry retains its provenance, including the original source and the extraction technique used, ensuring full metadata transparency. diff --git a/src/somef/process_files.py b/src/somef/process_files.py index 87a5795e..ae3daa6e 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -328,6 +328,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner # if repo_type == constants.RepositoryType.GITLAB: if filename.endswith(".yml"): + category = None if repo_type == constants.RepositoryType.GITLAB: analysis = extract_workflows.is_file_continuous_integration_gitlab(os.path.join(repo_dir, file_path)) if analysis: @@ -351,26 +352,23 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner category = constants.CAT_CONTINUOUS_INTEGRATION else: category = None - - if category: - workflow_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, - repo_dir, repo_relative_path, filename) - metadata_result.add_result(category, - {constants.PROP_VALUE: workflow_url, constants.PROP_TYPE: constants.URL}, - 1, constants.TECHNIQUE_FILE_EXPLORATION) + elif repo_type == constants.RepositoryType.BITBUCKET: + if os.path.basename(file_path) == "bitbucket-pipelines.yml": + category = constants.CAT_CONTINUOUS_INTEGRATION + else: + category = None elif repo_type == constants.RepositoryType.GITHUB: if file_path.startswith(".github/workflows/"): category = constants.CAT_CONTINUOUS_INTEGRATION else: category = None - if category: - workflow_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, - repo_dir, repo_relative_path, filename) - metadata_result.add_result(category, - {constants.PROP_VALUE: workflow_url, constants.PROP_TYPE: constants.URL}, - 1, constants.TECHNIQUE_FILE_EXPLORATION) - + if category: + workflow_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, + repo_dir, repo_relative_path, filename) + metadata_result.add_result(category, + {constants.PROP_VALUE: workflow_url, constants.PROP_TYPE: constants.URL}, + 1, constants.TECHNIQUE_FILE_EXPLORATION) if filename.endswith(".ga") or filename.endswith(".cwl") or filename.endswith(".nf") or ( filename.endswith(".snake") or filename.endswith( ".smk") or "Snakefile" == filename_no_ext) or filename.endswith(".knwf") or filename.endswith( @@ -421,6 +419,8 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner docs_url = f"https://{domain_gitlab}/{owner}/{repo_name}/-/tree/{urllib.parse.quote(repo_default_branch)}/{docs_path}" elif repo_type == constants.RepositoryType.CODEBERG: docs_url = f"https://codeberg.org/{owner}/{repo_name}/src/branch/{urllib.parse.quote(repo_default_branch)}/{docs_path}" + elif repo_type == constants.RepositoryType.BITBUCKET: + docs_url = f"https://bitbucket.org/{owner}/{repo_name}/src/{urllib.parse.quote(repo_default_branch)}/{docs_path}" else: docs_url = os.path.join(repo_dir, docs_path) # docs.append(docs_url) @@ -462,6 +462,8 @@ def get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, r return convert_to_raw_user_content_gitlab(file_path, owner, repo_name, repo_default_branch) elif repo_type == constants.RepositoryType.CODEBERG: return convert_to_raw_user_content_codeberg(file_path, owner, repo_name, repo_default_branch) + elif repo_type == constants.RepositoryType.BITBUCKET: + return convert_to_raw_user_content_bitbucket(file_path, owner, repo_name, repo_default_branch) else: return os.path.join(repo_dir, repo_relative_path, filename) @@ -714,6 +716,15 @@ def convert_to_raw_user_content_codeberg(partial, owner, repo_name, repo_ref): return f"https://codeberg.org/{owner}/{repo_name}/raw/branch/{repo_ref}/{urllib.parse.quote(partial)}" +def convert_to_raw_user_content_bitbucket(partial, owner, repo_name, repo_ref): + """Converts Bitbucket paths into raw content URLs""" + if partial.startswith("./"): + partial = partial.replace("./", "") + if partial.startswith(".\\"): + partial = partial.replace(".\\", "") + return f"https://bitbucket.org/{owner}/{repo_name}/raw/{repo_ref}/{urllib.parse.quote(partial)}" + + def convert_to_raw_user_content_gitlab(partial, owner, repo_name, repo_ref): """Converts GitLab paths into raw.githubuser content URLs, accessible by users""" if partial.startswith("./"): diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index fabbafae..4d83396c 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -85,20 +85,23 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const stream=use_stream, **kwargs ) - # Detect invalid or insufficient GitHub token + # Detect invalid or insufficient token if response.status_code == 401: - raise Exception("Invalid GitHub token. Run `somef configure` to set a valid token.") + raise Exception("Invalid token. Run `somef configure` to set a valid token.") if response.status_code == 403: - raise Exception("GitHub token lacks required permissions or scopes.") + raise Exception("Token lacks required permissions or scopes.") date = response.headers.get("Date", "") # Show rate limit information if available if "X-RateLimit-Remaining" in response.headers: rate_limit_remaining = response.headers["X-RateLimit-Remaining"] epochtime = int(response.headers["X-RateLimit-Reset"]) + if epochtime < 1000000000: + epochtime = int(time.time()) + epochtime + date_reset = datetime.fromtimestamp(epochtime) logging.info( - "Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str( + "Remaining repository API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str( date_reset)) if not use_stream: @@ -486,6 +489,9 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, elif repo_type is constants.RepositoryType.CODEBERG: primary_url = f"https://codeberg.org/{owner}/{repo_name}/raw/branch/{default_branch}/README.md" secondary_url = f"https://codeberg.org/{owner}/{repo_name}/raw/branch/master/README.md" + elif repo_type is constants.RepositoryType.BITBUCKET: + primary_url = f"https://bitbucket.org/{owner}/{repo_name}/raw/{default_branch}/README.md" + secondary_url = f"https://bitbucket.org/{owner}/{repo_name}/raw/master/README.md" else: logging.error("Repository type not supported") return None @@ -536,6 +542,8 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, return load_gitlab_repository_metadata(repository_metadata, repository_url) elif repo_type == constants.RepositoryType.CODEBERG: return load_codeberg_repository_metadata(repository_metadata, repository_url, authorization) + elif repo_type == constants.RepositoryType.BITBUCKET: + return load_bitbucket_repository_metadata(repository_metadata, repository_url, authorization) elif repo_type == constants.RepositoryType.LOCAL: logging.warning("Trying to download metadata from a local repository") return None @@ -773,7 +781,7 @@ def do_crosswalk(data, crosswalk_table): if value is not None: output[somef_key] = value else: - logging.error(f"Error: key {path} not present in github repository") + logging.error(f"Error: key {path} not present in repository") return output @@ -803,6 +811,8 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref) elif repo_type == constants.RepositoryType.CODEBERG: return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization) + elif repo_type == constants.RepositoryType.BITBUCKET: + return download_bitbucket_files(target_dir, owner, repo_name, default_branch, authorization) else: logging.error("Cannot download files from a local repository!") return None @@ -1302,6 +1312,7 @@ def download_codeberg_files(directory, owner, repo_name, repo_branch,authorizati repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) return repo_dir + def codeberg_header_template(authorization=None): header = {} file_paths = configuration.get_configuration_file() @@ -1309,4 +1320,147 @@ def codeberg_header_template(authorization=None): header["Authorization"] = authorization elif constants.CONF_CODEBERG_AUTHORIZATION in file_paths: header["Authorization"] = file_paths[constants.CONF_CODEBERG_AUTHORIZATION] - return header \ No newline at end of file + return header + + +def bitbucket_header_template(authorization=None): + header = {} + file_paths = configuration.get_configuration_file() + if authorization is not None: + header["Authorization"] = authorization + elif constants.CONF_BITBUCKET_AUTHORIZATION in file_paths: + header["Authorization"] = file_paths[constants.CONF_BITBUCKET_AUTHORIZATION] + return header + + +def load_bitbucket_repository_metadata(repo_metadata: Result, repository_url, authorization=None): + logging.info(f"Loading Repository {repository_url} Information....") + if repository_url[-1] == '/': + repository_url = repository_url[:-1] + url = urlparse(repository_url) + + path_components = [p for p in url.path.split('/') if p] + if len(path_components) < 2: + logging.error("Bitbucket link is not correct. Expected https://bitbucket.org//") + return repo_metadata, "", "", "", "" + + owner = path_components[0] + repo_name = path_components[1] + default_branch = None + + if len(path_components) >= 4 and path_components[2] == "tree": + default_branch = path_components[3] + + # API call + repo_api_url = f"{constants.BITBUCKET_API}/{owner}/{repo_name}" + headers = bitbucket_header_template(authorization) + resp, _ = rate_limit_get(repo_api_url, headers=headers) + if resp.status_code != 200: + logging.error(f"Error fetching Bitbucket repository: {resp.status_code}") + return repo_metadata, "", "", "", "" + general_resp = resp.json() + + if default_branch is None: + default_branch = general_resp.get('mainbranch', {}).get('name', 'main') + + filtered_resp = do_crosswalk(general_resp, constants.bitbucket_crosswalk_table) + + if constants.CAT_OWNER not in filtered_resp or not filtered_resp[constants.CAT_OWNER]: + owner_obj = general_resp.get('owner', {}) + owner_val = owner_obj.get('nickname') or owner_obj.get('username') + if owner_val: + filtered_resp[constants.CAT_OWNER] = owner_val + + # Issue tracker + if general_resp.get('has_issues', False) and 'links' in general_resp and 'html' in general_resp['links']: + html_url = general_resp['links']['html']['href'] + filtered_resp[constants.CAT_ISSUE_TRACKER] = f"{html_url}/issues" + + + if 'language' in general_resp and general_resp['language']: + lang_value = general_resp['language'] + result = { + constants.PROP_VALUE: lang_value, + constants.PROP_NAME: lang_value, + constants.PROP_TYPE: constants.LANGUAGE, + } + repo_metadata.add_result(constants.CAT_PROGRAMMING_LANGUAGES, result, 1, + constants.TECHNIQUE_BITBUCKET_API) + + + if 'links' in general_resp and 'html' in general_resp['links']: + filtered_resp[constants.CAT_DOWNLOAD_URL] = f"{general_resp['links']['html']['href']}/downloads" + + for category, value in filtered_resp.items(): + value_type = constants.STRING + if category in constants.all_categories: + if category == constants.CAT_OWNER: + value_type = "User" + if category in [constants.CAT_CODE_REPOSITORY, constants.CAT_ISSUE_TRACKER, + constants.CAT_DOWNLOAD_URL, constants.CAT_HOMEPAGE, constants.CAT_FORKS_URLS]: + value_type = constants.URL + if category in [constants.CAT_DATE_CREATED, constants.CAT_DATE_UPDATED]: + value_type = constants.DATE + if category == constants.CAT_PROGRAMMING_LANGUAGES: + value_type = constants.LANGUAGE + + result = { + constants.PROP_VALUE: value, + constants.PROP_TYPE: value_type + } + if result['value']: + repo_metadata.add_result(category, result, 1, constants.TECHNIQUE_BITBUCKET_API) + + # Releases from /refs/tags + tags_url = f"{constants.BITBUCKET_API}/{owner}/{repo_name}/refs/tags" + tags_resp, _ = rate_limit_get(tags_url, headers=headers) + if tags_resp.status_code == 200: + tags_data = tags_resp.json() + tags_list = tags_data.get('values', []) + for tag in tags_list: + release_obj = do_crosswalk(tag, constants.release_bitbucket_crosswalk_table) + release_obj[constants.PROP_TYPE] = constants.RELEASE + release_obj[constants.PROP_VALUE] = tag.get('name', '') + repo_metadata.add_result(constants.CAT_RELEASES, release_obj, 1, + constants.TECHNIQUE_BITBUCKET_API) + + logging.info("Repository information successfully loaded.\n") + return repo_metadata, owner, repo_name, default_branch, "/".join(path_components) + + +def download_bitbucket_files(directory, owner, repo_name, repo_branch, authorization=None): + repo_archive_url = f"https://bitbucket.org/{owner}/{repo_name}/get/{repo_branch}.zip" + logging.info(f"Downloading {repo_archive_url}") + + headers = bitbucket_header_template(authorization) + repo_download, _ = rate_limit_get(repo_archive_url, headers=headers) + if repo_download is None: + logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length") + return None + if repo_download.status_code != 200: + logging.error(f"Error downloading Bitbucket archive: HTTP {repo_download.status_code}") + return None + + repo_zip = repo_download.content + + repo_name_full = owner + "_" + repo_name + repo_zip_file = os.path.join(directory, repo_name_full + ".zip") + repo_extract_dir = os.path.join(directory, repo_name_full) + + with open(repo_zip_file, "wb") as f: + f.write(repo_zip) + + try: + with zipfile.ZipFile(repo_zip_file, "r") as zip_ref: + zip_ref.extractall(repo_extract_dir) + except zipfile.BadZipFile: + logging.error("Downloaded archive is not a valid zip") + return None + + repo_folders = os.listdir(repo_extract_dir) + if not repo_folders: + logging.warning("Repository archive is empty") + return None + + repo_dir = os.path.join(repo_extract_dir, repo_folders[0]) + return repo_dir \ No newline at end of file diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 9364aee9..dff53b03 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -79,6 +79,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc servidor = url.netloc bGitLab = False bCodeberg = False + bBitbucket = False if process_repository.is_gitlab(servidor): logging.info(f"{servidor} is GitLab.") repo_type = constants.RepositoryType.GITLAB @@ -88,6 +89,9 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc repo_type = constants.RepositoryType.CODEBERG bCodeberg = True logging.info(f"DEBUG: {servidor} is_codeberg = {bCodeberg}") + elif "bitbucket.org" in servidor: + repo_type = constants.RepositoryType.BITBUCKET + bBitbucket = True # if bGitLab: diff --git a/src/somef/test/test_bitbucket.py b/src/somef/test/test_bitbucket.py new file mode 100644 index 00000000..ef6bcbb4 --- /dev/null +++ b/src/somef/test/test_bitbucket.py @@ -0,0 +1,134 @@ +import os +import unittest +import json +from pathlib import Path +from .. import somef_cli +from ..utils import constants +from .. import process_repository +from ..process_results import Result +from unittest.mock import patch, MagicMock + +test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep + + +def _make_mock_response(status_code, content=b""): + """Helper: create a minimal mock requests.Response.""" + resp = MagicMock() + resp.status_code = status_code + resp.content = content + resp.headers = {} + try: + resp.json.return_value = json.loads(content) + except json.JSONDecodeError: + pass + return resp + + +class TestBitbucketRepository(unittest.TestCase): + + @patch("somef.process_repository.requests.get") + def test_load_bitbucket_metadata(self, mock_get): + repo_json = json.load(open(test_data_path + "api_responses/bitbucket/bitbucket_response.json")) + tags_json = json.load(open(test_data_path + "api_responses/bitbucket/bitbucket_tags.json")) + + mock_get.side_effect = [ + _make_mock_response(200, json.dumps(repo_json).encode()), + _make_mock_response(200, json.dumps(tags_json).encode()) + ] + + result, owner, repo_name, branch, path = \ + process_repository.load_bitbucket_repository_metadata( + Result(), "https://bitbucket.org/bitbucketpipelines/pipelines-guide-python" + ) + + self.assertIn(constants.CAT_NAME, result.results) + self.assertIn(constants.CAT_DESCRIPTION, result.results) + self.assertIn(constants.CAT_CODE_REPOSITORY, result.results) + self.assertIn(constants.CAT_FULL_NAME, result.results) + self.assertIn(constants.CAT_FORKS_URLS, result.results) + self.assertIn(constants.CAT_PROGRAMMING_LANGUAGES, result.results) + # this repo has not issues + self.assertNotIn(constants.CAT_ISSUE_TRACKER, result.results) + # this repo has not releases + self.assertNotIn(constants.CAT_RELEASES, result.results) + self.assertEqual(owner, "bitbucketpipelines") + self.assertEqual(repo_name, "pipelines-guide-python") + self.assertEqual(branch, "master") + + + @patch("somef.process_repository.requests.get") + def test_bitbucket_api_error_returns_empty(self, mock_get): + mock_get.return_value = _make_mock_response(404) + result, owner, repo_name, branch, path = \ + process_repository.load_bitbucket_repository_metadata( + Result(), "https://bitbucket.org/nonexistent/repo" + ) + self.assertEqual(owner, "") + self.assertNotIn(constants.CAT_NAME, result.results) + + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally") + def test_bitbucket_integration_cli(self): + output_file = test_data_path + "test-bitbucket-integration.json" + somef_cli.run_cli( + threshold=0.8, + ignore_classifiers=False, + repo_url="https://bitbucket.org/bitbucketpipelines/pipelines-guide-python", + local_repo=None, + doc_src=None, + in_file=None, + output=output_file, + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=True, + readme_only=False, + reconcile_authors=False + ) + + with open(output_file, "r") as f: + json_content = json.load(f) + + name_entries = json_content.get(constants.CAT_NAME, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_BITBUCKET_API and e["result"]["value"] == "pipelines-guide-python" + for e in name_entries), + "Name from Bitbucket_API should be 'pipelines-guide-python'" + ) + + code_repo_entries = json_content.get(constants.CAT_CODE_REPOSITORY, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_BITBUCKET_API and e["result"]["value"] == "https://bitbucket.org/bitbucketpipelines/pipelines-guide-python" + for e in code_repo_entries) + ) + + full_name_entries = json_content.get(constants.CAT_FULL_NAME, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_BITBUCKET_API and e["result"]["value"] == "bitbucketpipelines/pipelines-guide-python" + for e in full_name_entries) + ) + + pl_entries = json_content.get(constants.CAT_PROGRAMMING_LANGUAGES, []) + self.assertTrue( + any(e["technique"] == constants.TECHNIQUE_BITBUCKET_API and e["result"]["value"] == "python" + for e in pl_entries) + ) + + + ft_entries = json_content.get(constants.CAT_FULL_TITLE, []) + self.assertTrue( + any(e.get("result", {}).get("value") == "Pipelines Python" + for e in ft_entries) + ) + + # print(json_content[constants.CAT_FULL_TITLE]) + # print(json_content[constants.CAT_FORKS_URLS]) + self.assertTrue(any(e["technique"] == constants.TECHNIQUE_BITBUCKET_API + for e in json_content.get(constants.CAT_FORKS_URLS, []))) + + self.assertTrue(any(e["technique"] == "file_exploration" + for e in json_content.get(constants.CAT_HAS_BUILD_FILE, []))) + + os.remove(output_file) + diff --git a/src/somef/test/test_data/api_responses/bitbucket/bitbucket_response.json b/src/somef/test/test_data/api_responses/bitbucket/bitbucket_response.json new file mode 100644 index 00000000..6d4f1d31 --- /dev/null +++ b/src/somef/test/test_data/api_responses/bitbucket/bitbucket_response.json @@ -0,0 +1,127 @@ +{ + "type": "repository", + "full_name": "bitbucketpipelines/pipelines-guide-python", + "links": { + "self": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python" + }, + "html": { + "href": "https://bitbucket.org/bitbucketpipelines/pipelines-guide-python" + }, + "avatar": { + "href": "https://bytebucket.org/ravatar/%7Bcfe0b566-69b6-44a1-a3cf-c124a7ce0f24%7D?ts=python" + }, + "pullrequests": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/pullrequests" + }, + "commits": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/commits" + }, + "forks": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/forks" + }, + "watchers": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/watchers" + }, + "branches": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/refs/branches" + }, + "tags": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/refs/tags" + }, + "downloads": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/downloads" + }, + "source": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/src" + }, + "clone": [ + { + "name": "https", + "href": "https://bitbucket.org/bitbucketpipelines/pipelines-guide-python.git" + }, + { + "name": "ssh", + "href": "git@bitbucket.org:bitbucketpipelines/pipelines-guide-python.git" + } + ], + "hooks": { + "href": "https://api.bitbucket.org/2.0/repositories/bitbucketpipelines/pipelines-guide-python/hooks" + } + }, + "name": "pipelines-guide-python", + "slug": "pipelines-guide-python", + "description": "This is an example repo showing pipelines with python", + "scm": "git", + "website": "", + "owner": { + "display_name": "Bitbucket Pipelines", + "links": { + "self": { + "href": "https://api.bitbucket.org/2.0/workspaces/%7Bb2919ebc-eb34-4395-b60f-cd937aaa001c%7D" + }, + "avatar": { + "href": "https://bitbucket.org/account/bitbucketpipelines/avatar/" + }, + "html": { + "href": "https://bitbucket.org/%7Bb2919ebc-eb34-4395-b60f-cd937aaa001c%7D/" + } + }, + "type": "team", + "uuid": "{b2919ebc-eb34-4395-b60f-cd937aaa001c}", + "username": "bitbucketpipelines" + }, + "workspace": { + "type": "workspace", + "uuid": "{b2919ebc-eb34-4395-b60f-cd937aaa001c}", + "name": "Bitbucket Pipelines", + "slug": "bitbucketpipelines", + "links": { + "avatar": { + "href": "https://bitbucket.org/workspaces/bitbucketpipelines/avatar/?ts=1569518620" + }, + "html": { + "href": "https://bitbucket.org/bitbucketpipelines/" + }, + "self": { + "href": "https://api.bitbucket.org/2.0/workspaces/bitbucketpipelines" + } + } + }, + "is_private": false, + "project": { + "type": "project", + "key": "DOC", + "uuid": "{92800567-05cc-4bbd-8485-114b0e689717}", + "name": "documentation", + "links": { + "self": { + "href": "https://api.bitbucket.org/2.0/workspaces/bitbucketpipelines/projects/DOC" + }, + "html": { + "href": "https://bitbucket.org/bitbucketpipelines/workspace/projects/DOC" + }, + "avatar": { + "href": "https://bitbucket.org/bitbucketpipelines/workspace/projects/DOC/avatar/32?ts=1506484642" + } + } + }, + "fork_policy": "allow_forks", + "created_on": "2018-10-05T04:53:15.008411+00:00", + "updated_on": "2026-01-29T16:48:15.149268+00:00", + "size": 1373327, + "language": "python", + "uuid": "{cfe0b566-69b6-44a1-a3cf-c124a7ce0f24}", + "mainbranch": { + "name": "master", + "type": "branch" + }, + "override_settings": { + "default_merge_strategy": false, + "branching_model": false + }, + "parent": null, + "enforced_signed_commits": null, + "has_issues": false, + "has_wiki": false +} \ No newline at end of file diff --git a/src/somef/test/test_data/api_responses/bitbucket/bitbucket_tags.json b/src/somef/test/test_data/api_responses/bitbucket/bitbucket_tags.json new file mode 100644 index 00000000..3ba8febb --- /dev/null +++ b/src/somef/test/test_data/api_responses/bitbucket/bitbucket_tags.json @@ -0,0 +1 @@ +{"values": [], "pagelen": 10, "size": 0, "page": 1} \ No newline at end of file diff --git a/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo.json b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo.json new file mode 100644 index 00000000..20610ef3 --- /dev/null +++ b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo.json @@ -0,0 +1,104 @@ +{ + "id": 73144, + "owner": { + "id": 70422, + "login": "forgejo", + "login_name": "", + "source_id": 0, + "full_name": "Forgejo", + "email": "forgejo@noreply.codeberg.org", + "avatar_url": "https://codeberg.org/avatars/dae8ab126a96f6fbd6942cf08ab92382", + "html_url": "https://codeberg.org/forgejo", + "language": "", + "is_admin": false, + "last_login": "0001-01-01T00:00:00Z", + "created": "2022-11-06T07:18:11+01:00", + "restricted": false, + "active": false, + "prohibit_login": false, + "location": "", + "pronouns": "", + "website": "https://forgejo.org", + "description": "Beyond coding. We forge.", + "visibility": "public", + "followers_count": 519, + "following_count": 0, + "starred_repos_count": 0, + "username": "forgejo" + }, + "name": "forgejo", + "full_name": "forgejo/forgejo", + "description": "Beyond coding. We forge.", + "empty": false, + "private": false, + "fork": false, + "template": false, + "parent": null, + "mirror": false, + "size": 328744, + "language": "Go", + "languages_url": "https://codeberg.org/api/v1/repos/forgejo/forgejo/languages", + "html_url": "https://codeberg.org/forgejo/forgejo", + "url": "https://codeberg.org/api/v1/repos/forgejo/forgejo", + "link": "", + "ssh_url": "ssh://git@codeberg.org/forgejo/forgejo.git", + "clone_url": "https://codeberg.org/forgejo/forgejo.git", + "original_url": "https://github.com/go-gitea/gitea", + "website": "https://forgejo.org", + "stars_count": 4804, + "forks_count": 827, + "watchers_count": 118, + "open_issues_count": 1320, + "open_pr_counter": 139, + "release_counter": 105, + "default_branch": "forgejo", + "archived": false, + "created_at": "2022-11-06T07:24:57+01:00", + "updated_at": "2026-06-02T01:13:56+02:00", + "archived_at": "1970-01-01T01:00:00+01:00", + "permissions": { + "admin": false, + "push": false, + "pull": true + }, + "has_issues": true, + "internal_tracker": { + "enable_time_tracker": false, + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true + }, + "has_wiki": false, + "has_wiki_contents": false, + "wiki_branch": "master", + "wiki_ssh_url": "ssh://git@codeberg.org/forgejo/forgejo.wiki.git", + "wiki_clone_url": "https://codeberg.org/forgejo/forgejo.wiki.git", + "globally_editable_wiki": false, + "has_pull_requests": true, + "has_projects": true, + "has_releases": true, + "has_packages": true, + "has_actions": true, + "ignore_whitespace_conflicts": false, + "allow_merge_commits": true, + "allow_rebase": true, + "allow_rebase_explicit": true, + "allow_squash_merge": true, + "allow_fast_forward_only_merge": false, + "allow_rebase_update": true, + "default_delete_branch_after_merge": true, + "default_merge_style": "squash", + "default_allow_maintainer_edit": true, + "default_update_style": "merge", + "avatar_url": "https://codeberg.org/repo-avatars/73144-c883a242dec5299fbc06bbe3ee71d8c6", + "internal": false, + "mirror_interval": "", + "object_format_name": "sha1", + "mirror_updated": "0001-01-01T00:00:00Z", + "repo_transfer": null, + "topics": [ + "forge", + "forgejo", + "git", + "self-hosted" + ] +} \ No newline at end of file diff --git a/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_languages.json b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_languages.json new file mode 100644 index 00000000..02acc451 --- /dev/null +++ b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_languages.json @@ -0,0 +1,16 @@ +{ + "Go": 16886294, + "go-html-template": 2256939, + "JavaScript": 563991, + "CSS": 395828, + "TypeScript": 377005, + "Vue": 136433, + "Roff": 58206, + "Makefile": 46509, + "Shell": 44058, + "Jsonnet": 15443, + "Dockerfile": 7283, + "Less": 5467, + "Scheme": 2253, + "Nix": 705 +} \ No newline at end of file diff --git a/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_releases.json b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_releases.json new file mode 100644 index 00000000..7bc17202 --- /dev/null +++ b/src/somef/test/test_data/api_responses/codeberg/codeberg_forgejo_releases.json @@ -0,0 +1,261 @@ +[ + { + "id": 9279765, + "tag_name": "v15.0.2", + "target_commitish": "", + "name": "v15.0.2", + "body": "See https://codeberg.org/forgejo/forgejo/src/branch/forgejo/release-notes-published/15.0.2.md", + "url": "https://codeberg.org/api/v1/repos/forgejo/forgejo/releases/9279765", + "html_url": "https://codeberg.org/forgejo/forgejo/releases/tag/v15.0.2", + "tarball_url": "https://codeberg.org/forgejo/forgejo/archive/v15.0.2.tar.gz", + "zipball_url": "https://codeberg.org/forgejo/forgejo/archive/v15.0.2.zip", + "hide_archive_links": false, + "upload_url": "https://codeberg.org/api/v1/repos/forgejo/forgejo/releases/9279765/assets", + "draft": false, + "prerelease": false, + "created_at": "2026-05-12T12:09:16+02:00", + "published_at": "2026-05-12T12:09:16+02:00", + "author": { + "id": 70541, + "login": "release-team", + "login_name": "", + "source_id": 0, + "full_name": "Forgejo Release Team", + "email": "release-team@noreply.codeberg.org", + "avatar_url": "https://codeberg.org/avatars/83623afe083e82955f79b2dfeea58bd9f34897fae0f6fd001bc282d3a40efc8b", + "html_url": "https://codeberg.org/release-team", + "language": "", + "is_admin": false, + "last_login": "0001-01-01T00:00:00Z", + "created": "2022-11-07T10:25:48+01:00", + "restricted": false, + "active": false, + "prohibit_login": false, + "location": "", + "pronouns": "", + "website": "https://codeberg.org/forgejo/forgejo", + "description": "Account for publishing Forgejo releases using the Forgejo CI", + "visibility": "public", + "followers_count": 16, + "following_count": 0, + "starred_repos_count": 0, + "username": "release-team" + }, + "assets": [ + { + "id": 1404560, + "name": "forgejo-15.0.2-linux-amd64", + "size": 0, + "download_count": 0, + "created_at": "2026-06-03T14:03:49+02:00", + "uuid": "4ba17d97-2885-4c38-a5cc-7c9a7f8be084", + "browser_download_url": "https://code.forgejo.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64", + "type": "external" + }, + { + "id": 1291356, + "name": "forgejo-15.0.2-linux-amd64.asc", + "size": 228, + "download_count": 1261, + "created_at": "2026-05-12T12:09:19+02:00", + "uuid": "4c7724d9-d9d4-41fc-9758-e2abf194fd61", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64.asc", + "type": "attachment" + }, + { + "id": 1291359, + "name": "forgejo-15.0.2-linux-amd64.sha256", + "size": 93, + "download_count": 6882, + "created_at": "2026-05-12T12:09:20+02:00", + "uuid": "814194b6-26ee-4ce8-aa58-e9b854f0ef40", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64.sha256", + "type": "attachment" + }, + { + "id": 1404536, + "name": "forgejo-15.0.2-linux-amd64.xz", + "size": 33843308, + "download_count": 68, + "created_at": "2026-06-03T13:57:40+02:00", + "uuid": "de77717e-973e-44aa-847f-37b5bdc9c0dc", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64.xz", + "type": "attachment" + }, + { + "id": 1291365, + "name": "forgejo-15.0.2-linux-amd64.xz.asc", + "size": 228, + "download_count": 158, + "created_at": "2026-05-12T12:09:21+02:00", + "uuid": "9d6858ef-6b0b-44c3-931f-e643448d71e2", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64.xz.asc", + "type": "attachment" + }, + { + "id": 1291368, + "name": "forgejo-15.0.2-linux-amd64.xz.sha256", + "size": 96, + "download_count": 251, + "created_at": "2026-05-12T12:09:22+02:00", + "uuid": "7e553e73-d979-4bd6-9b08-d1f0c14446c4", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-amd64.xz.sha256", + "type": "attachment" + }, + { + "id": 1404557, + "name": "forgejo-15.0.2-linux-arm-6", + "size": 0, + "download_count": 0, + "created_at": "2026-06-03T14:03:49+02:00", + "uuid": "5fd81cd6-8afe-470b-9705-53a2b19908b2", + "browser_download_url": "https://code.forgejo.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6", + "type": "external" + }, + { + "id": 1291374, + "name": "forgejo-15.0.2-linux-arm-6.asc", + "size": 228, + "download_count": 18, + "created_at": "2026-05-12T12:09:25+02:00", + "uuid": "c5cf70db-8b76-4ddb-aebf-9a08164a7d88", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6.asc", + "type": "attachment" + }, + { + "id": 1291377, + "name": "forgejo-15.0.2-linux-arm-6.sha256", + "size": 93, + "download_count": 21, + "created_at": "2026-05-12T12:09:25+02:00", + "uuid": "51a849fa-543a-456b-81f0-70d2b54c08ac", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6.sha256", + "type": "attachment" + }, + { + "id": 1404542, + "name": "forgejo-15.0.2-linux-arm-6.xz", + "size": 30797300, + "download_count": 1, + "created_at": "2026-06-03T13:59:10+02:00", + "uuid": "4bfb8fad-8795-4da2-acb3-121bbec2aa8c", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6.xz", + "type": "attachment" + }, + { + "id": 1291383, + "name": "forgejo-15.0.2-linux-arm-6.xz.asc", + "size": 228, + "download_count": 16, + "created_at": "2026-05-12T12:09:26+02:00", + "uuid": "2f3e578d-6fef-4335-9ca0-e5c61432d836", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6.xz.asc", + "type": "attachment" + }, + { + "id": 1291386, + "name": "forgejo-15.0.2-linux-arm-6.xz.sha256", + "size": 96, + "download_count": 19, + "created_at": "2026-05-12T12:09:27+02:00", + "uuid": "4ef9e8fc-8a72-4f0c-8133-2ce6042c0159", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm-6.xz.sha256", + "type": "attachment" + }, + { + "id": 1404563, + "name": "forgejo-15.0.2-linux-arm64", + "size": 0, + "download_count": 0, + "created_at": "2026-06-03T14:03:49+02:00", + "uuid": "c3e8726e-087a-415d-bc73-4a9630f83c19", + "browser_download_url": "https://code.forgejo.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64", + "type": "external" + }, + { + "id": 1291392, + "name": "forgejo-15.0.2-linux-arm64.asc", + "size": 228, + "download_count": 118, + "created_at": "2026-05-12T12:09:30+02:00", + "uuid": "ce472907-e213-41a1-bc58-c9998737cce4", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64.asc", + "type": "attachment" + }, + { + "id": 1291395, + "name": "forgejo-15.0.2-linux-arm64.sha256", + "size": 93, + "download_count": 165, + "created_at": "2026-05-12T12:09:30+02:00", + "uuid": "df1a91a9-1c94-4545-bd35-710f87e695ad", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64.sha256", + "type": "attachment" + }, + { + "id": 1404539, + "name": "forgejo-15.0.2-linux-arm64.xz", + "size": 30275260, + "download_count": 6, + "created_at": "2026-06-03T13:59:10+02:00", + "uuid": "e7ef71c6-93a2-44ff-8af0-c07ba4d12a01", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64.xz", + "type": "attachment" + }, + { + "id": 1291401, + "name": "forgejo-15.0.2-linux-arm64.xz.asc", + "size": 228, + "download_count": 44, + "created_at": "2026-05-12T12:09:32+02:00", + "uuid": "6f0b2d62-1a06-4bc2-bb4d-2d632a6b68e4", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64.xz.asc", + "type": "attachment" + }, + { + "id": 1291404, + "name": "forgejo-15.0.2-linux-arm64.xz.sha256", + "size": 96, + "download_count": 84, + "created_at": "2026-05-12T12:09:32+02:00", + "uuid": "a6b6e43d-0e40-4ff5-bf8d-f0793658b90a", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-15.0.2-linux-arm64.xz.sha256", + "type": "attachment" + }, + { + "id": 1404545, + "name": "forgejo-src-15.0.2.tar.gz", + "size": 49711959, + "download_count": 20, + "created_at": "2026-06-03T14:00:54+02:00", + "uuid": "45465223-70f5-4705-9289-c2d5fe7d269c", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-src-15.0.2.tar.gz", + "type": "attachment" + }, + { + "id": 1291410, + "name": "forgejo-src-15.0.2.tar.gz.asc", + "size": 228, + "download_count": 42, + "created_at": "2026-05-12T12:09:34+02:00", + "uuid": "6db70858-d902-4d68-8bf2-10359c325599", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-src-15.0.2.tar.gz.asc", + "type": "attachment" + }, + { + "id": 1291413, + "name": "forgejo-src-15.0.2.tar.gz.sha256", + "size": 92, + "download_count": 36, + "created_at": "2026-05-12T12:09:34+02:00", + "uuid": "5e710ccf-62f2-499e-96e9-f4833eaa9c72", + "browser_download_url": "https://codeberg.org/forgejo/forgejo/releases/download/v15.0.2/forgejo-src-15.0.2.tar.gz.sha256", + "type": "attachment" + } + ], + "archive_download_count": { + "zip": 224, + "tar_gz": 363 + } + } +] \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 20d9a568..fb2a1438 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -312,6 +312,7 @@ TECHNIQUE_GITHUB_API = "GitHub_API" TECHNIQUE_GITLAB_API = "GitLab_API" TECHNIQUE_CODEBERG_API = "Codeberg_API" +TECHNIQUE_BITBUCKET_API = "Bitbucket_API" TECHNIQUE_HEURISTICS = "software_type_heuristics" # GitHub properties @@ -323,8 +324,14 @@ CODEBERG_DOMAIN = "codeberg.org" CODEBERG_API = "https://codeberg.org/api/v1/repos" +# Bitbucket properties +BITBUCKET_DOMAIN = "bitbucket.org" +BITBUCKET_API = "https://api.bitbucket.org/2.0/repositories" + # Token codeberg CONF_CODEBERG_AUTHORIZATION = "codeberg_authorization" +# Token bitbucket +CONF_BITBUCKET_AUTHORIZATION = "bitbucket_authorization" # Software Heritage SWH_ROOT = "https://archive.softwareheritage.org/" @@ -376,6 +383,20 @@ CAT_HOMEPAGE: "website" } + +bitbucket_crosswalk_table = { + CAT_FULL_NAME: "full_name", + CAT_NAME: "name", + CAT_DESCRIPTION: "description", + CAT_DATE_CREATED: "created_on", + CAT_DATE_UPDATED: "updated_on", + CAT_OWNER: ["owner", "nickname"], + CAT_CODE_REPOSITORY: ["links", "html", "href"], + CAT_HOMEPAGE: "website", + CAT_FORKS_URLS: ["links", "forks", "href"] + # CAT_PROGRAMMING_LANGUAGES: "language", +} + # Mapping for releases release_crosswalk_table = { PROP_TAG: 'tag_name', @@ -443,6 +464,11 @@ PROP_DATE_PUBLISHED: "published_at", # CAT_ASSETS: "attachments" } + +release_bitbucket_crosswalk_table = { + PROP_TAG: "name", + PROP_NAME: "name", +} # Minimum percentage of total bytes a programming language must have to be considered relevant in CodeMeta file. MINIMUM_PERCENTAGE_LANGUAGE_PROGRAMMING = 10 @@ -476,6 +502,7 @@ class RepositoryType(Enum): GITLAB = 2 LOCAL = 3 CODEBERG = 4 + BITBUCKET = 5 # Media/script/non-software sets workflow_extensions=('.ga','.cwl','.nf','.knwf','.t2flow','.dag','.kar','.wdl',".smk",".snake")