diff --git a/LICENSE b/LICENSE index 335ea9d0..d6cbe095 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2018 The Python Packaging Authority +Copyright (c) 2019 Daniel Garijo, Ontology Engineering Group, Universidad Politécnica de Madrid Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/src/somef/__main__.py b/src/somef/__main__.py index 0b666830..2681d84d 100644 --- a/src/somef/__main__.py +++ b/src/somef/__main__.py @@ -1,7 +1,11 @@ # -*- coding: utf-8 -*- import click -from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup, RequiredAnyOptionGroup +from click_option_group import ( + optgroup, + RequiredMutuallyExclusiveOptionGroup, + RequiredAnyOptionGroup, +) import logging from . import configuration @@ -13,33 +17,53 @@ class URLParamType(click.types.StringParamType): name = "url" -@click.group(context_settings={'help_option_names': ['-h', '--help']}) +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.version_option(__version__) def cli(): click.echo("SOftware Metadata Extraction Framework (SOMEF) Command Line Interface") -@cli.command(help="Configure GitHub credentials and classifiers file path") -@click.option('-a', '--auto', help="Automatically configure SOMEF", is_flag=True, default=False) -@click.option('-b', '--base_uri', type=URLParamType(), help="Base URI for somef transformations", - default=constants.CONF_DEFAULT_BASE_URI) +@cli.command(help="Configure GitHub/GitLab credentials and classifiers file path") +@click.option("-a", "--auto", help="Automatically configure SOMEF", is_flag=True, default=False) +@click.option( + "-b", + "--base_uri", + type=URLParamType(), + help="Base URI for somef transformations", + default=constants.CONF_DEFAULT_BASE_URI, +) def configure(auto, base_uri): if auto: - click.echo( - "Configuring SOMEF automatically. To assign credentials edit the configuration file or run " - "the interactive mode") + click.echo("Configuring SOMEF automatically. To assign credentials edit the configuration file or run the interactive mode") configuration.configure() elif base_uri is not constants.CONF_DEFAULT_BASE_URI: configuration.update_base_uri(base_uri) else: - authorization = click.prompt("Authorization", default="") - description = click.prompt("Documentation classifier model file", default=configuration.default_description) + github_authorization = click.prompt("GitHub Authorization token (leave blank to skip)", default="") + gitlab_authorization = click.prompt( + "GitLab Authorization token (leave blank to skip; works for gitlab.com and self-hosted instances)", + default="", + ) + description = click.prompt( + "Documentation classifier model file", + default=configuration.default_description, + ) invocation = click.prompt("Invocation classifier model file", default=configuration.default_invocation) - installation = click.prompt("Installation classifier model file", default=configuration.default_installation) + installation = click.prompt( + "Installation classifier model file", + default=configuration.default_installation, + ) citation = click.prompt("Citation classifier model file", default=configuration.default_citation) base_uri = click.prompt("Base URI for RDF generation", default=base_uri) - # configuration.configure() - configuration.configure(authorization, description, invocation, installation, citation, base_uri) + configuration.configure( + github_authorization, + gitlab_authorization, + description, + invocation, + installation, + citation, + base_uri, + ) click.secho(f"Success", fg="green") @@ -57,14 +81,14 @@ def configure(auto, base_uri): "-ic", is_flag=True, default=False, - help="Flag to ignore running the classifiers (by default False)" + help="Flag to ignore running the classifiers (by default False)", ) @click.option( "--ignore_github_metadata", "-igm", is_flag=True, default=False, - help="Flag to ignore Github Metadata (by default False)" + help="Flag to ignore Github Metadata (by default False)", ) @click.option( "--readme_only", @@ -72,9 +96,9 @@ def configure(auto, base_uri): is_flag=True, default=False, help="Flag to retrieve only the README.md file from the Github/Gitlab Repository URL. If such file does not exist, " - "no metadata will be retrieved (by default False)" + "no metadata will be retrieved (by default False)", ) -@optgroup.group('Input', cls=RequiredMutuallyExclusiveOptionGroup) +@optgroup.group("Input", cls=RequiredMutuallyExclusiveOptionGroup) @optgroup.option( "--repo_url", "-r", @@ -85,13 +109,13 @@ def configure(auto, base_uri): "--doc_src", "-d", type=click.Path(exists=True), - help="Path to the README file source" + help="Path to the README file source", ) @optgroup.option( "--local_repo", "-l", type=click.Path(exists=True), - help="Path to local repository" + help="Path to local repository", ) @optgroup.option( "--in_file", @@ -99,9 +123,9 @@ def configure(auto, base_uri): type=click.Path(exists=True), help=""""A file of newline separated links to GitHub/Gitlab repositories to process in bulk. Each repository will be stored in a different file called $out_$url.json where $out is the name selected as out file and $url is the - url of the target repository (url encoded)""" + url of the target repository (url encoded)""", ) -@optgroup.group('Output', cls=RequiredAnyOptionGroup) +@optgroup.group("Output", cls=RequiredAnyOptionGroup) @optgroup.option( "--output", "-o", @@ -112,34 +136,34 @@ def configure(auto, base_uri): "--codemeta_out", "-c", type=click.Path(), - help="Path to an output codemeta file" + help="Path to an output codemeta file", ) @optgroup.option( "--google_codemeta_out", "-gc", type=click.Path(), - help="Path to an output Google-compliant codemeta file" + help="Path to an output Google-compliant codemeta file", ) @optgroup.option( "--graph_out", "-g", type=click.Path(), help="""Path to the output Knowledge Graph export file. If supplied, the output will be a Knowledge Graph, - in the format given in the --format option chosen (turtle, json-ld)""" + in the format given in the --format option chosen (turtle, json-ld)""", ) @click.option( "--graph_format", "-f", type=click.Choice(["turtle", "json-ld"]), default="turtle", - help="""If the --graph_out option is given, this is the format that the graph will be stored in""" + help="""If the --graph_out option is given, this is the format that the graph will be stored in""", ) @click.option( "--pretty", "-p", is_flag=True, default=False, - help="""Pretty print the JSON output file so that it is easy to compare to another JSON output file.""" + help="""Pretty print the JSON output file so that it is easy to compare to another JSON output file.""", ) @click.option( "--missing", @@ -147,55 +171,63 @@ def configure(auto, base_uri): is_flag=True, default=False, help="""The JSON will include a field missing_categories to report with the missing metadata fields SOMEF was not - able to find. """ + able to find. """, ) @click.option( "--keep_tmp", "-kt", type=click.Path(), help="""SOMEF will NOT delete the temporary folder where files are stored for analysis. Files will be stored at the - desired path""" + desired path""", ) @click.option( "--ignore_test_folder", "-itf", is_flag=True, default=True, - help="""SOMEF will ignore the contents of all files within folders named test (True by default)""" + help="""SOMEF will ignore the contents of all files within folders named test (True by default)""", ) @click.option( "--requirements_all", "-all", is_flag=True, default=False, - help="Export all detected requirements, including text and libraries (default)." + help="Export all detected requirements, including text and libraries (default).", ) @click.option( "--requirements_v", "-v", is_flag=True, default=False, - help="Export only requirements from structured sources (pom.xml, requirements.txt, etc.)" + help="Export only requirements from structured sources (pom.xml, requirements.txt, etc.)", ) @click.option( "--reconcile_authors", "-ra", is_flag=True, default=False, - help="""SOMEF will extract additional information from certain files like CODEOWNERS, etc.""" + help="""SOMEF will extract additional information from certain files like CODEOWNERS, etc.""", ) @click.option( "--branch", "-b", type=str, default=None, - help="Branch of the repository to analyze. Overrides the default branch." + help="Branch of the repository to analyze. Overrides the default branch.", +) +@click.option( + "--gitlab_token", + "-gt", + type=str, + default=None, + help="GitLab personal access token to avoid API rate limits. Works for gitlab.com and self-hosted GitLab instances. " + "Can also be set via the SOMEF_GITLAB_TOKEN environment variable or with 'somef configure'.", ) @click.option( "--tag", type=str, default=None, - help="Tag of the repository to analyze. Incompatible with --branch" + help="Tag of the repository to analyze. Incompatible with --branch", ) def describe(requirements_v, requirements_all, **kwargs): # import so missing packages get installed when appropriate @@ -204,9 +236,9 @@ def describe(requirements_v, requirements_all, **kwargs): elif requirements_all: kwargs["requirements_mode"] = "all" else: - kwargs["requirements_mode"] = "all" + kwargs["requirements_mode"] = "all" from . import somef_cli + somef_cli.run_cli(**kwargs) click.secho(f"Success", fg="green") - \ No newline at end of file diff --git a/src/somef/configuration.py b/src/somef/configuration.py index d0d4004b..19042164 100644 --- a/src/somef/configuration.py +++ b/src/somef/configuration.py @@ -48,12 +48,15 @@ def update_base_uri(base_uri): json.dump(data, fh) -def configure(authorization="", - description=default_description, - invocation=default_invocation, - installation=default_installation, - citation=default_citation, - base_uri=constants.CONF_DEFAULT_BASE_URI): +def configure( + github_authorization="", + gitlab_authorization="", + description=default_description, + invocation=default_invocation, + installation=default_installation, + citation=default_citation, + base_uri=constants.CONF_DEFAULT_BASE_URI, +): """ Function to configure the main program""" import nltk nltk.download('wordnet') @@ -72,16 +75,21 @@ def configure(authorization="", # data = json.load(fh) # else: data = { - constants.CONF_AUTHORIZATION: "token " + authorization, constants.CONF_DESCRIPTION: description, constants.CONF_INVOCATION: invocation, constants.CONF_INSTALLATION: installation, constants.CONF_CITATION: citation, - constants.CONF_BASE_URI: base_uri + constants.CONF_BASE_URI: base_uri, } - if data[constants.CONF_AUTHORIZATION] == "token ": - del data[constants.CONF_AUTHORIZATION] + if github_authorization: + data[constants.CONF_GITHUB_AUTHORIZATION] = "token " + github_authorization + + if gitlab_authorization: + token = gitlab_authorization + if not token.lower().startswith("bearer "): + token = "Bearer " + token + data[constants.CONF_GITLAB_AUTHORIZATION] = token with credentials_file.open("w") as fh: credentials_file.parent.chmod(0o700) diff --git a/src/somef/parser/codeowners_parser.py b/src/somef/parser/codeowners_parser.py index 1c7c92a7..6d27b54c 100644 --- a/src/somef/parser/codeowners_parser.py +++ b/src/somef/parser/codeowners_parser.py @@ -105,7 +105,7 @@ def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_ # return None -def enrich_user(username, repo_type, server_url=None): +def enrich_user(username, repo_type, server_url=None, gitlab_authorization=None): """ Enrich user metadata using the appropriate platform API. @@ -115,6 +115,8 @@ def enrich_user(username, repo_type, server_url=None): repo_type : str "GITHUB" or "GITLAB" server_url : str, optional Base URL of GitLab instance if repo_type is "GITLAB" + gitlab_authorization : str, optional + GitLab personal access token (works for gitlab.com and self-hosted instances) Returns ------- @@ -146,7 +148,14 @@ def enrich_user(username, repo_type, server_url=None): if not server_url.startswith("http"): server_url = "https://" + server_url api_url = f"{server_url.rstrip('/')}/api/v4/users?username={username}" - response = requests.get(api_url, timeout=5) + # Build auth header — same token works for gitlab.com and self-hosted instances + gl_headers = {} + if gitlab_authorization: + token = gitlab_authorization + if not token.startswith("Bearer "): + token = "Bearer " + token + gl_headers["Authorization"] = token + response = requests.get(api_url, headers=gl_headers, timeout=5) if response.status_code != 200: logging.warning(f"GitLab API request failed for {username}: {response.status_code}") return None diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 6cbd6e06..236ac8bf 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -27,6 +27,23 @@ def header_template(authorization=None): return header +def gitlab_header_template(gitlab_authorization=None): + """Construct headers for GitLab requests using explicit token or config fallback.""" + header = {} + file_paths = configuration.get_configuration_file() + + token = gitlab_authorization + if not token: + token = file_paths.get(constants.CONF_GITLAB_AUTHORIZATION) + + if token: + if not token.lower().startswith("bearer "): + token = "Bearer " + token + header["Authorization"] = token + + return header + + def is_gitlab(gitlab_server): api_url = f"https://{gitlab_server}/api/v4/projects" try: @@ -152,7 +169,7 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const return response, date -def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): +def load_gitlab_repository_metadata(repo_metadata: Result, repository_url, gitlab_authorization=None): """ Function uses the repository_url provided to load required information from gitlab. Information kept from the repository is written in keep_keys. @@ -215,25 +232,25 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): # could be gitlab.com or some gitlab self-hosted GitLab servers like gitlab.in2p3.fr if repository_url.rfind("gitlab.com") > 0: - project_id = get_project_id(repository_url, False) + project_id = get_project_id(repository_url, False, gitlab_authorization=gitlab_authorization) project_api_url = f"https://gitlab.com/api/v4/projects/{project_id}" else: project_path = url.path.lstrip("/") # "gammalearn/gammalearn" encoded_project_path = quote(project_path, safe="") # Codifica "/" como "%2F" # Build url of api to get id api_url = f"https://{url.netloc}/api/v4/projects/{encoded_project_path}" - project_id = get_project_id(api_url, True) + project_id = get_project_id(api_url, True, gitlab_authorization=gitlab_authorization) logging.info(f'Project_id: {project_id}') project_api_url = f"https://{url.netloc}/api/v4/projects/{project_id}" logging.info(f"Downloading {project_api_url}") - details = requests.get(project_api_url) + details = requests.get(project_api_url, headers=gitlab_header_template(gitlab_authorization)) project_details = details.json() date = details.headers["date"] repo_api_base_url = f"{repository_url}" # releases = get_gitlab_releases(project_id, f"https://{url.netloc}") - all_releases = get_all_gitlab_releases(project_api_url) + all_releases = get_all_gitlab_releases(project_api_url, gitlab_authorization=gitlab_authorization) release_list_filtered = [do_crosswalk(release, constants.release_gitlab_crosswalk_table) for release in all_releases] for release in release_list_filtered: @@ -411,7 +428,7 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): -def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref): +def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, gitlab_authorization=None): """ Download all repository files from a GitHub repository Parameters @@ -440,7 +457,7 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref): ) logging.info(f"Downloading {repo_archive_url}") - repo_download = requests.get(repo_archive_url) + repo_download = requests.get(repo_archive_url, headers=gitlab_header_template(gitlab_authorization)) repo_zip = repo_download.content repo_zip_file = os.path.join(directory, "repo.zip") @@ -461,7 +478,7 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref): return None -def download_readme(owner, repo_name, default_branch, repo_type, authorization, project_path = None): +def download_readme(owner, repo_name, default_branch, repo_type, authorization, project_path=None, gitlab_authorization=None): """ Method that given a repository owner, name and default branch, it downloads the readme content only. The readme is assumed to be README.md @@ -488,7 +505,13 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, return None logging.info(f"Downloading {primary_url}") - repo_download, date = rate_limit_get(primary_url, headers=header_template(authorization)) + headers = ( + gitlab_header_template(gitlab_authorization) + if repo_type is constants.RepositoryType.GITLAB + else header_template(authorization) + ) + + repo_download, date = rate_limit_get(primary_url, headers=headers) if repo_download is None: logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or content-lenght none") @@ -496,7 +519,7 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, if repo_download.status_code == 404: logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") logging.info(f"Trying to download {secondary_url}") - repo_download, date = rate_limit_get(secondary_url, headers=header_template(authorization)) + repo_download, date = rate_limit_get(secondary_url, headers=headers) if repo_download is None: logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or content-lenght none") return None @@ -510,7 +533,7 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, def load_online_repository_metadata(repository_metadata: Result, repository_url, ignore_api_metadata=False, repo_type=constants.RepositoryType.GITHUB, authorization=None, reconcile_authors=False, - branch=None,tag=None): + branch=None,tag=None, gitlab_authorization=None): """ Function uses the repository_url provided to load required information from GitHub or Gitlab. Information kept from the repository is written in keep_keys. @@ -530,7 +553,11 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, @return: Result object with the available metadata from online APIs plus its owner, repo name and default branch """ if repo_type == constants.RepositoryType.GITLAB: - return load_gitlab_repository_metadata(repository_metadata, repository_url) + return load_gitlab_repository_metadata( + repository_metadata, + repository_url, + gitlab_authorization=gitlab_authorization, + ) elif repo_type == constants.RepositoryType.LOCAL: logging.warning("Trying to download metadata from a local repository") return None @@ -773,7 +800,7 @@ def do_crosswalk(data, crosswalk_table): def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, - authorization=None): + authorization=None, gitlab_authorization=None): """ Given a repository, this method will download its files and return the readme text Parameters @@ -795,7 +822,14 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe if repo_type == constants.RepositoryType.GITHUB: return download_github_files(target_dir, owner, repo_name, default_branch, authorization) elif repo_type == constants.RepositoryType.GITLAB: - return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref) + return download_gitlab_files( + target_dir, + owner, + repo_name, + default_branch, + repo_ref, + gitlab_authorization=gitlab_authorization, + ) else: logging.error("Cannot download files from a local repository!") return None @@ -974,7 +1008,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): return repo_dir -def get_project_id(repository_url,self_hosted): +def get_project_id(repository_url, self_hosted, gitlab_authorization=None): """ Function to download a repository, given its URL Parameters: @@ -985,7 +1019,7 @@ def get_project_id(repository_url,self_hosted): """ logging.info(f"Downloading {repository_url}") - response = requests.get(repository_url) + response = requests.get(repository_url, headers=gitlab_header_template(gitlab_authorization)) project_id = "-1" if self_hosted: @@ -1014,14 +1048,14 @@ def get_project_id(repository_url,self_hosted): project_id = response_str[start:end] return project_id -def get_all_gitlab_releases(repo_api_base_url): +def get_all_gitlab_releases(repo_api_base_url, gitlab_authorization=None): all_releases = [] page = 1 while True: url = f"{repo_api_base_url}/releases?page={page}&per_page=100" logging.info(f"Getting releases from: {url}") - response = requests.get(url) + response = requests.get(url, headers=gitlab_header_template(gitlab_authorization)) logging.info(f"Response: {response.status_code}") content_type = response.headers.get("Content-Type", "") if response.status_code != 200 or "application/json" not in content_type: diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 371489bb..06a73af8 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -8,20 +8,30 @@ import urllib.parse from os import path -from . import header_analysis, regular_expressions, process_repository, configuration, process_files, \ - supervised_classification +from . import header_analysis, regular_expressions, process_repository, configuration, process_files from .process_results import Result from .utils import constants, markdown_utils -from .parser import mardown_parser, create_excerpts -from .export.turtle_export import DataGraph from .export import json_export from .export import google_codemeta_export from .extract_software_type import check_repository_type from urllib.parse import urlparse, quote -def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None, - ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None, - ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None) -> Result: +def cli_get_data( + threshold, + ignore_classifiers, + repo_url=None, + doc_src=None, + local_repo=None, + ignore_github_metadata=False, + readme_only=False, + keep_tmp=None, + github_authorization=None, + ignore_test_folder=True, + requirements_mode='all', + reconcile_authors=False, + branch=None, tag=None, + gitlab_authorization=None + ) -> Result: """ Main function to get the data through the command line Parameters @@ -34,12 +44,14 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc @param ignore_github_metadata: flag used to avoid doing extra requests to the GitHub API @param readme_only: flag to indicate that only the readme should be analyzed @param keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them - @param authorization: GitHub authorization token + @param github_authorization: GitHub authorization token @param ignore_test_folder: Ignore contents of test folders - @param requirements_mode: flag to indicate what requirements show in codemeta + @param requirements_mode: flag to indicate what requirements show in codemeta @param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API. @param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata. @param tag: tag of the repository to analyze. Cannot be used together with the branch parameter. + @param gitlab_authorization: GitLab personal access token (works for gitlab.com and self-hosted instances) + Returns ------- @return: Dictionary with the results found by SOMEF, formatted as a Result object. @@ -64,58 +76,51 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc if repo_url is not None: try: - """ - It is necessary to make changes to all methods related to GitLab because, until now, - they only worked with repositories on GitLab.com but not with self-hosted GitLab servers like gitlab.in2p3.fr, for example. - We are going to split the process so that it also takes these servers into account. - """ + # It is necessary to make changes to all methods related to GitLab because, until now, + # they only worked with repositories on GitLab.com but not with self-hosted GitLab servers + # like gitlab.in2p3.fr, for example. We are going to split the process so that it also + # takes these servers into account. - """ - The only sure way to know if a server is from GitLab is by checking its API. - GitLab servers are usually of the type gitlab.com, gitlab.in2p3.fr, or even salsa.debian.org, - so you cannot discriminate solely with the string 'gitlab'. - """ + # The only sure way to know if a server is from GitLab is by checking its API. + # GitLab servers are usually of the type gitlab.com, gitlab.in2p3.fr, or even + # salsa.debian.org, so you cannot discriminate solely with the string 'gitlab'. url = urlparse(repo_url) servidor = url.netloc bGitLab = False if process_repository.is_gitlab(servidor): logging.info(f"{servidor} is GitLab.") bGitLab = True - # if reconcile_authors: - # logging.info("Author enrichment disabled: GitLab repositories are not supported for GitHub user enrichment.") - # reconcile_authors = False logging.info(f"DEBUG: {servidor} is_gitlab = {bGitLab}") if bGitLab: repo_type = constants.RepositoryType.GITLAB - logging.info("Processing repository metadata.") repository_metadata, owner, repo_name, def_branch, project_path = process_repository.load_online_repository_metadata( repository_metadata, repo_url, ignore_github_metadata, repo_type, - authorization, + github_authorization, reconcile_authors, branch=branch, - tag=tag + tag=tag, + gitlab_authorization=gitlab_authorization ) # download files and obtain path to download folder if readme_only: - logging.info("Downloading README only...") # download readme only with the information above - readme_text = process_repository.download_readme(owner, repo_name, def_branch, repo_type, authorization, project_path) + readme_text = process_repository.download_readme(owner, repo_name, def_branch, repo_type, github_authorization, project_path, gitlab_authorization=gitlab_authorization) elif keep_tmp is not None: # save downloaded files locally os.makedirs(keep_tmp, exist_ok=True) local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type, - keep_tmp, repo_url, authorization) + keep_tmp, repo_url, github_authorization, + gitlab_authorization=gitlab_authorization) if local_folder is not None: readme_text, full_repository_metadata = process_files.process_repository_files(local_folder, repository_metadata, - repo_type, - owner, + repo_type, owner, repo_name, def_branch, ignore_test_folder, @@ -128,12 +133,12 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc with tempfile.TemporaryDirectory() as temp_dir: local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type, - temp_dir, repo_url, authorization) + temp_dir, repo_url, github_authorization, + gitlab_authorization=gitlab_authorization) if local_folder is not None: readme_text, full_repository_metadata = process_files.process_repository_files(local_folder, repository_metadata, - repo_type, - owner, + repo_type, owner, repo_name, def_branch, ignore_test_folder, @@ -154,7 +159,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc readme_text, full_repository_metadata = process_files.process_repository_files(local_repo, repository_metadata, repo_type, - ignore_test_folder = ignore_test_folder, + ignore_test_folder, reconcile_authors = reconcile_authors) if readme_text == "": logging.warning("Warning: README document does not exist in the local repository") @@ -177,28 +182,13 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc logging.info("Extracted categories from headers successfully.") readme_text_unmarked = markdown_utils.unmark(readme_text) logging.info("readme text unmarked successfully.") - if not ignore_classifiers and readme_unfiltered_text != '': - logging.info("Supervised classification") - repository_metadata = supervised_classification.run_category_classification(readme_unfiltered_text, - threshold, - repository_metadata) - logging.info("Create excerpts") - excerpts = create_excerpts.create_excerpts(string_list) - logging.info("Extract text excerpts headers") - excerpts_headers = mardown_parser.extract_text_excerpts_header(readme_unfiltered_text) - header_parents = mardown_parser.extract_headers_parents(readme_unfiltered_text) - score_dict = supervised_classification.run_classifiers(excerpts, file_paths) - repository_metadata = supervised_classification.classify(score_dict, threshold, excerpts_headers, - header_parents, repository_metadata) + if readme_text_unmarked != "": try: readme_source = repository_metadata.results[constants.CAT_README_URL][0] readme_source = readme_source[constants.PROP_RESULT][constants.PROP_VALUE] except: readme_source = "README.md" - - - logging.info("Extracting regular expressions...") repository_metadata = regular_expressions.extract_bibtex(readme_unfiltered_text, repository_metadata, readme_source) repository_metadata = regular_expressions.extract_doi_badges(readme_unfiltered_text, repository_metadata, @@ -266,7 +256,8 @@ def run_cli(*, requirements_mode="all", reconcile_authors=False, branch=None, - tag=None + tag=None, + gitlab_token=None ): """Function to run all the required components of the cli for a repository""" # check if it is a valid url @@ -300,14 +291,15 @@ def run_cli(*, encoded_url = encoded_url.replace(".","") #removing dots just in case repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, - keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, + reconcile_authors=reconcile_authors, branch=branch, tag=tag, + gitlab_authorization=gitlab_token) if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() repo_data = json_export.unify_results(repo_data.results) - + if output is not None: output = output.replace(".json","") output = output + "_" + encoded_url + ".json" @@ -334,37 +326,29 @@ def run_cli(*, if repo_url: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, - keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, + reconcile_authors=reconcile_authors, branch=branch, tag=tag, + gitlab_authorization=gitlab_token) elif local_repo: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, - local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, + reconcile_authors=reconcile_authors,branch=branch, tag=tag) else: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, - doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) - + doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, + reconcile_authors=reconcile_authors,branch=branch, tag=tag) + if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() repo_data = json_export.unify_results(repo_data.results) - + if output is not None: json_export.save_json_output(repo_data, output, missing, pretty=pretty) if codemeta_out is not None: json_export.save_codemeta_output(repo_data, codemeta_out, pretty=pretty, requirements_mode=requirements_mode) if google_codemeta_out is not None: google_codemeta_export.save_google_codemeta_output(repo_data, google_codemeta_out, pretty=pretty, requirements_mode=requirements_mode) - if graph_out is not None: - logging.info("Generating triples...") - data_graph = DataGraph() - if multiple_repos: - for repo in repo_data: - data_graph.somef_data_to_graph(repo.results) - else: - data_graph.somef_data_to_graph(repo_data.results) - data_graph.export_to_file(graph_out, graph_format) diff --git a/src/somef/test/test_gitlab_token_handling.py b/src/somef/test/test_gitlab_token_handling.py new file mode 100644 index 00000000..351bd38d --- /dev/null +++ b/src/somef/test/test_gitlab_token_handling.py @@ -0,0 +1,117 @@ +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from .. import somef_cli +from ..parser import codeowners_parser +from ..process_results import Result +from ..utils import constants + + +class TestGitlabTokenHandling(unittest.TestCase): + @patch("somef.parser.codeowners_parser.requests.get") + def test_enrich_user_gitlab_adds_bearer_prefix(self, mock_get): + # Simulate a successful GitLab user lookup response. + response = MagicMock() + response.status_code = 200 + response.json.return_value = [ + { + "name": "Alice", + "organization": "Example Org", + "public_email": "alice@example.org", + } + ] + mock_get.return_value = response + + # Pass a raw token and verify the request header is normalized to Bearer. + result = codeowners_parser.enrich_user( + "alice", + constants.RepositoryType.GITLAB, + server_url="gitlab.example.org", + gitlab_authorization="my-token", + ) + + # Ensure request URL, auth header, and timeout are built correctly. + mock_get.assert_called_once_with( + "https://gitlab.example.org/api/v4/users?username=alice", + headers={"Authorization": "Bearer my-token"}, + timeout=5, + ) + # Ensure selected fields from the GitLab payload are mapped into SOMEF output. + self.assertEqual(result[constants.PROP_CODEOWNERS_NAME], "Alice") + self.assertEqual(result[constants.PROP_CODEOWNERS_COMPANY], "Example Org") + self.assertEqual(result[constants.PROP_CODEOWNERS_EMAIL], "alice@example.org") + + @patch("somef.parser.codeowners_parser.requests.get") + def test_enrich_user_gitlab_preserves_existing_bearer_prefix(self, mock_get): + # Simulate a valid response; this test focuses on header formatting only. + response = MagicMock() + response.status_code = 200 + response.json.return_value = [{"name": "Bob"}] + mock_get.return_value = response + + # Pass an already-prefixed token and verify no duplicate prefix is introduced. + codeowners_parser.enrich_user( + "bob", + constants.RepositoryType.GITLAB, + server_url="https://gitlab.example.org", + gitlab_authorization="Bearer already-prefixed", + ) + + mock_get.assert_called_once_with( + "https://gitlab.example.org/api/v4/users?username=bob", + headers={"Authorization": "Bearer already-prefixed"}, + timeout=5, + ) + + @patch("somef.somef_cli.json_export.unify_results", return_value={}) + @patch("somef.somef_cli.cli_get_data") + @patch("somef.somef_cli.validators.url", return_value=True) + def test_run_cli_forwards_gitlab_token_single_repo(self, _mock_valid_url, mock_cli_get_data, _mock_unify): + # Avoid network/repository processing and only assert argument forwarding. + mock_cli_get_data.return_value = Result() + + # Run single-repository mode with a GitLab token. + somef_cli.run_cli( + repo_url="https://gitlab.com/group/project", + output=None, + codemeta_out=None, + google_codemeta_out=None, + gitlab_token="token-123", + ) + + # Ensure CLI propagates the token into the lower-level authorization argument. + self.assertEqual(mock_cli_get_data.call_count, 1) + kwargs = mock_cli_get_data.call_args.kwargs + self.assertEqual(kwargs["gitlab_authorization"], "token-123") + + @patch("somef.somef_cli.json_export.unify_results", return_value={}) + @patch("somef.somef_cli.cli_get_data") + @patch("somef.somef_cli.validators.url", return_value=True) + def test_run_cli_forwards_gitlab_token_in_file_mode(self, _mock_valid_url, mock_cli_get_data, _mock_unify): + # Avoid network/repository processing and only assert argument forwarding. + mock_cli_get_data.return_value = Result() + + # Build a temporary input file to trigger batch mode. + with tempfile.TemporaryDirectory() as tmp_dir: + input_file = Path(tmp_dir) / "repos.txt" + input_file.write_text("https://gitlab.com/group/project\n", encoding="utf-8") + + # Run file mode with a GitLab token. + somef_cli.run_cli( + in_file=str(input_file), + output=None, + codemeta_out=None, + google_codemeta_out=None, + gitlab_token="token-456", + ) + + # Ensure CLI propagates the token in batch mode as well. + self.assertEqual(mock_cli_get_data.call_count, 1) + kwargs = mock_cli_get_data.call_args.kwargs + self.assertEqual(kwargs["gitlab_authorization"], "token-456") + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index bec49c19..e2d80e62 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -3,7 +3,10 @@ from pathlib import Path # constants about SOMEF configuration -CONF_AUTHORIZATION = "Authorization" +CONF_GITHUB_AUTHORIZATION = "GitHubAuthorization" +CONF_GITLAB_AUTHORIZATION = "GitlabAuthorization" +# Backward-compatible key used in legacy code paths for GitHub authorization. +CONF_AUTHORIZATION = CONF_GITHUB_AUTHORIZATION CONF_DESCRIPTION = "description" CONF_INVOCATION = "invocation" CONF_INSTALLATION = "installation"