diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 3eba9233b..d35293189 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -21,9 +21,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v5 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index efb84d712..9f5e18475 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - master - - 1.0.x + - 1.1.x jobs: install_and_run_tests: @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12', '3.13' ] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14t'] services: # mongo: @@ -41,9 +41,9 @@ jobs: steps: - name: Checkout source - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0d3fc1b10..9c9d14cae 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -11,12 +11,12 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12', '3.13' ] + python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13', '3.14t' ] steps: - name: Checkout source - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CHANGES.txt b/CHANGES.txt index 970cb4e6f..dc8ee9420 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,67 @@ +v1.1.0 (2026/05/27) + Highlights: + - Added Python 3.14 support and dropped Python 3.8 support. + - Reworked biothings CLI configuration, pathing, and dataplugin command organization. + - Added MongoDB build cleanup management APIs and commands. + + biothings.hub improvements: + - Added prefix support to DataTransformMDB. ([#411](https://github.com/biothings/biothings.api/pull/411)) + - Added TAR and Zstandard file handling. ([#424](https://github.com/biothings/biothings.api/pull/424), [#428](https://github.com/biothings/biothings.api/pull/428)) + - Added support for custom local uploader and dumper classes in dataplugin manifests. ([#431](https://github.com/biothings/biothings.api/pull/431)) + - Added MongoDB build cleanup validation and deletion tooling. ([#437](https://github.com/biothings/biothings.api/pull/437)) + + biothings.utils improvements: + - Fixed dict_sweep handling for NaN-like values, including pandas.NA and pandas.NaT. ([#442](https://github.com/biothings/biothings.api/pull/442)) + - Centralized orjson usage across the library. ([#435](https://github.com/biothings/biothings.api/pull/435)) + + biothings.web improvements: + - Removed deprecated Google Analytics event support. ([#439](https://github.com/biothings/biothings.api/pull/439)) + - Removed old doc_type and deprecated Elasticsearch compatibility code. + - Improved Elasticsearch exception handling and query response behavior. + - Optimized Elasticsearch memory usage when using scroll API. + + biothings.cli improvements: + - Added CLI config and pathing commands. ([#418](https://github.com/biothings/biothings.api/pull/418), [#422](https://github.com/biothings/biothings.api/pull/422)) + - Added CLI dump mark-success support. ([#423](https://github.com/biothings/biothings.api/pull/423)) + - Fixed Typer/rich_utils loading behavior. ([#426](https://github.com/biothings/biothings.api/pull/426)) + + Misc improvements: + - Updated dependencies and package metadata for Python 3.9+, including tornado, typer, pymongo, orjson, and zstandard. + - Updated GitHub Actions test, build, and publish workflows. + + +v1.1.0 (2026/05/27) + Highlights: + - Added Python 3.14 support and dropped Python 3.8 support. + - Reworked biothings CLI configuration, pathing, and dataplugin command organization. + - Added MongoDB build cleanup management APIs and commands. + + biothings.hub improvements: + - Added prefix support to DataTransformMDB. ([#411](https://github.com/biothings/biothings.api/pull/411)) + - Added TAR and Zstandard file handling. ([#424](https://github.com/biothings/biothings.api/pull/424), [#428](https://github.com/biothings/biothings.api/pull/428)) + - Added support for custom local uploader and dumper classes in dataplugin manifests. ([#431](https://github.com/biothings/biothings.api/pull/431)) + - Added MongoDB build cleanup validation and deletion tooling. ([#437](https://github.com/biothings/biothings.api/pull/437)) + + biothings.utils improvements: + - Fixed dict_sweep handling for NaN-like values, including pandas.NA and pandas.NaT. ([#442](https://github.com/biothings/biothings.api/pull/442)) + - Centralized orjson usage across the library. ([#435](https://github.com/biothings/biothings.api/pull/435)) + + biothings.web improvements: + - Removed deprecated Google Analytics event support. ([#439](https://github.com/biothings/biothings.api/pull/439)) + - Removed old doc_type and deprecated Elasticsearch compatibility code. + - Improved Elasticsearch exception handling and query response behavior. + - Optimized Elasticsearch memory usage when using scroll API. + + biothings.cli improvements: + - Added CLI config and pathing commands. ([#418](https://github.com/biothings/biothings.api/pull/418), [#422](https://github.com/biothings/biothings.api/pull/422)) + - Added CLI dump mark-success support. ([#423](https://github.com/biothings/biothings.api/pull/423)) + - Fixed Typer/rich_utils loading behavior. ([#426](https://github.com/biothings/biothings.api/pull/426)) + + Misc improvements: + - Updated dependencies and package metadata for Python 3.9+, including tornado, typer, pymongo, orjson, and zstandard. + - Updated GitHub Actions test, build, and publish workflows. + v1.0.2 (2025/10/15) Bugfix: - Fixed an import issue in inspector.py that was causing the mapping inspect to fail. diff --git a/biothings/__init__.py b/biothings/__init__.py index 67d5f1993..93255fdd2 100644 --- a/biothings/__init__.py +++ b/biothings/__init__.py @@ -8,7 +8,7 @@ class _version_info(NamedTuple): micro: int -version_info = _version_info(1, 0, 2) +version_info = _version_info(1, 1, 0) __version__ = ".".join(map(str, version_info)) diff --git a/biothings/cli/__init__.py b/biothings/cli/__init__.py index cfb946c6d..2dbe6f2c1 100644 --- a/biothings/cli/__init__.py +++ b/biothings/cli/__init__.py @@ -2,34 +2,41 @@ Entrypoint for the biothings-cli tool """ +from typing import Literal import importlib.util import logging import os import sys +import typer +from rich.logging import RichHandler -from biothings.cli.settings import ( - setup_biothings_configuration, - setup_commandline_configuration, - setup_logging_configuration, -) +from biothings.cli.commands.admin import build_admin_application +from biothings.cli.commands.config import config_application, load_configuration +from biothings.cli.commands.dataplugin import dataplugin_application +from biothings.cli.commands.pathing import path_application -def check_module_import_status(module: str) -> bool: +def setup_logging_configuration(logging_level: Literal[10, 20, 30, 40, 50]) -> None: """ - Verify that we can import a module prior to proceeding with creating our commandline - tooling that depends on those modules + Configures the logging based off our environment configuration """ - module_specification = importlib.util.find_spec(module) - status = module_specification is not None - return status + rich_handler = RichHandler( + level=logging_level, + markup=True, + rich_tracebacks=False, # typer creates it already + show_path=False, + tracebacks_suppress=[typer], + ) + logging.basicConfig(level=logging_level, format="%(message)s", datefmt="[%X]", handlers=[rich_handler]) def main(): """ - The entrypoint for running the BioThings CLI to test your local data plugin + The entrypoint for running the BioThings CLI """ - typer_status = check_module_import_status("typer") + module_specification = importlib.util.find_spec("typer") + typer_status = module_specification is not None if not typer_status: logging.error( ( @@ -48,14 +55,14 @@ def main(): cli_debug_flag = os.environ.get("BTCLI_DEBUG", False) cli_rich_traceback_flag = os.environ.get("BTCLI_RICH_TRACEBACK", False) - cli = setup_commandline_configuration(debug=cli_debug_flag, rich_traceback=cli_rich_traceback_flag) + admin_application = build_admin_application(debug=cli_debug_flag, rich_traceback=cli_rich_traceback_flag) logging_level = logging.WARNING if cli_debug_flag: logging_level = logging.DEBUG setup_logging_configuration(logging_level) - setup_biothings_configuration() + load_configuration() - from biothings.cli.dataplugin import dataplugin_application - - cli.add_typer(dataplugin_application, name="dataplugin") - return cli() + admin_application.add_typer(dataplugin_application, name="dataplugin") + admin_application.add_typer(config_application, name="config") + admin_application.add_typer(path_application, name="path") + return admin_application() diff --git a/biothings/cli/assistant.py b/biothings/cli/assistant.py index 0e0ecf7d0..7aad1f5e8 100644 --- a/biothings/cli/assistant.py +++ b/biothings/cli/assistant.py @@ -38,7 +38,7 @@ class CLIAssistant(BaseAssistant): plugin_type = "CLI" - def __init__(self, plugin_name: Optional[str] = None, job_manager: "JobManager" = None): + def __init__(self, plugin_name: Optional[str] = None, job_manager: CLIJobManager = None): from biothings import config from biothings.hub.databuild.builder import BuilderManager from biothings.hub.dataindex.indexer import IndexManager diff --git a/biothings/cli/commands/__init__.py b/biothings/cli/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/biothings/cli/commands/admin.py b/biothings/cli/commands/admin.py new file mode 100644 index 000000000..e735c57fe --- /dev/null +++ b/biothings/cli/commands/admin.py @@ -0,0 +1,48 @@ +""" +Configuration settings for the biothings-cli tool + +> Logging +> Tool Configuration + > Creates a mock config used in the biothings.api backend +""" + +import sys + +import typer +import typer.rich_utils + + +def build_admin_application(debug: bool, rich_traceback: bool) -> typer.Typer: + """ + Builds the main administrative command line application for the + biothings-cli application + """ + pretty_exceptions_show_locals = False + pretty_exceptions_enable = False + sys.tracebacklimit = 1 + + if rich_traceback: + pretty_exceptions_enable = True + sys.tracebacklimit = 1000 + + if debug: + pretty_exceptions_enable = True + pretty_exceptions_show_locals = True + sys.tracebacklimit = 1000 + + # prevent dimming the help text from the 2nd line + # see: https://github.com/tiangolo/typer/issues/437#issuecomment-1224149402 + typer.rich_utils.STYLE_HELPTEXT = "" + + context_settings = {"help_option_names": ["-h", "--help"]} + typer_instance = typer.Typer( + help="[green]BioThings Admin CLI to test your local data plugins. See helps for each command for specific usage.[/green]", + rich_help_panel="Help and Others", + rich_markup_mode="rich", + context_settings=context_settings, + no_args_is_help=True, + pretty_exceptions_show_locals=pretty_exceptions_show_locals, + pretty_exceptions_enable=pretty_exceptions_enable, + ) + + return typer_instance diff --git a/biothings/cli/commands/config.py b/biothings/cli/commands/config.py new file mode 100644 index 000000000..1f2df74ba --- /dev/null +++ b/biothings/cli/commands/config.py @@ -0,0 +1,350 @@ +""" +Module for creating the CLI application for the configuration interface + +Provides the following capabilities: + +- View the default configuration file +- Generate a local configuration file +- Delete a local configuration file +- Modify the default configuration file + +By default, a config.py module isn't required to the biothings-cli locally. +A default config module is setup at launch, however an additional config module +can be provided to override the default config settings. + +The available config settings can be found at biothings.hub.default_config module (note that +not all settings are relevant to the CLI) + +*** HUB MODE *** +config.py +.biothings_hub + .data_src_database + archive + biothings_hubdb +data_plugin0 + ... +data_plugin1 + ... +data_plugin2 + ... + +*** SINGULAR MODE *** +config.py +.biothings_hub + .data_src_database + archive + biothings_hubdb +manifest.json +parser.py + +*** Example Configuration *** +######################################## +# DATA PLUGIN CONFIGURATION VARIABLES # +######################################## +DATA_SRC_DATABASE = '.data_src_database' +DATA_HUB_DB_DATABASE = 'data_hub_db_database' +HUB_DB_BACKEND = { + "module": "biothings.utils.sqlite3", + "sqlite_db_folder": ".biothings_hub"" +} +DATA_ARCHIVE_ROOT = ".biothings_hub/archive" + +# Add new entry in DOCKER_CONFIG if you want to use a different docker host for your +# docker-based data plugin, other than the default docker host running on your localhost. +DOCKER_CONFIG = { + "docker1": {"tls_cert_path": None, "tls_key_path": None, "client_url": ""}, + "localhost": {"client_url": "unix://var/run/docker.sock"}, +} +""" +import enum +import importlib +import json +import logging +import os +import pathlib +import sys +import types +from typing import Union + +from rich import box +from rich.console import Console +from rich.panel import Panel +import typer +from typing_extensions import Annotated + +from biothings.utils.common import DummyConfig +from biothings.utils.configuration import ConfigurationError + + + +SHORT_HELP = "[green]CLI tool for handling the biothings configuration.[/green]" +FULL_HELP = ( + f"{SHORT_HELP}" + "\n" + "\n[green] * View the default configuration file [/green]" + "\n[green] * Generate a local configuration file [/green]" + "\n[green] * Modify the backend storage configuration [/green]" + "\n" + "\nBy default, a config.py module isn't required to the biothings-cli locally." + "\nA default config module is setup at launch, however an additional config module " + "\ncan be provided to override the default config settings." + "\n" + "\nThe available config settings can be found at biothings.hub.default_config module " + "\n(note that not all settings are relevant to the CLI)" +) + +config_application = typer.Typer( + help=FULL_HELP, + short_help=SHORT_HELP, + no_args_is_help=True, + rich_markup_mode="rich", +) + +logger = logging.getLogger(name="biothings-cli") + + +@config_application.command(name="display") +def display_default_configuration(): + """ + Displays the default configuration stored for the biothings-cli + """ + default_configuration = default_biothings_configuration() + console = Console() + panel = Panel( + f"{build_configuration_repr(default_configuration)}\n", + title="[white]Default Biothings Configuration[/white]", + title_align="left", + box=box.ROUNDED, + ) + console.print(panel) + + +@config_application.command(name="create") +def create_local_configuration( + db_backend: Annotated[ + bool, + typer.Option("--override-backend", help="If provided, will prompt for overriding the HUB_DB_BACKEND value"), + ] = False, + index_backend: Annotated[ + bool, + typer.Option("--override-index", help="If provided, will prompt for overriding the INDEX_CONFIG value"), + ] = False, +): + """ + Creates a local configuration file (named config.py) in the current working directory + """ + configuration = default_biothings_configuration() + + class BackendType(str, enum.Enum): + SQLITE3 = "sqlite3" + MONGODB = "mongodb" + + if db_backend: + db_type = typer.prompt( + "What backend would you like to use? (supported options \"sqlite3\"|\"mongodb\"", + type=BackendType + + ) + if db_type == "sqlite3": + backend = { + "module": "biothings.utils.sqlite3", + "sqlite_db_folder": ".biothings_hub", + } + logger.info("Setting HUB_DB_BACKEND:\n%s", json.dumps(backend, indent=2)) + elif db_type == "mongodb": + backend = { + "module" : "biothings.utils.mongo", + "uri" : "mongodb://localhost:27017", + } + custom_uri = typer.prompt( + "Please specify the server uri for mongodb", default="mongodb://localhost:27017" + ) + if custom_uri is not None: + backend["uri"] = custom_uri + logger.info("Setting HUB_DB_BACKEND:\n%s", json.dumps(backend, indent=2)) + configuration["HUB_DB_BACKEND"] = backend + + if index_backend: + host_address = typer.prompt( + "Please specify the host address for elasticsearch you would like to use", default="http://localhost:9200" + ) + backend = { + "indexer_select": {}, + "env": { + "commandhub": { + "host": host_address, + "indexer": { + "args": { + "request_timeout": 300, + "retry_on_timeout": True, + "max_retries": 10 + } + } + } + } + } + logger.info("Setting INDEX_CONFIG:\n%s", json.dumps(backend, indent=2)) + configuration["INDEX_CONFIG"] = backend + + with open("config.py", "w", encoding="utf-8") as handle: + configuration_repr = build_configuration_repr(configuration) + handle.write(configuration_repr) + + console = Console() + panel = Panel( + f"{build_configuration_repr(configuration)}\n", + title="[white]Local Biothings Configuration[/white]", + title_align="left", + box=box.ROUNDED, + ) + console.print(panel) + + + +def build_configuration_repr(configuration_values: dict) -> str: + """ + Generates a string representation of the configuration + """ + header_string = ( + "########################################\n" + "# DATA PLUGIN CONFIGURATION VARIABLES #\n" + "########################################" + ) + configuration_repr = [header_string] + for configuration_key, configuration_value in configuration_values.items(): + if isinstance(configuration_value, dict): + mapping_repr = f"{configuration_key} = {json.dumps(configuration_value, indent=2)}" + mapping_repr = mapping_repr.replace("true", "True") + mapping_repr = mapping_repr.replace("false", "False") + configuration_repr.append(mapping_repr) + elif isinstance(configuration_value, (pathlib.Path, str)): + configuration_repr.append( + f"{configuration_key} = \"{configuration_value}\"" + ) + else: + configuration_repr.append( + f"{configuration_key} = {configuration_value}" + ) + return "\n".join(configuration_repr).rstrip("\n") + + +def default_biothings_configuration() -> dict: + """ + Function call to build the default biothings configuration + + Stores all the default values for the biothings configuration + for reference and updating + """ + + configuration = { + "HUB_DB_BACKEND": { + "module": "biothings.utils.sqlite3", + "sqlite_db_folder": ".biothings_hub", + }, + "DATA_SRC_SERVER": "localhost", + "DATA_SRC_DATABASE": "data_src_database", + "DATA_ARCHIVE_ROOT": ".biothings_hub/archive", + "LOG_FOLDER": ".biothings_hub/logs", + "DATA_PLUGIN_FOLDER": pathlib.Path().cwd(), + "DATA_TARGET_SERVER": "localhost", + "DATA_TARGET_PORT": 27017, + "DATA_TARGET_DATABASE": "plugin-hub", + "INDEX_CONFIG": { + "indexer_select": {}, + "env": { + "commandhub": { + "host": "http://localhost:9200", + "indexer": { + "args": { + "request_timeout": 300, + "retry_on_timeout": True, + "max_retries": 10 + } + } + } + } + }, + "RUN_DIR": pathlib.Path().cwd(), + "HUB_MAX_WORKERS": os.cpu_count(), + "MAX_QUEUED_JOBS": 1000 + } + + # specific attributes to the biothings-cli application + cli_configuration = { + "BIOTHINGS_CLI_PATH": "biothings_hub/path", + } + configuration.update(cli_configuration) + + return configuration + + + +def load_local_configuration() -> types.ModuleType: + """ + Attempts to load a local configuration file first before + falling back to a default configuration + """ + current_directory = pathlib.Path.cwd() + config_module_file = current_directory.joinpath("config.py") + if config_module_file.exists(): + spec = importlib.util.spec_from_file_location("config", location=str(config_module_file)) + config_module = importlib.util.module_from_spec(spec) + sys.modules["config"] = config_module + sys.modules["biothings.config"] = config_module + + spec.loader.exec_module(config_module) + + try: + backend = getattr(config_module, "HUB_DB_BACKEND") + setattr(config_module, "hub_db", importlib.import_module(backend["module"])) + except ImportError as import_err: + logging.exception(import_err) + raise import_err + + for attr in dir(config_module): + value = getattr(config_module, attr) + if isinstance(value, ConfigurationError): + raise ConfigurationError(f"{attr}: {value}") + + return config_module + return None + + +def load_default_configuration(): + """ + Loads the default configuration into a DummyConfig + """ + config_module = DummyConfig("config") + default_configuration_values = default_biothings_configuration() + for configuration_key, configuration_value in default_configuration_values.items(): + setattr(config_module, configuration_key, configuration_value) + + try: + backend = getattr(config_module, "HUB_DB_BACKEND") + setattr(config_module, "hub_db", importlib.import_module(backend["module"])) + except ImportError as import_err: + logging.exception(import_err) + raise import_err + + sys.modules["config"] = config_module + sys.modules["biothings.config"] = config_module + + return config_module + +def load_configuration() -> Union[types.ModuleType, DummyConfig]: + """ + Setup a config module necessary to launch the biothings-cli. + + Attempts to load a local file named config.py in the current working directory, + otherwise loads a default configuration through a DummyConfig instance + + Depending on the backend hub database, the order of configuration + matters. If we attempt to load a module that checks for the configuration + we'll have to ensure that the configuration is properly configured prior + to loading the module + """ + configuration = load_local_configuration() + if configuration is None: + logging.debug("Unable to find `config` module. Using the default configuration") + configuration = load_default_configuration() + return configuration diff --git a/biothings/cli/dataplugin.py b/biothings/cli/commands/dataplugin.py similarity index 96% rename from biothings/cli/dataplugin.py rename to biothings/cli/commands/dataplugin.py index 8b805db97..c202fa0e6 100644 --- a/biothings/cli/dataplugin.py +++ b/biothings/cli/commands/dataplugin.py @@ -8,7 +8,7 @@ import typer from typing_extensions import Annotated -from biothings.cli import operations +from biothings.cli.commands import operations SHORT_HELP = "[green]CLI tool for locally evaluating a biothings dataplugin. Allows for simple querying and data inspection.[/green]" FULL_HELP = ( @@ -54,6 +54,10 @@ def create_data_plugin( @dataplugin_application.command(name="dump") def dump_source( plugin_name: Annotated[Optional[str], typer.Option("--name", "-n", help=PLUGIN_NAME_HELP)] = None, + mark_success: Annotated[ + Optional[bool], + typer.Option("--mark-sucess", "-m", help="Mark dump as success without attempting to actually dump the files"), + ] = False, show_dump: Annotated[ Optional[bool], typer.Option("--show-dump", help="Displays the dump source result output after dump operation"), @@ -62,7 +66,7 @@ def dump_source( """ Download the source data files to the local file system """ - asyncio.run(operations.do_dump(plugin_name=plugin_name, show_dumped=show_dump)) + asyncio.run(operations.do_dump(plugin_name=plugin_name, show_dumped=show_dump, mark_success=mark_success)) @dataplugin_application.command(name="upload") @@ -133,7 +137,7 @@ def listing( @dataplugin_application.command(name="inspect") -def inspect_source( +def inspect_source( # pylint: disable=too-many-arguments,too-many-positional-arguments plugin_name: Annotated[Optional[str], typer.Option("--name", "-n", help=PLUGIN_NAME_HELP)] = None, sub_source_name: Annotated[ Optional[str], typer.Option("--sub-source-name", "-s", help="Your sub source name") diff --git a/biothings/cli/commands/decorators.py b/biothings/cli/commands/decorators.py new file mode 100644 index 000000000..5289ab01b --- /dev/null +++ b/biothings/cli/commands/decorators.py @@ -0,0 +1,139 @@ +""" +Collection of decorators for usage within the biothings-cli + +These are often method we want associated with many of the plugin methods we +use, but don't directly impact the logic of the actual operation. Typically things +related to paths and configurations that apply to large swaths of the cli +would make sense as a decorator +""" + +import functools +import inspect +import logging +import pathlib +import sys +from typing import Callable + +from biothings.cli.exceptions import MissingPluginName + +logger = logging.getLogger(name="biothings-cli") + + +def get_biothings_config(): + try: + return sys.modules["biothings.config"] + except KeyError as exc: + raise RuntimeError("BioThings CLI configuration has not been loaded") from exc + + +def operation_mode(operation: Callable): + """ + Based off the directory structure for where the biothings-cli + was invoked we set the "mode" to one of two states: + + 0) singular + The current working directory contains a singular data-plugin + + In this case we don't require a plugin_name argument to be passed + at the command-line + + 1) hub + The current working directory contains N directories operating as a + "hub" or collection of data-plugins under one umbrella + + In this case we do require a plugin_name argument to be passed + at the command-line. Otherwise we have no idea which data-plugin to + refer to + + We attempt to load the plugin from this working directory. If we sucessfully load + either a manifest or advanced plugin, then we can safely say this is a singular + dataplugin + + If we cannot load either a manifest or advanced plugin then we default assume that + the mode is hub + """ + + @functools.wraps(operation) + def determine_operation_mode(*args, **kwargs): + + def determine_hub_mode(): + working_directory = pathlib.Path.cwd() + working_directory_files = {file.name for file in working_directory.iterdir()} + + mode = None + if "manifest.json" in working_directory_files or "manifest.yaml" in working_directory_files: + logger.debug("Inferring singular manifest plugin from directory structure") + mode = "SINGULAR" + elif "__init__.py" in working_directory_files: + logger.debug("Inferring singular advanced plugin from directory structure") + mode = "SINGULAR" + else: + logger.debug("Inferring multiple plugins from directory structure") + mode = "HUB" + + if mode == "SINGULAR": + if kwargs.get("plugin_name", None) is not None: + kwargs["plugin_name"] = None + elif mode == "HUB": + if kwargs.get("plugin_name", None) is None: + raise MissingPluginName(working_directory) + + @functools.wraps(operation) + def handle_function(*args, **kwargs): + operation_result = operation(*args, **kwargs) + return operation_result + + @functools.wraps(operation) + async def handle_corountine(*args, **kwargs): + operation_result = await operation(*args, **kwargs) + return operation_result + + determine_hub_mode() + + if inspect.iscoroutinefunction(operation): + return handle_corountine(*args, **kwargs) + return handle_function(*args, **kwargs) + + return determine_operation_mode + + +def cli_system_path(operation: Callable): # pylint: disable=unused-argument + """ + Used for ensuring that if we've appended files to biothings-cli + path file (stored under config.BIOTHINGS_CLI_PATH), then we need to update + the system path so we can discover the modules at runtime + """ + + @functools.wraps(operation) + def update_system_path(*args, **kwargs): + + def update_system_path_from_file(): + config = get_biothings_config() + discovery_path = pathlib.Path(config.BIOTHINGS_CLI_PATH).resolve().absolute() + path_file = discovery_path.joinpath("biothings_cli.pth") + + if path_file.exists(): + with open(path_file, "r", encoding="utf-8") as handle: + path_entries = handle.readlines() + path_entries = [entry.strip("\n") for entry in path_entries] + sys.path.extend(path_entries) + for path in path_entries: + logger.debug("Adding %s to system path", path) + + @functools.wraps(operation) + def handle_function(*args, **kwargs): + operation_result = operation(*args, **kwargs) + return operation_result + + @functools.wraps(operation) + async def handle_corountine(*args, **kwargs): + operation_result = await operation(*args, **kwargs) + return operation_result + + update_system_path_from_file() + + if inspect.iscoroutinefunction(operation): + return handle_corountine(*args, **kwargs) + return handle_function(*args, **kwargs) + + return update_system_path diff --git a/biothings/cli/operations.py b/biothings/cli/commands/operations.py similarity index 89% rename from biothings/cli/operations.py rename to biothings/cli/commands/operations.py index 0be38234d..d55119ec8 100644 --- a/biothings/cli/operations.py +++ b/biothings/cli/commands/operations.py @@ -47,7 +47,6 @@ """ import asyncio -import functools import logging import multiprocessing import os @@ -57,7 +56,8 @@ import shutil import sys import uuid -from typing import Callable, Optional, Union +from importlib import import_module +from typing import Optional, Union import jsonschema import rich @@ -67,7 +67,8 @@ from rich.console import Console from rich.panel import Panel -from biothings.cli.exceptions import MissingPluginName, UnknownUploaderSource +from biothings.cli.commands.decorators import cli_system_path, get_biothings_config, operation_mode +from biothings.cli.exceptions import UnknownUploaderSource from biothings.cli.structure import TEMPLATE_DIRECTORY from biothings.cli.utils import ( clean_dumped_files, @@ -87,65 +88,12 @@ logger = logging.getLogger(name="biothings-cli") -def operation_mode(operation_method: Callable): - """ - Based off the directory structure for where the biothings-cli - was invoked we set the "mode" to one of two states: - - 0) singular - The current working directory contains a singular data-plugin - - In this case we don't require a plugin_name argument to be passed - at the command-line - - 1) hub - The current working directory contains N directories operating as a - "hub" or collection of data-plugins under one umbrella - - In this case we do require a plugin_name argument to be passed - at the command-line. Otherwise we have no idea which data-plugin to - refer to - - We attempt to load the plugin from this working directory. If we sucessfully load - either a manifest or advanced plugin, then we can safely say this is a singular - dataplugin - - If we cannot load either a manifest or advanced plugin then we default assume that - the mode is hub - """ - - @functools.wraps(operation_method) - def determine_operation_mode(*args, **kwargs): - working_directory = pathlib.Path.cwd() - working_directory_files = {file.name for file in working_directory.iterdir()} - - mode = None - if "manifest.json" in working_directory_files or "manifest.yaml" in working_directory_files: - logger.debug("Inferring singular manifest plugin from directory structure") - mode = "SINGULAR" - elif "__init__.py" in working_directory_files: - logger.debug("Inferring singular advanced plugin from directory structure") - mode = "SINGULAR" - else: - logger.debug("Inferring multiple plugins from directory structure") - mode = "HUB" - - if mode == "SINGULAR": - if kwargs.get("plugin_name", None) is not None: - kwargs["plugin_name"] = None - elif mode == "HUB": - if kwargs.get("plugin_name", None) is None: - raise MissingPluginName(working_directory) - - operation_result = operation_method(*args, **kwargs) - return operation_result - - return determine_operation_mode +def _load_attr(module_path: str, attr_name: str): + return getattr(import_module(module_path), attr_name) # do not apply operation_mode decorator since this operation means to create a new plugin # regardless what the current working directory has -# @operation_mode def do_create(plugin_name: str, multi_uploaders: bool = False, parallelizer: bool = False): """ Create a new data plugin from the template @@ -178,14 +126,15 @@ def do_create(plugin_name: str, multi_uploaders: bool = False, parallelizer: boo logger.info("Successfully created data plugin template at: %s\n", new_plugin_directory) +@cli_system_path @operation_mode -async def do_dump(plugin_name: Optional[str] = None, show_dumped: bool = True) -> None: +async def do_dump(plugin_name: Optional[str] = None, show_dumped: bool = True, mark_success: bool = False) -> None: """ Perform dump for the given plugin """ - from biothings import config - from biothings.cli.assistant import CLIAssistant - from biothings.utils import hub_db + config = get_biothings_config() + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") + hub_db = import_module("biothings.utils.hub_db") hub_db.setup(config) assistant_instance = CLIAssistant(plugin_name) @@ -206,11 +155,15 @@ async def do_dump(plugin_name: Optional[str] = None, show_dumped: bool = True) - ) logger.warning(attribute_warning) - dump_job = dumper_instance.dump( - job_manager=assistant_instance.job_manager, - force=False, - ) - await asyncio.gather(dump_job) + if mark_success: + logger.warning("Marking dump as successful without running the dumper") + dumper_instance.mark_success(dry_run=True) + else: + dump_job = dumper_instance.dump( + job_manager=assistant_instance.job_manager, + force=False, + ) + await asyncio.gather(dump_job) dp = hub_db.get_data_plugin() dp.remove({"_id": assistant_instance.plugin_name}) @@ -223,6 +176,7 @@ async def do_dump(plugin_name: Optional[str] = None, show_dumped: bool = True) - show_dumped_files(data_folder, assistant_instance.plugin_name) +@cli_system_path @operation_mode async def do_upload(plugin_name: Optional[str] = None, batch_limit: int = 10000, show_uploaded: bool = True) -> None: """ @@ -235,8 +189,7 @@ async def do_upload(plugin_name: Optional[str] = None, batch_limit: int = 10000, >>> self.commands["upload_all"] = self.managers["upload_manager"].upload_all >>> self.commands["update_source_meta"] = self.managers["upload_manager"].update_source_meta """ - from biothings.cli.assistant import CLIAssistant - + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") assistant_instance = CLIAssistant(plugin_name) uploader_classes = assistant_instance.get_uploader_class() for uploader_class in uploader_classes: @@ -277,6 +230,7 @@ async def do_upload(plugin_name: Optional[str] = None, batch_limit: int = 10000, show_uploaded_sources(pathlib.Path(assistant_instance.plugin_directory), assistant_instance.plugin_name) +@cli_system_path @operation_mode async def do_parallel_upload( plugin_name: Optional[str] = None, batch_limit: int = 10000, show_uploaded: bool = True @@ -293,8 +247,7 @@ async def do_parallel_upload( This is a modified version of the ParallelUploader `update_data` source call """ - from biothings.cli.assistant import CLIAssistant - + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") assistant_instance = CLIAssistant(plugin_name) uploader_classes = assistant_instance.get_uploader_class() for uploader_class in uploader_classes: @@ -344,6 +297,7 @@ async def do_parallel_upload( show_uploaded_sources(pathlib.Path(assistant_instance.plugin_directory), assistant_instance.plugin_name) +@cli_system_path @operation_mode async def do_dump_and_upload(plugin_name: str) -> None: """ @@ -354,6 +308,7 @@ async def do_dump_and_upload(plugin_name: str) -> None: logger.info("[green]Success![/green] :rocket:", extra={"markup": True}) +@cli_system_path @operation_mode async def do_index(plugin_name: Optional[str] = None, sub_source_name: Optional[str] = None) -> None: """ @@ -416,10 +371,10 @@ async def do_index(plugin_name: Optional[str] = None, sub_source_name: Optional[ The default location is localhost:9200. If successful a couple frames detailing the build and index information will be displayed to the enduser """ - from biothings import config - from biothings.cli.assistant import CLIAssistant - from biothings.hub.databuild.builder import BuilderException - from biothings.utils.manager import JobManager + config = get_biothings_config() + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") + BuilderException = _load_attr("biothings.hub.databuild.builder", "BuilderException") + JobManager = _load_attr("biothings.utils.manager", "JobManager") if platform.system() == "Windows": logger.warning("The `biothings-cli dataplugin index` command isn't supported on windows") @@ -540,6 +495,7 @@ async def do_index(plugin_name: Optional[str] = None, sub_source_name: Optional[ await show_source_index(index_name, assistant_instance.index_manager, elasticsearch_mapping) +@cli_system_path @operation_mode async def do_list( plugin_name: Optional[str] = None, dump: bool = True, upload: bool = True, hubdb: bool = False @@ -547,8 +503,7 @@ async def do_list( """ List the dumped files, uploaded sources, or hubdb content. """ - from biothings.cli.assistant import CLIAssistant - + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") assistant_instance = CLIAssistant(plugin_name) if dump: dumper_instance = assistant_instance.get_dumper_class() @@ -569,8 +524,9 @@ async def do_list( show_hubdb_content() +@cli_system_path @operation_mode -async def do_inspect( +async def do_inspect( # pylint: disable=too-many-arguments,too-many-positional-arguments plugin_name: Optional[str] = None, sub_source_name: Optional[str] = None, mode: str = "type,stats", @@ -581,8 +537,7 @@ async def do_inspect( """ Perform inspection on a data plugin. """ - from biothings.cli.assistant import CLIAssistant - + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") assistant_instance = CLIAssistant(plugin_name) uploader_classes = assistant_instance.get_uploader_class() @@ -633,14 +588,15 @@ async def do_inspect( write_mapping_to_file(sub_output, inspection_mapping) +@cli_system_path @operation_mode async def do_serve(plugin_name: Optional[str] = None, host: str = "localhost", port: int = 9999): """ Handles creation of a basic web server for hosting files using for a dataplugin """ - from biothings.cli.assistant import CLIAssistant - from biothings.cli.web_app import main - from biothings.utils import hub_db + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") + main = _load_attr("biothings.cli.web_app", "main") + hub_db = import_module("biothings.utils.hub_db") assistant_instance = CLIAssistant(plugin_name) uploader_classes = assistant_instance.get_uploader_class() @@ -651,6 +607,7 @@ async def do_serve(plugin_name: Optional[str] = None, host: str = "localhost", p await main(host=host, port=port, db=src_db, table_space=table_space) +@cli_system_path @operation_mode async def do_clean( plugin_name: Optional[str] = None, dump: bool = False, upload: bool = False, clean_all: bool = False @@ -658,8 +615,7 @@ async def do_clean( """ Clean the dumped files, uploaded sources, or both. """ - from biothings.cli.assistant import CLIAssistant - + CLIAssistant = _load_attr("biothings.cli.assistant", "CLIAssistant") if clean_all: dump = True upload = True @@ -688,8 +644,7 @@ async def display_schema(): Loads the jsonschema definition file and displays it to the console """ - from biothings.hub.dataplugin.loaders.schema import load_manifest_schema - + load_manifest_schema = _load_attr("biothings.hub.dataplugin.loaders.schema", "load_manifest_schema") manifest_schema = load_manifest_schema() schema_validator = jsonschema.validators.validator_for(manifest_schema) valid_schema = False @@ -714,14 +669,14 @@ async def display_schema(): console.print(panel) +@cli_system_path @operation_mode async def validate_manifest(plugin_name: Optional[str] = None): """ Loads the manifest file and validates it against the schema file If an error exists it will display the error to the enduser """ - from biothings.hub.dataplugin.loaders.loader import ManifestBasedPluginLoader - + ManifestBasedPluginLoader = _load_attr("biothings.hub.dataplugin.loaders.loader", "ManifestBasedPluginLoader") if plugin_name is None: plugin_directory = pathlib.Path.cwd().resolve().absolute() plugin_name = plugin_directory.name diff --git a/biothings/cli/commands/pathing.py b/biothings/cli/commands/pathing.py new file mode 100644 index 000000000..3874752bb --- /dev/null +++ b/biothings/cli/commands/pathing.py @@ -0,0 +1,168 @@ +""" +Module for creating the cli interface for the path interface +""" + +import logging +import pathlib +import sys + +import typer +from rich.console import Console +from rich.table import Table + +from biothings.cli.commands.decorators import cli_system_path, get_biothings_config, operation_mode + +SHORT_HELP = ( + "[green]CLI tool for viewing the python system path and adding external directories to the system path[/green]" +) +FULL_HELP = ( + SHORT_HELP + + "\n\n[magenta] :sparkles: Run from an existing data plugin folder to evaluate a singular data plugin.[/magenta]" +) +path_application = typer.Typer( + help=FULL_HELP, + short_help=SHORT_HELP, + no_args_is_help=True, + rich_markup_mode="rich", +) + +logger = logging.getLogger(name="biothings-cli") + + +@path_application.command(name="view") +def view_system_path() -> None: + """ + View the system paths current discovered by python, along with potential hub directories of interest + that the user may wish to add to the system path for usage in data plugin testing + """ + display_system_paths() + + +@path_application.command(name="add") +def add_parser_to_system_path() -> None: + """ + Add discovered hub directory paths to the python system path for aiding in testing various data plugins + Creates the file "bt_custom.pth" (uses .pth extension to mimic the `site` module internal to + python). It creates this file in the .biothings_hub/path directory. If found while running a + command, then the paths in the file with be added the system path prior to executing the command + """ + update_system_paths() + display_system_paths() + + +@path_application.command(name="remove") +def remove_parser_from_system_path() -> None: + """ + Remove the hub directories discovered from the python system path + Simply removes the bt_custom.pth file from the biothings-cli directory + """ + remove_system_paths() + + +@cli_system_path +@operation_mode +def display_system_paths() -> None: + """ + Method for displaying the system path information used for the + biothing-cli application + + External method so we can call it from multiple typer commands + """ + path_table = Table(title="Python System Path(s)") + + path_table.add_column("Index", style="cyan") + path_table.add_column("Paths", style="green") + + system_paths = sys.path + for index, system_path in enumerate(system_paths): + path_table.add_row(str(index), str(system_path)) + + parser_table = Table(title="External Parser Path(s)") + + parser_table.add_column("Index", style="cyan") + parser_table.add_column("Paths", style="magenta") + parser_table.add_column("On System Path?", style="steel_blue1") + + hub_parser_paths = find_hub_parsers() + for index, parser_path in enumerate(hub_parser_paths): + parser_table.add_row(str(index), str(parser_path), str(str(parser_path.parent) in system_paths)) + + console = Console() + console.print(path_table) + console.print(parser_table) + + +@cli_system_path +@operation_mode +def update_system_paths() -> None: + config = get_biothings_config() + discovery_path = pathlib.Path(config.BIOTHINGS_CLI_PATH).resolve().absolute() + discovery_path.mkdir(parents=True, exist_ok=True) + + hub_parser_paths = find_hub_parsers() + + # The actual path that needs to be added is the parent of the hub directory + hub_parser_paths = [path.parent for path in hub_parser_paths] + + path_file = discovery_path.joinpath("biothings_cli.pth") + with open(path_file, "w", encoding="utf-8") as path_handle: + for parser_path in hub_parser_paths: + logger.info("Adding %s -> %s", parser_path, path_file) + path_handle.write(f"{parser_path}\n") + + +@cli_system_path +@operation_mode +def remove_system_paths() -> None: + config = get_biothings_config() + discovery_path = pathlib.Path(config.BIOTHINGS_CLI_PATH).resolve().absolute() + path_file = discovery_path.joinpath("biothings_cli.pth") + path_file.unlink(missing_ok=True) + + hub_parser_paths = find_hub_parsers() + for parser_path in hub_parser_paths: + try: + sys.path.remove(str(parser_path)) + except ValueError: + pass + + +def find_hub_parsers(upward_depth: int = 2) -> list[pathlib.Path]: + """ + Attempts to locate any potential hub-based parsers that are use across different plugins + within a shared hub instance + + Will attempt to traverse recursively by levels (defaults to 2 levels) above the present working directory + The typical hub structure has the plugins directory at the same level as the hub directory + + pending.api structure: + root + ├── hub + ├── plugins + + (mygene, mychem, myvariant, ...) structure + root + ├── src + │   ├── hub + │   ├── plugins + + In either structure, the user is expected to be operating within the directory of a specific + plugin (root/plugin/plugin_directory/) or acting as a HUB within the (root/plugin) directory + Either case we should be able to find the shared parsers within 2 upper levels + """ + directory_pointer = pathlib.Path.cwd() + + traversal_counter = 0 + external_parser_paths = [] + + # Match any path ending explicitly in hub. The bracket "[a]" matches the character literal + # enclosed in the bracket, so [h][u][b] matches the literal hub + match_expr = "**/[h][u][b]" + while traversal_counter < upward_depth: + directory_pointer = directory_pointer.parent + for hub_path in directory_pointer.glob(match_expr): + hub_dataload = hub_path.joinpath("dataload") + if hub_dataload.exists(): + external_parser_paths.append(hub_path.resolve().absolute()) + traversal_counter += 1 + return external_parser_paths diff --git a/biothings/cli/configuration/config.py.sample b/biothings/cli/configuration/config.py.sample deleted file mode 100644 index 5e89c5d3a..000000000 --- a/biothings/cli/configuration/config.py.sample +++ /dev/null @@ -1,47 +0,0 @@ -######################################## -# DATA PLUGIN CONFIGURATION VARIABLES # -######################################## -# Typicaly, you don't need to include a config.py module to run the BioThings CLI tool to -# test your data plugin locally. A default config module is setup at the launch of the CLI. -# However, you can always include an additional config.py module to override the default -# config settings, e.g. alternative DATA_ARCHIVE_ROOT, HUB_DB_BACKEND for different db path. -# The available config settings can be found at biothings.hub.default_config module (note that -# not all settings are relevant to the CLI) - -# This file should be place at the same directory with developed data plugin: - -# When using dataplugin-hub sub commands -# $ ls -al -# config.py -# .biothings_hub -# .data_src_database -# archive -# biothings_hubdb -# your_data_plugin_folder -# manifest.json -# parser.py - -# When using dataplugin sub commands inside a data plugin folder -# $ ls -al -# config.py -# .biothings_hub -# .data_src_database -# archive -# biothings_hubdb -# manifest.json -# parser.py - -DATA_SRC_DATABASE = '.data_src_database' -DATA_HUB_DB_DATABASE = 'data_hub_db_database' -HUB_DB_BACKEND = { - "module": "biothings.utils.sqlite3", - "sqlite_db_folder": ".biothings_hub"" -} -DATA_ARCHIVE_ROOT = ".biothings_hub/archive" - -# Add new entry in DOCKER_CONFIG if you want to use a different docker host for your -# docker-based data plugin, other than the default docker host running on your localhost. -# DOCKER_CONFIG = { -# "docker1": {"tls_cert_path": None, "tls_key_path": None, "client_url": ""}, -# "localhost": {"client_url": "unix://var/run/docker.sock"}, -# } diff --git a/biothings/cli/manager.py b/biothings/cli/manager.py index 511903b9a..eb0cd565a 100644 --- a/biothings/cli/manager.py +++ b/biothings/cli/manager.py @@ -29,13 +29,13 @@ async def defer_to_process(self, pinfo=None, func=None, *args, **kwargs): async def defer_to_thread(self, pinfo=None, func=None, *args): """keep the same signature as JobManager.defer_to_thread. The passed pinfo is ignored""" - async def run(fut, func): + async def run(fut, func, *args): try: - res = func() + res = func(*args) fut.set_result(res) except Exception as gen_exc: fut.set_exception(gen_exc) fut = self.loop.create_future() - self.loop.create_task(run(fut, func)) + self.loop.create_task(run(fut, func, *args)) return fut diff --git a/biothings/cli/settings.py b/biothings/cli/settings.py deleted file mode 100644 index 4cb9416ce..000000000 --- a/biothings/cli/settings.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Configuration settings for the biothings-cli tool - -> Logging -> Tool Configuration - > Creates a mock config used in the biothings.api backend -""" - -from typing import Literal -import importlib -import importlib.util -import logging -import os -import pathlib -import sys - -from rich.logging import RichHandler -import typer - - -from biothings.utils.common import DummyConfig -from biothings.utils.configuration import ConfigurationError - - -def setup_commandline_configuration(debug: bool, rich_traceback: bool) -> typer.Typer: - """ - Sets up the typer command line tooling - """ - pretty_exceptions_show_locals = False - pretty_exceptions_enable = False - sys.tracebacklimit = 1 - - if rich_traceback: - pretty_exceptions_enable = True - sys.tracebacklimit = 1000 - - if debug: - pretty_exceptions_enable = True - pretty_exceptions_show_locals = True - sys.tracebacklimit = 1000 - - # prevent dimming the help text from the 2nd line - # see: https://github.com/tiangolo/typer/issues/437#issuecomment-1224149402 - typer.rich_utils.STYLE_HELPTEXT = "" - - context_settings = {"help_option_names": ["-h", "--help"]} - typer_instance = typer.Typer( - help="[green]BioThings Admin CLI to test your local data plugins. See helps for each command for specific usage.[/green]", - rich_help_panel="Help and Others", - rich_markup_mode="rich", - context_settings=context_settings, - no_args_is_help=True, - pretty_exceptions_show_locals=pretty_exceptions_show_locals, - pretty_exceptions_enable=pretty_exceptions_enable, - ) - - return typer_instance - - -def setup_logging_configuration(logging_level: Literal[10, 20, 30, 40, 50]) -> None: - """ - Configures the logging based off our environment configuration - """ - rich_handler = RichHandler( - level=logging_level, - markup=True, - rich_tracebacks=False, # typer creates it already - show_path=False, - tracebacks_suppress=[typer], - ) - logging.basicConfig(level=logging_level, format="%(message)s", datefmt="[%X]", handlers=[rich_handler]) - - -def setup_biothings_configuration(): - """ - Setup a config module necessary to launch the CLI - - Depending on the backend hub database, the order of configuration - matters. If we attempt to load a module that checks for the configuration - we'll have to ensure that the configuration is properly configured prior - to loading the module - """ - working_dir = pathlib.Path().resolve() - configuration_instance = DummyConfig("config") - - try: - config_mod = importlib.import_module("config") - for attr in dir(config_mod): - value = getattr(config_mod, attr) - if isinstance(value, ConfigurationError): - raise ConfigurationError(f"{attr}: {value}") - setattr(configuration_instance, attr, value) - except ModuleNotFoundError: - logging.debug(ModuleNotFoundError) - logging.debug("Unable to find `config` module. Using the default configuration") - finally: - sys.modules["config"] = configuration_instance - sys.modules["biothings.config"] = configuration_instance - - configuration_instance.HUB_DB_BACKEND = { - "module": "biothings.utils.sqlite3", - "sqlite_db_folder": ".biothings_hub", - } - configuration_instance.DATA_SRC_SERVER = "localhost" - configuration_instance.DATA_SRC_DATABASE = "data_src_database" - configuration_instance.DATA_ARCHIVE_ROOT = ".biothings_hub/archive" - configuration_instance.LOG_FOLDER = ".biothings_hub/logs" - configuration_instance.DATA_PLUGIN_FOLDER = f"{working_dir}" - - configuration_instance.DATA_TARGET_SERVER = "localhost" - configuration_instance.DATA_TARGET_PORT = 27017 - configuration_instance.DATA_TARGET_DATABASE = "plugin-hub" - configuration_instance.INDEX_CONFIG = { - "indexer_select": {}, - "env": { - "commandhub": { - "host": "http://localhost:9200", - "indexer": {"args": {"request_timeout": 300, "retry_on_timeout": True, "max_retries": 10}}, - } - }, - } - - # job manager configuration properties - configuration_instance.RUN_DIR = pathlib.Path().cwd() - configuration_instance.HUB_MAX_WORKERS = os.cpu_count() - configuration_instance.MAX_QUEUED_JOBS = 1000 - - try: - configuration_instance.hub_db = importlib.import_module(configuration_instance.HUB_DB_BACKEND["module"]) - except ImportError as import_err: - logging.exception(import_err) - raise import_err - - return configuration_instance diff --git a/biothings/cli/templates/manifest.yaml.tpl b/biothings/cli/templates/manifest.yaml.tpl index ce53daa28..d398efbb2 100644 --- a/biothings/cli/templates/manifest.yaml.tpl +++ b/biothings/cli/templates/manifest.yaml.tpl @@ -4,7 +4,7 @@ display_name: # Optional. This will be displayed as friendly name on the Biothin biothing_type: # Optional. Can be used to provide the default value to some hub functions (e.g. in quick_index as the default doc_type value. __metadata__: # Optional. license_url: https://example.com/ # Optional. Put your license url here - licence: ABCXYZ # Optional. Your license name + license: ABCXYZ # Optional. Your license name url: https://example.com/ # Your site url description: # Optional. More description for this data plugin requires: # Optional. Listing all extra packages if need diff --git a/biothings/hub/__init__.py b/biothings/hub/__init__.py index a792c45db..faf716055 100644 --- a/biothings/hub/__init__.py +++ b/biothings/hub/__init__.py @@ -101,7 +101,7 @@ def _config_for_app(config_mod=None): # _config.HUB_DB_BACKEND = { # "module": "biothings.utils.sqlite3", # "sqlite_db_folder": "."} -# _config.DATA_HUB_DB_DATABASE = ".hubdb" +# _config.DATA_HUB_DB_DATABASE = "biothings_hubdb" # _config_for_app(_config) @@ -534,6 +534,14 @@ def configure(self): def configure_ioloop(self): import tornado.platform.asyncio + # In Python 3.14, get_event_loop raises a RuntimeError if there is no current event loop. + # Eventually this probably should not be needed when tornado handles this internally. + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + tornado.platform.asyncio.AsyncIOMainLoop().install() def before_start(self): @@ -693,6 +701,7 @@ def configure_index_manager(self): def configure_snapshot_manager(self): assert "index" in self.features, "'snapshot' feature requires 'index'" + from biothings.hub.dataindex.mongo_build_cleanup import MongoBuildCleanupManager from biothings.hub.dataindex.snapshooter import SnapshotManager args = self.mixargs("snapshot") @@ -705,6 +714,7 @@ def configure_snapshot_manager(self): snapshot_manager.configure(config.SNAPSHOT_CONFIG) snapshot_manager.poll("snapshot", snapshot_manager.snapshot_a_build) self.managers["snapshot_manager"] = snapshot_manager + self.managers["mongo_build_cleanup_manager"] = MongoBuildCleanupManager(job_manager=self.managers["job_manager"]) def configure_auto_snapshot_cleaner_manager(self): assert "snapshot" in self.features, "'auto_snapshot_cleaner' feature requires 'snapshot'" @@ -1140,6 +1150,10 @@ def configure_commands(self): self.commands["list_snapshots"] = self.managers["snapshot_manager"].list_snapshots self.commands["delete_snapshots"] = self.managers["snapshot_manager"].delete_snapshots self.commands["validate_snapshots"] = self.managers["snapshot_manager"].validate_snapshots + if self.managers.get("mongo_build_cleanup_manager"): + self.commands["list_mongo_builds"] = self.managers["mongo_build_cleanup_manager"].list_mongo_builds + self.commands["delete_mongo_builds"] = self.managers["mongo_build_cleanup_manager"].delete_mongo_builds + self.commands["validate_mongo_builds"] = self.managers["mongo_build_cleanup_manager"].validate_mongo_builds # data release commands if self.managers.get("release_manager"): self.commands["create_release_note"] = self.managers["release_manager"].create_release_note @@ -1506,6 +1520,16 @@ def configure_api_endpoints(self): ) if "validate_snapshots" in cmdnames: self.api_endpoints["validate_snapshots"] = EndpointDefinition(name="validate_snapshots", method="post") + if "list_mongo_builds" in cmdnames: + self.api_endpoints["mongo_builds"] = EndpointDefinition(name="list_mongo_builds", method="get") + if "delete_mongo_builds" in cmdnames: + self.api_endpoints["mongo_builds/delete"] = EndpointDefinition( + name="delete_mongo_builds", method="put", force_bodyargs=True + ) + if "validate_mongo_builds" in cmdnames: + self.api_endpoints["mongo_builds/validate"] = EndpointDefinition( + name="validate_mongo_builds", method="post" + ) if "sync" in cmdnames: self.api_endpoints["sync"] = EndpointDefinition(name="sync", method="post", force_bodyargs=True) if "whatsnew" in cmdnames: diff --git a/biothings/hub/api/handlers/base.py b/biothings/hub/api/handlers/base.py index 3746ef9c1..1da37fb39 100644 --- a/biothings/hub/api/handlers/base.py +++ b/biothings/hub/api/handlers/base.py @@ -6,10 +6,10 @@ # see https://github.com/biothings/biothings.api/commit/59c0d78f758018b0d87836657a2b5d1a700503a1 # import pandas.io.json as pdjson # replace pandas json encoder with orjson: -import orjson from tornado.web import RequestHandler from biothings import config +from biothings.utils import serializer class DefaultHandler(RequestHandler): @@ -26,10 +26,9 @@ def write(self, result): # "result": result, # "status": "ok" # }, iso_dates=True) - orjson.dumps( - {"result": result, "status": "ok"}, - option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC, - ).decode() + serializer.to_json({ + "result": result, "status": "ok" + }) ) def write_error(self, status_code, **kwargs): diff --git a/biothings/hub/autoupdate/uploader.py b/biothings/hub/autoupdate/uploader.py index 439f03716..1a358c347 100644 --- a/biothings/hub/autoupdate/uploader.py +++ b/biothings/hub/autoupdate/uploader.py @@ -348,13 +348,13 @@ async def apply_diff(self, build_meta, job_manager, **kwargs): # ---------------------------------------- # self.target_backend.target_name, # ---------------------------------------- - self.target_backend.target_esidxer._doc_type, + # self.target_backend.target_esidxer._doc_type, # remove the use of doc_type here, remove this line if confirmed ) # new: index's data we will reach once updated (just informative) new = ( self.target_backend.target_esidxer.es_host, meta["new"]["backend"], - self.target_backend.target_esidxer._doc_type, + # self.target_backend.target_esidxer._doc_type, # remove the use of doc_type here, remove this line if confirmed ) await self.syncer_func(old_db_col_names=old, new_db_col_names=new, diff_folder=self.data_folder) # return current number of docs in index (even if diff update) diff --git a/biothings/hub/databuild/backend.py b/biothings/hub/databuild/backend.py index a86de24c9..0b41ae32c 100644 --- a/biothings/hub/databuild/backend.py +++ b/biothings/hub/databuild/backend.py @@ -315,7 +315,6 @@ def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs): is_mongo = False idxr = ESIndexer( index=db_col_names[1], - doc_type=db_col_names[2], es_host=db_col_names[0], **kwargs, ) diff --git a/biothings/hub/databuild/builder.py b/biothings/hub/databuild/builder.py index f9275b147..46b5bbac6 100644 --- a/biothings/hub/databuild/builder.py +++ b/biothings/hub/databuild/builder.py @@ -20,6 +20,15 @@ from biothings import config as btconfig from biothings.hub import BUILDER_CATEGORY, UPLOADER_CATEGORY +from biothings.hub.databuild.backend import ( + LinkTargetDocMongoBackend, + SourceDocMongoBackend, + TargetDocMongoBackend, + create_backend, +) +from biothings.hub.databuild.buildconfig import AutoBuildConfig +from biothings.hub.databuild.mapper import TransparentMapper +from biothings.hub.dataload.uploader import ResourceNotReady from biothings.hub.manager import BaseManager from biothings.utils import mongo from biothings.utils.backend import DocMongoBackend @@ -36,24 +45,14 @@ get_source_fullname, get_src_build, get_src_build_config, + get_src_db, get_src_dump, get_src_master, - get_src_db, ) from biothings.utils.loggers import get_logger from biothings.utils.manager import JobManager from biothings.utils.mongo import doc_feeder, id_feeder -from biothings.hub.databuild.backend import ( - LinkTargetDocMongoBackend, - SourceDocMongoBackend, - TargetDocMongoBackend, - create_backend, -) -from biothings.hub.databuild.buildconfig import AutoBuildConfig -from biothings.hub.databuild.mapper import TransparentMapper -from biothings.hub.dataload.uploader import ResourceNotReady - logging = btconfig.logger @@ -1618,7 +1617,6 @@ def build_info( only_archived=True will return archived merges only status: will return only successful/failed builds. Can be "success" or "failed" """ - res = {} q = self.get_query_for_list_merge(only_archived=only_archived, status=status) if id is not None: q = {"_id": id} @@ -1640,6 +1638,12 @@ def build_info( b["status"] = "unknown" if jobs: b["status"] = jobs[-1]["status"] + stored_total = b.get("_meta", {}).get("stats", {}).get("total") + if stored_total is not None: + b["count"] = stored_total + continue + + # Fallback for older build docs missing _meta.stats.total. try: backend = create_backend(b["backend_url"]) b["count"] = backend.count() diff --git a/biothings/hub/databuild/syncer.py b/biothings/hub/databuild/syncer.py index 2935e411b..916e4ad17 100644 --- a/biothings/hub/databuild/syncer.py +++ b/biothings/hub/databuild/syncer.py @@ -202,7 +202,7 @@ async def sync_cols( if diff_mapping_file: # old_db_col_names is actually the index name in that case index_name = old_db_col_names[1] - doc_type = self._meta["build_config"]["doc_type"] + # doc_type = self._meta["build_config"]["doc_type"] # remove doc_type, delete the line above after confirmed indexer = create_backend(old_db_col_names).target_esidxer pinfo["step"] = "mapping" pinfo["description"] = diff_mapping_file @@ -212,7 +212,8 @@ def update_mapping(): ops = loadobj(diffm) mapping = indexer.get_mapping() # we should have the same doc type declared in the mapping - mapping[doc_type]["properties"] = jsonpatch.apply_patch(mapping[doc_type]["properties"], ops) + # mapping[doc_type]["properties"] = jsonpatch.apply_patch(mapping[doc_type]["properties"], ops) + mapping["properties"] = jsonpatch.apply_patch(mapping["properties"], ops) # remove doc_type, delete the line above after confirmed res = indexer.update_mapping(mapping) return res @@ -311,7 +312,7 @@ def synced(f): if "meta" in steps and self.target_backend_type == "es": # old_db_col_names is actually the index name in that case index_name = old_db_col_names[1] - doc_type = self._meta["build_config"]["doc_type"] + # doc_type = self._meta["build_config"]["doc_type"] # remove doc_type, delete this line after confirmed indexer = create_backend(old_db_col_names).target_esidxer new_meta = self._meta["_meta"] pinfo["step"] = "metadata" @@ -619,13 +620,13 @@ def sync_es_jsondiff_worker( # (not allowed within an ES document (_source)) [d.pop("_timestamp", None) for d in docs] try: - res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] + res["added"] += indexer.index_bulk(docs, batch_size, op_type="create")[0] except BulkIndexError: for doc in docs: _id = doc.pop("_id") try: # force action=create to spot docs already added - indexer.index(doc, _id, action="create") + indexer.index(doc, _id, op_type="create") res["added"] += 1 except ConflictError: # already added diff --git a/biothings/hub/dataindex/indexer_task.py b/biothings/hub/dataindex/indexer_task.py index d9f4a25a4..cfcfb52a4 100644 --- a/biothings/hub/dataindex/indexer_task.py +++ b/biothings/hub/dataindex/indexer_task.py @@ -9,6 +9,7 @@ from biothings.utils.es import ESIndex as BaseESIndex from biothings.utils.loggers import get_logger +from biothings.utils.serializer import to_json try: from biothings.utils.mongo import doc_feeder @@ -92,8 +93,12 @@ def _action(doc): self.logger.error(error) self.logger.error("Document ID %s failed: %s", document_id, reason) - self.logger.warning("Discovered errors during the bulk index task. Defaulting to 0 indexed documents") - return 0 + serialized_errors = to_json(errors, indent=True) + message = ( + f"Bulk indexing failed for index '{self.index_name}'. " + f"Elasticsearch responded with errors:\n{serialized_errors}" + ) + raise helpers.BulkIndexError(message, errors) from e # NOTE # Why doesn't "mget", "mexists", "mindex" belong to the base class? diff --git a/biothings/hub/dataindex/mongo_build_cleanup.py b/biothings/hub/dataindex/mongo_build_cleanup.py new file mode 100644 index 000000000..71eb086f5 --- /dev/null +++ b/biothings/hub/dataindex/mongo_build_cleanup.py @@ -0,0 +1,182 @@ +from datetime import datetime +from functools import partial + +from config import logger as logging + +from biothings import config as btconfig +from biothings.hub.manager import BaseManager +from biothings.utils import mongo +from biothings.utils.hub_db import get_src_build + + +class MongoBuildCleaner: + def __init__(self, job_manager): + self.job_manager = job_manager + + def list_builds(self, build_config=None, build_name=None, year=None): + collection = get_src_build() + + filters = {} + if build_config: + filters["build_config._id"] = build_config + if build_name: + filters["_id"] = build_name + if year: + year = int(year) + filters["started_at"] = { + "$gte": datetime(year, 1, 1), + "$lt": datetime(year + 1, 1, 1), + } + + projection = { + "_id": 1, + "build_config": 1, + "started_at": 1, + "archived": 1, + "target_name": 1, + } + builds = list(collection.find(filters, projection).sort("started_at", -1)) + + grouped = {} + for build in builds: + group_name = build.get("build_config", {}).get("_id") or "N/A" + grouped.setdefault(group_name, []).append(build) + + return [{"_id": key, "items": items} for key, items in grouped.items()] + + async def delete_builds(self, build_ids): + if not build_ids: + return { + "deleted_count": 0, + "target_collections_deleted_count": 0, + "target_collections_deleted": [], + } + + conn = mongo.get_hub_db_async_conn() + try: + src_build = mongo.get_src_build_async(conn) + + build_docs = [] + async for doc in src_build.find({"_id": {"$in": build_ids}}, {"_id": 1, "target_name": 1}): + build_docs.append(doc) + + target_collection_candidates = set() + for doc in build_docs: + build_id = doc["_id"] + target_name = doc.get("target_name") + target_collection_candidates.add(build_id) + if target_name and target_name != build_id: + target_collection_candidates.add(target_name) + + target_collections_deleted = [] + if target_collection_candidates: + target_db = conn[btconfig.DATA_TARGET_DATABASE] + existing_collections = await target_db.list_collection_names( + filter={"name": {"$in": list(target_collection_candidates)}} + ) + + for collection_name in existing_collections: + await target_db[collection_name].drop() + target_collections_deleted.append(collection_name) + + result = await src_build.delete_many({"_id": {"$in": build_ids}}) + return { + "deleted_count": result.deleted_count, + "target_collections_deleted_count": len(target_collections_deleted), + "target_collections_deleted": sorted(target_collections_deleted), + } + finally: + await conn.close() + + async def validate_builds(self): + """Validate that target collections exist for each build record. + + Checks every build in src_build to see if its target collection still + exists in the target database. Build records whose target collections + have been removed are deleted, keeping the database in sync with the + actual data. + + Returns a dict with ``builds_removed`` (count) and ``builds_removed_names``. + """ + logging.info("Starting validation of MongoDB builds...") + conn = mongo.get_hub_db_async_conn() + try: + src_build = mongo.get_src_build_async(conn) + target_db = conn[btconfig.DATA_TARGET_DATABASE] + + existing_collections = set(await target_db.list_collection_names()) + + orphaned_ids = [] + async for doc in src_build.find({}, {"_id": 1, "target_name": 1}): + build_id = doc["_id"] + target_name = doc.get("target_name") or build_id + if target_name not in existing_collections: + orphaned_ids.append(build_id) + + if orphaned_ids: + result = await src_build.delete_many({"_id": {"$in": orphaned_ids}}) + deleted_count = result.deleted_count + else: + deleted_count = 0 + + logging.info( + "Build validation complete: removed %d orphaned build record(s)", + deleted_count, + extra={"notify": True}, + ) + return { + "builds_removed": deleted_count, + "builds_removed_names": sorted(orphaned_ids), + } + finally: + await conn.close() + + def done(self, future): + try: + result = future.result() + logging.info( + "Deleted %d MongoDB builds and dropped %d target collections", + result.get("deleted_count", 0), + result.get("target_collections_deleted_count", 0), + extra={"notify": True}, + ) + except Exception as exc: + logging.exception("Failed to delete MongoDB builds: %s", exc, extra={"notify": True}) + + def validate_done(self, future): + try: + result = future.result() + logging.info( + "Build validation complete: removed %d orphaned build record(s)", + result.get("builds_removed", 0), + extra={"notify": True}, + ) + except Exception as exc: + logging.exception("Failed to validate MongoDB builds: %s", exc, extra={"notify": True}) + + +class MongoBuildCleanupManager(BaseManager): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.cleaner = MongoBuildCleaner(self.job_manager) + + def list_mongo_builds(self, build_config=None, build_name=None, year=None): + return self.cleaner.list_builds(build_config=build_config, build_name=build_name, year=year) + + def delete_mongo_builds(self, build_ids): + try: + job = self.job_manager.submit(partial(self.cleaner.delete_builds, build_ids)) + job.add_done_callback(self.cleaner.done) + except Exception as ex: + logging.exception("Error while submitting MongoDB build deletion job: %s", ex, extra={"notify": True}) + raise + return job + + def validate_mongo_builds(self): + try: + job = self.job_manager.submit(partial(self.cleaner.validate_builds)) + job.add_done_callback(self.cleaner.validate_done) + except Exception as ex: + logging.exception("Error while submitting MongoDB build validation job: %s", ex, extra={"notify": True}) + raise + return job diff --git a/biothings/hub/dataindex/snapshooter.py b/biothings/hub/dataindex/snapshooter.py index 153c87659..7f62cfeb2 100644 --- a/biothings/hub/dataindex/snapshooter.py +++ b/biothings/hub/dataindex/snapshooter.py @@ -8,9 +8,8 @@ from functools import partial import boto3 -from config import logger as logging from elasticsearch import Elasticsearch -from elasticsearch.exceptions import TransportError, NotFoundError +from elasticsearch.exceptions import NotFoundError, TransportError from biothings import config as btconfig from biothings.hub import SNAPSHOOTER_CATEGORY @@ -22,6 +21,7 @@ from biothings.utils.hub import template_out from biothings.utils.hub_db import get_src_build from biothings.utils.loggers import get_logger +from config import logger as logging from . import snapshot_cleanup as cleaner, snapshot_registrar as registrar from .snapshot_repo import Repository diff --git a/biothings/hub/dataload/dumper.py b/biothings/hub/dataload/dumper.py index 44476fedf..b4e44c785 100644 --- a/biothings/hub/dataload/dumper.py +++ b/biothings/hub/dataload/dumper.py @@ -20,6 +20,8 @@ from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union from urllib import parse as urlparse +from biothings.utils import serializer + try: import docker from docker.errors import ImageNotFound, NotFound, NullResource @@ -28,7 +30,6 @@ except ImportError: docker_avail = False -import orjson import requests from biothings import config as btconfig @@ -873,9 +874,9 @@ class HTTPDumper(BaseDumper): Dumper using HTTP protocol and "requests" library """ - VERIFY_CERT = True - IGNORE_HTTP_CODE = [] # list of HTTP code to ignore in case on non-200 response - RESOLVE_FILENAME = False # global trigger to get filenames from headers + VERIFY_CERT: bool = True + IGNORE_HTTP_CODE: List[int] = [] # list of HTTP code to ignore in case on non-200 response + RESOLVE_FILENAME: bool = False # global trigger to get filenames from headers def prepare_client(self) -> None: self.client = requests.Session() @@ -1883,7 +1884,7 @@ def _run_api_and_store_to_disk( try: for filename, obj in fn(): fn_byte_arr = buffer.setdefault(filename, bytearray()) - fn_byte_arr.extend(orjson.dumps(obj) + b"\n") + fn_byte_arr.extend(serializer.to_json(obj, return_bytes=True) + b"\n") if len(fn_byte_arr) >= buffer_size: with open(f"{filename}.{pid}", "ab") as f: f.write(fn_byte_arr) diff --git a/biothings/hub/dataplugin/loaders/dumper.py.tpl b/biothings/hub/dataplugin/loaders/dumper.py.tpl index 3e714e81f..1e7a02521 100644 --- a/biothings/hub/dataplugin/loaders/dumper.py.tpl +++ b/biothings/hub/dataplugin/loaders/dumper.py.tpl @@ -6,6 +6,7 @@ from config import DATA_ARCHIVE_ROOT from biothings.utils.common import uncompressall +import $PLUGIN_MODULE import biothings.hub.dataload.dumper diff --git a/biothings/hub/dataplugin/loaders/loader.py b/biothings/hub/dataplugin/loaders/loader.py index 55dd7d89d..ef36355c6 100644 --- a/biothings/hub/dataplugin/loaders/loader.py +++ b/biothings/hub/dataplugin/loaders/loader.py @@ -201,10 +201,10 @@ def get_code_for_mod_name(self, plugin_directory: Union[str, pathlib.Path], mod_ """ try: module, funcname = map(str.strip, mod_name.split(":")) - except ValueError: + except ValueError as exc: raise LoaderException( "Invalid format for module '%s', it must be use the following format 'module:func'", mod_name - ) + ) from exc plugin_directory = pathlib.Path(plugin_directory).resolve().absolute() module_file = plugin_directory.joinpath(module).with_suffix(".py") @@ -271,6 +271,8 @@ def get_dumper_dynamic_class( dumper_class = self.dumper_registry.get(scheme) dumper_configuration["BASE_CLASSES"] = "biothings.hub.dataload.dumper.%s" % dumper_class.__name__ + dumper_configuration["PLUGIN_MODULE"] = dumper_configuration["BASE_CLASSES"].split(".")[0] + if not dumper_class: raise LoaderException("No dumper class registered to handle scheme '%s'", scheme) @@ -282,18 +284,12 @@ def get_dumper_dynamic_class( if dumper_section.get("release"): indentfunc, func = self.get_code_for_mod_name(plugin_directory, dumper_section["release"]) assert func != "set_release", "'set_release' is a reserved method name, pick another name" - dumper_configuration["SET_RELEASE_FUNC"] = ( - """ -%s + dumper_configuration["SET_RELEASE_FUNC"] = f""" +{indentfunc} def set_release(self): - self.release = self.%s() + self.release = self.{func}() """ - % ( - indentfunc, - func, - ) - ) else: dumper_configuration["SET_RELEASE_FUNC"] = "" @@ -352,20 +348,17 @@ def get_uploader_dynamic_class( if uploader_section.get("parser_kwargs"): parser_kwargs_serialized = repr(uploader_section["parser_kwargs"]) - confdict["PARSER_FACTORY_CODE"] = textwrap.dedent( - f""" + confdict["PARSER_FACTORY_CODE"] = textwrap.dedent(f""" # Setup parser to parser factory from {mod} import {func} as parser_func parser_kwargs = {parser_kwargs_serialized} - """ - ) + """) else: # create empty parser_kwargs to pass to parser_func parser_kwargs_serialized = repr({}) - confdict["PARSER_FACTORY_CODE"] = textwrap.dedent( - f""" + confdict["PARSER_FACTORY_CODE"] = textwrap.dedent(f""" # when code is exported, import becomes relative try: from {self.plugin_path_name}.{mod} import {func} as parser_func @@ -381,8 +374,7 @@ def get_uploader_dynamic_class( importlib.reload({mod}) from {mod} import {func} as parser_func parser_kwargs = {parser_kwargs_serialized} - """ - ) + """) except ValueError as value_error: loader_error_message = ( f"`parser` must be defined as `module:parser_func` but got: `{uploader_section['parser']}`" @@ -426,38 +418,35 @@ def get_uploader_dynamic_class( assert func != "jobs", "'jobs' is a reserved method name, pick another name" confdict["BASE_CLASSES"] = "biothings.hub.dataload.uploader.ParallelizedSourceUploader" confdict["IMPORT_FROM_PARALLELIZER"] = "" - confdict["JOBS_FUNC"] = ( - """ -%s + confdict["JOBS_FUNC"] = f""" +{indentfunc} def jobs(self): - return self.%s() + return self.{func}() """ - % ( - indentfunc, - func, - ) - ) else: - confdict["BASE_CLASSES"] = "biothings.hub.dataload.uploader.BaseSourceUploader" + # use specified custom class + klass = uploader_section.get("class") + if klass: + get_class_from_classpath(klass) + confdict["BASE_CLASSES"] = klass + else: + confdict["BASE_CLASSES"] = "biothings.hub.dataload.uploader.BaseSourceUploader" + + confdict["PLUGIN_MODULE"] = confdict["BASE_CLASSES"].split(".")[0] + confdict["JOBS_FUNC"] = "" if uploader_section.get("mapping"): indentfunc, func = self.get_code_for_mod_name(plugin_directory, uploader_section["mapping"]) assert func != "get_mapping", "'get_mapping' is a reserved class method name, pick another name" - confdict["MAPPING_FUNC"] = ( - """ + confdict["MAPPING_FUNC"] = f""" @classmethod -%s +{indentfunc} @classmethod def get_mapping(cls): - return cls.%s() + return cls.{func}() """ - % ( - indentfunc, - func, - ) - ) else: confdict["MAPPING_FUNC"] = "" @@ -587,8 +576,7 @@ def can_load_plugin(self) -> bool: if df.exists(): data_folder_files = {file.name for file in df.iterdir()} return "__init__.py" in data_folder_files - else: - return False + return False def load_plugin(self): plugin = self.get_plugin_obj() diff --git a/biothings/hub/dataplugin/loaders/schema/manifest.json b/biothings/hub/dataplugin/loaders/schema/manifest.json index 950dc2d09..4fe9b3fe9 100644 --- a/biothings/hub/dataplugin/loaders/schema/manifest.json +++ b/biothings/hub/dataplugin/loaders/schema/manifest.json @@ -143,6 +143,10 @@ "disabled": { "type": "boolean", "description": "If true, then the dumper will not be run. This is useful for testing purposes or if you want to disable the dumper without removing it from the manifest" + }, + "class": { + "type": "string", + "description": "Reference to a locally defined dumper class. Format: 'module:class_name'" } }, "required": [ @@ -264,7 +268,11 @@ "examples": [ "parallelizer:parallel_jobs" ] - } + }, + "class": { + "type": "string", + "description": "Reference to a locally defined uploader class. Format: 'module:class_name'" + } }, "required": [ "name", diff --git a/biothings/hub/dataplugin/loaders/subuploader.py.tpl b/biothings/hub/dataplugin/loaders/subuploader.py.tpl index 1d7ef8a5b..11e090292 100644 --- a/biothings/hub/dataplugin/loaders/subuploader.py.tpl +++ b/biothings/hub/dataplugin/loaders/subuploader.py.tpl @@ -4,7 +4,7 @@ import biothings, config biothings.config_for_app(config) import biothings.hub.dataload.uploader - +import $PLUGIN_MODULE $PARSER_FACTORY_CODE diff --git a/biothings/hub/dataplugin/loaders/uploader.py.tpl b/biothings/hub/dataplugin/loaders/uploader.py.tpl index 8f621eff3..9e23ab5bc 100644 --- a/biothings/hub/dataplugin/loaders/uploader.py.tpl +++ b/biothings/hub/dataplugin/loaders/uploader.py.tpl @@ -5,6 +5,7 @@ biothings.config_for_app(config) import biothings.hub.dataload.uploader +import $PLUGIN_MODULE $PARSER_FACTORY_CODE diff --git a/biothings/hub/datatransform/datatransform_mdb.py b/biothings/hub/datatransform/datatransform_mdb.py index 687ff715e..abd5b9d17 100644 --- a/biothings/hub/datatransform/datatransform_mdb.py +++ b/biothings/hub/datatransform/datatransform_mdb.py @@ -188,6 +188,11 @@ def __init__(self, graph, *args, **kwargs): source document regardless as to weather it matches an edge or not. (advanced usage) :type copy_from_doc: bool + + Note: Prefixes can be defined at the node level using: + graph.add_node("chebi", prefix="CHEBI") + When an identifier is converted to a node with a prefix attribute, + the prefix will be automatically added to the _id. """ if not isinstance(graph, nx.DiGraph): raise ValueError("key_lookup configuration error: graph must be of type nx.DiGraph") @@ -198,6 +203,29 @@ def __init__(self, graph, *args, **kwargs): super(DataTransformMDB, self).__init__(*args, **kwargs) self._precompute_paths() + def _apply_prefix(self, identifier, output_type): + """ + Apply prefix to identifier based on output type. + + Prefixes are defined as node attributes in the graph: + graph.add_node("chebi", prefix="CHEBI") + + :param identifier: The identifier value to potentially prefix + :param output_type: The output type to check for prefix + :return: The identifier with prefix applied if configured + """ + # Check if the node has a prefix attribute + if output_type in self.graph.nodes(): + node_data = self.graph.nodes[output_type] + if 'prefix' in node_data: + prefix = node_data['prefix'] + identifier_str = str(identifier) + # Only add prefix if it's not already there + if not identifier_str.startswith(prefix + ":"): + return f"{prefix}:{identifier_str}" + + return str(identifier) + def _valid_input_type(self, input_type): return input_type.lower() in self.graph.nodes() @@ -292,7 +320,7 @@ def key_lookup_batch(self, batchiter): (hit_lst, miss_lst) = self.travel(input_type, output_type, miss_lst) # or if copy is allowed, we get the value from the doc elif self.copy_from_doc: - (hit_lst, miss_lst) = self._copy(input_type, miss_lst) + (hit_lst, miss_lst) = self._copy(input_type, output_type, miss_lst) else: (hit_lst, miss_lst) = self.travel(input_type, output_type, miss_lst) @@ -305,15 +333,15 @@ def key_lookup_batch(self, batchiter): for doc in miss_lst: yield doc - def _copy(self, input_type, doc_lst): + def _copy(self, input_type, output_type, doc_lst): """Copy ids in the case where input_type == output_type""" hit_lst = [] miss_lst = [] for doc in doc_lst: val = nested_lookup(doc, input_type[1]) if val: - # ensure _id is always a str - doc["_id"] = str(val) + # ensure _id is always a str and apply prefix if configured + doc["_id"] = self._apply_prefix(val, output_type) hit_lst.append(doc) # retain debug information if available (assumed dt_debug already in place) if self.debug: @@ -371,8 +399,8 @@ def _build_hit_miss_lsts(doc_lst, id_strct, debug): value = nested_lookup(doc, input_type[1]) for lookup_id in id_strct.find_left(value): new_doc = copy.deepcopy(doc) - # ensure _id is always a str - new_doc["_id"] = str(lookup_id) + # ensure _id is always a str and apply prefix if configured + new_doc["_id"] = self._apply_prefix(lookup_id, target) # capture debug information if debug: new_doc["dt_debug"]["start_field"] = input_type[1] diff --git a/biothings/hub/default_config.py b/biothings/hub/default_config.py index 4a5084dd2..e7c7f3f5c 100644 --- a/biothings/hub/default_config.py +++ b/biothings/hub/default_config.py @@ -83,8 +83,15 @@ import biothings.utils.jsondiff # set_default_folder is needed for evaluating some default values below -from biothings.utils.configuration import set_default_folder # pylint: disable=unused-import # noqa -from biothings.utils.configuration import ConfigurationDefault, ConfigurationError, ConfigurationValue +from biothings.utils.configuration import ( + ConfigurationDefault, + ConfigurationError, + ConfigurationValue, + set_default_folder, +) + +# ConfigurationValue evaluates these symbols dynamically from this module's namespace. +_CONFIGURATION_EVAL_SYMBOLS = (logging, set_default_folder) # * 1. General *# # Hub name/icon url/version, for display purpose @@ -508,11 +515,21 @@ #################################################### # for running tests locally in our biothings hub with testing api -APITEST_PATH = ConfigurationError("Define path to folder which will contain pytests") - +APITEST_ROOT = ConfigurationDefault( + default="./tests", + desc="Define the root path to a folder that contains API tests", +) +APITEST_PATH = ConfigurationDefault( + default="", + desc="Define the path to a sub-folder of `APITEST_ROOT` that contains API tests", +) +APITEST_CONFIG_ROOT = ConfigurationDefault( + default=".", + desc="Define the root path containing the config_web to run a dev API for testing", +) APITEST_CONFIG = ConfigurationDefault( - default=ConfigurationValue("""'config_web_local'"""), - desc="Provide a default hub logger instance (use setup_default_log(name,log_folder)", + default="config_web_local", + desc="Define the name of the config_web file to run a dev API for testing", ) diff --git a/biothings/utils/backend.py b/biothings/utils/backend.py index 100874311..11a4d0e67 100644 --- a/biothings/utils/backend.py +++ b/biothings/utils/backend.py @@ -216,7 +216,7 @@ def count_from_ids(self, ids, step=100000): """ total_cnt = 0 for i in range(0, len(ids), step): - _ids = ids[i : i + step] + _ids = ids[i:i + step] _cnt = self.target_collection.count_documents({"_id": {"$in": _ids}}) total_cnt += _cnt return total_cnt @@ -230,7 +230,7 @@ def finalize(self): def remove_from_ids(self, ids, step=10000): deleted = 0 for i in range(0, len(ids), step): - res = self.target_collection.delete_many({"_id": {"$in": ids[i : i + step]}}) + res = self.target_collection.delete_many({"_id": {"$in": ids[i:i + step]}}) deleted += res.deleted_count return deleted @@ -244,7 +244,7 @@ class DocESBackend(DocBackendBase): def __init__(self, esidxer=None): """esidxer is an instance of utils.es.ESIndexer class.""" - if type(esidxer) == partial: + if isinstance(esidxer, partial): self._target_esidxer_provider = esidxer self._target_esidxer = None else: @@ -253,7 +253,7 @@ def __init__(self, esidxer=None): @property def target_esidxer(self): - if not self._target_esidxer: + if not self._target_esidxer and self._target_esidxer_provider: self._target_esidxer = self._target_esidxer_provider() return self._target_esidxer @@ -339,14 +339,15 @@ def query(self, query=None, verbose=False, step=10000, scroll="10m", only_source def create_from_options(cls, options): """Function that recreates itself from a DocBackendOptions class. Probably a needless rewrite of __init__...""" - if not options.es_index or not options.es_host or not options.es_doc_type: + if not options.es_index or not options.es_host: raise Exception( - "Cannot create backend class from options, ensure that es_index, es_host, and es_doc_type are set" + "Cannot create backend class from options, ensure that es_index, es_host are set" ) - return cls(ESIndexer(index=options.es_index, doc_type=options.es_doc_type, es_host=options.es_host)) + return cls(ESIndexer(index=options.es_index, es_host=options.es_host)) class DocBackendOptions(object): + # Deprecated, not used anywhere def __init__( self, cls, es_index=None, es_host=None, es_doc_type=None, mongo_target_db=None, mongo_target_collection=None ): diff --git a/biothings/utils/common.py b/biothings/utils/common.py index ec5577908..fe31a49ea 100644 --- a/biothings/utils/common.py +++ b/biothings/utils/common.py @@ -19,14 +19,17 @@ import os.path import pickle import random +import shutil import string import sys +import tarfile +import tempfile import time import types import urllib.parse import warnings from collections import UserDict, UserList -from contextlib import contextmanager +from contextlib import closing, contextmanager from datetime import date, datetime, timezone from functools import partial from itertools import islice @@ -35,6 +38,11 @@ import requests import yaml +try: + import zstandard as zstd +except ImportError: + zstd = None + # from json serial, catching special type # import _sre # TODO: unused import;remove it once confirmed @@ -160,7 +168,7 @@ def safewfile(filename, prompt=True, default="C", mode="w"): def anyfile(infile, mode="r"): """ - return a file handler with the support for gzip/zip comppressed files. + return a file handler with the support for gzip/zip compressed files. if infile is a two value tuple, then first one is the compressed file; the second one is the actual filename in the compressed file. e.g., ('a.zip', 'aa.txt') @@ -171,6 +179,58 @@ def anyfile(infile, mode="r"): else: rawfile = os.path.splitext(infile)[0] filetype = os.path.splitext(infile)[1].lower() + + # check if lower version zst handling is needed + lower_version_zst = False + if sys.version_info < (3, 14) and filetype == ".zst": + if zstd is None: + raise ImportError("zstandard is required to open .zst files on Python versions below 3.14") + lower_version_zst = True + + # tarfile handling. works for zst in Python >= 3.14 + if lower_version_zst or tarfile.is_tarfile(infile): + if lower_version_zst: + with open(infile, "rb") as compressed_file: + dctx = zstd.ZstdDecompressor() + with closing(dctx.stream_reader(compressed_file)) as reader: + with tarfile.open(fileobj=reader, mode="r|") as tar_file: + for member in tar_file: + if member.name == rawfile: + extracted = tar_file.extractfile(member) + break + else: + extracted = None + + # Keep the returned file readable after closing the tar and zst streams. + if extracted is not None: + with extracted: + spooled_file = tempfile.SpooledTemporaryFile( # pylint: disable=consider-using-with + max_size=1024 * 1024 + ) + shutil.copyfileobj(extracted, spooled_file) + spooled_file.seek(0) + + # extracted member is not a regular file or link + if extracted is None: + raise ValueError("invalid target file: must be a regular file or a link") + + return spooled_file + + tar_file = tarfile.open(infile, mode) # pylint: disable=consider-using-with + try: + extracted = tar_file.extractfile(rawfile) + except KeyError as exc: + # provided rawfile does not appear in the tarball + tar_file.close() + raise FileNotFoundError("target member does not contain the provided tar file.") from exc + + # extracted member is not a regular file or link + if extracted is None: + tar_file.close() + raise ValueError("invalid target file: must be a regular file or a link") + + return io.TextIOWrapper(extracted) + if filetype == ".gz": # import gzip in_f = io.TextIOWrapper(gzip.GzipFile(infile, mode)) @@ -747,7 +807,7 @@ def sanitize_tarfile(tar_object, directory): abs_target = os.path.abspath(target) prefix = os.path.commonprefix([abs_directory, abs_target]) if not prefix == abs_directory: - raise Exception("Attempted Path Traversal in Tar File") + raise ValueError("Attempted Path Traversal in Tar File") def sizeof_fmt(num, suffix="B"): diff --git a/biothings/utils/dataload.py b/biothings/utils/dataload.py index 8a733b440..41c416db3 100644 --- a/biothings/utils/dataload.py +++ b/biothings/utils/dataload.py @@ -9,6 +9,7 @@ # from __future__ import unicode_literals import itertools import json +import math import os import os.path from collections import Counter, OrderedDict @@ -21,9 +22,57 @@ csv.field_size_limit(10000000) # default is 131072, too small for some big files +def _missing_value_kind(val): + """Return a stable kind for NaN-like values without importing optional deps.""" + val_cls = val.__class__ + cls_module = getattr(val_cls, "__module__", "") + cls_name = getattr(val_cls, "__name__", "") + + if (cls_module == "pandas" or cls_module.startswith("pandas.")) and cls_name in ("NAType", "NaTType"): + return cls_name + + try: + if math.isnan(val): + return "NaN" + except (TypeError, ValueError): + pass + + return None + + +def _val_to_delete(val, vals): + """Return True if val is considered as a value to delete, False otherwise. + + NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed when + explicitly included in the vals list. + """ + if is_str(vals): + vals = [vals] + + val_missing_kind = _missing_value_kind(val) + + for candidate in vals: + candidate_missing_kind = _missing_value_kind(candidate) + if val_missing_kind or candidate_missing_kind: + if val_missing_kind == candidate_missing_kind: + return True + continue + + try: + if val == candidate: + return True + except (TypeError, ValueError): + continue + + return False + + def dict_sweep(d, vals=None, remove_invalid_list=False): """ - Remove keys whose values are ".", "-", "", "NA", "none", " "; and remove empty dictionaries + Remove keys whose values are ".", "-", "", "NA", "none", " "; and remove empty dictionaries. + + NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed when + explicitly included in the ``vals`` list. Args: d (dict): a dictionary @@ -45,11 +94,11 @@ def dict_sweep(d, vals=None, remove_invalid_list=False): # set default supported vals for empty values vals = vals or [".", "-", "", "NA", "none", " ", "Not Available", "unknown"] for key, val in list(d.items()): - if val in vals: + if _val_to_delete(val, vals): del d[key] elif isinstance(val, list): if remove_invalid_list: - val = [v for v in val if v not in vals] + val = [v for v in val if not _val_to_delete(v, vals)] for item in val: if isinstance(item, dict): dict_sweep(item, vals, remove_invalid_list=remove_invalid_list) @@ -59,12 +108,14 @@ def dict_sweep(d, vals=None, remove_invalid_list=False): else: d[key] = val else: - for item in val: - if item in vals: - val.remove(item) + i = 0 + while i < len(val): + item = val[i] + if _val_to_delete(item, vals): + del val[i] elif isinstance(item, dict): dict_sweep(item, vals, remove_invalid_list=remove_invalid_list) - # if len(val) == 0: + i += 1 if not val: del d[key] elif isinstance(val, dict): diff --git a/biothings/utils/dotfield.py b/biothings/utils/dotfield.py index 27c0c541e..bcffcd61b 100644 --- a/biothings/utils/dotfield.py +++ b/biothings/utils/dotfield.py @@ -1,4 +1,4 @@ -import orjson +from biothings.utils import serializer def make_object(attr, value): @@ -21,10 +21,9 @@ def make_object(attr, value): # s += "}" * (len(attr_list)) # return json.loads(s) - # New implementation using orjson module - s += orjson.dumps(value).decode("utf-8") # decoding is necessary because orjson dumps into bytes + s += serializer.to_json(value) s += "}" * (len(attr_list)) - return orjson.loads(s) + return serializer.load_json(s) def merge_object(obj1, obj2): diff --git a/biothings/utils/es.py b/biothings/utils/es.py index 3cea7352d..cb7c78271 100644 --- a/biothings/utils/es.py +++ b/biothings/utils/es.py @@ -40,13 +40,11 @@ def verify_ids( doc_iter, es_host, index, - doc_type=None, step=100000, ): """verify how many docs from input interator/list overlapping with existing docs.""" index = index - doc_type = doc_type es = get_es(es_host) q = {"query": {"ids": {"values": []}}} total_cnt = 0 @@ -56,7 +54,7 @@ def verify_ids( id_li = [doc["_id"] for doc in doc_batch] # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch] q["query"]["ids"]["values"] = id_li - xres = es.search(index=index, doc_type=doc_type, body=q, _source=False) + xres = es.search(index=index, body=q, source=False) found_cnt += xres["hits"]["total"] total_cnt += len(id_li) out.extend([x["_id"] for x in xres["hits"]["hits"]]) @@ -64,7 +62,7 @@ def verify_ids( def get_es(es_host, timeout=120, max_retries=3, retry_on_timeout=False): - es = Elasticsearch(es_host, timeout=timeout, max_retries=max_retries, retry_on_timeout=retry_on_timeout) + es = Elasticsearch(es_host, request_timeout=timeout, max_retries=max_retries, retry_on_timeout=retry_on_timeout) return es @@ -76,9 +74,7 @@ def wrapper(func): def outter_fn(*args, **kwargs): self = args[0] index = kwargs.pop("index", self._index) # pylint: disable=protected-access - doc_type = kwargs.pop("doc_type", self._doc_type) # pylint: disable=protected-access self._index = index # pylint: disable=protected-access - self._doc_type = doc_type # pylint: disable=protected-access return func(*args, **kwargs) outter_fn.__doc__ = func.__doc__ @@ -141,7 +137,6 @@ class ESIndexer: def __init__( self, index, - doc_type="_doc", es_host="http://localhost:9200", step=500, step_size=10, # elasticsearch.helpers.bulk @@ -150,6 +145,8 @@ def __init__( check_index=True, **kwargs, ): + # some old caller may still pass doc_type, we will ignore it here since it's no longer used. + kwargs.pop("doc_type", None) self.es_host = es_host self._es = get_es(es_host, **kwargs) self._host_major_ver = int(self._es.info()["version"]["number"].split(".")[0]) @@ -161,18 +158,6 @@ def __init__( # the real underlying index self.check_index() - self._doc_type = None - if doc_type: - self._doc_type = doc_type - else: - # assuming index exists, get mapping to discover doc_type - try: - m = self.get_mapping() - assert len(m) == 1, "Expected only one doc type, got: %s" % m.keys() - self._doc_type = list(m).pop() - except Exception as e: # pylint: disable=broad-except - if check_index: - logging.info("Failed to guess doc_type: %s", e) # set number_of_shards when create_index self.number_of_shards = number_of_shards # set number_of_replicas when create_index @@ -204,7 +189,7 @@ def check_index(self): @wrapper def get_biothing(self, bid, only_source=False, **kwargs): - rawdoc = self._es.get(index=self._index, id=bid, doc_type=self._doc_type, **kwargs) + rawdoc = self._es.get(index=self._index, id=bid, **kwargs) if not only_source: return rawdoc else: @@ -226,7 +211,6 @@ def mexists(self, bid_list): q = {"query": {"ids": {"values": bid_list}}} res = self._es.search( index=self._index, - doc_type=self._doc_type, body=q, stored_fields=None, size=len(bid_list), @@ -240,7 +224,7 @@ def count(self, q=None, raw=False): try: count_kwargs = {"index": self._index} if q is not None: - count_kwargs.update({"doc_type": self._doc_type, "q": q}) + count_kwargs.update({"q": q}) _res = self._es.count(**count_kwargs) return _res if raw else _res["count"] except NotFoundError: @@ -294,16 +278,15 @@ def exists_index(self, index: Optional[str] = None): index = self._index return self._es.indices.exists(index=index) - def index(self, doc, id=None, action="index"): # pylint: disable=redefined-builtin + def index(self, doc, id=None, op_type="index"): # pylint: disable=redefined-builtin """add a doc to the index. If id is not None, the existing doc will be updated. """ - self._es.index(index=self._index, doc_type=self._doc_type, body=doc, id=id, params={"op_type": action}) + self._es.index(index=self._index, body=doc, id=id, op_type=op_type) - def index_bulk(self, docs, step=None, action="index"): + def index_bulk(self, docs, step=None, op_type="index"): self._populate_es_version() index_name = self._index - doc_type = self._doc_type step = step or self.step def _get_bulk(doc): @@ -312,12 +295,9 @@ def _get_bulk(doc): ndoc.update( { "_index": index_name, - "_type": doc_type, - "_op_type": action, + "_op_type": op_type, } ) - if self._host_major_ver > 6: - ndoc.pop("_type") return ndoc actions = (_get_bulk(doc) for doc in docs) @@ -329,19 +309,15 @@ def _get_bulk(doc): def delete_doc(self, id): # pylint: disable=redefined-builtin """delete a doc from the index based on passed id.""" - return self._es.delete(index=self._index, doc_type=self._doc_type, id=id) + return self._es.delete(index=self._index, id=id) def delete_docs(self, ids, step=None): """delete a list of docs in bulk.""" index_name = self._index - doc_type = self._doc_type step = step or self.step def _get_bulk(_id): - if self._host_major_ver >= 7: - doc = {"_op_type": "delete", "_index": index_name, "_id": _id} - else: - doc = {"_op_type": "delete", "_index": index_name, "_type": doc_type, "_id": _id} + doc = {"_op_type": "delete", "_index": index_name, "_id": _id} return doc actions = (_get_bulk(_id) for _id in ids) @@ -359,27 +335,17 @@ def update(self, id, extra_doc, upsert=True): # pylint: disable=redefined-built body = {"doc": extra_doc} if upsert: body["doc_as_upsert"] = True - return self._es.update(index=self._index, doc_type=self._doc_type, id=id, body=body) + return self._es.update(index=self._index, id=id, body=body) def update_docs(self, partial_docs, upsert=True, step=None, **kwargs): """update a list of partial_docs in bulk. allow to set upsert=True, to insert new docs. """ index_name = self._index - doc_type = self._doc_type step = step or self.step def _get_bulk(doc): - if self._host_major_ver >= 7: - doc = {"_op_type": "update", "_index": index_name, "_id": doc["_id"], "doc": doc} - else: - doc = { - "_op_type": "update", - "_index": index_name, - "_type": doc_type, - "_id": doc["_id"], - "doc": doc, - } + doc = {"_op_type": "update", "_index": index_name, "_id": doc["_id"], "doc": doc} if upsert: doc["doc_as_upsert"] = True return doc @@ -387,31 +353,22 @@ def _get_bulk(doc): actions = (_get_bulk(doc) for doc in partial_docs) return helpers.bulk(self._es, actions, chunk_size=step, **kwargs) - def get_mapping(self): + def get_mapping(self, with_doc_type=False): """return the current index mapping""" - if self._host_major_ver <= 6: - m = self._es.indices.get_mapping( - index=self._index, - doc_type=self._doc_type, - ) - return m[self._index]["mappings"] - elif self._host_major_ver <= 8: + if self._host_major_ver >= 7: m = self._es.indices.get_mapping(index=self._index) - # fake the mapping doc_type - m = {self._doc_type: m[self._index]["mappings"]} - return m - else: - raise RuntimeError( - f"Server Elasticsearch version is {self._host_major_ver} " - "which is unsupported when using old ESIndexer class" - ) + if with_doc_type: + # use "_doc" as a fake doc_type to make it compatible with old behavior + # in case some caller expects a doc_type level key + return {"_doc": m[self._index]["mappings"]} + return m[self._index]["mappings"] + raise RuntimeError( + f"Server Elasticsearch version is {self._host_major_ver} " + "which is unsupported (must >=7) when using old ESIndexer class" + ) def update_mapping(self, m): - if self._host_major_ver <= 6: - assert list(m) == [self._doc_type], "Bad mapping format, should have one doc_type, got: %s" % list(m) - assert "properties" in m[self._doc_type], "Bad mapping format, no 'properties' key" - return self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m) - elif self._host_major_ver <= 8: + if self._host_major_ver >= 7: # this is basically guessing based on heuristics if len(m) == 1: if "properties" not in m: # basically {'_doc': mapping} @@ -425,21 +382,13 @@ def update_mapping(self, m): else: raise RuntimeError( f"Server Elasticsearch version is {self._host_major_ver} " - "which is unsupported when using old ESIndexer class" + "which is unsupported (must >=7) when using old ESIndexer class" ) def get_mapping_meta(self): """return the current _meta field.""" m = self.get_mapping() - doc_type = self._doc_type - if doc_type is None: - # fetch doc_type from mapping - - assert len(m) == 1, ( - "More than one doc_type found, not supported when self._doc_type " + "is not initialized" - ) - doc_type = list(m.keys())[0] - return {"_meta": m[doc_type]["_meta"]} + return {"_meta": m["_meta"]} def update_mapping_meta(self, meta): allowed_keys = {"_meta", "_timestamp"} @@ -450,9 +399,11 @@ def update_mapping_meta(self, meta): index=self._index, body=meta, ) - else: # not sure if _type needs to be specified - body = {self._doc_type: meta} - return self._es.indices.put_mapping(doc_type=self._doc_type, body=body, index=self._index) + else: + raise RuntimeError( + f"Server Elasticsearch version is {self._host_major_ver} " + "which is unsupported (must >=7) when using old ESIndexer class" + ) else: raise ValueError('Input "meta" should have and only have "_meta" field.') @@ -531,10 +482,10 @@ def rate_control(cnt, t): def optimize(self, max_num_segments=1): """optimize the default index.""" params = { - "wait_for_merge": False, + "wait_for_completion": False, "max_num_segments": max_num_segments, } - return self._es.indices.forcemerge(index=self._index, params=params) + return self._es.indices.forcemerge(index=self._index, **params) def clean_field(self, field, dryrun=True, step=5000): """remove a top-level field from ES index, if the field is the only field of the doc, @@ -543,7 +494,7 @@ def clean_field(self, field, dryrun=True, step=5000): try first with dryrun turned on, and then perform the actual updates with dryrun off. """ if self._host_major_ver >= 7: - raise RuntimeError("clean_field is no longer supported") + raise RuntimeError("clean_field is no longer supported") # It may still work, but untested yet q = {"query": {"constant_score": {"filter": {"exists": {"field": field}}}}} cnt_orphan_doc = 0 cnt = 0 @@ -552,10 +503,10 @@ def clean_field(self, field, dryrun=True, step=5000): if set(doc) == {"_id", field}: cnt_orphan_doc += 1 # delete orphan doc - _li.append({"delete": {"_index": self._index, "_type": self._doc_type, "_id": doc["_id"]}}) + _li.append({"delete": {"_index": self._index, "_id": doc["_id"]}}) else: # otherwise, just remove the field from the doc - _li.append({"update": {"_index": self._index, "_type": self._doc_type, "_id": doc["_id"]}}) + _li.append({"update": {"_index": self._index, "_id": doc["_id"]}}) # this script update requires "script.disable_dynamic: false" setting # in elasticsearch.yml _li.append({"script": 'ctx._source.remove("{}")'.format(field)}) @@ -581,7 +532,6 @@ def doc_feeder_using_helper(self, step=None, verbose=True, query=None, scroll="1 query=q, scroll=scroll, index=self._index, - doc_type=self._doc_type, **kwargs, ): if rawdoc.get("_source", False): @@ -596,6 +546,8 @@ def doc_feeder(self, step=None, verbose=True, query=None, scroll="10m", only_sou step = step or self.step q = query if query else {"query": {"match_all": {}}} _q_cnt = self.count(q=q, raw=True) + if not _q_cnt: + return n = _q_cnt["count"] n_shards = _q_cnt["_shards"]["total"] assert n_shards == _q_cnt["_shards"]["successful"] @@ -609,7 +561,6 @@ def doc_feeder(self, step=None, verbose=True, query=None, scroll="10m", only_sou res = self._es.search( index=self._index, - doc_type=self._doc_type, body=q, size=_size, search_type="scan", @@ -656,10 +607,7 @@ def get_docs(self, ids, step=None, only_source=True, **mget_args): # chunkify step = step or self.step for chunk in iter_n(ids, step): - if self._host_major_ver > 6: - chunk_res = self._es.mget(body={"ids": chunk}, index=self._index, **mget_args) - else: - chunk_res = self._es.mget(body={"ids": chunk}, index=self._index, doc_type=self._doc_type, **mget_args) + chunk_res = self._es.mget(body={"ids": chunk}, index=self._index, **mget_args) for rawdoc in chunk_res["docs"]: if ("found" not in rawdoc) or (("found" in rawdoc) and not rawdoc["found"]): continue @@ -677,9 +625,9 @@ def find_biggest_doc(self, fields_li, min=5, return_doc=False): # pylint: disab q = " AND ".join(["_exists_:" + field for field in field_set]) q = {"query": {"query_string": {"query": q}}} cnt = self.count(q) - if cnt > 0: + if cnt and cnt > 0: if return_doc: - res = self._es.search(index=self._index, doc_type=self._doc_type, body=q, size=cnt) + res = self._es.search(index=self._index, body=q, size=cnt) return res else: return (cnt, q) @@ -704,7 +652,7 @@ def snapshot(self, repo, snapshot, mode=None, **params): # ok, nothing to delete/purge pass try: - return self._es.snapshot.create(repository=repo, snapshot=snapshot, body=body, params=params) + return self._es.snapshot.create(repository=repo, snapshot=snapshot, body=body, **params) except RequestError as e: try: err_msg = e.info["error"]["reason"] @@ -950,7 +898,7 @@ def update_settings(self, settings, close=False, **params): self._es.indices.put_settings( body=settings, index=self._index, - params=params, + **params, ) if close: diff --git a/biothings/utils/manager.py b/biothings/utils/manager.py index 929672e54..2c611a183 100644 --- a/biothings/utils/manager.py +++ b/biothings/utils/manager.py @@ -3,8 +3,10 @@ import copy import datetime import glob +import multiprocessing import os import re +import sys import threading import time import types @@ -166,6 +168,23 @@ class JobManager: HEADERLINE = "{pid:^10}|{source:^35}|{category:^10}|{step:^20}|{description:^30}|{mem:^10}|{cpu:^6}|{started_at:^20}|{duration:^10}" DATALINE = HEADERLINE.replace("^", "<") + def _get_process_executor(self): + kwargs = {} + if sys.version_info >= (3, 7): + # since Python 3.14, multiprocessing uses `forkserver` as the default, instead of 'fork' + # on POSIX systems. This breaks our current biothings JobManager when creating dynamic + # classes in worker processes (e.g. AssistedDumper_ class), as the 'forkserver' + # context does not inherit resources from the parent process. + # This is a quick fix to force using 'fork' context for ProcessPoolExecutor in 3.14, + # consistent with previous Python versions. + # REF: https://docs.python.org/3.14/library/multiprocessing.html#contexts-and-start-methods + # TODO: we should consider refactoring the code to be compatible with 'forkserver' context in the future. + try: + kwargs["mp_context"] = multiprocessing.get_context("fork") + except ValueError: + pass + return concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers, **kwargs) + def __init__( self, loop, @@ -185,7 +204,7 @@ def __init__( logger.debug("Adjusting number of worker to 1") self.num_workers = 1 self.num_threads = num_threads or self.num_workers - self.process_queue = process_queue or concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) + self.process_queue = process_queue or self._get_process_executor() # notes on fixing BPE (BrokenProcessPool Exception): # whenever a process exits unexpectedly, BPE is raised, and while that # all the processes in the pool gets a SIGTERM from the management @@ -255,7 +274,7 @@ async def do(): if recycling: # now replace logger.info("Replacing process queue with new one") - self.process_queue = concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) + self.process_queue = self._get_process_executor() else: self.process_queue = None except Exception as e: @@ -469,7 +488,7 @@ async def run(future, job_id): # we don't need to care about the remaining tasks because # they'd all be SIGTERM'd anyways. But ... logger.warning("Broken Process Pool: %s, restarting.", e) - self.process_queue = concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) + self.process_queue = self._get_process_executor() for stale_id in self._process_job_ids: self.jobs.pop(stale_id, None) # in the rare case that # somehow they de-sync diff --git a/biothings/utils/mongo.py b/biothings/utils/mongo.py index 1a6c1f069..bc7d06fe5 100644 --- a/biothings/utils/mongo.py +++ b/biothings/utils/mongo.py @@ -10,7 +10,7 @@ import bson import dateutil.parser as date_parser -from pymongo import DESCENDING, MongoClient +from pymongo import DESCENDING, AsyncMongoClient, MongoClient from pymongo.client_session import ClientSession from pymongo.collection import Collection as PymongoCollection from pymongo.database import Database as PymongoDatabase @@ -155,6 +155,10 @@ def __getitem__(self, name): return Database(self, name) +class AsyncDatabaseClient(AsyncMongoClient): + pass + + def requires_config(func): @wraps(func) def func_wrapper(*args, **kwargs): @@ -192,6 +196,12 @@ def get_hub_db_conn(): return conn +@requires_config +def get_hub_db_async_conn(): + conn = AsyncDatabaseClient(config.HUB_DB_BACKEND["uri"]) + return conn + + @requires_config def get_src_conn(): return get_conn(config.DATA_SRC_SERVER, getattr(config, "DATA_SRC_PORT", 27017)) @@ -221,6 +231,12 @@ def get_src_build(conn=None): return conn[config.DATA_HUB_DB_DATABASE][config.DATA_SRC_BUILD_COLLECTION] +@requires_config +def get_src_build_async(conn=None): + conn = conn or get_hub_db_async_conn() + return conn[config.DATA_HUB_DB_DATABASE][config.DATA_SRC_BUILD_COLLECTION] + + @requires_config def get_src_build_config(conn=None): conn = conn or get_hub_db_conn() diff --git a/biothings/utils/parsers.py b/biothings/utils/parsers.py index 0f8a5a12d..273135da6 100644 --- a/biothings/utils/parsers.py +++ b/biothings/utils/parsers.py @@ -3,7 +3,7 @@ from typing import Callable, Generator, Iterable, Optional from urllib.parse import parse_qsl, urlparse -import orjson +from biothings.utils import serializer def ndjson_parser( @@ -31,7 +31,7 @@ def ndjson_parser_func(data_folder): for filename in work_dir.glob(pattern): with open(filename, "rb") as f: for line in f: - doc = orjson.loads(line) + doc = serializer.load_json(line) yield doc return ndjson_parser_func @@ -60,7 +60,7 @@ def json_array_parser(data_folder): for pattern in patterns: for filename in work_dir.glob(pattern): with open(filename, "r") as f: - data = orjson.loads(f.read()) + data = serializer.load_json(f.read()) try: iterator = iter(data) except TypeError: diff --git a/biothings/utils/serializer.py b/biothings/utils/serializer.py index a17733390..a15932ead 100644 --- a/biothings/utils/serializer.py +++ b/biothings/utils/serializer.py @@ -28,7 +28,7 @@ def orjson_default(o): raise TypeError(f"Type {type(o)} not serializable") -def to_json(data, indent=False, sort_keys=False): +def to_json(data, indent=False, sort_keys=False, return_bytes=False): # default option: # OPT_NON_STR_KEYS: non string dictionary key, e.g. integer # OPT_NAIVE_UTC: use UTC as the timezone when it's missing @@ -37,7 +37,12 @@ def to_json(data, indent=False, sort_keys=False): option |= orjson.OPT_INDENT_2 if sort_keys: option |= orjson.OPT_SORT_KEYS - return orjson.dumps(data, default=orjson_default, option=option).decode() + + byte_dump = orjson.dumps(data, default=orjson_default, option=option) + if return_bytes: + return byte_dump + + return byte_dump.decode() def to_json_file(data, fobj, indent=False, sort_keys=False): diff --git a/biothings/web/analytics/channels.py b/biothings/web/analytics/channels.py index ad0c97ab7..1c52e31bb 100644 --- a/biothings/web/analytics/channels.py +++ b/biothings/web/analytics/channels.py @@ -1,10 +1,11 @@ -import aiohttp import asyncio -import certifi import logging -import orjson import ssl +import aiohttp +import certifi + +from biothings.utils import serializer from biothings.web.analytics.events import Event, Message @@ -34,30 +35,6 @@ async def send_request(self, session, url, event): pass -class GAChannel(Channel): - def __init__(self, tracking_id, uid_version=1): - self.tracking_id = tracking_id - self.uid_version = uid_version - self.url = "http://www.google-analytics.com/batch" - - async def handles(self, event): - return isinstance(event, Event) - - async def send(self, event): - events = event.to_GA_payload(self.tracking_id, self.uid_version) - async with aiohttp.ClientSession() as session: - # The pagination of 20 is defined according to the context of the current application - # Usually, each client request is going to make just 1 request to the GA API. - # However, it's possible to collect data to GA in other parts of the application. - for i in range(0, len(events), 20): - data = "\n".join(events[i : i + 20]) - await self.send_request(session, self.url, data) - - async def send_request(self, session, url, data): - async with session.post(url, data=data) as _: - pass - - class GA4Channel(Channel): def __init__(self, measurement_id, api_secret, uid_version=1): self.measurement_id = measurement_id @@ -81,7 +58,7 @@ async def send(self, event): "user_id": str(event._cid(1)), "events": events[i : i + 25], } - await self.send_request(session, self.url, orjson.dumps(data)) + await self.send_request(session, self.url, serializer.to_json(data, return_bytes=True)) async def send_request(self, session, url, data): retries = 0 diff --git a/biothings/web/analytics/events.py b/biothings/web/analytics/events.py index fb148bab9..79b58533b 100644 --- a/biothings/web/analytics/events.py +++ b/biothings/web/analytics/events.py @@ -1,15 +1,10 @@ import hashlib - -# import smtplib import uuid from collections import UserDict -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText from ipaddress import IPv4Address, IPv6Address, ip_address from pprint import pformat from random import randint from typing import Union -from urllib.parse import urlencode class Event(UserDict): @@ -74,41 +69,11 @@ def _cid_v2(self): def _cid(self, version): if version == 1: return self._cid_v1() - elif version == 2: + if version == 2: return self._cid_v2() - # this is a required GA field raise ValueError("CID Version.") - def to_GA_payload(self, tracking_id, cid_version=1): - # by default implements - # a GA PageView hit-type - - # In the future, consider adding additional - # keys as cutomized dimensions or metrics. - - payload = { - "v": 1, # protocol version - "t": "pageview", - "tid": tracking_id, - "cid": self._cid(cid_version), - "uip": self.user_ip, - "dh": self.host, - "dp": self.path, - } - - # add document referer - if isinstance(self.referer, str): - if len(self.referer) <= 2048: # GA Limit - payload["dr"] = self.referer - - # add user_agent - if self.user_agent: - payload["ua"] = self.user_agent - - # this also escapes payload vals - return [urlencode(payload)] - def to_GA4_payload(self, measurement_id, cid_version=1): # Document about page_view event: https://support.google.com/analytics/answer/9964640#pageviews&zippy=%2Cin-this-article # GA4 does not support [Document path as UA](https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters#dp) @@ -154,32 +119,6 @@ class GAEvent(Event): # "value": "60" # } - def to_GA_payload(self, tracking_id, cid_version=1): - payloads = super().to_GA_payload(tracking_id, cid_version) - if self.get("category") and self.get("action"): - payloads.append( - urlencode( - _clean( - { - "v": 1, # protocol version - "t": "event", - "tid": tracking_id, - "cid": self._cid(cid_version), - "ec": self["category"], - "ea": self["action"], - "el": self.get("label", ""), - "ev": self.get("value", ""), - } - ) - ) - ) - for event in self.get("__secondary__", []): - event["__request__"] = self["__request__"] - payloads.extend(event.to_GA_payload(tracking_id, cid_version)[1:]) - # ignore the first event (pageview) - # which is already generated once - return payloads - def to_GA4_payload(self, measurement_id, cid_version=1): payloads = super().to_GA4_payload(measurement_id, cid_version) if self.get("category") and self.get("action"): diff --git a/biothings/web/analytics/notifiers.py b/biothings/web/analytics/notifiers.py index 16fa410e7..ec103477a 100644 --- a/biothings/web/analytics/notifiers.py +++ b/biothings/web/analytics/notifiers.py @@ -1,8 +1,9 @@ import asyncio - from collections import defaultdict + from tornado.web import RequestHandler -from biothings.web.analytics.channels import GA4Channel, GAChannel, SlackChannel + +from biothings.web.analytics.channels import GA4Channel, SlackChannel class Notifier: @@ -11,13 +12,6 @@ def __init__(self, settings): if hasattr(settings, "SLACK_WEBHOOKS"): self.channels.append(SlackChannel(getattr(settings, "SLACK_WEBHOOKS"))) - if getattr(settings, "GA_ACCOUNT", None): - self.channels.append( - GAChannel( - getattr(settings, "GA_ACCOUNT"), - getattr(settings, "GA_UID_GENERATOR_VERSION", 1), - ) - ) if getattr(settings, "GA4_MEASUREMENT_ID", None): self.channels.append( GA4Channel( diff --git a/biothings/web/connections.py b/biothings/web/connections.py index 5738a83fd..f8103bd58 100644 --- a/biothings/web/connections.py +++ b/biothings/web/connections.py @@ -5,7 +5,6 @@ from functools import partial import elasticsearch -import elasticsearch_dsl import requests from tornado.ioloop import IOLoop @@ -27,7 +26,10 @@ def _log_pkg(): es_ver = elasticsearch.__version__ - es_dsl_ver = elasticsearch_dsl.__versionstr__ + + # since v8.18.0, the DSL is released as part of the main elasticsearch package. + # dsl version is therefore aligned with base es version + es_dsl_ver = es_ver logger.info("Elasticsearch Package Version: %s", ".".join(map(str, es_ver))) logger.info("Elasticsearch DSL Package Version: %s", ".".join(map(str, es_dsl_ver))) diff --git a/biothings/web/handlers/base.py b/biothings/web/handlers/base.py index 0d7617c28..965174734 100644 --- a/biothings/web/handlers/base.py +++ b/biothings/web/handlers/base.py @@ -20,7 +20,6 @@ """ import logging -import orjson import yaml from tornado.web import HTTPError, RequestHandler @@ -105,8 +104,8 @@ def _parse_json(self): if not self.request.body: return {} try: - return orjson.loads(self.request.body) - except orjson.JSONDecodeError: + return serializer.load_json(self.request.body) + except serializer.JSONDecodeError: raise HTTPError(400, reason="Invalid JSON body.") def _parse_yaml(self): diff --git a/biothings/web/options/manager.py b/biothings/web/options/manager.py index c886599b5..d71c03227 100644 --- a/biothings/web/options/manager.py +++ b/biothings/web/options/manager.py @@ -10,7 +10,7 @@ from types import MappingProxyType import jmespath -import orjson +from biothings.utils import serializer try: from re import Pattern # py>=3.7 @@ -250,8 +250,8 @@ def __init__(self, **kwargs): def convert_to(self, value, to_type): if self.jsoninput: try: # attempt to load as json first - _value = orjson.loads(value) - except orjson.JSONDecodeError as exc: + _value = serializer.load_json(value) + except serializer.JSONDecodeError as exc: logging.debug(repr(exc)) else: # no more conversions if isinstance(_value, to_type): diff --git a/biothings/web/query/builder.py b/biothings/web/query/builder.py index 8fdcccb97..005efba3f 100644 --- a/biothings/web/query/builder.py +++ b/biothings/web/query/builder.py @@ -38,10 +38,10 @@ class implementations or not defined. import re from typing import Iterable, List, Set, Tuple, Union -from elasticsearch_dsl import MultiSearch, Q, Search -from elasticsearch_dsl.exceptions import IllegalOperation -import orjson +from elasticsearch.dsl import MultiSearch, Q, Search +from elasticsearch.dsl.exceptions import IllegalOperation +from biothings.utils import serializer from biothings.utils.common import dotdict from biothings.web.query.formatter import ESResultFormatter from biothings.web.services.metadata import BiothingsMetadata @@ -124,7 +124,7 @@ def _build_endpoint_metadata_fields(self, metadata: BiothingsMetadata) -> Set[st "url": "https://github.com/ericz1803/doid/tree/37c9bda7ba0e0569dad3181842ebc14d3af6c6a9/" }, "download_date": "2023-06-02T01:24:14.106000", - "licence": "Creative Commons \nPublic Domain Dedication CC0 \n1.0 Universal license", + "license": "Creative Commons \nPublic Domain Dedication CC0 \n1.0 Universal license", "license_url": "https://creativecommons.org/publicdomain/zero/1.0/", "stats": { "doid": 11314 @@ -159,7 +159,7 @@ def _build_endpoint_metadata_fields(self, metadata: BiothingsMetadata) -> Set[st "url": "https://github.com/ericz1803/doid/tree/37c9bda7ba0e0569dad3181842ebc14d3af6c6a9/" }, "download_date": "2023-06-02T01:24:14.106000", - "licence": "Creative Commons \nPublic Domain Dedication CC0 \n1.0 Universal license", + "license": "Creative Commons \nPublic Domain Dedication CC0 \n1.0 Universal license", "license_url": "https://creativecommons.org/publicdomain/zero/1.0/", "stats": { "doid": 11314 @@ -430,9 +430,9 @@ def __init__(self, path): ## alternative implementation # noqa: E266 # self._queries[os.path.basename(dirpath)] = text_file.read() ## - self._queries[os.path.basename(dirpath)] = orjson.loads(text_file.read()) + self._queries[os.path.basename(dirpath)] = serializer.load_json(text_file.read()) elif "filter" in filename: - self._filters[os.path.basename(dirpath)] = orjson.loads(text_file.read()) + self._filters[os.path.basename(dirpath)] = serializer.load_json(text_file.read()) except Exception: self.logger.exception("Error loading user queries.") diff --git a/biothings/web/query/engine.py b/biothings/web/query/engine.py index e1fe2023e..c3d5fe35e 100644 --- a/biothings/web/query/engine.py +++ b/biothings/web/query/engine.py @@ -11,7 +11,7 @@ >>> from biothings.web.query import ESQueryBackend >>> from elasticsearch import Elasticsearch ->>> from elasticsearch_dsl import Search +>>> from elasticsearch.dsl import Search >>> backend = ESQueryBackend(Elasticsearch()) >>> backend.execute(Search().query("match", _id="1017")) @@ -22,12 +22,15 @@ """ import asyncio +import logging from elasticsearch import NotFoundError, RequestError -from elasticsearch_dsl import MultiSearch, Search +from elasticsearch.dsl import MultiSearch, Search from biothings.web.query.builder import ESScrollID +logger = logging.getLogger(__name__) + class ResultInterrupt(Exception): def __init__(self, data): @@ -139,6 +142,13 @@ async def execute(self, query, **options): raise RawResultInterrupt(res) if not res["hits"]["hits"]: + scroll_id=query.data + try: + await self.client.clear_scroll(scroll_id=scroll_id) + logger.info("Scroll context cleared: %s", scroll_id) + except NotFoundError as e: + logger.warning("Scroll context not found (ID: %s): %s", scroll_id, str(e)) + # Always raise this exception regardless of whether clear_scroll succeeds raise EndScrollInterrupt() return res diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py index 5d5810425..aa6f46a9c 100644 --- a/biothings/web/query/formatter.py +++ b/biothings/web/query/formatter.py @@ -84,13 +84,26 @@ class ESResultFormatter(ResultFormatter): class _Hits(Hits): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # Check if this is an error response from Elasticsearch + if "error" in self.data: + logger.error("ES returned error response: %s", self.data) + raise ValueError("Invalid response format") + # make sure the document is coming from # elasticsearch at initialization time - assert "hits" in self.data - assert "total" in self.data["hits"] - assert "hits" in self.data["hits"] + if "hits" not in self.data: + logger.error("ES response missing 'hits' field. Response data: %s", self.data) + raise ValueError("Response missing 'hits' field") + if "total" not in self.data["hits"]: + logger.error("ES response missing 'hits.total' field. Response data: %s", self.data) + raise ValueError("Response missing 'hits.total' field") + if "hits" not in self.data["hits"]: + logger.error("ES response missing 'hits.hits' field. Response data: %s", self.data) + raise ValueError("Response missing 'hits.hits' field") for hit in self.data["hits"]["hits"]: - assert "_source" in hit + if "_source" not in hit: + logger.error("ES hit missing '_source' field. Hit data: %s", hit) + raise ValueError("Hit missing '_source' field") class _Doc(Doc): pass diff --git a/biothings/web/query/pipeline.py b/biothings/web/query/pipeline.py index 39bc3872c..73ceb04ba 100644 --- a/biothings/web/query/pipeline.py +++ b/biothings/web/query/pipeline.py @@ -79,7 +79,7 @@ def _simplify_ES_exception(exc, debug=False): root_cause = root_cause.replace('"', "'").split("\n") for index, cause in enumerate(root_cause): result["root_cause_line_" + f"{index:02}"] = cause - except IndexError: + except (IndexError, KeyError): pass # no root cause except Exception: logger.exception( @@ -147,6 +147,12 @@ async def _(*args, **kwargs): elif error_type == "index_not_found_exception": raise QueryPipelineException(500, error_type) + elif error_type == "es_rejected_execution_exception": + # ES cluster is overloaded, all thread pools at capacity + raise QueryPipelineException( + 503, "Service Unavailable", "Elasticsearch cluster overloaded" + ) from exc + else: # unexpected raise diff --git a/biothings/web/settings/default.py b/biothings/web/settings/default.py index 95636e0c7..211983986 100644 --- a/biothings/web/settings/default.py +++ b/biothings/web/settings/default.py @@ -209,9 +209,6 @@ # Sentry project address SENTRY_CLIENT_KEY = "" -# Google Analytics Account ID -GA_ACCOUNT = "" - # ***************************************************************************** # Endpoints Specifics & Others # ***************************************************************************** diff --git a/config.py.example b/config.py.example index 7c32bc221..e860cafce 100644 --- a/config.py.example +++ b/config.py.example @@ -110,10 +110,6 @@ GA4_UID_GENERATOR_VERSION = 1 # Analytics Settings # ***************************************************************************** -# Google Analytics Account ID - -GA_ACCOUNT = 'UA-123123-1' - # Google Measurement ID GA4_MEASUREMENT_ID = 'G-KXzzzzLBN' diff --git a/docs/tutorial/studio_guide.rst b/docs/tutorial/studio_guide.rst index f09694f78..5d2e2ecaa 100644 --- a/docs/tutorial/studio_guide.rst +++ b/docs/tutorial/studio_guide.rst @@ -298,7 +298,7 @@ A manifest file is defined like this: "__metadata__" : { # optional "url" : "", "license_url" : "", - "licence" : "", + "license" : "", "author" : { "name" : "", "url" : "" @@ -327,7 +327,7 @@ or with multiple uploader "__metadata__" : { # optional "url" : "", "license_url" : "", - "licence" : "", + "license" : "", "author" : { "name" : "", "url" : "" diff --git a/docs/tutorial/studio_tutorial.rst b/docs/tutorial/studio_tutorial.rst index 3168e1553..4668711ca 100644 --- a/docs/tutorial/studio_tutorial.rst +++ b/docs/tutorial/studio_tutorial.rst @@ -820,7 +820,7 @@ A ``tutorials`` folder can be found and contains the exported code: -rw-rw-r-- 1 biothings biothings 1190 Jan 22 19:32 parser.py -rw-rw-r-- 1 biothings biothings 2334 Jan 22 19:32 upload.py -Some files were copied from data plugin repository (``LICENCE``, ``README`` and ``parser.py``), the others are the exported ones: ``dump.py`` for the dumper, ``upload.py`` +Some files were copied from data plugin repository (``LICENSE``, ``README`` and ``parser.py``), the others are the exported ones: ``dump.py`` for the dumper, ``upload.py`` for the uploader and the mappings, and ``__init__.py`` so the **Hub** can find these components upon start. We'll go in further details later, specially when we'll add more uploaders. diff --git a/docs/tutorial/web.rst b/docs/tutorial/web.rst index e78a8d101..e4c63b789 100644 --- a/docs/tutorial/web.rst +++ b/docs/tutorial/web.rst @@ -295,7 +295,7 @@ with a few rules to increase result relevancy. Additionally add to the ``pipelin .. code-block:: python from biothings.web.query import ESQueryBuilder - from elasticsearch_dsl import Search + from elasticsearch.dsl import Search class MyQueryBuilder(ESQueryBuilder): diff --git a/pyproject.toml b/pyproject.toml index 9570a401a..dc5698b68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,11 +20,11 @@ dynamic = ["version"] # version is dynamically generated from setup.py authors = [ {name = "The BioThings Team", email="dev@biothings.io"}, ] -requires-python = ">=3.8" +requires-python = ">=3.9" description = "a toolkit for building high-performance data & knowledge APIs in biology" readme = "README.md" -# license = "Apache-2.0" # when we drop Python 3.8 support, we can use this new format -license = {text = "Apache-2.0"} # this is an old format, but works for Python 3.8+ +license = "Apache-2.0" # this new format works for Python 3.9+ +# license = {text = "Apache-2.0"} # this is an old format, but works for Python 3.8+ keywords = [ "biology", "medicine", @@ -38,12 +38,12 @@ keywords = [ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Development Status :: 5 - Production/Stable", "Operating System :: OS Independent", "Operating System :: POSIX", @@ -60,17 +60,18 @@ dependencies = [ "requests>=2.21.0", 'tornado==6.1.0; python_version < "3.7.0"', 'tornado==6.2.0; python_version == "3.7.0"', - 'tornado==6.4.2; python_version >= "3.8.0"', + 'tornado==6.4.2; python_version == "3.8.0"', + 'tornado==6.5.3; python_version >= "3.9.0"', "gitpython>=3.1.0", "elasticsearch[async]>=7, <8; python_version < '3.7.0'", - "elasticsearch-dsl>=7, <8; python_version < '3.7.0'", "elasticsearch[async]>=8, <9; python_version >= '3.7.0'", - "elasticsearch-dsl>=8, <9; python_version >= '3.7.0'", 'singledispatchmethod; python_version < "3.8.0"', 'dataclasses; python_version < "3.7.0"', "jmespath>=0.7.1,<2.0.0", # support jmespath query parameter "PyYAML>=5.1", - "orjson>=3.6.1", # a faster json lib supports inf/nan and datetime, v3.6.1 is the last version supports Python 3.6 + 'orjson>=3.10.16; python_version < "3.14.0"', # a faster json lib supports inf/nan and datetime, v3.10.16 is the first version requires Python 3.9+ + 'orjson==3.11.4; python_version >= "3.14.0"', # orjson 3.11.5 cannot be built on Python 3.14t for now + 'zstandard>=0.21.0; python_version<"3.14"', # we need zst library before 3.14 ] [project.optional-dependencies] @@ -86,11 +87,11 @@ opensearch = [ ] # minimal requirements for running biothings.hub, e.g. in CLI mode hubcore = [ - "pymongo>=4.1.0,<5.0", # support MongoDB 5.0 since v3.12.0 + "pymongo>=4.13.0,<5.0", # AsyncMongoClient stable since 4.13.0 ] # extra requirements to run a full biothings.hub hub = [ - "pymongo>=4.1.0,<5.0", + "pymongo>=4.13.0,<5.0", "beautifulsoup4", # used in dumper.GoogleDriveDumper "aiocron==1.8", # setup scheduled jobs # "aiohttp==3.8.4", # elasticsearch requires aiohttp>=3,<4 @@ -120,10 +121,10 @@ hub = [ ] # minimal requirements for to run biothings CLI cli = [ - "pymongo>=4.1.0,<5.0", # support MongoDB 5.0 since v3.12.0 + "pymongo>=4.13.0,<5.0", # AsyncMongoClient stable since 4.13.0 "psutil", "jsonschema>=2.6.0", - "typer>=0.12.1", # required for CLI, also installs rich package + "typer>=0.17.0", # required for CLI, also installs rich package ] # if DockerContainerDumper is used, requires this Docker SDK for Python docker = [ diff --git a/tests/conftest.py b/tests/conftest.py index 04118a1cc..b8ad7f7b0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,8 +13,8 @@ import types from pathlib import Path -import pytest import _pytest +import pytest from biothings.utils.loggers import setup_default_log @@ -90,7 +90,7 @@ def pytest_sessionstart(session: _pytest.main.Session): }, "HUB_ENV": "", "ACTIVE_DATASOURCES": [], - "DATA_HUB_DB_DATABASE": ".hubdb", + "DATA_HUB_DB_DATABASE": "biothings_hubdb", "DATA_PLUGIN_FOLDER": "/tmp/testhub/plugins", "DATA_ARCHIVE_ROOT": "/tmp/testhub/datasources", "DIFF_PATH": "/tmp/testhub/datasources/diff", diff --git a/tests/hub/config/data/base_configuration.py b/tests/hub/config/data/base_configuration.py index c1a144de6..2ef6b9ebc 100644 --- a/tests/hub/config/data/base_configuration.py +++ b/tests/hub/config/data/base_configuration.py @@ -90,7 +90,7 @@ S3_SNAPSHOT_BUCKET = "" S3_REGION = "" -DATA_HUB_DB_DATABASE = ".hubdb" +DATA_HUB_DB_DATABASE = "biothings_hubdb" APITEST_PATH = str(Path(__file__).parent.absolute().resolve()) # descONE diff --git a/tests/hub/config/data/deep_configuration.py b/tests/hub/config/data/deep_configuration.py index f0392df9e..04ff7774a 100644 --- a/tests/hub/config/data/deep_configuration.py +++ b/tests/hub/config/data/deep_configuration.py @@ -90,7 +90,7 @@ S3_SNAPSHOT_BUCKET = "" S3_REGION = "" -DATA_HUB_DB_DATABASE = ".hubdb" +DATA_HUB_DB_DATABASE = "biothings_hubdb" APITEST_PATH = str(Path(__file__).parent.absolute().resolve()) # redefine some params diff --git a/tests/hub/dataload/test_dump.py b/tests/hub/dataload/test_dump.py index 8d91aeb92..85c24c57e 100644 --- a/tests/hub/dataload/test_dump.py +++ b/tests/hub/dataload/test_dump.py @@ -2,6 +2,7 @@ Tests for the various dumper classes """ +import pathlib import tempfile import pytest @@ -61,7 +62,11 @@ def test_http_dumper_properties(): @pytest.mark.parametrize( "remoteurl,resolve_filepath", [ - ("https://github.com/biothings/biothings.api/archive/refs/tags/v0.12.5.zip", True), + # this URL contains a "content-disposition" header with a filename as "biothings.api-0.12.5.zip" + # if resolve_filepath is True, the dumper should resolve the filename from this header + ("https://codeload.github.com/biothings/biothings.api/zip/refs/tags/v0.12.5", True), + # this URL does not contain a "content-disposition" header, and resolve_filepath is False + ("https://biothings.io/static/img/transition.svg", False), ], ) def test_http_dumper_download(remoteurl: str, resolve_filepath: bool): @@ -71,9 +76,19 @@ def test_http_dumper_download(remoteurl: str, resolve_filepath: bool): with tempfile.NamedTemporaryFile() as temp_local_file: dumper_instance = HTTPDumper() HTTPDumper.RESOLVE_FILENAME = resolve_filepath - assert dumper_instance.remote_is_better(remoteurl, temp_local_file) + assert dumper_instance.remote_is_better(remoteurl, temp_local_file.name) download_headers = {} response = dumper_instance.download( remoteurl=remoteurl, localfile=temp_local_file.name, headers=download_headers ) assert isinstance(response, requests.models.Response) + if resolve_filepath: + assert response.headers.get("content-disposition") + assert response.headers["content-disposition"].endswith("biothings.api-0.12.5.zip") + local_file = pathlib.Path(pathlib.Path(temp_local_file.name).parent, "biothings.api-0.12.5.zip") + assert local_file.exists() + assert local_file.stat().st_size > 0 + else: + local_file = pathlib.Path(pathlib.Path(temp_local_file.name)) + assert local_file.exists() + assert local_file.stat().st_size > 0 diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest0.json b/tests/hub/dataplugin/data/manifests/malformed_manifest0.json index 5552267ce..3299004fa 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest0.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest0.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest1.json b/tests/hub/dataplugin/data/manifests/malformed_manifest1.json index e607e1ad2..5c273a469 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest1.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest1.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest2.json b/tests/hub/dataplugin/data/manifests/malformed_manifest2.json index 3c3f130b5..2ffed70cf 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest2.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest2.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest3.json b/tests/hub/dataplugin/data/manifests/malformed_manifest3.json index de8a36080..6fd890b1a 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest3.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest3.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest4.json b/tests/hub/dataplugin/data/manifests/malformed_manifest4.json index fd7022e3d..598357b5c 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest4.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest4.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest5.json b/tests/hub/dataplugin/data/manifests/malformed_manifest5.json index 3dd44a904..06a12f615 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest5.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest5.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/malformed_manifest6.json b/tests/hub/dataplugin/data/manifests/malformed_manifest6.json index 8d13edc17..f75da7ced 100644 --- a/tests/hub/dataplugin/data/manifests/malformed_manifest6.json +++ b/tests/hub/dataplugin/data/manifests/malformed_manifest6.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/dataplugin/data/manifests/mock_manifest.json b/tests/hub/dataplugin/data/manifests/mock_manifest.json index 8f977d807..791d4595c 100644 --- a/tests/hub/dataplugin/data/manifests/mock_manifest.json +++ b/tests/hub/dataplugin/data/manifests/mock_manifest.json @@ -2,7 +2,7 @@ "version": "1.0", "__metadata__": { "license_url": "http://www.mock-license-url.gov", - "licence": "", + "license": "", "url": "http://www.mock-reference.org/" }, "requires": [ diff --git a/tests/hub/datarelease/conftest.py b/tests/hub/datarelease/conftest.py index 2810c4ef4..4a4d256c5 100644 --- a/tests/hub/datarelease/conftest.py +++ b/tests/hub/datarelease/conftest.py @@ -1,12 +1,11 @@ -from pathlib import Path import copy import logging import sys +from pathlib import Path import pytest from pytest_mock import MockerFixture - logger = logging.getLogger(__name__) @@ -14,7 +13,7 @@ def releasenote_configuration(root_configuration: "TestConfig"): releasenote_configuration = { "HUB_DB_BACKEND": {"module": "biothings.utils.sqlite3", "sqlite_db_folder": "./dummy_db"}, - "DATA_HUB_DB_DATABASE": "mock_releasenote.hubdb", + "DATA_HUB_DB_DATABASE": "mock_releasenote_hubdb", } root_configuration.override(releasenote_configuration) @@ -278,10 +277,7 @@ def mock_fn(col_name): def release_note_source( mock_get_source_fullname, cold_src_build_doc, hot_src_build_doc, old_cold_src_build_docs, old_hot_src_build_docs ): - from biothings.hub.datarelease.releasenote import ( - ReleaseNoteSource, - ReleaseNoteSrcBuildReader, - ) + from biothings.hub.datarelease.releasenote import ReleaseNoteSource, ReleaseNoteSrcBuildReader old_src_build_reader = ReleaseNoteSrcBuildReader(old_hot_src_build_docs) old_src_build_reader.attach_cold_src_build_reader(ReleaseNoteSrcBuildReader(old_cold_src_build_docs)) diff --git a/tests/utils/test_dataload.py b/tests/utils/test_dataload.py new file mode 100644 index 000000000..66a919b71 --- /dev/null +++ b/tests/utils/test_dataload.py @@ -0,0 +1,384 @@ +""" +Tests for dict_sweep and _val_to_delete in biothings.utils.dataload, +specifically the handling of NaN-like values. + +NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed +when explicitly included in the vals list (opt-in). +""" + +import math + +import pytest + +from biothings.utils.dataload import _val_to_delete, dict_sweep + +# --------------------------------------------------------------------------- +# Fake pandas-like NA / NaT types for testing without importing pandas. +# __module__ mirrors real pandas internals closely enough to exercise fallback +# detection without importing pandas. +# --------------------------------------------------------------------------- + + +class NAType: + """Mimics pandas.NA (pandas.core.arrays.masked.NAType).""" + + __module__ = "pandas._libs.missing" + + def __bool__(self): + raise TypeError("boolean value of NA is ambiguous") + + def __eq__(self, other): + raise TypeError("boolean value of NA is ambiguous") + + def __hash__(self): + return 0 + + def __repr__(self): + return "" + + +class NaTType: + """Mimics pandas.NaT (pandas._libs.tslibs.nattype.NaTType).""" + + __module__ = "pandas._libs.tslibs.nattype" + + def __bool__(self): + raise TypeError("boolean value of NaT is ambiguous") + + def __eq__(self, other): + raise TypeError("boolean value of NaT is ambiguous") + + def __hash__(self): + return 0 + + def __repr__(self): + return "NaT" + + +_NA = NAType() +_NaT = NaTType() + +_DEFAULT_VALS = [".", "-", "", "NA", "none", " ", "Not Available", "unknown"] +_VALS_WITH_NAN = _DEFAULT_VALS + [float("nan"), _NA, _NaT] + + +@pytest.fixture(scope="module") +def pandas(): + return pytest.importorskip("pandas") + + +@pytest.fixture(scope="module") +def numpy(): + return pytest.importorskip("numpy") + + +# --------------------------------------------------------------------------- +# _val_to_delete helper tests +# --------------------------------------------------------------------------- + + +class TestValToDelete: + # -- default vals (no NaN entries) ----------------------------------- + + def test_default_vals_matched(self): + for val in _DEFAULT_VALS: + assert _val_to_delete(val, _DEFAULT_VALS) is True, f"should delete {val!r}" + + def test_regular_values_not_deleted(self): + for val in [0, 1, -1, "hello", None, [], {}, 0.0, 1.5, True, False]: + assert _val_to_delete(val, _DEFAULT_VALS) is False, f"should keep {val!r}" + + def test_string_vals_matched(self): + assert _val_to_delete("NA", "NA") is True + + def test_float_nan_not_deleted_by_default(self): + """float NaN is kept when vals does not contain a NaN float.""" + assert _val_to_delete(float("nan"), _DEFAULT_VALS) is False + + def test_na_not_deleted_by_default(self): + """Mock pandas NA is kept when vals does not contain a pandas NA.""" + assert _val_to_delete(_NA, _DEFAULT_VALS) is False + + def test_nat_not_deleted_by_default(self): + """Mock pandas NaT is kept when vals does not contain a pandas NaT.""" + assert _val_to_delete(_NaT, _DEFAULT_VALS) is False + + # -- vals with NaN entries (opt-in) ---------------------------------- + + def test_float_nan_deleted_when_in_vals(self): + assert _val_to_delete(float("nan"), _VALS_WITH_NAN) is True + + def test_na_deleted_when_in_vals(self): + assert _val_to_delete(_NA, _VALS_WITH_NAN) is True + + def test_nat_deleted_when_in_vals(self): + assert _val_to_delete(_NaT, _VALS_WITH_NAN) is True + + def test_na_and_nat_are_not_interchangeable(self): + assert _val_to_delete(_NA, _DEFAULT_VALS + [_NaT]) is False + assert _val_to_delete(_NaT, _DEFAULT_VALS + [_NA]) is False + + def test_regular_match_after_na_like_value(self): + assert _val_to_delete("", [_NA, ""]) is True + + +# --------------------------------------------------------------------------- +# dict_sweep — default vals (NaN kept) +# --------------------------------------------------------------------------- + + +class TestDictSweepDefaultKeepsNan: + def test_float_nan_kept(self): + d = {"a": 1, "b": float("nan")} + result = dict_sweep(d) + assert "b" in result + assert math.isnan(result["b"]) + + def test_na_kept(self): + d = {"a": 1, "b": _NA} + result = dict_sweep(d) + assert "b" in result + assert result["b"] is _NA + + def test_nat_kept(self): + d = {"a": 1, "b": _NaT} + result = dict_sweep(d) + assert "b" in result + assert result["b"] is _NaT + + def test_nan_kept_in_list(self): + d = {"a": [1, float("nan"), 2]} + result = dict_sweep(d) + assert len(result["a"]) == 3 + + def test_default_vals_still_removed(self): + d = {"a": ".", "b": "-", "c": "", "d": "keep"} + result = dict_sweep(d) + assert result == {"d": "keep"} + + +# --------------------------------------------------------------------------- +# dict_sweep — opt-in NaN removal (NaN in vals) +# --------------------------------------------------------------------------- + + +class TestDictSweepOptInNanRemoval: + def test_float_nan_removed(self): + d = {"a": 1, "b": float("nan")} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": 1} + + def test_na_removed(self): + d = {"a": 1, "b": _NA} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": 1} + + def test_nat_removed(self): + d = {"a": 1, "b": _NaT} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": 1} + + def test_multiple_nan_types_removed(self): + d = {"a": float("nan"), "b": _NA, "c": _NaT, "d": "keep"} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"d": "keep"} + + +# --------------------------------------------------------------------------- +# dict_sweep — NaN inside lists (opt-in) +# --------------------------------------------------------------------------- + + +class TestDictSweepNanInList: + def test_nan_removed_from_list(self): + d = {"a": [1, float("nan"), 2]} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": [1, 2]} + + def test_na_removed_from_list(self): + d = {"a": [1, _NA, 2]} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": [1, 2]} + + def test_nat_removed_from_list(self): + d = {"a": [1, _NaT, 2]} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": [1, 2]} + + def test_list_becomes_empty_after_nan_removal(self): + d = {"a": [float("nan")]} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert "a" not in result + + def test_nan_in_list_with_remove_invalid_list(self): + d = {"a": [float("nan"), _NA, "valid"]} + result = dict_sweep(d, vals=_VALS_WITH_NAN, remove_invalid_list=True) + assert result == {"a": ["valid"]} + + def test_all_nan_list_removed_with_remove_invalid_list(self): + d = {"a": [float("nan"), _NA, _NaT]} + result = dict_sweep(d, vals=_VALS_WITH_NAN, remove_invalid_list=True) + assert "a" not in result + + def test_all_invalid_list_preserves_false_mode_behavior(self): + d = {"gene": [None, None], "site": ["Intron", None], "snp_build": 136} + result = dict_sweep(d, vals=[None], remove_invalid_list=False) + assert result == {"gene": [None], "site": ["Intron"], "snp_build": 136} + + def test_all_invalid_list_removed_with_remove_invalid_list(self): + d = {"gene": [None, None], "site": ["Intron", None], "snp_build": 136} + result = dict_sweep(d, vals=[None], remove_invalid_list=True) + assert result == {"site": ["Intron"], "snp_build": 136} + + +# --------------------------------------------------------------------------- +# dict_sweep — NaN in nested dicts (opt-in) +# --------------------------------------------------------------------------- + + +class TestDictSweepNanNested: + def test_nan_in_nested_dict(self): + d = {"a": {"b": float("nan"), "c": 1}} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": {"c": 1}} + + def test_na_in_nested_dict(self): + d = {"a": {"b": _NA, "c": 1}} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": {"c": 1}} + + def test_nested_dict_removed_when_empty_after_sweep(self): + d = {"a": {"b": float("nan")}} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert "a" not in result + + def test_nan_in_list_of_dicts(self): + d = {"a": [{"x": float("nan"), "y": 1}, {"x": _NA, "y": 2}]} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"a": [{"y": 1}, {"y": 2}]} + + +# --------------------------------------------------------------------------- +# dict_sweep — default vals behaviour unchanged +# --------------------------------------------------------------------------- + + +class TestDictSweepDefaultBehavior: + def test_normal_values_kept(self): + d = {"a": 1, "b": "hello", "c": [1, 2], "d": {"nested": True}} + result = dict_sweep(d) + assert result == {"a": 1, "b": "hello", "c": [1, 2], "d": {"nested": True}} + + def test_mixed_nan_and_default_vals_with_optin(self): + d = {"a": float("nan"), "b": ".", "c": _NA, "d": "keep", "e": ""} + result = dict_sweep(d, vals=_VALS_WITH_NAN) + assert result == {"d": "keep"} + + def test_string_vals_supported(self): + d = {"a": "NA", "b": "keep", "c": ["NA", "keep"]} + result = dict_sweep(d, vals="NA") + assert result == {"b": "keep", "c": ["keep"]} + + +# --------------------------------------------------------------------------- +# Tests using real pandas and numpy types, when the optional deps are installed. +# --------------------------------------------------------------------------- + + +class TestValToDeleteRealTypes: + def test_numpy_nan_not_deleted_by_default(self, numpy): + assert _val_to_delete(numpy.nan, _DEFAULT_VALS) is False + + def test_pandas_na_not_deleted_by_default(self, pandas): + assert _val_to_delete(pandas.NA, _DEFAULT_VALS) is False + + def test_pandas_nat_not_deleted_by_default(self, pandas): + assert _val_to_delete(pandas.NaT, _DEFAULT_VALS) is False + + def test_numpy_nan_deleted_when_in_vals(self, numpy): + vals_with_nan = _DEFAULT_VALS + [float("nan")] + assert _val_to_delete(numpy.nan, vals_with_nan) is True + + def test_pandas_na_deleted_when_in_vals(self, pandas): + vals_with_na = _DEFAULT_VALS + [pandas.NA] + assert _val_to_delete(pandas.NA, vals_with_na) is True + + def test_pandas_nat_deleted_when_in_vals(self, pandas): + vals_with_nat = _DEFAULT_VALS + [pandas.NaT] + assert _val_to_delete(pandas.NaT, vals_with_nat) is True + + def test_pandas_na_and_nat_are_not_interchangeable(self, pandas): + assert _val_to_delete(pandas.NA, _DEFAULT_VALS + [pandas.NaT]) is False + assert _val_to_delete(pandas.NaT, _DEFAULT_VALS + [pandas.NA]) is False + + def test_regular_match_after_pandas_na(self, pandas): + assert _val_to_delete("", [pandas.NA, ""]) is True + + def test_numpy_float32_nan_deleted_when_in_vals(self, numpy): + vals_with_nan = _DEFAULT_VALS + [float("nan")] + assert _val_to_delete(numpy.float32("nan"), vals_with_nan) is True + + def test_numpy_float64_nan_deleted_when_in_vals(self, numpy): + vals_with_nan = _DEFAULT_VALS + [float("nan")] + assert _val_to_delete(numpy.float64("nan"), vals_with_nan) is True + + +class TestDictSweepRealPandas: + """dict_sweep with real pandas/numpy NaN types and opt-in removal.""" + + def _vals_with(self, *extras): + return _DEFAULT_VALS + list(extras) + + def test_pandas_na_top_level(self, pandas): + d = {"a": 1, "b": pandas.NA} + result = dict_sweep(d, vals=self._vals_with(pandas.NA)) + assert result == {"a": 1} + + def test_pandas_nat_top_level(self, pandas): + d = {"a": 1, "b": pandas.NaT} + result = dict_sweep(d, vals=self._vals_with(pandas.NaT)) + assert result == {"a": 1} + + def test_numpy_nan_top_level(self, numpy): + d = {"a": 1, "b": numpy.nan} + result = dict_sweep(d, vals=self._vals_with(float("nan"))) + assert result == {"a": 1} + + def test_pandas_na_in_list(self, pandas): + d = {"a": [1, pandas.NA, 2]} + result = dict_sweep(d, vals=self._vals_with(pandas.NA)) + assert result == {"a": [1, 2]} + + def test_pandas_nat_in_list(self, pandas): + d = {"a": [1, pandas.NaT, 2]} + result = dict_sweep(d, vals=self._vals_with(pandas.NaT)) + assert result == {"a": [1, 2]} + + def test_numpy_nan_in_list(self, numpy): + d = {"a": [1, numpy.nan, 2]} + result = dict_sweep(d, vals=self._vals_with(float("nan"))) + assert result == {"a": [1, 2]} + + def test_pandas_na_in_nested_dict(self, pandas): + d = {"a": {"b": pandas.NA, "c": 1}} + result = dict_sweep(d, vals=self._vals_with(pandas.NA)) + assert result == {"a": {"c": 1}} + + def test_pandas_na_in_list_with_remove_invalid_list(self, pandas): + d = {"a": [pandas.NA, pandas.NaT, "valid"]} + result = dict_sweep(d, vals=self._vals_with(pandas.NA, pandas.NaT), remove_invalid_list=True) + assert result == {"a": ["valid"]} + + def test_all_real_nan_types_removed(self, pandas, numpy): + d = {"a": numpy.nan, "b": pandas.NA, "c": pandas.NaT, "d": "keep"} + result = dict_sweep(d, vals=self._vals_with(float("nan"), pandas.NA, pandas.NaT)) + assert result == {"d": "keep"} + + def test_real_nan_kept_by_default(self, pandas, numpy): + """Without opting in, real NaN types are preserved.""" + d = {"a": numpy.nan, "b": pandas.NA, "c": pandas.NaT, "d": "keep"} + result = dict_sweep(d) + assert "a" in result + assert "b" in result + assert "c" in result + assert result["d"] == "keep" diff --git a/tests/web/analytics/test_channels.py b/tests/web/analytics/test_channels.py index acc219947..a530df461 100644 --- a/tests/web/analytics/test_channels.py +++ b/tests/web/analytics/test_channels.py @@ -1,12 +1,13 @@ -import aiohttp import asyncio -import orjson -import pytest +from unittest.mock import patch +import aiohttp +import pytest from aioresponses import aioresponses -from biothings.web.analytics.channels import SlackChannel, GA4Channel, GAChannel + +from biothings.utils import serializer +from biothings.web.analytics.channels import GA4Channel, SlackChannel from biothings.web.analytics.events import GAEvent, Message -from unittest.mock import patch @pytest.mark.asyncio @@ -31,34 +32,6 @@ async def test_send_Slack(): await channel.send(message) -@pytest.mark.asyncio -async def test_send_GA(): - event = GAEvent( - { - "__request__": { - "user_agent": "Opera/9.60 (Windows NT 6.0; U; en) Presto/2.1.1", - "referer": None, - "user_ip": "127.0.0.1", - "host": "example.org", - "path": "/", - }, - "category": "test", - "action": "play", - "label": "sample.mp4", - "value": 60, - } - ) - channel = GAChannel("G-XXXXXX", 2) - assert await channel.handles(event) - - with aioresponses() as responses: - # Mock the URL to return a 200 OK response - responses.post(channel.url, status=200) - - # If the function completes without raising an exception, the test will pass - await channel.send(event) - - @pytest.mark.asyncio async def test_send_GA4(): event = GAEvent( @@ -91,7 +64,8 @@ async def test_send_GA4(): async def test_send_GA4_request_retries(): channel = GA4Channel("G-XXXXXX", "SECRET") url = channel.url - data = orjson.dumps({"test": "data"}) + # data = orjson.dumps({"test": "data"}) + data = serializer.to_json({"test": "data"}, return_bytes=True) async with aiohttp.ClientSession() as session: with aioresponses() as responses: @@ -109,7 +83,7 @@ async def test_send_GA4_request_retries(): async def test_send_GA4_request_max_retries(): channel = GA4Channel("G-XXXXXX", "SECRET") url = channel.url - data = orjson.dumps({"test": "data"}) + data = serializer.to_json({"test": "data"}, return_bytes=True) async with aiohttp.ClientSession() as session: with aioresponses() as responses: diff --git a/tests/web/analytics/test_events.py b/tests/web/analytics/test_events.py index 823584a8e..a1dc46c2c 100644 --- a/tests/web/analytics/test_events.py +++ b/tests/web/analytics/test_events.py @@ -2,77 +2,6 @@ from biothings.web.analytics.events import Event, GAEvent -# validator -# https://ga-dev-tools.web.app/hit-builder/ - - -def test_pageview_1(): - event = Event( - dict( - __request__={ - "user_agent": "Opera/9.60 (Windows NT 6.0; U; en) Presto/2.1.1", - "referer": "https://example.com/", - "user_ip": "127.0.0.1", - "host": "example.org", - "path": "/", - } - ) - ) - print(event.to_GA_payload("UA-000000-2")) - print(event.to_GA_payload("UA-000000-2", 2)) - - -def test_pageview_2(): - event = Event( - dict( - __request__={ - "user_agent": None, - "referer": None, - "user_ip": "127.0.0.1", - "host": "example.org", - "path": "/404.html", - } - ) - ) - print(event.to_GA_payload("UA-000000-2")) - print(event.to_GA_payload("UA-000000-2", 2)) - - -def test_event_1(): - event = GAEvent( - { - "__request__": { - "user_agent": "Opera/9.60 (Windows NT 6.0; U; en) Presto/2.1.1", - "referer": "https://example.com/", - "user_ip": "127.0.0.1", - "host": "example.org", - "path": "/", - }, - "category": "video", - "action": "play", - "label": "sample.mp4", - "value": 60, - } - ) - print(event.to_GA_payload("UA-000000-2")) - print(event.to_GA_payload("UA-000000-2", 2)) - - -def test_event_2(): - event = GAEvent( - { - "__request__": { - "user_agent": "Opera/9.60 (Windows NT 6.0; U; en) Presto/2.1.1", - "referer": "https://example.com/", - "user_ip": "127.0.0.1", - "host": "example.org", - "path": "/", - } - } - ) - print(event.to_GA_payload("UA-000000-2")) - print(event.to_GA_payload("UA-000000-2", 2)) - def test_pageview_ga4_1(): event = Event( diff --git a/tests/web/handlers/data/test_mapping.json b/tests/web/handlers/data/test_mapping.json index 9f8f7066a..ba41dcf5f 100644 --- a/tests/web/handlers/data/test_mapping.json +++ b/tests/web/handlers/data/test_mapping.json @@ -111,7 +111,7 @@ "version": "68" }, "clingen": { - "licence": "CC0 1.0 Universal", + "license": "CC0 1.0 Universal", "code": { "file": "src/hub/dataload/sources/clingen/upload.py", "repo": "https://github.com/biothings/mygene.info.git", @@ -446,7 +446,7 @@ "version": "20191218" }, "pantherdb": { - "licence": "GNU General Public License Version 2", + "license": "GNU General Public License Version 2", "code": { "folder": "src/plugins/pantherdb", "repo": "https://github.com/biothings/mygene.info.git", diff --git a/tests/web/test_connections.py b/tests/web/test_connections.py index c250c6736..35bf29e67 100644 --- a/tests/web/test_connections.py +++ b/tests/web/test_connections.py @@ -13,8 +13,8 @@ def test_es_1(): def test_es_2(): # see if the client is reused client1 = connections.es.get_client("http://localhost:9200") - client2 = connections.es.get_client("http://localhost:9200", timeout=20) - client3 = connections.es.get_client("http://localhost:9200", timeout=20) + client2 = connections.es.get_client("http://localhost:9200", request_timeout=20) + client3 = connections.es.get_client("http://localhost:9200", request_timeout=20) print(id(client1)) print(id(client2)) print(id(client3)) diff --git a/tests/web/test_es_exceptions.py b/tests/web/test_es_exceptions.py index 36b61f5c5..14e51f88d 100644 --- a/tests/web/test_es_exceptions.py +++ b/tests/web/test_es_exceptions.py @@ -1,18 +1,20 @@ +from unittest.mock import Mock + import pytest from biothings.web.query.pipeline import ( - capturesESExceptions, - RawQueryInterrupt, - QueryPipelineInterrupt, + AuthenticationException, + AuthorizationException, + ConflictError, EndScrollInterrupt, - RawResultInterrupt, + NotFoundError, QueryPipelineException, + QueryPipelineInterrupt, + RawQueryInterrupt, + RawResultInterrupt, RequestError, - NotFoundError, - ConflictError, - AuthenticationException, - AuthorizationException, TransportError, + capturesESExceptions, ) @@ -25,7 +27,7 @@ async def func(): with pytest.raises(QueryPipelineInterrupt) as exc_info: await func() assert exc_info.value.code == 200 - assert exc_info.value.summary == None + assert exc_info.value.summary is None assert exc_info.value.details == {"error": "test_error"} @@ -38,7 +40,7 @@ async def func(): with pytest.raises(QueryPipelineInterrupt) as exc_info: await func() assert exc_info.value.code == 200 - assert exc_info.value.summary == None + assert exc_info.value.summary is None assert exc_info.value.details == {"success": False, "error": "No more results to return."} @@ -51,7 +53,7 @@ async def func(): with pytest.raises(QueryPipelineInterrupt) as exc_info: await func() assert exc_info.value.code == 200 - assert exc_info.value.summary == None + assert exc_info.value.summary is None assert exc_info.value.details == "test_body" @@ -65,7 +67,7 @@ async def func(): await func() assert exc_info.value.code == 500 assert exc_info.value.summary == "test_assertion_error" - assert exc_info.value.details == None + assert exc_info.value.details is None @pytest.mark.asyncio @@ -96,9 +98,12 @@ async def func(): @pytest.mark.asyncio async def test_request_error(): + _meta = Mock() + _meta.status = 400 + @capturesESExceptions async def func(): - raise RequestError(message="test_request_error", meta={}, body={}) + raise RequestError(message="test_request_error", meta=_meta, body={}) with pytest.raises(QueryPipelineException) as exc_info: await func() @@ -108,9 +113,12 @@ async def func(): @pytest.mark.asyncio async def test_not_found_error(): + _meta = Mock() + _meta.status = 404 + @capturesESExceptions async def func(): - raise NotFoundError(message="test_not_found_error", meta={}, body={}) + raise NotFoundError(message="test_not_found_error", meta=_meta, body={}) with pytest.raises(QueryPipelineException) as exc_info: await func() @@ -122,9 +130,12 @@ async def func(): @pytest.mark.asyncio async def test_conflict_error(): + _meta = Mock() + _meta.status = 409 + @capturesESExceptions async def func(): - raise ConflictError(message="test_conflict_error", meta={}, body={}) + raise ConflictError(message="test_conflict_error", meta=_meta, body={}) with pytest.raises(QueryPipelineException) as exc_info: await func() @@ -135,9 +146,12 @@ async def func(): @pytest.mark.asyncio async def test_authentication_exception(): + _meta = Mock() + _meta.status = 403 + @capturesESExceptions async def func(): - raise AuthenticationException(message="test_authentication_exception", meta={}, body={}) + raise AuthenticationException(message="test_authentication_exception", meta=_meta, body={}) with pytest.raises(QueryPipelineException) as exc_info: await func() @@ -148,9 +162,12 @@ async def func(): @pytest.mark.asyncio async def test_authorization_exception(): + _meta = Mock() + _meta.status = 403 + @capturesESExceptions async def func(): - raise AuthorizationException(message="test_authorization_exception", meta={}, body={}) + raise AuthorizationException(message="test_authorization_exception", meta=_meta, body={}) with pytest.raises(QueryPipelineException) as exc_info: await func() @@ -160,10 +177,10 @@ async def func(): @pytest.mark.asyncio -async def test_generic_exception(): +async def test_index_not_found_exception(): @capturesESExceptions async def func(): - exc = Exception(message="test_generic_exception", meta={}, body={}) + exc = Exception(message="test_index_not_found_exception", meta={}, body={}) exc.status_code = 500 exc.info = {"error": {"type": "index_not_found_exception", "reason": "test_reason"}} raise exc @@ -175,6 +192,22 @@ async def func(): assert exc_info.value.details == "Exception() takes no keyword arguments" +@pytest.mark.asyncio +async def test_es_rejected_execution_exception(): + @capturesESExceptions + async def func(): + exc = TransportError("test_es_rejected_execution_exception") + exc.status_code = 503 + exc.info = {"error": {"type": "es_rejected_execution_exception", "reason": "rejected execution of TimedRunnable..."}} + raise exc + + with pytest.raises(QueryPipelineException) as exc_info: + await func() + assert exc_info.value.code == 503 + assert exc_info.value.summary == "Service Unavailable" + assert exc_info.value.details == "Elasticsearch cluster overloaded" + + @pytest.mark.asyncio async def test_search_phase_execution_exception_rejected_execution(): @capturesESExceptions @@ -188,7 +221,7 @@ async def func(): await func() assert exc_info.value.code == 503 assert exc_info.value.summary == "" - assert exc_info.value.details == None + assert exc_info.value.details is None @pytest.mark.asyncio @@ -235,4 +268,4 @@ async def func(): assert exc_info.value.code == 503 assert exc_info.value.summary == "" - assert exc_info.value.details == None + assert exc_info.value.details is None