From 89c1dcb0282fb5c7bc77ab09446e7986c0b2b179 Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 6 May 2026 15:54:37 -0400 Subject: [PATCH 1/4] post rfc100 cleanup --- dags/import_base.py | 109 +++-- dags/import_clickhouse_base.py | 334 ---------------- dags/import_genie_clickhouse_dag.py | 75 ---- dags/import_genie_dag.py | 65 +-- dags/import_public_clickhouse_dag.py | 75 ---- dags/import_public_dag.py | 50 ++- dags/import_tempo_dag.py | 59 --- ...clickhouse_dag.py => import_triage_dag.py} | 4 +- .../airflow-create-derived-tables.sh | 6 +- import-scripts/airflow-import-clickhouse.sh | 92 ----- import-scripts/airflow-import-sql.sh | 40 +- import-scripts/airflow-setup-import.sh | 38 +- import-scripts/backup-eks-dbs.sh | 52 --- .../fetch-ddp-and-import-cmo-access-data.sh | 4 +- import-scripts/import-cmo-data-msk.sh | 4 +- import-scripts/import-dmp-impact-data.sh | 4 +- import-scripts/import-msk-extract-projects.sh | 79 ---- import-scripts/import-pdx-data.sh | 371 ------------------ import-scripts/import-temp-study.sh | 254 ------------ import-scripts/import-tempo-data.sh | 128 ------ ...ge_dremio_clinical_data_into_cmo_access.sh | 4 +- import-scripts/monitor-stalled-jobs.sh | 93 ----- import-scripts/rds_functions.sh | 142 ------- .../rsync_jenkins_test_properties.sh | 82 ---- import-scripts/scale-rds.sh | 110 ------ import-scripts/tempo-environment.sh | 7 - .../test_if_impact_has_lost_allele_count.sh | 53 --- import-scripts/update-msk-mind-cohort.sh | 100 ----- import-scripts/update-msk-spectrum-cohort.sh | 85 ---- 29 files changed, 167 insertions(+), 2352 deletions(-) delete mode 100644 dags/import_clickhouse_base.py delete mode 100644 dags/import_genie_clickhouse_dag.py delete mode 100644 dags/import_public_clickhouse_dag.py delete mode 100644 dags/import_tempo_dag.py rename dags/{import_triage_clickhouse_dag.py => import_triage_dag.py} (95%) delete mode 100755 import-scripts/airflow-import-clickhouse.sh delete mode 100755 import-scripts/backup-eks-dbs.sh delete mode 100755 import-scripts/import-msk-extract-projects.sh delete mode 100755 import-scripts/import-pdx-data.sh delete mode 100755 import-scripts/import-temp-study.sh delete mode 100755 import-scripts/import-tempo-data.sh delete mode 100755 import-scripts/monitor-stalled-jobs.sh delete mode 100755 import-scripts/rds_functions.sh delete mode 100755 import-scripts/rsync_jenkins_test_properties.sh delete mode 100755 import-scripts/scale-rds.sh delete mode 100644 import-scripts/tempo-environment.sh delete mode 100755 import-scripts/test_if_impact_has_lost_allele_count.sh delete mode 100755 import-scripts/update-msk-mind-cohort.sh delete mode 100755 import-scripts/update-msk-spectrum-cohort.sh diff --git a/dags/import_base.py b/dags/import_base.py index 8da622e0d..15aab4ad8 100644 --- a/dags/import_base.py +++ b/dags/import_base.py @@ -33,17 +33,17 @@ *DAG ID*: {{ dag.dag_id }} *Execution Time*: {{ execution_date }} """ -import_sql_failure_slack_msg = """ - :red_circle: Import SQL Failed. Please check the notification file in the Airflow logs. +import_ch_failure_slack_msg = """ + :red_circle: ClickHouse Import Failed. Please check the notification file in the Airflow logs. *DAG ID*: {{ dag.dag_id }} *Execution Time*: {{ execution_date }} - *Log Url*: {{ import_sql_log_url }} + *Log Url*: {{ import_ch_log_url }} """ -import_sql_success_slack_msg = """ - :large_green_circle: Import SQL Success! +import_ch_success_slack_msg = """ + :large_green_circle: ClickHouse Import Success! *DAG ID*: {{ dag.dag_id }} *Execution Time*: {{ execution_date }} - *Log Url*: {{ import_sql_log_url }} + *Log Url*: {{ import_ch_log_url }} """ dag_failure_slack_webhook_notification = send_slack_webhook_notification( slack_webhook_conn_id="slack_default", text=fail_slack_msg @@ -86,10 +86,13 @@ class ImporterConfig: schedule_interval: Optional[str] = None -def _script(scripts_dir: str, script_name: str, *args: object) -> str: +def _script(scripts_dir: str, script_name: str, *args: object, source_automation_env: bool = False) -> str: parts = [f"{scripts_dir}/{script_name}"] parts.extend(str(arg) for arg in args) - return " ".join(parts) + cmd = " ".join(parts) + if source_automation_env: + return f"source {scripts_dir}/automation-environment.sh && {cmd}" + return cmd def build_import_dag(config: ImporterConfig) -> DAG: @@ -124,33 +127,33 @@ def build_import_dag(config: ImporterConfig) -> DAG: @task def get_data_repos(repos: list[str]) -> str: return " ".join(repos) - - # run this task even if import_sql failed + + # run this task even if import_direct_to_clickhouse failed @task(trigger_rule=TriggerRule.ALL_DONE) def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> None: """ - Sends a Slack message to the #airflow-logs channel with a link to the import_sql logs URL. + Sends a Slack message to the #airflow-logs channel with a link to the import_direct_to_clickhouse logs URL. This tells the curators whether there were any studies that suceeded or failed to import during a given run. To avoid confusion -- we run this task towards the end of the DAG (eg. after the transfer_deployment step) because we don't want to send a success message before the entire import run completes. """ - # Get the log URL for the import_sql task + # Get the log URL for the import_direct_to_clickhouse task context = get_current_context() dag_run = context.get("dag_run") - import_sql_ti = None + import_ch_ti = None if dag_run is not None: - import_sql_ti = dag_run.get_task_instance("import_sql", map_index=0) - import_sql_log_url = import_sql_ti.log_url if import_sql_ti is not None else "" - if not import_sql_log_url: - logger.warning("Could not determine import_sql log url; skipping Slack notification.") + import_ch_ti = dag_run.get_task_instance("import_direct_to_clickhouse", map_index=0) + import_ch_log_url = import_ch_ti.log_url if import_ch_ti is not None else "" + if not import_ch_log_url: + logger.warning("Could not determine import_direct_to_clickhouse log url; skipping Slack notification.") raise AirflowSkipException() - import_sql_failed = ( - import_sql_ti is not None and import_sql_ti.state == State.FAILED + import_ch_failed = ( + import_ch_ti is not None and import_ch_ti.state == State.FAILED ) - if not import_sql_failed: + if not import_ch_failed: # Read the notification file from the remote node to check if any studies failed try: ssh_hook = SSHHook(ssh_conn_id=ssh_conn_id) @@ -160,11 +163,11 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No ) if exit_status != 0: logger.warning("Notification file not found at %s; treating as failure", notification_filepath) - import_sql_failed = True + import_ch_failed = True else: notification_content = notif_contents.decode("utf-8") ERROR_STRING = "The following studies had errors during import" - import_sql_failed = (ERROR_STRING in notification_content) + import_ch_failed = (ERROR_STRING in notification_content) except Exception as exc: logger.warning("Could not read notification file from remote node; skipping Slack notification") logger.warning("Stack trace:") @@ -172,9 +175,9 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No raise AirflowSkipException() from exc # Build the msg and send to Slack - msg_template = import_sql_failure_slack_msg if import_sql_failed else import_sql_success_slack_msg + msg_template = import_ch_failure_slack_msg if import_ch_failed else import_ch_success_slack_msg rendered_message = Template(msg_template).render( - import_sql_log_url=import_sql_log_url, + import_ch_log_url=import_ch_log_url, **context, ) SlackWebhookHook(slack_webhook_conn_id="slack_default").send(text=rendered_message) @@ -189,20 +192,27 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, color_swap_config_filepath, ), - "scale_up_rds_node": _script( + "clone_database": _script( scripts_dir, - "scale-rds.sh", - "up", + "airflow-clone-db.sh", importer, - color_swap_config_filepath, + scripts_dir, + db_properties_filepath, ), - "clone_database": _script( + "create_derived_tables": _script( scripts_dir, - "airflow-clone-db.sh", + "airflow-create-derived-tables.sh", importer, scripts_dir, db_properties_filepath, ), + "set_import_complete": _script( + scripts_dir, + "set_update_process_state.sh", + db_properties_filepath, + "complete", + source_automation_env=True, + ), "fetch_data": _script( scripts_dir, "data_source_repo_clone_manager.sh", @@ -210,6 +220,7 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No "pull", importer, data_repos, + source_automation_env=True, ), "setup_import": _script( scripts_dir, @@ -218,7 +229,8 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No scripts_dir, db_properties_filepath, ), - "import_sql": _script( + # reuse the old import-sql script for now + "import_direct_to_clickhouse": _script( scripts_dir, "airflow-import-sql.sh", importer, @@ -226,26 +238,6 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, notification_filepath, ), - "import_clickhouse": _script( - scripts_dir, - "airflow-import-clickhouse.sh", - importer, - scripts_dir, - db_properties_filepath, - ), - "scale_down_rds_node": _script( - scripts_dir, - "scale-rds.sh", - "down", - importer, - color_swap_config_filepath, - # Normally, we would verify that we are in a "scaled up" state before trying to scale down. - # However, if the DAG run failed before "scale_up_rds_node" completed successfully, - # we may still be in a "scaled down" state when we run the scale down task - # (which runs regardless of upstream failures). - # In those cases -- skip verifying that we're in a scaled down state - "{{ '' if (dag_run.get_task_instance('scale_up_rds_node', map_index=ti.map_index) and dag_run.get_task_instance('scale_up_rds_node', map_index=ti.map_index).state == 'success') else '--skip-pre-validation' }}", - ), "transfer_deployment": _script( scripts_dir, "airflow-transfer-deployment.sh", @@ -253,23 +245,19 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, color_swap_config_filepath, ), - "clear_persistence_caches": _script( - scripts_dir, - "airflow-clear-persistence-caches.sh", - importer, - scripts_dir, - ), "set_import_running": _script( scripts_dir, "set_update_process_state.sh", db_properties_filepath, "running", + source_automation_env=True, ), "set_import_abandoned": _script( scripts_dir, "set_update_process_state.sh", db_properties_filepath, "abandoned", + source_automation_env=True, ), "cleanup_data": _script( scripts_dir, @@ -278,6 +266,7 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No "cleanup", importer, data_repos, + source_automation_env=True, ), } @@ -312,9 +301,11 @@ def _build_task(name: str) -> object: return SSHOperator.partial(**params).expand(ssh_conn_id=list(ssh_targets)) - tasks: dict[str, object] = {"data_repos": data_repos} + tasks: dict[str, object] = {} for name in config.task_names: - if name == "send_update_notification": + if name == "data_repos": + tasks[name] = data_repos + elif name == "send_update_notification": tasks[name] = send_update_notification( notification_filepath=notification_filepath, ssh_conn_id=config.target_nodes[0], diff --git a/dags/import_clickhouse_base.py b/dags/import_clickhouse_base.py deleted file mode 100644 index 42e30eb01..000000000 --- a/dags/import_clickhouse_base.py +++ /dev/null @@ -1,334 +0,0 @@ -"""Shared builder for ClickHouse import DAGs.""" -from __future__ import annotations - -from dataclasses import dataclass -from datetime import datetime, timedelta -from typing import Callable, Mapping, Optional, Sequence -import logging -import shlex - -from airflow import DAG -from airflow.decorators import task -from airflow.exceptions import AirflowException, AirflowSkipException -from airflow.models.param import Param -from airflow.providers.ssh.operators.ssh import SSHOperator -from airflow.providers.ssh.hooks.ssh import SSHHook -from airflow.utils.trigger_rule import TriggerRule -from airflow.utils.state import State -from airflow.operators.python import get_current_context -from airflow.providers.slack.notifications.slack_webhook import send_slack_webhook_notification -from airflow.providers.slack.hooks.slack_webhook import SlackWebhookHook -from jinja2 import Template -from airflow.utils.dates import days_ago - -fail_slack_msg = """ - :red_circle: DAG Failed. - *DAG ID*: {{ dag.dag_id }} - *Task ID*: {{ task_instance.task_id }} - *Execution Time*: {{ execution_date }} - *Log Url*: {{ task_instance.log_url }} -""" -success_slack_msg = """ - :large_green_circle: DAG Success! - *DAG ID*: {{ dag.dag_id }} - *Execution Time*: {{ execution_date }} -""" -import_ch_failure_slack_msg = """ - :red_circle: ClickHouse Import Failed. Please check the notification file in the Airflow logs. - *DAG ID*: {{ dag.dag_id }} - *Execution Time*: {{ execution_date }} - *Log Url*: {{ import_ch_log_url }} -""" -import_ch_success_slack_msg = """ - :large_green_circle: ClickHouse Import Success! - *DAG ID*: {{ dag.dag_id }} - *Execution Time*: {{ execution_date }} - *Log Url*: {{ import_ch_log_url }} -""" -dag_failure_slack_webhook_notification = send_slack_webhook_notification( - slack_webhook_conn_id="slack_default", text=fail_slack_msg -) -dag_success_slack_webhook_notification = send_slack_webhook_notification( - slack_webhook_conn_id="slack_default", text=success_slack_msg -) - -_DEFAULT_ARGS = { - "owner": "airflow", - "depends_on_past": False, - "email_on_failure": True, - "email_on_retry": False, - "retries": 0, - "retry_delay": timedelta(minutes=5), - "on_failure_callback": [dag_failure_slack_webhook_notification], -} - -WireDependencies = Callable[[dict[str, object]], None] -logger = logging.getLogger(__name__) - - -@dataclass(frozen=True, kw_only=True) -class ClickhouseImporterConfig: - dag_id: str - description: str - importer: str - tags: Sequence[str] - target_nodes: Sequence[str] - data_nodes: Sequence[str] - task_names: Sequence[str] - scripts_dir: str = "/data/portal-cron/scripts" - creds_dir: str = "/data/portal-cron/pipelines-credentials" - db_properties_filename: str - color_swap_config_filename: str - data_source_properties_filename: str = "importer-data-source-manager-config.yaml" - params: Mapping[str, Param] - wire_dependencies: WireDependencies - pool: Optional[str] = None - schedule_interval: Optional[str] = None - - -def _script(scripts_dir: str, script_name: str, *args: object, source_automation_env: bool = False) -> str: - parts = [f"{scripts_dir}/{script_name}"] - parts.extend(str(arg) for arg in args) - cmd = " ".join(parts) - if source_automation_env: - return f"source {scripts_dir}/automation-environment.sh && {cmd}" - return cmd - - -def build_import_dag(config: ClickhouseImporterConfig) -> DAG: - params = dict(config.params) if config.params else {} - - dag = DAG( - dag_id=config.dag_id, - default_args=_DEFAULT_ARGS, - description=config.description, - max_active_runs=1, - start_date=days_ago(2), - schedule_interval=config.schedule_interval, - tags=list(config.tags), - render_template_as_native_obj=True, - on_success_callback=[dag_success_slack_webhook_notification], - params=params, - ) - - with dag: - importer = config.importer - scripts_dir = config.scripts_dir - creds_dir = config.creds_dir - db_properties_filepath = f"{creds_dir}/{config.db_properties_filename}" - color_swap_config_filepath = f"{creds_dir}/{config.color_swap_config_filename}" - data_source_properties_filepath = f"{creds_dir}/{config.data_source_properties_filename}" - if len(config.target_nodes) != 1: - raise ValueError( - f"Expected exactly one target node for importer '{importer}', got {len(config.target_nodes)}." - ) - notification_filepath = f"/tmp/airflow-notifications/{config.dag_id}/{{{{ ts_nodash }}}}.txt" - - @task - def get_data_repos(repos: list[str]) -> str: - return " ".join(repos) - - # run this task even if import_direct_to_clickhouse failed - @task(trigger_rule=TriggerRule.ALL_DONE) - def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> None: - """ - Sends a Slack message to the #airflow-logs channel with a link to the import_direct_to_clickhouse logs URL. - This tells the curators whether there were any studies that suceeded or failed to import during a given run. - To avoid confusion -- we run this task towards the end of the DAG - (eg. after the transfer_deployment step) because we don't want to - send a success message before the entire import run completes. - """ - - # Get the log URL for the import_direct_to_clickhouse task - context = get_current_context() - dag_run = context.get("dag_run") - import_ch_ti = None - if dag_run is not None: - import_ch_ti = dag_run.get_task_instance("import_direct_to_clickhouse", map_index=0) - import_ch_log_url = import_ch_ti.log_url if import_ch_ti is not None else "" - if not import_ch_log_url: - logger.warning("Could not determine import_direct_to_clickhouse log url; skipping Slack notification.") - raise AirflowSkipException() - - import_ch_failed = ( - import_ch_ti is not None and import_ch_ti.state == State.FAILED - ) - if not import_ch_failed: - # Read the notification file from the remote node to check if any studies failed - try: - ssh_hook = SSHHook(ssh_conn_id=ssh_conn_id) - ssh_client = ssh_hook.get_conn() - exit_status, notif_contents, _ = ssh_hook.exec_ssh_client_command( - ssh_client, f"cat {shlex.quote(notification_filepath)}", get_pty=False, environment=None - ) - if exit_status != 0: - logger.warning("Notification file not found at %s; treating as failure", notification_filepath) - import_ch_failed = True - else: - notification_content = notif_contents.decode("utf-8") - ERROR_STRING = "The following studies had errors during import" - import_ch_failed = (ERROR_STRING in notification_content) - except Exception as exc: - logger.warning("Could not read notification file from remote node; skipping Slack notification") - logger.warning("Stack trace:") - logger.warning(exc) - raise AirflowSkipException() from exc - - # Build the msg and send to Slack - msg_template = import_ch_failure_slack_msg if import_ch_failed else import_ch_success_slack_msg - rendered_message = Template(msg_template).render( - import_ch_log_url=import_ch_log_url, - **context, - ) - SlackWebhookHook(slack_webhook_conn_id="slack_default").send(text=rendered_message) - - data_repos = get_data_repos("{{ params.get('data_repos', []) }}") - - command_map = { - "verify_management_state": _script( - scripts_dir, - "airflow-verify-management.sh", - scripts_dir, - db_properties_filepath, - color_swap_config_filepath, - ), - "clone_database": _script( - scripts_dir, - "airflow-clone-db.sh", - importer, - scripts_dir, - db_properties_filepath, - ), - "create_derived_tables": _script( - scripts_dir, - "airflow-create-derived-tables.sh", - importer, - scripts_dir, - db_properties_filepath, - ), - "set_import_complete": _script( - scripts_dir, - "set_update_process_state.sh", - db_properties_filepath, - "complete", - source_automation_env=True, - ), - "fetch_data": _script( - scripts_dir, - "data_source_repo_clone_manager.sh", - data_source_properties_filepath, - "pull", - importer, - data_repos, - source_automation_env=True, - ), - "setup_import": _script( - scripts_dir, - "airflow-setup-import.sh", - importer, - scripts_dir, - db_properties_filepath, - ), - # reuse the old import-sql script for now - "import_direct_to_clickhouse": _script( - scripts_dir, - "airflow-import-sql.sh", - importer, - scripts_dir, - db_properties_filepath, - notification_filepath, - ), - "transfer_deployment": _script( - scripts_dir, - "airflow-transfer-deployment.sh", - scripts_dir, - db_properties_filepath, - color_swap_config_filepath, - ), - "set_import_running": _script( - scripts_dir, - "set_update_process_state.sh", - db_properties_filepath, - "running", - source_automation_env=True, - ), - "set_import_abandoned": _script( - scripts_dir, - "set_update_process_state.sh", - db_properties_filepath, - "abandoned", - source_automation_env=True, - ), - "cleanup_data": _script( - scripts_dir, - "data_source_repo_clone_manager.sh", - data_source_properties_filepath, - "cleanup", - importer, - data_repos, - source_automation_env=True, - ), - } - - def _build_task(name: str) -> object: - if name not in command_map: - raise ValueError(f"Unsupported task '{name}' for importer '{importer}'.") - - params: dict[str, object] = { - "task_id": name, - "command": command_map[name], - } - - if name == "set_import_abandoned": - params["trigger_rule"] = TriggerRule.ONE_FAILED - elif name == "cleanup_data": - params["trigger_rule"] = TriggerRule.ALL_DONE - elif name == "scale_up_rds_node": - # Use XCom to signal downstream that the scale up task completed successfully - params["do_xcom_push"] = True - elif name == "scale_down_rds_node": - # Run scale down task regardless of upstream failures during import - params["trigger_rule"] = TriggerRule.ALL_DONE - - if config.pool is not None: - params["pool"] = config.pool - - ssh_targets: Sequence[str] - if name in ("fetch_data", "cleanup_data"): - ssh_targets = config.data_nodes - else: - ssh_targets = config.target_nodes - - return SSHOperator.partial(**params).expand(ssh_conn_id=list(ssh_targets)) - - tasks: dict[str, object] = {} - for name in config.task_names: - if name == "data_repos": - tasks[name] = data_repos - elif name == "send_update_notification": - tasks[name] = send_update_notification( - notification_filepath=notification_filepath, - ssh_conn_id=config.target_nodes[0], - ) - else: - tasks[name] = _build_task(name) - - config.wire_dependencies(tasks) - - @task(trigger_rule=TriggerRule.ONE_FAILED, retries=0, on_failure_callback=None) - def watcher(): - raise AirflowException("Failing task because one or more upstream tasks failed.") - - list(dag.tasks) >> watcher() - - # set_import_abandoned needs to be directly downstream of all other DAG tasks in - # order for it to trigger if any one of them fails - if "set_import_abandoned" in config.task_names: - # make sure we don't create a cyclical dependency - other_tasks = [t for t in dag.tasks if t.task_id not in ("set_import_abandoned", "watcher")] - other_tasks >> tasks["set_import_abandoned"] - - return dag - - -__all__ = ["ClickhouseImporterConfig", "build_import_dag", "_script"] diff --git a/dags/import_genie_clickhouse_dag.py b/dags/import_genie_clickhouse_dag.py deleted file mode 100644 index d52c85a26..000000000 --- a/dags/import_genie_clickhouse_dag.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -import_genie_clickhouse_dag.py -Imports Genie study to ClickHouse database. -""" -import os -import sys - -from airflow.models.param import Param - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_clickhouse_base import ClickhouseImporterConfig, build_import_dag - - -def _wire(tasks: dict[str, object]) -> None: - - tasks["data_repos"] >> tasks["verify_management_state"] - - tasks["verify_management_state"] >> [ - tasks["fetch_data"], - tasks["clone_database"] - ] - - [ - tasks["fetch_data"], - tasks["clone_database"] - ] >> tasks["setup_import"] - - tasks["setup_import"] >> tasks["import_direct_to_clickhouse"] - - tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] - - tasks["create_derived_tables"] >> tasks["transfer_deployment"] - - tasks["transfer_deployment"] >> [ - tasks["cleanup_data"], - tasks["send_update_notification"] - ] - - -_GENIE_CLICKHOUSE_CONFIG = ClickhouseImporterConfig( - dag_id="import_genie_clickhouse_dag", - description="Imports Genie study to ClickHouse database", - importer="genie-clickhouse", - tags=["genie-clickhouse"], - target_nodes=("pipelines5_ssh",), - data_nodes=("pipelines5_ssh",), - task_names=( - "data_repos", - "verify_management_state", - "fetch_data", - "clone_database", - "setup_import", - "import_direct_to_clickhouse", - "create_derived_tables", - "transfer_deployment", - "send_update_notification", - "cleanup_data", - "set_import_abandoned", - ), - db_properties_filename="manage_genie_clickhouse_database_update_tools.properties", - # disabled on pipelines5 machine during testing phase - color_swap_config_filename="genie-db-color-swap-config.yaml", - params={ - "data_repos": Param( - ["genie"], - type="array", - description="Comma-separated list of data repositories to pull updates from/cleanup.", - title="Data Repositories", - examples=["genie"], - ), - }, - wire_dependencies=_wire, -) - -globals()[_GENIE_CLICKHOUSE_CONFIG.dag_id] = build_import_dag(_GENIE_CLICKHOUSE_CONFIG) diff --git a/dags/import_genie_dag.py b/dags/import_genie_dag.py index fa6307ba9..70a81fafd 100644 --- a/dags/import_genie_dag.py +++ b/dags/import_genie_dag.py @@ -1,9 +1,10 @@ """ -import_genie_dag.py -Imports Genie study to MySQL and ClickHouse databases using blue/green deployment strategy. +import_genie_clickhouse_dag.py +Imports Genie study to ClickHouse database. """ import os import sys + from airflow.models.param import Param sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -11,44 +12,64 @@ def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["verify_management_state"] >> [tasks["fetch_data"], tasks["scale_up_rds_node"]] - tasks["scale_up_rds_node"] >> tasks["clone_database"] - [tasks["fetch_data"], tasks["clone_database"]] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_sql"] >> tasks["import_clickhouse"] >> tasks["transfer_deployment"] >> tasks["scale_down_rds_node"] >> tasks["send_update_notification"] >> tasks["cleanup_data"] - -_GENIE_CONFIG = ImporterConfig( - dag_id="import_genie_dag", - description="Imports Genie study to MySQL and ClickHouse databases using blue/green deployment strategy", - importer="genie", - tags=["genie"], - target_nodes=("importer_ssh",), - data_nodes=("importer_ssh",), + + tasks["data_repos"] >> tasks["verify_management_state"] + + tasks["verify_management_state"] >> [ + tasks["fetch_data"], + tasks["clone_database"] + ] + + [ + tasks["fetch_data"], + tasks["clone_database"] + ] >> tasks["setup_import"] + + tasks["setup_import"] >> tasks["import_direct_to_clickhouse"] + + tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] + + tasks["create_derived_tables"] >> tasks["transfer_deployment"] + + tasks["transfer_deployment"] >> [ + tasks["cleanup_data"], + tasks["send_update_notification"] + ] + + +_GENIE_CLICKHOUSE_CONFIG = ImporterConfig( + dag_id="import_genie_clickhouse_dag", + description="Imports Genie study to ClickHouse database", + importer="genie-clickhouse", + tags=["genie-clickhouse"], + target_nodes=("pipelines5_ssh",), + data_nodes=("pipelines5_ssh",), task_names=( + "data_repos", "verify_management_state", - "scale_up_rds_node", - "clone_database", "fetch_data", + "clone_database", "setup_import", - "import_sql", - "import_clickhouse", + "import_direct_to_clickhouse", + "create_derived_tables", "transfer_deployment", - "scale_down_rds_node", "send_update_notification", "cleanup_data", "set_import_abandoned", ), - db_properties_filename="manage_genie_database_update_tools.properties", + db_properties_filename="manage_genie_clickhouse_database_update_tools.properties", + # disabled on pipelines5 machine during testing phase color_swap_config_filename="genie-db-color-swap-config.yaml", params={ "data_repos": Param( ["genie"], type="array", + description="Comma-separated list of data repositories to pull updates from/cleanup.", title="Data Repositories", - description="List of GENIE data repositories to clean up after import.", examples=["genie"], ), }, wire_dependencies=_wire, ) -globals()[_GENIE_CONFIG.dag_id] = build_import_dag(_GENIE_CONFIG) +globals()[_GENIE_CLICKHOUSE_CONFIG.dag_id] = build_import_dag(_GENIE_CLICKHOUSE_CONFIG) diff --git a/dags/import_public_clickhouse_dag.py b/dags/import_public_clickhouse_dag.py deleted file mode 100644 index b7bf6ac8a..000000000 --- a/dags/import_public_clickhouse_dag.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -import_public_clickhouse_dag.py -Imports to Public cBioPortal ClickHouse database. -""" -import os -import sys - -from airflow.models.param import Param - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_clickhouse_base import ClickhouseImporterConfig, build_import_dag - - -def _wire(tasks: dict[str, object]) -> None: - - tasks["data_repos"] >> tasks["verify_management_state"] - - tasks["verify_management_state"] >> [ - tasks["fetch_data"], - tasks["clone_database"] - ] - - [ - tasks["fetch_data"], - tasks["clone_database"] - ] >> tasks["setup_import"] - - tasks["setup_import"] >> tasks["import_direct_to_clickhouse"] - - tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] - - tasks["create_derived_tables"] >> tasks["transfer_deployment"] - - tasks["transfer_deployment"] >> [ - tasks["cleanup_data"], - tasks["send_update_notification"] - ] - - -_PUBLIC_CLICKHOUSE_CONFIG = ClickhouseImporterConfig( - dag_id="import_public_clickhouse_dag", - description="Imports to Public cBioPortal ClickHouse database", - importer="public-clickhouse", - tags=["public-clickhouse"], - target_nodes=("pipelines5_ssh",), - data_nodes=("pipelines5_ssh",), - task_names=( - "data_repos", - "verify_management_state", - "fetch_data", - "clone_database", - "setup_import", - "import_direct_to_clickhouse", - "create_derived_tables", - "transfer_deployment", - "send_update_notification", - "cleanup_data", - "set_import_abandoned", - ), - db_properties_filename="manage_public_clickhouse_database_update_tools.properties", - # disabled on pipelines5 machine during testing phase - color_swap_config_filename="public-db-color-swap-config.yaml", - params={ - "data_repos": Param( - ["datahub"], - type="array", - description="Comma-separated list of data repositories to pull updates from/cleanup.", - title="Data Repositories", - examples=["datahub", "impact", "private"], - ), - }, - wire_dependencies=_wire, -) - -globals()[_PUBLIC_CLICKHOUSE_CONFIG.dag_id] = build_import_dag(_PUBLIC_CLICKHOUSE_CONFIG) diff --git a/dags/import_public_dag.py b/dags/import_public_dag.py index 6c5c12846..a8b769471 100644 --- a/dags/import_public_dag.py +++ b/dags/import_public_dag.py @@ -1,6 +1,6 @@ """ -import_public_dag.py -Imports to Public cBioPortal MySQL and ClickHouse databases using blue/green deployment strategy. +import_public_clickhouse_dag.py +Imports to Public cBioPortal ClickHouse database. """ import os import sys @@ -12,33 +12,53 @@ def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["verify_management_state"] >> [tasks["fetch_data"], tasks["scale_up_rds_node"]] - tasks["scale_up_rds_node"] >> tasks["clone_database"] - [tasks["fetch_data"], tasks["clone_database"]] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_sql"] >> tasks["import_clickhouse"] >> tasks["transfer_deployment"] >> tasks["scale_down_rds_node"] >> tasks["send_update_notification"] >> tasks["cleanup_data"] + + tasks["data_repos"] >> tasks["verify_management_state"] + + tasks["verify_management_state"] >> [ + tasks["fetch_data"], + tasks["clone_database"] + ] + + [ + tasks["fetch_data"], + tasks["clone_database"] + ] >> tasks["setup_import"] + + tasks["setup_import"] >> tasks["import_direct_to_clickhouse"] + + tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] + + tasks["create_derived_tables"] >> tasks["transfer_deployment"] + + tasks["transfer_deployment"] >> [ + tasks["cleanup_data"], + tasks["send_update_notification"] + ] + _PUBLIC_CONFIG = ImporterConfig( dag_id="import_public_dag", - description="Imports to Public cBioPortal MySQL and ClickHouse databases using blue/green deployment strategy", + description="Imports to Public cBioPortal ClickHouse database", importer="public", tags=["public"], - target_nodes=("importer_ssh",), - data_nodes=("importer_ssh", "pipelines3_ssh"), + target_nodes=("pipelines5_ssh",), + data_nodes=("pipelines5_ssh",), task_names=( + "data_repos", "verify_management_state", - "scale_up_rds_node", - "clone_database", "fetch_data", + "clone_database", "setup_import", - "import_sql", - "import_clickhouse", + "import_direct_to_clickhouse", + "create_derived_tables", "transfer_deployment", - "scale_down_rds_node", "send_update_notification", "cleanup_data", "set_import_abandoned", ), - db_properties_filename="manage_public_database_update_tools.properties", + db_properties_filename="manage_public_clickhouse_database_update_tools.properties", + # disabled on pipelines5 machine during testing phase color_swap_config_filename="public-db-color-swap-config.yaml", params={ "data_repos": Param( diff --git a/dags/import_tempo_dag.py b/dags/import_tempo_dag.py deleted file mode 100644 index d5f5a16a1..000000000 --- a/dags/import_tempo_dag.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -import_tempo_dag.py -Imports TEMPO dataset into triage using default import method -""" -from datetime import timedelta, datetime -from airflow import DAG -from airflow.decorators import task -from airflow.exceptions import AirflowException -from airflow.models.param import Param -from airflow.providers.ssh.operators.ssh import SSHOperator -from airflow.utils.trigger_rule import TriggerRule - -args = { - "owner": "airflow", - "depends_on_past": False, - "email_on_failure": True, - "email_on_retry": False, - "retries": 0, - "retry_delay": timedelta(minutes=5), -} - -""" -If any upstream tasks failed, this task will propagate the "Failed" status to the Dag Run. -""" -@task(trigger_rule=TriggerRule.ONE_FAILED, retries=0) -def watcher(): - raise AirflowException("Failing task because one or more upstream tasks failed.") - -with DAG( - dag_id="import_tempo_dag", - default_args=args, - description="Imports TEMPO dataset into triage using default import method", - max_active_runs=1, - start_date=datetime(2025, 8, 8), - schedule_interval=None, - tags=["tempo"], - render_template_as_native_obj=True, - params={ - "importer": Param("triage", type="string", enum=["triage"], title="Import Pipeline", description="Determines which importer to use."), - } -) as dag: - - pipelines5_conn_id = "pipelines5_ssh" - import_scripts_path = "/data/portal-cron/scripts" - creds_dir = "/data/portal-cron/pipelines-credentials" - - """ - Clean up data repos within MSK network - """ - import_tempo_data = SSHOperator( - task_id="import_tempo_data", - ssh_conn_id=pipelines5_conn_id, - command=f"{import_scripts_path}/import-tempo-data.sh {{ params.importer }}", - pool="triage_import_pool", - dag=dag, - ) - - import_tempo_data - list(dag.tasks) >> watcher() diff --git a/dags/import_triage_clickhouse_dag.py b/dags/import_triage_dag.py similarity index 95% rename from dags/import_triage_clickhouse_dag.py rename to dags/import_triage_dag.py index 13c94c1b1..12e71ed53 100644 --- a/dags/import_triage_clickhouse_dag.py +++ b/dags/import_triage_dag.py @@ -8,7 +8,7 @@ from airflow.models.param import Param sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_clickhouse_base import ClickhouseImporterConfig, build_import_dag +from dags.import_base import ImporterConfig, build_import_dag def _wire(tasks: dict[str, object]) -> None: @@ -37,7 +37,7 @@ def _wire(tasks: dict[str, object]) -> None: # for now, all the 'finally' handlers are taken care of by the parent class -_TRIAGE_CONFIG = ClickhouseImporterConfig( +_TRIAGE_CONFIG = ImporterConfig( dag_id="import_triage_clickhouse_dag", description="Imports Triage study to Clickhouse database", importer="triage-clickhouse", diff --git a/import-scripts/airflow-create-derived-tables.sh b/import-scripts/airflow-create-derived-tables.sh index 8eb1612de..0b968b2d6 100644 --- a/import-scripts/airflow-create-derived-tables.sh +++ b/import-scripts/airflow-create-derived-tables.sh @@ -54,13 +54,13 @@ fi # Attempt to download the derived table SQL files from github clickhouse_schema_branch_name="master" # default -if [ "$PORTAL_DATABASE" == "public" -o "$PORTAL_DATABASE" == "public-clickhouse" ] ; then +if [ "$PORTAL_DATABASE" == "public" ] ; then clickhouse_schema_branch_name="public-portal-db-clickhouse-sql-for-import" fi -if [ "$PORTAL_DATABASE" == "genie" -o "$PORTAL_DATABASE" == "genie-clickhouse" ] ; then +if [ "$PORTAL_DATABASE" == "genie" ] ; then clickhouse_schema_branch_name="genie-portal-db-clickhouse-sql-for-import" fi -if [ "$PORTAL_DATABASE" == "triage-clickhouse" ]; then +if [ "$PORTAL_DATABASE" == "triage" ]; then clickhouse_schema_branch_name="triage-portal-db-clickhouse-sql-for-import" fi if ! $DOWNLOAD_DERVIED_TABLE_SQL_FILES_SCRIPT_FILEPATH --github_branch_name "$clickhouse_schema_branch_name" "$derived_table_sql_script_dirpath" ; then diff --git a/import-scripts/airflow-import-clickhouse.sh b/import-scripts/airflow-import-clickhouse.sh deleted file mode 100755 index 1e84eef13..000000000 --- a/import-scripts/airflow-import-clickhouse.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash - -# Script for updating ClickHouse DB -# Consists of the following: -# - Drop ClickHouse tables -# - Copy MySQL tables to ClickHouse -# - Create derived ClickHouse tables - -PORTAL_DATABASE=$1 -PORTAL_SCRIPTS_DIRECTORY=$2 -MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH=$3 -if [ -z $PORTAL_SCRIPTS_DIRECTORY ]; then - PORTAL_SCRIPTS_DIRECTORY="/data/portal-cron/scripts" -fi -AUTOMATION_ENV_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/automation-environment.sh" -if [ ! -f $AUTOMATION_ENV_SCRIPT_FILEPATH ] ; then - echo "`date`: Unable to locate $AUTOMATION_ENV_SCRIPT_FILEPATH, exiting..." - exit 1 -fi -source $AUTOMATION_ENV_SCRIPT_FILEPATH - -tmp=$PORTAL_HOME/tmp/import-cron-$PORTAL_DATABASE -GET_DB_IN_PROD_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/get_database_currently_in_production.sh" -DROP_TABLES_FROM_CLICKHOUSE_DATABASE_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/drop_tables_in_clickhouse_database.sh" -COPY_TABLES_FROM_MYSQL_TO_CLICKHOUSE_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/copy_mysql_database_tables_to_clickhouse.sh" -DOWNLOAD_DERVIED_TABLE_SQL_FILES_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/download_clickhouse_sql_scripts_py3.py" -CREATE_DERIVED_TABLES_IN_CLICKHOUSE_SCRIPT_FILEPATH="$PORTAL_SCRIPTS_DIRECTORY/create_derived_tables_in_clickhouse_database.sh" - -# Get the current production database color -current_production_database_color=$(sh $GET_DB_IN_PROD_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH) -destination_database_color="unset" -if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" -fi -if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" -fi -if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 -fi - -echo "Destination DB color: $destination_database_color" - -# Drop tables from non-production ClickHouse DB to make room for incoming copy -echo "dropping tables from clickhouse database $destination_database_color to make room for incoming copy" -if ! $DROP_TABLES_FROM_CLICKHOUSE_DATABASE_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH $destination_database_color ; then - echo "Error during dropping of tables from clickhouse database $destination_database_color" >&2 - exit 1 -fi - -# Use Sling to copy data from non-production MySQL DB to non-production ClickHouse DB -echo "copying tables from mysql database $destination_database_color to clickhouse database $destination_database_color" -if ! $COPY_TABLES_FROM_MYSQL_TO_CLICKHOUSE_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH $destination_database_color ; then - echo "Error during copying of tables from mysql database $destination_database_color to clickhouse database $destination_database_color" >&2 - exit 1 -fi - -# Check if derived table sql script dirpath exists -# If not, try to create it -derived_table_sql_script_dirpath="$tmp/create_derived_clickhouse_tables" -if ! [ -e "$derived_table_sql_script_dirpath" ] ; then - if ! mkdir -p "$derived_table_sql_script_dirpath" ; then - echo "Error: could not create target directory '$derived_table_sql_script_dirpath'" >&2 - exit 1 - fi -fi - -# Remove any scripts currently in the derived table sql script dirpath -if [[ -d "$derived_table_sql_script_dirpath" && "$derived_table_sql_script_dirpath" != "/" ]]; then - rm -rf "$derived_table_sql_script_dirpath"/* -fi - -# Attempt to download the derived table SQL files from github -clickhouse_schema_branch_name="master" # default -if [ "$PORTAL_DATABASE" == "public" ] ; then - clickhouse_schema_branch_name="public-portal-db-clickhouse-sql-for-import" -fi -if [ "$PORTAL_DATABASE" == "genie" ] ; then - clickhouse_schema_branch_name="genie-portal-db-clickhouse-sql-for-import" -fi -if ! $DOWNLOAD_DERVIED_TABLE_SQL_FILES_SCRIPT_FILEPATH --github_branch_name "$clickhouse_schema_branch_name" "$derived_table_sql_script_dirpath" ; then - echo "Error during download of derived table construction .sql files from github" >&2 - exit 1 -fi - -# Create the additional derived tables inside of non-production Clickhouse DB -echo "creating derived tables in clickhouse database $destination_database_color" -if ! $CREATE_DERIVED_TABLES_IN_CLICKHOUSE_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH $destination_database_color "$derived_table_sql_script_dirpath"/* ; then - echo "Error during derivation of clickhouse tables in clickhouse database $destination_database_color" >&2 - exit 1 -fi diff --git a/import-scripts/airflow-import-sql.sh b/import-scripts/airflow-import-sql.sh index d51114c0c..10a265433 100755 --- a/import-scripts/airflow-import-sql.sh +++ b/import-scripts/airflow-import-sql.sh @@ -21,38 +21,24 @@ source "$AUTOMATION_ENV_SCRIPT_FILEPATH" # Set needed paths/filenames for import case "$PORTAL_DATABASE" in - genie) - TMP_DIR_NAME="import-cron-genie" - IMPORTER_NAME="genie-aws" - LOG_FILE_NAME="genie-aws-importer.log" - PORTAL_NAME="genie-portal" - ONCOTREE_VERSION="oncotree_2019_12_01" - ;; - public) - TMP_DIR_NAME="import-cron-public-data" - IMPORTER_NAME="public" - LOG_FILE_NAME="public-data-importer.log" - PORTAL_NAME="public-portal" - ONCOTREE_VERSION="oncotree_latest_stable" - ;; - triage-clickhouse) - TMP_DIR_NAME="import-cron-triage-clickhouse" - IMPORTER_NAME="triage-clickhouse" - LOG_FILE_NAME="triage-clickhouse-importer.log" + triage) + TMP_DIR_NAME="import-cron-triage" + IMPORTER_NAME="triage" + LOG_FILE_NAME="triage-importer.log" PORTAL_NAME="triage-portal" ONCOTREE_VERSION="oncotree_candidate_release" ;; - public-clickhouse) - TMP_DIR_NAME="import-cron-public-clickhouse" - IMPORTER_NAME="public-clickhouse" - LOG_FILE_NAME="public-clickhouse-importer.log" + public) + TMP_DIR_NAME="import-cron-public" + IMPORTER_NAME="public" + LOG_FILE_NAME="public-importer.log" PORTAL_NAME="public-portal" ONCOTREE_VERSION="oncotree_latest_stable" ;; - genie-clickhouse) - TMP_DIR_NAME="import-cron-genie-clickhouse" - IMPORTER_NAME="genie-clickhouse" - LOG_FILE_NAME="genie-clickhouse-importer.log" + genie) + TMP_DIR_NAME="import-cron-genie" + IMPORTER_NAME="genie" + LOG_FILE_NAME="genie-importer.log" PORTAL_NAME="genie-portal" ONCOTREE_VERSION="oncotree_2019_12_01" ;; @@ -81,7 +67,7 @@ fi IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" tmp="${PORTAL_HOME}/tmp/${TMP_DIR_NAME}" -JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" +JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -Dlog4j.appender.a.File=/data/portal-cron/logs/$LOG_FILE_NAME -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" # Set up a temp notification file # After the importer runs with --notification-file, it will write to this file describing the number of studies updated / removed / had import errors diff --git a/import-scripts/airflow-setup-import.sh b/import-scripts/airflow-setup-import.sh index 75b5984ad..3cb3cf5f5 100755 --- a/import-scripts/airflow-setup-import.sh +++ b/import-scripts/airflow-setup-import.sh @@ -22,36 +22,24 @@ source "$AUTOMATION_ENV_SCRIPT_FILEPATH" # Configure names/paths based on portal database case "$PORTAL_DATABASE" in - genie) - TMP_DIR_NAME="import-cron-genie" - IMPORTER_NAME="genie-aws" - LOG_FILE_NAME="genie-aws-importer.log" - PORTAL_NAME="genie-portal" - ;; - public) - TMP_DIR_NAME="import-cron-public-data" - IMPORTER_NAME="public" - LOG_FILE_NAME="public-data-importer.log" - PORTAL_NAME="public-portal" - ;; - triage-clickhouse) - TMP_DIR_NAME="import-cron-triage-clickhouse" - IMPORTER_NAME="triage-clickhouse" - LOG_FILE_NAME="triage-clickhouse-importer.log" + triage) + TMP_DIR_NAME="import-cron-triage" + IMPORTER_NAME="triage" + LOG_FILE_NAME="triage-importer.log" PORTAL_NAME="triage-portal" ONCOTREE_VERSION="oncotree_candidate_release" ;; - public-clickhouse) - TMP_DIR_NAME="import-cron-public-clickhouse" - IMPORTER_NAME="public-clickhouse" - LOG_FILE_NAME="public-clickhouse-importer.log" + public) + TMP_DIR_NAME="import-cron-public" + IMPORTER_NAME="public" + LOG_FILE_NAME="public-importer.log" PORTAL_NAME="public-portal" ONCOTREE_VERSION="oncotree_latest_stable" ;; - genie-clickhouse) - TMP_DIR_NAME="import-cron-genie-clickhouse" - IMPORTER_NAME="genie-clickhouse" - LOG_FILE_NAME="genie-clickhouse-importer.log" + genie) + TMP_DIR_NAME="import-cron-genie" + IMPORTER_NAME="genie" + LOG_FILE_NAME="genie-importer.log" PORTAL_NAME="genie-portal" ONCOTREE_VERSION="oncotree_2019_12_01" ;; @@ -80,7 +68,7 @@ fi IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" tmp="$PORTAL_HOME/tmp/$TMP_DIR_NAME" -JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" +JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -Dlog4j.appender.a.File=/data/portal-cron/logs/$LOG_FILE_NAME -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" # Direct importer logs to stdout # Make sure to kill the tail process on exit so we don't hang the script diff --git a/import-scripts/backup-eks-dbs.sh b/import-scripts/backup-eks-dbs.sh deleted file mode 100755 index f1238563f..000000000 --- a/import-scripts/backup-eks-dbs.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash - -DBS=("cgds_triage" "keycloak" "redcap") -LOCAL_BACKUP_DIR=/data/mysql-dumps - -if [ -z "$PORTAL_HOME" ] ; then - export PORTAL_HOME=/data/portal-cron -fi -source "$PORTAL_HOME/scripts/slack-message-functions.sh" - -for db in ${DBS[@]}; do - DUMP_FAILURE=0 - echo "Backing up: '${db}'" - - PORTAL_INFO_TABLE=$(mysql --login-path=mysql_localhost -e "SHOW TABLES LIKE 'info';" $db | tail -n1) - if [ ! -z "$PORTAL_INFO_TABLE" ]; then - DB_SCHEMA_VERSION=".v$(mysql --login-path=mysql_localhost -e "SELECT db_schema_version FROM info;" $db | tail -n1)" - else - DB_SCHEMA_VERSION="" - fi - SQLDUMP_FILENAME=${db}.$(date +%Y%m%d)${DB_SCHEMA_VERSION}.sql.gz - SQLDUMP_FULLPATH=${LOCAL_BACKUP_DIR}/$SQLDUMP_FILENAME - - # The return status of a pipeline is the exit status of the last command, unless the pipefail option is enabled. If pipefail is enabled, the pipeline's return status is the value of the last (rightmost) command to exit with a non-zero status, or zero if all commands exit successfully. - $(set -o pipefail && mysqldump --login-path=mysql_localhost --quick $db | gzip > $SQLDUMP_FULLPATH) - if [ $? -eq 0 ]; then - echo "Successfully dumped: '${SQLDUMP_FILENAME}'" - /data/portal-cron/git-repos/portal-configuration/eks-cluster/pipelines/authenticate_service_account.sh eks - aws s3 cp ${SQLDUMP_FULLPATH} s3://cbioportal-backups/${SQLDUMP_FILENAME} --profile automation_eks - if [ $? -ne 0 ]; then - echo "ERROR: failed to cp '${SQLDUMP_FILENAME}' to S3" - DUMP_FAILURE=1 - fi - - # delete files older than 14 days that match this pattern - echo "Deleting files in '$LOCAL_BACKUP_DIR' that match '$db.[0-9]{8}*.sql.gz' and are >= 14 days old." - find $LOCAL_BACKUP_DIR -type f -mtime +13 -name "$db.[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]*.sql.gz" -delete - else - echo "ERROR: failed to dump '${SQLDUMP_FILENAME}'" - DUMP_FAILURE=1 - echo "Deleting invalid dump file '${SQLDUMP_FILENAME}'" - rm -r "${SQLDUMP_FULLPATH}" - fi - - if [ $DUMP_FAILURE -eq 0 ]; then - echo "Dump was successful" - send_slack_message_to_channel "#mskcc-sysadmin" "string" "eks-pipelines backed up local db (${db})" - else - echo "Dump failed" - send_slack_message_to_channel "#mskcc-sysadmin" "string" "ERROR: eks-pipelines failed to back up local db :fire: (${db})" - fi -done diff --git a/import-scripts/fetch-ddp-and-import-cmo-access-data.sh b/import-scripts/fetch-ddp-and-import-cmo-access-data.sh index 03fbbafaf..43187c827 100755 --- a/import-scripts/fetch-ddp-and-import-cmo-access-data.sh +++ b/import-scripts/fetch-ddp-and-import-cmo-access-data.sh @@ -47,9 +47,9 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-ddp-and-import-cmo-access-d echo "Error during determination of the destination database color" >&2 exit 1 fi - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" + IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-importer-$destination_database_color.jar" ENABLE_DEBUGGING=0 - JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$CMO_ACCESS_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-clickhouse-importer.log -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" + JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$CMO_ACCESS_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-importer.log -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" cmo_access_notification_file=$(mktemp $CMO_ACCESS_TMPDIR/cmo-access-portal-update-notification.$now.XXXXXX) ONCOTREE_VERSION_TO_USE="oncotree_candidate_release" cmo_access_dmp_pids_filepath=$CMO_ACCESS_TMPDIR/cmo_access_patient_list.txt diff --git a/import-scripts/import-cmo-data-msk.sh b/import-scripts/import-cmo-data-msk.sh index 663deac7d..5656e9efb 100755 --- a/import-scripts/import-cmo-data-msk.sh +++ b/import-scripts/import-cmo-data-msk.sh @@ -61,8 +61,8 @@ FLOCK_FILEPATH="${FLOCK_FILEPATH:-/data/portal-cron/cron-lock/import-cmo-data-ms fi DATA_SOURCE_MANAGER_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/data_source_repo_clone_manager.sh" DATA_SOURCE_MANAGER_CONFIG_FILEPATH="$PORTAL_HOME/pipelines-credentials/importer-data-source-manager-config.yaml" - CMO_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" - CMO_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-clickhouse-importer.log -ea -cp $CMO_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" + CMO_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-importer-$destination_database_color.jar" + CMO_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-importer.log -ea -cp $CMO_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" DATA_SOURCE_REPO_FETCH_FAIL=0 if ! $DATA_SOURCE_MANAGER_SCRIPT_FILEPATH $DATA_SOURCE_MANAGER_CONFIG_FILEPATH pull msk $DATA_SOURCES_TO_BE_FETCHED ; then diff --git a/import-scripts/import-dmp-impact-data.sh b/import-scripts/import-dmp-impact-data.sh index 1a76f6aac..492835a47 100755 --- a/import-scripts/import-dmp-impact-data.sh +++ b/import-scripts/import-dmp-impact-data.sh @@ -94,8 +94,8 @@ if [ "$destination_database_color" == "unset" ] ; then exit 1 fi -MSK_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" -MSK_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS $JAVA_DD_AGENT_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$MSK_DMP_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-dmp-clickhouse-importer.log -ea -cp $MSK_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" +MSK_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-importer-$destination_database_color.jar" +MSK_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS $JAVA_DD_AGENT_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$MSK_DMP_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-dmp-importer.log -ea -cp $MSK_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" VALIDATE_BLUE_GREEN_STUDY_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/validate_blue_green_study.py" # Runs --update-study-data for a portal, then validates the named study against diff --git a/import-scripts/import-msk-extract-projects.sh b/import-scripts/import-msk-extract-projects.sh deleted file mode 100755 index 205be544d..000000000 --- a/import-scripts/import-msk-extract-projects.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash - -# echo $(date) - -# source $PORTAL_HOME/scripts/dmp-import-vars-functions.sh -# source $PORTAL_HOME/scripts/set-data-source-environment-vars.sh - -# if [ -z $JAVA_BINARY ] | [ -z $GIT_BINARY ] | [ -z $PORTAL_HOME ] ; then -# message="import-msk-extract-projects.sh cannot be run without setting JAVA_BINARY, GIT_BINARY, PORTAL_HOME...)" -# echo $message -# echo -e "$message" | mail -s "import-msk-extract-projects.sh failed to run." $PIPELINES_EMAIL_LIST -# sendPreImportFailureMessageMskPipelineLogsSlack "$message" -# exit 2 -# fi - -# tmp=$PORTAL_HOME/tmp/import-msk-extract-projects -# if ! [ -d "$tmp" ] ; then -# if ! mkdir -p "$tmp" ; then -# echo "Error : could not create tmp directory '$tmp'" >&2 -# exit 1 -# fi -# fi -# if [[ -d "$tmp" && "$tmp" != "/" ]]; then -# rm -rf "$tmp"/* -# fi - -# source $PORTAL_HOME/scripts/clear-persistence-cache-shell-functions.sh - -# # Get the current production database color -# GET_DB_IN_PROD_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/get_database_currently_in_production.sh" -# MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH="/data/portal-cron/pipelines-credentials/manage_msk_clickhouse_database_update_tools.properties" -# current_production_database_color=$($GET_DB_IN_PROD_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH) -# destination_database_color="unset" -# if [ ${current_production_database_color:0:4} == "blue" ] ; then -# destination_database_color="green" -# fi -# if [ ${current_production_database_color:0:5} == "green" ] ; then -# destination_database_color="blue" -# fi -# if [ "$destination_database_color" == "unset" ] ; then -# echo "Error during determination of the destination database color" >&2 -# exit 1 -# fi - -# DATA_SOURCE_MANAGER_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/data_source_repo_clone_manager.sh" -# DATA_SOURCE_MANAGER_CONFIG_FILEPATH="$PORTAL_HOME/pipelines-credentials/importer-data-source-manager-config.yaml" -# MSK_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" -# MSK_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS $JAVA_DD_AGENT_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$MSK_DMP_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-dmp-clickhouse-importer.log -ea -cp $MSK_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" -# # ROB : TODO : refactor the next line so that repo fetching and cleaning is independent of the importer jar (which requires the embedded database name to exist in order to function) -# IMPORT_FAIL=0 -# ONCOTREE_VERSION_TO_USE="oncotree_candidate_release" -# DATA_SOURCES_TO_BE_FETCHED="extract-projects" -# extract_projects_notification_file=$(mktemp $tmp/import-msk-extract-projects-notification.$now.XXXXXX) - -# # grab data from extract-projects repos -# $DATA_SOURCE_MANAGER_SCRIPT_FILEPATH $DATA_SOURCE_MANAGER_CONFIG_FILEPATH pull msk $DATA_SOURCES_TO_BE_FETCHED - -# # import data into msk portal -# $JAVA_BINARY -Xmx64g $MSK_JAVA_IMPORTER_ARGS --update-study-data --portal extract-projects-to-msk-portal --notification-file $extract_projects_notification_file --oncotree-version $ONCOTREE_VERSION_TO_USE --transcript-overrides-source mskcc --disable-redcap-export -# if [ $? -gt 0 ]; then -# echo "MSK Extract projects import failed!" -# IMPORT_FAIL=1 -# EMAIL_BODY="Extract projects import failed" -# echo -e "Sending email $EMAIL_BODY" -# echo -e "$EMAIL_BODY" | mail -s "Update failure: MSK Extract Projects" $PIPELINES_EMAIL_LIST -# fi - -# # get num studies updated -# if [[ $IMPORT_FAIL -eq 0 && -f "$TMP_DIRECTORY/num_studies_updated.txt" ]]; then -# num_studies_updated=`cat $TMP_DIRECTORY/num_studies_updated.txt` -# else -# num_studies_updated=0 -# fi - -# # clean up extract-projects repo and send notification file -# $DATA_SOURCE_MANAGER_SCRIPT_FILEPATH $DATA_SOURCE_MANAGER_CONFIG_FILEPATH cleanup msk $DATA_SOURCES_TO_BE_FETCHED -# EMAIL_BODY=`cat $extract_projects_notification_file` -# echo -e "The following Extract projects have been added or updated in the MSK cBioPortal:\n\n$EMAIL_BODY" | mail -r "cbioportal-pipelines@cbioportal.org" -s "Updates to MSK cBioPortal (Extract Projects)" $PIPELINES_EMAIL_LIST -# exit $IMPORT_FAIL diff --git a/import-scripts/import-pdx-data.sh b/import-scripts/import-pdx-data.sh deleted file mode 100755 index 7a846136a..000000000 --- a/import-scripts/import-pdx-data.sh +++ /dev/null @@ -1,371 +0,0 @@ -#!/bin/bash - -echo $(date) - -PATH_TO_AUTOMATION_SCRIPT=/data/portal-cron/scripts/automation-environment.sh -# PIPELINES_EMAIL_LIST receives low level emails (fail to clear persistence cache, ...) -PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" -# PDX_EMAIL_LIST receives a daily summary email of import statistics and problems -PDX_EMAIL_LIST="cbioportal-pdx-importer@cbioportal.org" -CRDB_PDX_TMPDIR=/data/portal-cron/tmp/import-cron-pdx-msk -ONCOTREE_VERSION_TO_USE=oncotree_candidate_release -shopt -s nullglob -declare -a study_list - -# Functions - -source "$PORTAL_HOME/scripts/slack-message-functions.sh" - -function sendFailureMessageMskPipelineLogsSlack { - MESSAGE=$1 - send_slack_message_to_channel "#msk-pipeline-logs" "string" "$MESSAGE :tired_face:" -} - -# Function for alerting slack channel of successful imports -function sendSuccessMessageMskPipelineLogsSlack { - MESSAGE=$1 - send_slack_message_to_channel "#msk-pipeline-logs" "string" "$MESSAGE" -} - -function purgeOrigFilesUnderDirectory { - search_dir=$1 - find "$search_dir" -name "*.orig" -delete -} - -function addRemoveFilesUnderDirectory { - search_dir=$1 - purgeOrigFilesUnderDirectory "$search_dir" - ( cd $search_dir ; $GIT_BINARY add -A . ) -} - -function commitAllRepositoryChanges { - repository_subdirectory=$1 - changeset_log_message=$2 - $GIT_BINARY -C "$repository_subdirectory" commit -m "$changeset_log_message" -} - -function pushAllChangesets { - repository_subdirectory=$1 - $GIT_BINARY -C "$repository_subdirectory" push -} - - -function purgeAllFileModifications { - repository_subdirectory=$1 - $GIT_BINARY -C "$repository_subdirectory" reset HEAD --hard -} - -function cleanAllUntrackedFiles { - repository_subdirectory=$1 - $GIT_BINARY -C "$repository_subdirectory" clean -fd -} - -function cleanUpEntireRepository { - repository_subdirectory=$1 - purgeAllFileModifications "$repo_subdirectory" - cleanAllUntrackedFiles "$repo_subdirectory" -} - -function revertModifiedFilesUnderDirectory { - repository_subdirectory=$1 - $GIT_BINARY -C "$repository_subdirectory" checkout -- . -} - -function find_trigger_files_for_existing_studies { - suffix=$1 - suffix_length=${#suffix} - unset study_list - study_list_index=0 - for filepath in $CRDB_PDX_TMPDIR/*${suffix} ; do - filename="${filepath##*/}" - filename_length=${#filename} - study_directory_length=$(( $filename_length - $suffix_length )) - study_directory=${filename:0:$study_directory_length} - if [ -d $PDX_DATA_HOME/$study_directory ] ; then - study_list[$study_list_index]=$study_directory - study_list_index=$(( $study_list_index + 1 )) - else - echo "error : trigger file $filename found for non-existent study : $PDX_HOME/$study_directory" - fi - done -} - -function find_studies_to_be_committed { - find_trigger_files_for_existing_studies "_commit_triggerfile" -} - -function find_studies_to_be_reverted { - find_trigger_files_for_existing_studies "_revert_triggerfile" -} - -# set up enivornment variables and temp directory -if ! [ -f $PATH_TO_AUTOMATION_SCRIPT ] ; then - message="automation-environment.sh could not be found, exiting..." - echo ${message} - echo -e "${message}" | mail -s "import-pdx-data failed to run." $PIPELINES_EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "CRDB PDX Pipeline Failure" - exit 2 -fi - -source $PATH_TO_AUTOMATION_SCRIPT - -if [ -z "$PORTAL_HOME" ] | [ -z "$BIC_LEGACY_DATA_HOME" ] | [ -z "$CMO_ARGOS_DATA_HOME" ] | [ -z "$PRIVATE_DATA_HOME" ] | [ -z "$PDX_DATA_HOME" ] | [ -z "$GIT_BINARY" ] | [ -z "$PYTHON_BINARY" ] | [ -z "$DATAHUB_DATA_HOME" ] | [ -z "$ANNOTATOR_JAR" ] | [ -z "$CASE_LIST_CONFIG_FILE" ] ; then - message="could not run import-pdx-data.sh: automation-environment.sh script must be run in order to set needed environment variables (like BIC_LEGACY_DATA_HOME, PDX_DATA_HOME, ANNOTATOR_JAR, CASE_LIST_CONFIG_FILE,...)" - echo ${message} - echo -e "${message}" | mail -s "import-pdx-data failed to run." $PIPELINES_EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "CRDB PDX Pipeline Failure" - exit 2 -fi - -source $PORTAL_HOME/scripts/clear-persistence-cache-shell-functions.sh - -if [ ! -d $CRDB_PDX_TMPDIR ] ; then - mkdir $CRDB_PDX_TMPDIR - if [ $? -ne 0 ] ; then - message="error : required temp directory does not exist and could not be created : $CRDB_PDX_TMPDIR" - echo ${message} - echo -e "${message}" | mail -s "import-pdx-data failed to run." $PIPELINES_EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "CRDB PDX Pipeline Failure" - exit 2 - fi -fi -if [[ -d "$CRDB_PDX_TMPDIR" && "$CRDB_PDX_TMPDIR" != "/" ]] ; then - rm -rf "$CRDB_PDX_TMPDIR"/* -fi - -GET_DB_IN_PROD_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/get_database_currently_in_production.sh" -MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH="/data/portal-cron/pipelines-credentials/manage_msk_clickhouse_database_update_tools.properties" -current_production_database_color=$($GET_DB_IN_PROD_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH) -destination_database_color="unset" -if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" -fi -if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" -fi -if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 -fi -IMPORTER_JAR_LABEL=CMO -IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" -IMPORTER_DEBUG_PORT=27182 -CRDB_FETCHER_JAR_FILENAME="$PORTAL_HOME/lib/crdb_fetcher.jar" -importer_notification_file=$(mktemp $CRDB_PDX_TMPDIR/importer-update-notification.$now.XXXXXX) -ENABLE_DEBUGGING=0 -java_debug_args="" -if [ $ENABLE_DEBUGGING != "0" ] ; then - java_debug_args="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$IMPORTER_DEBUG_PORT" -fi -JAVA_CRDB_FETCHER_ARGS="--add-opens java.base/java.lang=ALL-UNNAMED -jar $CRDB_FETCHER_JAR_FILENAME" -JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$CRDB_PDX_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-clickhouse-importer.log -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" -SUBSET_AND_MERGE_WARNINGS_FILENAME="subset_and_merge_pdx_studies_warnings.txt" -# status flags (set to 1 when each stage is successfully completed) -CRDB_PDX_FETCH_SUCCESS=0 -CRDB_PDX_SUBSET_AND_MERGE_SUCCESS=0 -BIC_MSKCC_LEGACY_DATA_SOURCE_FETCH_SUCCESS=0 -CMO_ARGOS_DATA_SOURCE_FETCH_SUCCESS=0 -PRIVATE_DATA_SOURCE_FETCH_SUCESS=0 -PDX_DATA_SOURCE_FETCH_SUCCESS=0 -DATAHUB_DATA_SOURCE_FETCH_SUCCESS=0 -ALL_DATA_SOURCE_FETCH_SUCCESS=0 -IMPORT_SUCCESS=0 -CLEAR_PERSISTENCE_CACHE_SUCCESS=0 - -DB_VERSION_FAIL=0 -# check database version before importing anything -echo "Checking if database version is compatible" -$JAVA_BINARY $JAVA_IMPORTER_ARGS --check-db-version -if [ $? -gt 0 ] -then - echo "Database version expected by portal does not match version in database!" - DB_VERSION_FAIL=1 -fi - -# importer data source fetch step -echo "fetching updates from bic-mskcc-legacy repository..." -$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source bic-mskcc-legacy --run-date latest --update-worksheet -if [ $? -gt 0 ] ; then - sendFailureMessageMskPipelineLogsSlack "Fetch BIC-MSKCC-LEGACY Studies From Git Failure" -else - BIC_MSKCC_LEGACY_DATA_SOURCE_FETCH_SUCCESS=1 -fi - -echo "fetching updates from cmo-argos repository..." -$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source cmo-argos --run-date latest --update-worksheet -if [ $? -gt 0 ] ; then - sendFailureMessageMskPipelineLogsSlack "Fetch CMO Argos Studies From Git Failure" -else - CMO_ARGOS_DATA_SOURCE_FETCH_SUCCESS=1 -fi - -echo "fetching updates from private repository..." -$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source private --run-date latest -if [ $? -gt 0 ] ; then - sendFailureMessageMskPipelineLogsSlack "Fetch Private Studies From Git Failure" -else - PRIVATE_DATA_SOURCE_FETCH_SUCESS=1 -fi - -echo "fetching updates from datahub repository..." -$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source datahub --run-date latest -if [ $? -gt 0 ] ; then - sendFailureMessageMskPipelineLogsSlack "Fetch Datahub Studies From Git Failure" -else - DATAHUB_DATA_SOURCE_FETCH_SUCCESS=1 -fi - -echo "fetching updates from pdx repository..." -$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source pdx --run-date latest -if [ $? -gt 0 ] ; then - sendFailureMessageMskPipelineLogsSlack "Fetch PDX Studies From Git Failure" -else - PDX_DATA_SOURCE_FETCH_SUCCESS=1 -fi - - -if [[ $BIC_MSKCC_LEGACY_DATA_SOURCE_FETCH_SUCCESS -eq 1 && $PRIVATE_DATA_SOURCE_FETCH_SUCESS -eq 1 && $PDX_DATA_SOURCE_FETCH_SUCCESS -eq 1 && $DATAHUB_DATA_SOURCE_FETCH_SUCCESS -eq 1 && $CMO_ARGOS_DATA_SOURCE_FETCH_SUCCESS -eq 1 ]] ; then - # udpate status for email - ALL_DATA_SOURCE_FETCH_SUCCESS=1 - echo "fetching pdx data fom crdb" - $JAVA_BINARY $JAVA_CRDB_FETCHER_ARGS --pdx --directory $CRDB_FETCHER_PDX_HOME - if [ $? -ne 0 ] ; then - echo "error: crdb_pdx_fetch failed" - sendFailureMessageMskPipelineLogsSlack "Fetch CRDB PDX Failure" - cleanUpEntireRepository $CRDB_FETCHER_PDX_HOME - else - addRemoveFilesUnderDirectory $CRDB_FETCHER_PDX_HOME - commitAllRepositoryChanges $CRDB_FETCHER_PDX_HOME "CRDB PDX Fetch" - CRDB_PDX_FETCH_SUCCESS=1 - fi -fi - -# TEMP (done): transform project_ids in test data to stable study ids (add tranform of stable id) - -# construct destination studies from source studies -# call subsetting/constuction python script (add touch a trigger file for successful subset/merge) (add subroutine which creates process command) (touch needed meta files for the generated data files) -if [ $CRDB_PDX_FETCH_SUCCESS -ne 0 ] ; then - mapping_filename="source_to_destination_mappings.txt" - clinical_annotation_mapping_filename="clinical_annotations_mappings.txt" - scripts_directory="$PORTAL_HOME/scripts" - $PYTHON_BINARY $PORTAL_HOME/scripts/subset_and_merge_crdb_pdx_studies.py --mapping-file $mapping_filename --root-directory $PDX_DATA_HOME --lib $scripts_directory --data-source-directories $DATAHUB_DATA_HOME,$BIC_LEGACY_DATA_HOME,$CMO_ARGOS_DATA_HOME,$PRIVATE_DATA_HOME,$DMP_DATA_HOME --fetch-directory $CRDB_FETCHER_PDX_HOME --temp-directory $CRDB_PDX_TMPDIR --warning-file $SUBSET_AND_MERGE_WARNINGS_FILENAME --clinical-annotation-mapping-file $clinical_annotation_mapping_filename --annotator $ANNOTATOR_JAR --sample-lists-config $CASE_LIST_CONFIG_FILE - if [ $? -ne 0 ] ; then - echo "error: subset_and_merge_crdb_pdx_studies.py exited with non zero status" - sendFailureMessageMskPipelineLogsSlack "CRDB PDX Subset-And-Merge Script Failure" - cleanUpEntireRepository $CRDB_FETCHER_PDX_HOME - else - CRDB_PDX_SUBSET_AND_MERGE_SUCCESS=1 - fi -fi - -if [ $CRDB_PDX_SUBSET_AND_MERGE_SUCCESS -ne 0 ] ; then - # check trigger files and do appropriate data source operations - find_studies_to_be_reverted - index=0 - while [ $index -lt ${#study_list} ] ; do - revertModifiedFilesUnderDirectory "$PDX_DATA_HOME/${study_list[$index]}" - index=$(( $index + 1 )) - done - find_studies_to_be_committed - index=0 - while [ $index -lt ${#study_list} ] ; do - addRemoveFilesUnderDirectory "$PDX_DATA_HOME/${study_list[$index]}" - index=$(( $index + 1 )) - done - commitAllRepositoryChanges $CRDB_FETCHER_PDX_HOME "CRDB PDX Subset and Merge" -fi - -# push changesets to data source - this will commit to them regardless of whether import succeeds, or partially succeeds, or fails -pushAllChangesets $CRDB_FETCHER_PDX_HOME - -#TODO : make this smarter .. to only import if the destination study has changed (i.e. alter the spreadsheet checkmarks) -#TODO : check if we can reuse the pdx-portal column -if [ $CRDB_PDX_SUBSET_AND_MERGE_SUCCESS -ne 0 ] ; then - # import if all went well (only if trigger file is present) - # if the database version is correct and ALL fetches succeed, then import - if [[ $DB_VERSION_FAIL -eq 0 ]] ; then - echo "importing study data to database using $IMPORTER_JAR_FILENAME ..." - $JAVA_BINARY -Xmx16g $JAVA_IMPORTER_ARGS --update-study-data --portal crdb-pdx-portal --use-never-import --update-worksheet --notification-file "$importer_notification_file" --oncotree-version ${ONCOTREE_VERSION_TO_USE} --transcript-overrides-source mskcc - if [ $? -ne 0 ]; then - echo "$IMPORTER_JAR_LABEL import failed!" - EMAIL_BODY="$IMPORTER_JAR_LABEL import failed" - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "Import failure: $IMPORTER_JAR_LABEL" $PIPELINES_EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "CRDB PDX Failure During Import" - else - IMPORT_SUCCESS=1 - fi - num_studies_updated=`cat $CRDB_PDX_TMPDIR/num_studies_updated.txt` -#### ROB : 2025_08_17 - persistence cache reset will now happen at the color transition instead -#### # clear persistence cache (note : this script is constructing studies for the msk portal, including mskimpact sample data - that is why the msk portal cache is cleared) -#### if [[ $IMPORT_SUCCESS -ne 0 && $num_studies_updated -gt 0 ]]; then -#### echo "'$num_studies_updated' studies have been updated, clearing persistence cache for msk portal ..." -#### if ! clearPersistenceCachesForMskPortals ; then -#### sendClearCacheFailureMessage msk import-pdx-data.sh -#### else -#### CLEAR_PERSISTENCE_CACHE_SUCCESS=1 -#### fi -#### else -#### echo "No studies have been updated, not clearing persistence cache for msk portal..." -#### CLEAR_PERSISTENCE_CACHE_SUCCESS=1 -#### fi - fi -fi - -# send appropriate email message to PDX_EMAIL_LIST -echo "sending notification email.." -EMAIL_MESSAGE_FILE="$CRDB_PDX_TMPDIR/pdx_summary_email_body.txt" -EMAIL_SUBJECT="CRDB PDX cBioPortal import failure" -rm -f $EMAIL_MESSAGE_FILE -if [ $ALL_DATA_SOURCE_FETCH_SUCCESS -eq 0 ] ; then - echo -e "The import of CRDB PDX studies did not occur today due to a failure to update the git repositories used to hold study data." >> "$EMAIL_MESSAGE_FILE" -else - if [ $CRDB_PDX_FETCH_SUCCESS -eq 0 ] ; then - echo -e "The import of CRDB PDX studies did not occur today due to a failure to download PDX data from the CRDB database server." >> "$EMAIL_MESSAGE_FILE" - else - if [ $CRDB_PDX_SUBSET_AND_MERGE_SUCCESS -eq 0 ] ; then - echo -e "The import of CRDB PDX studies did not occur today due to a failure during the subsetting and merging of source study data according to the source_to_destination_mappings.txt file downloaded from CRDB database server." >> "$EMAIL_MESSAGE_FILE" - else - if [ $IMPORT_SUCCESS -eq 0 ] ; then - echo -e "The import of CRDB PDX studies was attempted but failed during the importing process." >> "$EMAIL_MESSAGE_FILE" - else - if [ $CLEAR_PERSISTENCE_CACHE_SUCCESS -eq 0 ] ; then - echo -e "The import of CRDB PDX studies completed successfully, however due to a technical problem the website may not display the latest data." >> "$EMAIL_MESSAGE_FILE" - else - echo -e "The import of CRDB PDX studies completed successfully." >> "$EMAIL_MESSAGE_FILE" - EMAIL_SUBJECT="CRDB PDX cBioPortal nightly import status" - sendSuccessMessageMskPipelineLogsSlack "CRDB PDX Pipeline Success" - fi - fi - fi - fi -fi - -# append any warnings from the subset and merge script -if [ -s "$CRDB_PDX_TMPDIR/$SUBSET_AND_MERGE_WARNINGS_FILENAME" ] ; then - echo -e "\n" >> "$EMAIL_MESSAGE_FILE" - echo -e "Warnings generated by the subset and merge of pdx cmo studies:" >> "$EMAIL_MESSAGE_FILE" - echo -e "--------------------------------------------------------------" >> "$EMAIL_MESSAGE_FILE" - cat "$CRDB_PDX_TMPDIR/$SUBSET_AND_MERGE_WARNINGS_FILENAME" >> "$EMAIL_MESSAGE_FILE" -fi -# append any warnings from the importer notification file -if [ -s "$importer_notification_file" ] ; then - echo -e "\n" >> "$EMAIL_MESSAGE_FILE" - echo -e "Output generated by the cBioPortal importer:" >> "$EMAIL_MESSAGE_FILE" - echo -e "--------------------------------------------" >> "$EMAIL_MESSAGE_FILE" - cat "$importer_notification_file" >> "$EMAIL_MESSAGE_FILE" -fi - -validation_report_attachments="" -for validation_report in $(find $CRDB_PDX_TMPDIR -name "*-validation.html"); do - validation_report_attachments+=" -a $validation_report" -done - -echo -e "Sending email:" -cat "$EMAIL_MESSAGE_FILE" -cat "$EMAIL_MESSAGE_FILE" | mailx -s "$EMAIL_SUBJECT" $validation_report_attachments $PDX_EMAIL_LIST - -echo "Cleaning up any untracked files from MSK-PDX import..." -bash $PORTAL_HOME/scripts/datasource-repo-cleanup.sh $PORTAL_DATA_HOME/bic-mskcc-legacy $PORTAL_DATA_HOME/cmo-argos $PORTAL_DATA_HOME/private $PORTAL_DATA_HOME/datahub $PORTAL_DATA_HOME/crdb_pdx - -exit $(( 1 - IMPORT_SUCCESS )) diff --git a/import-scripts/import-temp-study.sh b/import-scripts/import-temp-study.sh deleted file mode 100755 index 37fd5d15f..000000000 --- a/import-scripts/import-temp-study.sh +++ /dev/null @@ -1,254 +0,0 @@ -#!/bin/bash - -# Temp study importer arguments -# (1): cancer study id [ mskimpact | mskarcher | msk_kingscounty | msk_lehighvalley | msk_queenscancercenter | msk_miamicancerinstitute | msk_hartfordhealthcare | msk_ralphlauren | msk_rikengenesisjapan | mskimpact_ped | sclc_mskimpact_2017 | lymphoma_super_cohort_fmi_msk ] -# (2): temp study id [ temporary_mskimpact | temporary_mskarcher | temporary_msk_kingscounty | temporary_msk_lehighvalley | temporary_msk_queenscancercenter | temporary_msk_miamicancerinstitute | temporary_msk_hartfordhealthcare | temporary_msk_ralphlauren | temporary_msk_rikengenesisjapan | temporary_mskimpact_ped | temporary_sclc_mskimpact_2017 | temporary_lymphoma_super_cohort_fmi_msk] -# (3): backup study id [ yesterday_mskimpact | yesterday_mskarcher | yesterday_msk_kingscounty | yesterday_msk_lehighvalley | yesterday_msk_queenscancercenter | yesterday_msk_miamicancerinstitute | yesterday_msk_hartfordhealthcare | yesterday_msk_ralphlauren | yesterday_msk_rikengenesisjapan | yesterday_mskimpact_ped | yesterday_sclc_mskimpact_2017 | yesterday_lymphoma_super_cohort_fmi_msk] -# (4): portal name [ msk-solid-heme-portal | mskarcher-portal | msk-kingscounty-portal | msk-lehighvalley-portal | msk-queenscancercenter-portal | msk-mci-portal | msk-hartford-portal | msk-ralphlauren-portal | msk-tailormedjapan-portal | msk-ped-portal | msk-sclc-portal | msk-fmi-lymphoma-portal ] -# (5): study path [ $MSK_SOLID_HEME_DATA_HOME | $MSK_ARCHER_DATA_HOME | $MSK_KINGS_DATA_HOME | $MSK_LEHIGH_DATA_HOME | $MSK_QUEENS_DATA_HOME | $MSK_MCI_DATA_HOME | $MSK_HARTFORD_DATA_HOME | $MSK_RALPHLAUREN_DATA_HOME | $MSK_RIKENGENESISJAPAN_DATA_HOME | $MSKIMPACT_PED_DATA_HOME | $MSK_SCLC_DATA_HOME | $LYMPHOMA_SUPER_COHORT_DATA_HOME ] -# (6): notification file [ $msk_solid_heme_notification_file | $kingscounty_notification_file | $lehighvalley_notification_file | $queenscancercenter_notification_file | $miamicancerinstitute_notification_file | $hartfordhealthcare_notification_file | $ralphlauren_notification_file | $rikengenesisjapan_notification_file | $mskimpact_ped_notification_file | $sclc_mskimpact_notification_file | $lymphoma_super_cohort_notification_file ] -# (7): tmp directory -# (8): email list -# (9): oncotree version [ oncotree_candidate_release | oncotree_latest_stable ] -# (10): importer jar -# (11): transcript overrides source [ uniprot | mskcc ] - -# Non-zero exit code status indication -# There are several flags that are checked during the execution of the temporary study import. If any flags are non-zero at the end -# of execution then an email is sent out to the email list provided for each non-zero flag and the script exits with a non-zero status. -# Flags: -# IMPORT_FAIL if non-zero indicates that the study failed to import under a temporary id -# VALIDATION_FAIL if non-zero indicates that the temporary study failed validation against the original study -# DELETE_FAIL if non-zero indicates that the backup study failed to delete -# RENAME_BACKUP_FAIL if non-zero indicates that the original study failed to rename to the backup study id -# RENAME_FAIL if non-zero indicates that the temporary study failed to rename to the original study id - -source "$PORTAL_HOME/scripts/slack-message-functions.sh" - -function usage { - echo "import-temp-study.sh" - echo -e "\t-i | --study-id cancer study identifier" - echo -e "\t-t | --temp-study-id temp study identifier" - echo -e "\t-b | --backup-study-id backup study identifier" - echo -e "\t-p | --portal-name portal name" - echo -e "\t-s | --study-path study path" - echo -e "\t-n | --notification-file notification file" - echo -e "\t-d | --tmp-directory tmp directory" - echo -e "\t-e | --email-list email list" - echo -e "\t-o | --oncotree-version oncotree version" - echo -e "\t-j | --importer-jar importer jar" - echo -e "\t-r | --transcript-overrides-source transcript overrides source" - echo -e "\t-a | --allow-redcap-export allow redcap export during import (overrides --disable-redcap-export default)" -} - -function sendFailureMessageMskPipelineLogsSlack { - MESSAGE=$1 - send_slack_message_to_channel "#msk-pipeline-logs" "string" "MSK temporary study import process failed :tired_face: : $MESSAGE" -} - -# set default value(s) -DISABLE_REDCAP_EXPORT_TERM="--disable-redcap-export" -ONCOTREE_VERSION_TERM="" - -echo "Input arguments:" -for i in "$@"; do -case $i in - -i=*|--study-id=*) - CANCER_STUDY_IDENTIFIER="${i#*=}" - echo -e "\tstudy id=$CANCER_STUDY_IDENTIFIER" - shift - ;; - -t=*|--temp-study-id=*) - TEMP_CANCER_STUDY_IDENTIFIER="${i#*=}" - echo -e "\ttemp id=$TEMP_CANCER_STUDY_IDENTIFIER" - shift - ;; - -b=*|--backup-study-id=*) - BACKUP_CANCER_STUDY_IDENTIFIER="${i#*=}" - echo -e "\tbackup id=$BACKUP_CANCER_STUDY_IDENTIFIER" - shift - ;; - -p=*|--portal-name=*) - PORTAL_NAME="${i#*=}" - echo -e "\tportal name=$PORTAL_NAME" - shift - ;; - -s=*|--study-path=*) - STUDY_PATH="${i#*=}" - echo -e "\tstudy path=$STUDY_PATH" - shift - ;; - -n=*|--notification-file=*) - NOTIFICATION_FILE="${i#*=}" - echo -e "\tnotifcation file=$NOTIFICATION_FILE" - shift - ;; - -d=*|--tmp-directory=*) - TMP_DIRECTORY="${i#*=}" - echo -e "\ttmp dir=$TMP_DIRECTORY" - shift - ;; - -e=*|--email-list=*) - EMAIL_LIST="${i#*=}" - echo -e "\temail list=$EMAIL_LIST" - shift - ;; - -o=*|--oncotree-version=*) - ONCOTREE_VERSION_TERM="--oncotree-version ${i#*=}" - echo -e "\toncotree version=${i#*=}" - shift - ;; - -j=*|--importer-jar=*) - IMPORTER_JAR_FILENAME="${i#*=}" - echo -e "\timporter jar=$IMPORTER_JAR_FILENAME" - shift - ;; - -r=*|--transcript-overrides-source=*) - TRANSCRIPT_OVERRIDES_SOURCE="${i#*=}" - echo -e "\ttranscript overrides source=$TRANSCRIPT_OVERRIDES_SOURCE" - shift - ;; - -a|--allow-redcap-export) - DISABLE_REDCAP_EXPORT_TERM="" - echo -e "\tallow redcap export=true" - shift - ;; - *) - ;; -esac -done - -if [[ -z $CANCER_STUDY_IDENTIFIER || -z $TEMP_CANCER_STUDY_IDENTIFIER || -z $BACKUP_CANCER_STUDY_IDENTIFIER || -z $PORTAL_NAME || -z $STUDY_PATH || -z $NOTIFICATION_FILE || -z $TMP_DIRECTORY || -z $EMAIL_LIST || -z $IMPORTER_JAR_FILENAME || -z $TRANSCRIPT_OVERRIDES_SOURCE ]]; then - usage - exit 1 -fi - -ENABLE_DEBUGGING=0 -java_debug_args="" -if [ $ENABLE_DEBUGGING != "0" ] ; then - java_debug_args="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=27182" -fi -JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$TMP_DIRECTORY -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" -SLACK_PIPELINES_MONITOR_URL=`cat $SLACK_URL_FILE` - -# define validator notification filename based on cancer study id, remove if already exists, touch new file -now=$(date "+%Y-%m-%d-%H-%M-%S") -VALIDATION_NOTIFICATION_FILENAME="$(mktemp $TMP_DIRECTORY/validation_$CANCER_STUDY_IDENTIFIER.$now.XXXXXX)" -if [ -f $VALIDATION_NOTIFICATION_FILENAME ]; then - rm $VALIDATION_NOTIFICATION_FILENAME -fi -touch $VALIDATION_NOTIFICATION_FILENAME - -# variables for import temp study status -IMPORT_FAIL=0 -VALIDATION_FAIL=0 -DELETE_FAIL=0 -RENAME_BACKUP_FAIL=0 -RENAME_FAIL=0 - -# import study using temp id -echo "Importing study '$CANCER_STUDY_IDENTIFIER' as temporary study '$TEMP_CANCER_STUDY_IDENTIFIER'" -$JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --update-study-data --portal $PORTAL_NAME --notification-file $NOTIFICATION_FILE --temporary-id $TEMP_CANCER_STUDY_IDENTIFIER $ONCOTREE_VERSION_TERM --transcript-overrides-source $TRANSCRIPT_OVERRIDES_SOURCE $DISABLE_REDCAP_EXPORT_TERM -# we do not have to check the exit status here because if num_studies_updated != 1 we consider the import to have failed (we check num_studies_updated next) - -# check number of studies updated before continuing -if [[ $? -eq 0 && -f "$TMP_DIRECTORY/num_studies_updated.txt" ]]; then - num_studies_updated=`cat $TMP_DIRECTORY/num_studies_updated.txt` -else - num_studies_updated=0 -fi - -if [ "$num_studies_updated" -ne 1 ]; then - echo "Failed to import study '$CANCER_STUDY_IDENTIFIER'" - IMPORT_FAIL=1 -else - # validate - echo "Validating import..." - $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --validate-temp-study --temp-study-id $TEMP_CANCER_STUDY_IDENTIFIER --original-study-id $CANCER_STUDY_IDENTIFIER --notification-file $VALIDATION_NOTIFICATION_FILENAME - if [ $? -gt 0 ]; then - echo "Failed to validate - deleting temp study '$TEMP_CANCER_STUDY_IDENTIFIER'" - $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --delete-cancer-study --cancer-study-ids $TEMP_CANCER_STUDY_IDENTIFIER - VALIDATION_FAIL=1 - else - echo "Successful validation - renaming '$CANCER_STUDY_IDENTIFIER' and temp study '$TEMP_CANCER_STUDY_IDENTIFIER'" - $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --delete-cancer-study --cancer-study-ids $BACKUP_CANCER_STUDY_IDENTIFIER - if [ $? -gt 0 ]; then - echo "Failed to delete backup study '$BACKUP_CANCER_STUDY_IDENTIFIER'!" - DELETE_FAIL=1 - else - echo "Renaming '$CANCER_STUDY_IDENTIFIER' to '$BACKUP_CANCER_STUDY_IDENTIFIER'" - $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --rename-cancer-study --new-study-id $BACKUP_CANCER_STUDY_IDENTIFIER --original-study-id $CANCER_STUDY_IDENTIFIER - if [ $? -gt 0 ]; then - echo "Failed to rename existing '$CANCER_STUDY_IDENTIFIER' to backup study '$BACKUP_CANCER_STUDY_IDENTIFIER'!" - RENAME_BACKUP_FAIL=1 - else - echo "Renaming temporary study '$TEMP_CANCER_STUDY_IDENTIFIER' to '$CANCER_STUDY_IDENTIFIER'" - $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --rename-cancer-study --new-study-id $CANCER_STUDY_IDENTIFIER --original-study-id $TEMP_CANCER_STUDY_IDENTIFIER - if [ $? -gt 0 ]; then - echo "Failed to rename temporary study '$TEMP_CANCER_STUDY_IDENTIFIER' to '$CANCER_STUDY_IDENTIFIER'!" - RENAME_FAIL=1 - fi - fi - fi - fi -fi - -### FAILURE EMAIL ### - -EMAIL_BODY="The $CANCER_STUDY_IDENTIFIER study failed import. The original study will remain on the portal." -# send email if import fails -if [ $IMPORT_FAIL -gt 0 ]; then - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "$CANCER_STUDY_IDENTIFIER Update Failure: Import" $EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "$CANCER_STUDY_IDENTIFIER import as temp study" -fi - -# send email if validation fails -if [ $VALIDATION_FAIL -gt 0 ]; then - if [ $(wc -l < $VALIDATION_NOTIFICATION_FILENAME) -eq 0 ]; then - EMAIL_BODY="The $CANCER_STUDY_IDENTIFIER study failed to pass the validation step in import process for some unknown reason. No data was saved to validation notification file $VALIDATION_NOTIFICATION_FILENAME. The original study will remain on the portal." - else - EMAIL_BODY=`cat $VALIDATION_NOTIFICATION_FILENAME` - fi - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "$CANCER_STUDY_IDENTIFIER Update Failure: Validation" $EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "$CANCER_STUDY_IDENTIFIER temp study validation" -fi - -EMAIL_BODY="The $BACKUP_CANCER_STUDY_IDENTIFIER study failed to delete. $CANCER_STUDY_IDENTIFIER study did not finish updating." -if [ $DELETE_FAIL -gt 0 ]; then - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "$CANCER_STUDY_IDENTIFIER Update Failure: Deletion" $EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "$BACKUP_CANCER_STUDY_IDENTIFIER deletion" -fi - -EMAIL_BODY="Failed to backup $CANCER_STUDY_IDENTIFIER to $BACKUP_CANCER_STUDY_IDENTIFIER via renaming. $CANCER_STUDY_IDENTIFIER study did not finish updating." -if [ $RENAME_BACKUP_FAIL -gt 0 ]; then - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "$CANCER_STUDY_IDENTIFIER Update Failure: Renaming backup" $EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "$CANCER_STUDY_IDENTIFIER rename to $BACKUP_CANCER_STUDY_IDENTIFIER" -fi - -EMAIL_BODY="Failed to rename temp study $TEMP_CANCER_STUDY_IDENTIFIER to $CANCER_STUDY_IDENTIFIER. $CANCER_STUDY_IDENTIFIER study did not finish updating." -if [ $RENAME_FAIL -gt 0 ]; then - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "$CANCER_STUDY_IDENTIFIER Update Failure: CRITICAL!! Renaming" $EMAIL_LIST - sendFailureMessageMskPipelineLogsSlack "CRITICAL FAILURE: $TEMP_CANCER_STUDY_IDENTIFIER rename to $CANCER_STUDY_IDENTIFIER" -fi - -# send notification file -# this contains the error or success message from import -# we only want to send the email on import failure -# or if everything succeeds -if [[ $IMPORT_FAIL -ne 0 || ($VALIDATION_FAIL -eq 0 && $DELETE_FAIL -eq 0 && $RENAME_BACKUP_FAIL -eq 0 && $RENAME_FAIL -eq 0) ]]; then - EMAIL_NOTIFICATION_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/email-import-notification-after-import.sh" - $EMAIL_NOTIFICATION_SCRIPT_FILEPATH $PORTAL_NAME "$NOTIFICATION_FILE" -fi - -# determine if we need to exit with error code -if [[ $IMPORT_FAIL -ne 0 || $VALIDATION_FAIL -ne 0 || $DELETE_FAIL -ne 0 || $RENAME_BACKUP_FAIL -ne 0 || $RENAME_FAIL -ne 0 ]]; then - echo "Update failed for study '$CANCER_STUDY_IDENTIFIER'" - exit 1 -else - echo "Update successful for study '$CANCER_STUDY_IDENTIFIER'" -fi diff --git a/import-scripts/import-tempo-data.sh b/import-scripts/import-tempo-data.sh deleted file mode 100755 index 2e195d4f1..000000000 --- a/import-scripts/import-tempo-data.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash - -echo $(date) - -# setting up the environment on the machine -# ------------------------- -PATH_TO_AUTOMATION_SCRIPT=/data/portal-cron/scripts/automation-environment.sh -PATH_TO_TEMPO_JOB_CONFIG=/data/portal-cron/scripts/tempo-environment.sh - -# set up enivornment variables and temp directory -if ! [ -f $PATH_TO_AUTOMATION_SCRIPT ] || ! [ -f $PATH_TO_TEMPO_JOB_CONFIG ]; then - message="automation-environment.sh and/or tempo-environment.sh could not be found, exiting..." - echo ${message} - exit 2 -fi - -source $PATH_TO_AUTOMATION_SCRIPT -source $PATH_TO_TEMPO_JOB_CONFIG -source $PORTAL_HOME/scripts/clear-persistence-cache-shell-functions.sh -# ------------------------- - -# TODO: to be set by Airflow DAG -# ------------------------- -IMPORTER_JAR_FILENAME=$PORTAL_HOME/lib/triage-cmo-importer.jar -#IMPORTER_JAR_FILENAME=$1 -# ------------------------- - -# Additional env vars -# ------------------------- -importer_notification_file=$(mktemp $TEMPO_TMPDIR/importer-update-notification.$now.XXXXXX) -TEMPO_TMPDIR=/data/portal-cron/tmp/import-cron-tempo -JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$TEMPO_TMPDIR -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" - -# temporary workaround to enable importing a study from arbitrary path -# needed because importer looks for "data sources" under PORTAL_DATA_HOME -# not checking in files into an actual data source -# to be redone with whole refactoring of how we integrate datasources -PORTAL_DATA_HOME=/data/portal-cron/tmp/import-tempo/ -ONCOTREE_VERSION_TO_USE=oncotree_candidate_release -# ------------------------- - -# Flags for individual steps required for successful import -CDD_RECACHE_FAIL=0 -DB_VERSION_FAIL=0 -DATABRICKS_EXPORT_FAIL=0 -IMPORT_SUCCESS=0 -CLEAR_PERSISTENCE_CACHE_SUCCESS=0 - -# Refresh CDD -if ! [ -z $INHIBIT_RECACHING_FROM_TOPBRAID ] ; then - # refresh cdd cache - bash $PORTAL_HOME/scripts/refresh-cdd-cache.sh - if [ $? -gt 0 ]; then - CDD_RECACHE_FAIL=1 - message="Failed to refresh CDD cache during CRDB PDX import!" - echo $message - fi -fi - -# Database Check -echo "Checking if database version is compatible" -$JAVA_BINARY $JAVA_IMPORTER_ARGS --check-db-version -if [ $? -eq 0 ] -then - echo "Database version expected by portal does not match version in database!" - DB_VERSION_FAIL=1 -fi - -# Create temp directory needed for import -if [ ! -d $TEMPO_TMPDIR ] ; then - mkdir $TEMPO_TMPDIR - if [ $? -ne 0 ] ; then - message="error : required temp directory does not exist and could not be created : $TEMPO_TMPDIR" - echo ${message} - exit 2 - fi -fi -if [[ -d "$TEMPO_TMPDIR" && "$TEMPO_TMPDIR" != "/" ]] ; then - rm -rf "$TEMPO_TMPDIR"/* -fi - -# Pull TEMPO dataset from Databricks -echo "Pulling TEMPO dataset from Databricks" -$PORTAL_HOME/scripts/cbioportal-databricks-gateway --catalog=$DATABRICKS_CATALOG --directory=$STUDY_DIRECTORY --host=$DATABRICKS_HOST --path=$DATABRICKS_PATH --port=$DATABRICKS_PORT --schema=$DATABRICKS_SCHEMA --token=$DATABRICKS_TOKEN -if [ $? -gt 0 ]; then - DATABRICKS_EXPORT_FAIL=1 - message="Failed to export study from Databricks" - echo $message - exit 2 -fi - -# Additional processing -# Add clinical data metadata headers -# Filter data issues - to be removed -# Generate caselists -# ------------------------- -python /data/portal-cron/scripts/add_clinical_attribute_metadata_headers.py -f /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo/data_clinical_sample.txt -python /data/portal-cron/scripts/add_clinical_attribute_metadata_headers.py -f /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo/data_clinical_patient.txt -$PYTHON_BINARY /data/portal-cron/scripts/merge.py -e /home/cbioportal_importer/tempo_subset_list --output-directory /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo --study-id msk_tempo --cancer-type mixed /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo -python /data/portal-cron/scripts/generate_case_lists.py -i msk_tempo -s /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo -c /data/portal-cron/scripts/case_list_config.tsv -d /data/portal-cron/tmp/import-tempo/pipelines-testing/studies/msk_tempo/case_lists -# ------------------------- - -if [[ $DB_VERSION_FAIL -eq 0 && $CDD_RECACHE_FAIL -eq 0 && $DATABRICKS_EXPORT_FAIL -eq 0 ]] ; then - echo "importing study data to database using $IMPORTER_JAR_FILENAME ..." - $JAVA_BINARY -Xmx16g $JAVA_IMPORTER_ARGS --update-study-data --portal hot-deploy --use-never-import --update-worksheet --notification-file "$importer_notification_file" --oncotree-version ${ONCOTREE_VERSION_TO_USE} --transcript-overrides-source mskcc - if [ $? -ne 0 ]; then - echo "$IMPORTER_JAR_LABEL import failed!" - else - echo "This was a success" - IMPORT_SUCCESS=1 - fi - num_studies_updated=`cat $TEMPO_TMPDIR/num_studies_updated.txt` - # clear persistence cache (note : this script is constructing studies for the msk portal, including mskimpact sample data - that is why the msk portal cache is cleared) - # # TODO: configure pipelines5 server to handle cache resets - if [[ $IMPORT_SUCCESS -ne 0 && $num_studies_updated -gt 0 ]]; then - echo "'$num_studies_updated' studies have been updated, clearing persistence cache for a portal ..." - exit 0 - if ! clearPersistenceCachesForMskPortals ; then - sendClearCacheFailureMessage msk import-pdx-data.sh - else - CLEAR_PERSISTENCE_CACHE_SUCCESS=1 - fi - else - echo "No studies have been updated, not clearing persistence cache for msk portal..." - CLEAR_PERSISTENCE_CACHE_SUCCESS=1 - fi -fi -exit 0 diff --git a/import-scripts/merge_dremio_clinical_data_into_cmo_access.sh b/import-scripts/merge_dremio_clinical_data_into_cmo_access.sh index 5f8eff564..e5c372ac7 100755 --- a/import-scripts/merge_dremio_clinical_data_into_cmo_access.sh +++ b/import-scripts/merge_dremio_clinical_data_into_cmo_access.sh @@ -74,9 +74,9 @@ FLOCK_FILEPATH="/data/portal-cron/cron-lock/merge_dremio_clinical_data_into_cmo_ echo "Error during determination of the destination database color" >&2 exit 1 fi - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" + IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-importer-$destination_database_color.jar" ENABLE_DEBUGGING=0 - JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$CMO_ACCESS_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-clickhouse-importer.log -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" + JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$CMO_ACCESS_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-cmo-importer.log -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" ONCOTREE_VERSION_TO_USE="oncotree_candidate_release" RUNMODE_PROD="runmode_prod" RUNMODE_DEV="runmode_dev" diff --git a/import-scripts/monitor-stalled-jobs.sh b/import-scripts/monitor-stalled-jobs.sh deleted file mode 100755 index 62aa390be..000000000 --- a/import-scripts/monitor-stalled-jobs.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -source /data/portal-cron/scripts/slack-message-functions.sh - -# converts timestamp (D:H:M:S) to seconds -function convert_to_seconds () { - elapsed_time=$1 - elapsed_time_in_seconds=`echo "$elapsed_time" | awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}'` - echo $elapsed_time_in_seconds -} - -# function for sending notification emails -function send_email_notification () { - process_name=$1 - hostname=`hostname` - ### FAILURE EMAIL ### - EMAIL_BODY="Following processes appear to be stalled.\nHostname: ${hostname}\ndate: ${now}\nrunning processes: see below\n\nCMD\tPID\tSTART_TIME\tETIME\n${process_name}\n" - echo -e "Sending email\n$EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "Alert: Import jobs stalled on ${hostname}" cbioportal-pipelines@cbioportal.org -} - -# Function for alerting slack channel of stalled jobs -function send_slack_warning_message () { - send_slack_message_to_channel "#msk-pipeline-logs" "string" "A stalled nightly import process has been detected. :tired_face:" -} - -# Array of process names being checked -checked_process_list=( - 'import_portal_users_genie.sh' - 'importUsers.py' - 'import-dmp-impact-data.sh' - 'import-temp-study.sh' - 'oncokb-annotator.sh' -) - -# Stalled times -mt_users_genie=$(( 5 * 60 )) # import_portal_users_genie.sh: 5 minutes -mt_import_users=$(( 30 * 60 )) # importUsers.py: 30 minutes -mt_import_dmp=$(( 10 * 60 * 60 )) # import-dmp-impact-data.sh: 10 hours -mt_import_temp_study=$(( 3 * 60 * 60 )) # import-temp-study.sh: 3 hours -mt_oncokb_annotator=$(( 4 * 60 * 60 )) # oncokb-annotator.sh: 4 hours -max_time=($mt_users_genie $mt_import_users $mt_import_dmp $mt_import_temp_study $mt_oncokb_annotator) -email_times=(0 0 0 0 0) - -while : -do - now=$(date "+%Y-%m-%d-%H-%M-%S") - echo date: ${now} : scanning for long running import jobs - myuidoutput=`id` - myuid=-1 - if [[ ${myuidoutput} =~ uid=([[:digit:]]+).* ]] ; then - myuid=${BASH_REMATCH[1]} - fi - if [ $myuid -eq -1 ] ; then - echo Error : could not determine uid - exit 1 - fi - - ex1="CMD" #exclude the header from ps - ex2="grep\|ps\|tail\|vim" #exclude grep, ps, tail, and vim commands - ex3="triage\|hot-deploy" #exclude triage or hot-deploy imports - ex4="scan-for-stalled-import-jobs\.sh" #exclude this command (scan-for-stalled-import-jobs.sh) - ex5="\.log" #exclude accesses to log files (less, cat, grep) - - for (( i=0; i<${#checked_process_list[@]}; i++ )); - do - # get ps for user and sort by elapsed time (take longest running process) and convert to seconds - ps_output=`export COLUMNS=24000 ; ps --user $myuid -o cmd:100,pid:15,start_time:15,etime:15 --sort=etime | grep "${checked_process_list[i]}" | grep -ve "$ex1\|$ex2\|$ex3\|$ex4\|$ex5" | sed 's/\s\s\s*/\t/g' | head -1` - ps_etime=`echo "$ps_output" | cut -f4 | sed 's/-/:/g'` - ps_etime_seconds=$(convert_to_seconds $ps_etime) - - # if process is not stalled then set date to current time so next break triggers email - if [ $ps_etime_seconds -le ${max_time[i]} ] ; then - email_times[i]="$(date +%H%M)" - else - # if process is stalled and current time is greater than 'email time' - send email and set email time to current time plus 3 hours - if [ $(date +%H%M) -gt ${email_times[i]} ] ; then - send_email_notification "$ps_output" - send_slack_warning_message - email_times[i]="$((10#$(date -d '+3 hours' +"%H%M")))" - fi - fi - done - - # kill script at 23:30 - restart at midnight in crontab - if [ $(date +"%H%M") -gt 2330 ] ; then - echo "Exiting monitoring script... script will restart at midnight" - exit 0 - fi - - # sleep 10 minutes before trying again - sleep 600 -done diff --git a/import-scripts/rds_functions.sh b/import-scripts/rds_functions.sh deleted file mode 100755 index eae503e25..000000000 --- a/import-scripts/rds_functions.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env bash - -DEFAULT_WAIT_TIMEOUT_FOR_RDS_INSTANCE_SCALING=3600 # seconds - -# Get current instance class -rds_current_class() { - local id="$1" - local profile="$2" - - aws rds describe-db-instances \ - --db-instance-identifier "$id" \ - --query 'DBInstances[0].DBInstanceClass' \ - --output text \ - --profile "$profile" -} - -# Get current instance status -rds_current_status() { - local id="$1" - local profile="$2" - - aws rds describe-db-instances \ - --db-instance-identifier "$id" \ - --query 'DBInstances[0].DBInstanceStatus' \ - --output text \ - --profile "$profile" -} - -# Start instance -rds_start() { - local id="$1" - local profile="$2" - - aws rds start-db-instance \ - --db-instance-identifier "$id" \ - --no-cli-pager \ - --profile "$profile" - aws rds wait db-instance-available \ - --db-instance-identifier "$id" \ - --profile "$profile" -} - -# Stop instance -rds_stop() { - local id="$1" - local profile="$2" - - aws rds stop-db-instance \ - --db-instance-identifier "$id" \ - --no-cli-pager \ - --profile "$profile" - - # The functionality to check whether an RDS instance is stopped doesn't yet exist from the AWS CLI; - # we have to emulate it ourselves manually - # aws rds wait db-instance-stopped --db-instance-identifier "$id" - - while true; do - STATUS=$(rds_current_status "$id" "$profile") - if [ "$STATUS" = "stopped" ]; then - # instance is stopped - break - fi - sleep 5 - done -} - -# Validate that a class is orderable for the instance's engine/version -rds_validate_class() { - local id="$1" - local profile="$2" - local new_class="$3" - local engine engine_version - - read -r engine engine_version <<<"$(aws rds describe-db-instances \ - --db-instance-identifier "$id" \ - --query 'DBInstances[0].[Engine,EngineVersion]' \ - --output text \ - --profile "$profile")" - - if [[ -z "$engine" || -z "$engine_version" ]]; then - echo "Unable to determine engine/version for '$id'" >&2 - return 1 - fi - - aws rds describe-orderable-db-instance-options \ - --engine "$engine" \ - --engine-version "$engine_version" \ - --query "OrderableDBInstanceOptions[?DBInstanceClass=='$new_class'].DBInstanceClass" \ - --output text \ - --profile "$profile" | grep -qw "$new_class" -} - -# Explicitly set class -rds_set_class() { - local id="$1" - local profile="$2" - local new_class="$3" - local timeout_seconds="${4:-$DEFAULT_WAIT_TIMEOUT_FOR_RDS_INSTANCE_SCALING}" - - if ! rds_validate_class "$id" "$profile" "$new_class"; then - echo "Invalid DB instance class '$new_class' for '$id'" >&2 - return 1 - fi - - aws rds modify-db-instance \ - --db-instance-identifier "$id" \ - --db-instance-class "$new_class" \ - --apply-immediately \ - --no-cli-pager \ - --profile "$profile" - - wait_for_class "$id" "$profile" "$new_class" "$timeout_seconds" -} - -# Wait for the instance class change to complete and become available -wait_for_class() { - local id="$1" - local profile="$2" - local new_class="$3" - local timeout="${4:-$DEFAULT_WAIT_TIMEOUT_FOR_RDS_INSTANCE_SCALING}" - local start now class status pending - - start=$(date +%s) - while true; do - read -r class status pending <<<"$(aws rds describe-db-instances \ - --db-instance-identifier "$id" \ - --query 'DBInstances[0].[DBInstanceClass,DBInstanceStatus,PendingModifiedValues.DBInstanceClass]' \ - --output text \ - --profile "$profile")" - - # Succeed when the new class is applied, instance is available, and no pending modification remains - if [[ "$class" == "$new_class" && "$status" == "available" && ( "$pending" == "None" || -z "$pending" ) ]]; then - return 0 - fi - now=$(date +%s) - if (( now - start >= timeout )); then - echo "Timed out waiting for $id to become $new_class (status=$status, pending=$pending, current=$class)" >&2 - return 1 - fi - sleep 10 - done -} diff --git a/import-scripts/rsync_jenkins_test_properties.sh b/import-scripts/rsync_jenkins_test_properties.sh deleted file mode 100755 index 8eeb9b7ba..000000000 --- a/import-scripts/rsync_jenkins_test_properties.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -PATH_TO_AUTOMATION_SCRIPT=/data/portal-cron/scripts/automation-environment.sh - -if ! [ -f "$PATH_TO_AUTOMATION_SCRIPT" ] ; then - echo "automation-environment.sh could not be found, exiting..." - exit 2 -fi - -source "$PATH_TO_AUTOMATION_SCRIPT" - -# jenkins server paths -JENKINS_SRV_HOSTNAME=jenkins_hostname_prod -JENKINS_SRV_HOME_DIRECTORY=/var/lib/jenkins -JENKINS_SRV_PROPERTIES_DIRECTORY=$JENKINS_SRV_HOME_DIRECTORY/pipelines-configuration/properties -JENKINS_SRV_SCRIPTS_DIRECTORY=$JENKINS_SRV_HOME_DIRECTORY/pipelines-configuration/jenkins -JENKINS_SRV_PIPELINES_CREDENTIALS=$JENKINS_SRV_HOME_DIRECTORY/pipelines-credentials -JENKINS_SRV_GIT_CREDENTIALS=$JENKINS_SRV_HOME_DIRECTORY/git-credentials -# local jenkins staging paths -LOCAL_PROPERTIES_DIRECTORY=$PIPELINES_CONFIG_HOME/properties/ -LOCAL_JENKINS_DIRECTORY=$PIPELINES_CONFIG_HOME/jenkins/ -LOCAL_PIPELINES_CREDENTIALS=$PORTAL_HOME/pipelines-credentials/ -LOCAL_GIT_CREDENTIALS=$PIPELINES_CONFIG_HOME/git/git-credentials - -source "$PORTAL_HOME/scripts/slack-message-functions.sh" - -# Function for alerting slack channel that something failed -function sendFailureMessageMskPipelineLogsSlack { - MESSAGE="$1" - send_slack_message_to_channel "#msk-pipeline-logs" "string" "$MESSAGE :boom:" -} - -cd $LOCAL_PROPERTIES_DIRECTORY -git pull -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when pulling pipelines-configuration git repo" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -rsync -a --delete $LOCAL_PROPERTIES_DIRECTORY $JENKINS_SRV_HOSTNAME:$JENKINS_SRV_PROPERTIES_DIRECTORY -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when rsync-ing properties to jenkins machine" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -rsync -a --delete $LOCAL_JENKINS_DIRECTORY $JENKINS_SRV_HOSTNAME:$JENKINS_SRV_SCRIPTS_DIRECTORY -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when rsync-ing jenkins scripts to jenkins machine" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -rsync -a --delete $LOCAL_PIPELINES_CREDENTIALS $JENKINS_SRV_HOSTNAME:$JENKINS_SRV_PIPELINES_CREDENTIALS -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when rsync-ing pipelines-credentials to jenkins machine" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -rsync -a --delete $LOCAL_GIT_CREDENTIALS $JENKINS_SRV_HOSTNAME:$JENKINS_SRV_HOME_DIRECTORY -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when rsync-ing git-credentials to jenkins machine" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -ssh $JENKINS_SRV_HOSTNAME /bin/bash << EOF - chmod -R 700 $JENKINS_SRV_PROPERTIES_DIRECTORY $JENKINS_SRV_PIPELINES_CREDENTIALS && - find $JENKINS_SRV_PROPERTIES_DIRECTORY -type f -exec chmod 600 {} \; && - find $JENKINS_SRV_PIPELINES_CREDENTIALS -type f -exec chmod 600 {} \; && - chmod 600 $JENKINS_SRV_GIT_CREDENTIALS; -EOF - -if [ $? -ne 0 ] ; then - FAILURE_MESSAGE="Something went wrong when setting permissions for properties/credentials" - sendFailureMessageMskPipelineLogsSlack "$FAILURE_MESSAGE" - exit 1 -fi - -echo "Successfully synced properties/credentials to jenkins machine" diff --git a/import-scripts/scale-rds.sh b/import-scripts/scale-rds.sh deleted file mode 100755 index 96c7f26b9..000000000 --- a/import-scripts/scale-rds.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env bash -set -eEuo pipefail - -DIRECTION="$1" -PORTAL_DATABASE="$2" -COLOR_SWAP_CONFIG_FILEPATH="$3" -SKIP_PRE_VALIDATION="${4:-}" - -[[ "$DIRECTION" == "up" || "$DIRECTION" == "down" ]] -[[ "$PORTAL_DATABASE" == "genie" || "$PORTAL_DATABASE" == "public" || "$PORTAL_DATABASE" == "msk" ]] -[[ -f "$COLOR_SWAP_CONFIG_FILEPATH" ]] - -source /data/portal-cron/scripts/automation-environment.sh -source /data/portal-cron/scripts/color-config-parsing-functions.sh -source /data/portal-cron/scripts/rds_functions.sh - -# Authenticate with AWS -# We choose the automation profile based on portal to ensure the RDS helpers hit the right account. -if [[ "$PORTAL_DATABASE" == "msk" ]]; then - /data/portal-cron/scripts/authenticate_service_account.sh eks - aws_profile="automation_eks" -else - /data/portal-cron/scripts/authenticate_service_account.sh public - aws_profile="automation_public" -fi - -get_node_id() { - # NOTE: the color names in these RDS nodes do *not* have anything to do - # with the blue-green switching that's deployed in production today. - # they are relics of old times. - case "$PORTAL_DATABASE" in - public) - echo "cbioportal-public-db-green" - ;; - genie) - echo "cbioportal-genie-db-blue" - ;; - msk) - echo "mskdb-cbioportal-blue" - ;; - *) - echo "Unsupported portal: $PORTAL_DATABASE" >&2 - exit 1 - ;; - esac -} - -err_mismatched_instance_class() { - echo "ERROR: trying to scale $DIRECTION when $rds_node_id node is already scaled $DIRECTION" >&2 - exit 1 -} - -err_failed_to_change_instance_class() { - echo "ERROR: failed to scale $rds_node_id node $DIRECTION" >&2 - exit 1 -} - -function warn_already_at_desired_class_and_exit() { - desired_class="$1" - scaling_direction="$2" - echo "WARN: instance class was already at desired class $desired_class when we would have expected to scale the node in $scaling_direction direction." >&2 - exit 0 -} - -# Get the scale up / scale down classes for this portal -echo "Reading configuration knobs from $COLOR_SWAP_CONFIG_FILEPATH" -scale_up_class=$(read_scalar_from_yaml "$COLOR_SWAP_CONFIG_FILEPATH" '.rds_scale_up_class') -scale_down_class=$(read_scalar_from_yaml "$COLOR_SWAP_CONFIG_FILEPATH" '.rds_scale_down_class') -rds_node_id=$(get_node_id) -current_class=$(rds_current_class "$rds_node_id" "$aws_profile") - -# If we are already at the desired size, log warning but exit without error -if [[ "$DIRECTION" == "up" ]]; then - if [[ "$current_class" == "$scale_up_class" ]] ; then - warn_already_at_desired_class_and_exit "$current_class" "$DIRECTION" - fi -else - if [[ "$current_class" == "$scale_down_class" ]] ; then - warn_already_at_desired_class_and_exit "$current_class" "$DIRECTION" - fi -fi - -if [[ "$SKIP_PRE_VALIDATION" != "--skip-pre-validation" ]]; then - # Validate the current class for the given direction - # We should be scaling up from a downsized node, and scaling down from an upsized node - echo "Validating RDS instance class pre-scaling" - - if [[ "$DIRECTION" == "up" ]]; then - [[ "$current_class" == "$scale_down_class" ]] || err_mismatched_instance_class - else - [[ "$current_class" == "$scale_up_class" ]] || err_mismatched_instance_class - fi -fi - -# Do the scaling -echo "Scaling node $DIRECTION" -if [[ "$DIRECTION" == "up" ]]; then - rds_set_class "$rds_node_id" "$aws_profile" "$scale_up_class" -else - rds_set_class "$rds_node_id" "$aws_profile" "$scale_down_class" -fi - -# After scaling: validate that the instance class was changed successfully -echo "Validating RDS instance class post-scaling" -new_class=$(rds_current_class "$rds_node_id" "$aws_profile") -if [[ "$DIRECTION" == "up" ]]; then - [[ "$new_class" == "$scale_up_class" ]] || err_failed_to_change_instance_class -else - [[ "$new_class" == "$scale_down_class" ]] || err_failed_to_change_instance_class -fi diff --git a/import-scripts/tempo-environment.sh b/import-scripts/tempo-environment.sh deleted file mode 100644 index 76234247b..000000000 --- a/import-scripts/tempo-environment.sh +++ /dev/null @@ -1,7 +0,0 @@ -DATABRICKS_PORT=443 -DATABRICKS_PATH= -DATABRICKS_HOST= -DATABRICKS_CATALOG=cdsi_public -DATABRICKS_SCHEMA=tempo -DATABRICKS_TOKEN= -STUDY_DIRECTORY= diff --git a/import-scripts/test_if_impact_has_lost_allele_count.sh b/import-scripts/test_if_impact_has_lost_allele_count.sh deleted file mode 100755 index 9e7607b18..000000000 --- a/import-scripts/test_if_impact_has_lost_allele_count.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" -export tab=$'\t' -tmp=$PORTAL_HOME/tmp/import-cron-dmp-msk -sourcefilename=$MSK_IMPACT_DATA_HOME/data_mutations_extended.txt - -#find header field indices -tempfilename=$(mktemp $tmp/mskimpact_mut_filed_test.XXXXXX) -head ${sourcefilename} | - grep Mutation_Status | - grep t_ref_count | - grep t_alt_count | - grep n_ref_count | - grep n_alt_count > ${tempfilename} -headerlinecount=$(cat ${tempfilename} | wc -l) -if [ ${headerlinecount} -ne 1 ] ; then - message="failed to complete mskimpact missing tumor/normal-ref/alt counts test because no header was found" - echo ${message} - echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST - rm -f ${tempfilename} - exit 1 -fi -mut_stat_index=$(awk -F '\t' -v col='Mutation_Status' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) -t_ref_index=$(awk -F '\t' -v col='t_ref_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) -t_alt_index=$(awk -F '\t' -v col='t_alt_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) -n_ref_index=$(awk -F '\t' -v col='n_ref_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) -n_alt_index=$(awk -F '\t' -v col='n_alt_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) -rm -f ${tempfilename}.fields - -# sanity check for empty indices -if [ -z ${mut_stat_index} ] || [ -z ${t_ref_index} ] || [ -z ${t_alt_index} ] || [ -z ${n_ref_index} ] || [ -z ${n_alt_index} ] ; then - message="failed to complete mskimpact missing tumor/normal-ref/alt counts test because header was missing a necessary field" - echo ${message} - echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST - rm ${tempfilename} - exit 2 -fi -neededfields=${mut_stat_index},${t_ref_index},${t_alt_index},${n_ref_index},${n_alt_index} -headerfilter="grep -v \"T01-IM3\"" -#count only records which are not germline (also ignore empty fields) -germlinefilter="grep SOMATIC\|UNKNOWN" -cut -f ${neededfields} ${sourcefilename} | ${headerfilter} | ${germlinefilter} | cut -f 2-5 > ${tempfilename} -sed "s/${tab}/,/g" ${tempfilename} | grep ",," > ${tempfilename}.blanks -recordswithblanks=$(cat ${tempfilename}.blanks | wc -l) -rm -f ${tempfilename} ${tempfilename}.blanks -if [ ${recordswithblanks} -gt 0 ] ; then - message="mskimpact missing tumor/normal-ref/alt counts test failed : ${recordswithblanks} (somatic/unknown) records with blanks found in field t_ref_count, t_alt_count, n_ref_count, or n_alt_count" - echo ${message} - echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST - exit 3 -fi -#test passed - no blanks found in somatic/unknown records -exit 0 diff --git a/import-scripts/update-msk-mind-cohort.sh b/import-scripts/update-msk-mind-cohort.sh deleted file mode 100755 index 5bb39336e..000000000 --- a/import-scripts/update-msk-mind-cohort.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash - -source $PORTAL_HOME/scripts/dmp-import-vars-functions.sh -source $PORTAL_HOME/scripts/set-data-source-environment-vars.sh - -echo $(date) - -if ! [ -d "$MSK_DMP_TMPDIR" ] ; then - if ! mkdir -p "$MSK_DMP_TMPDIR" ; then - echo "Error : could not create tmp directory '$MSK_DMP_TMPDIR'" >&2 - exit 1 - fi -fi -if [[ -d "$MSK_DMP_TMPDIR" && "$MSK_DMP_TMPDIR" != "/" ]] ; then - rm -rf "$MSK_DMP_TMPDIR"/* -fi - -if [ -z $JAVA_BINARY ] | [ -z $GIT_BINARY ] | [ -z $PORTAL_HOME ] | [ -z $MSK_MIND_DATA_HOME ] | [ -z $MSK_EXTRACT_COHORT_DATA_HOME ] ; then - message="test could not run update-msk-mind-cohort.sh: automation-environment.sh script must be run in order to set needed environment variables (like MSK_MIND_DATA_HOME, ...)" - echo $message - echo -e "$message" | mail -s "update-msk-mind-cohort failed to run." $PIPELINES_EMAIL_LIST - sendPreImportFailureMessageMskPipelineLogsSlack "$message" - exit 2 -fi - -source $PORTAL_HOME/scripts/clear-persistence-cache-shell-functions.sh - -# Get the current production database color -GET_DB_IN_PROD_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/get_database_currently_in_production.sh" -MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH="/data/portal-cron/pipelines-credentials/manage_msk_clickhouse_database_update_tools.properties" -current_production_database_color=$($GET_DB_IN_PROD_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH) -destination_database_color="unset" -if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" -fi -if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" -fi -if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 -fi -MSK_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" -MSK_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS $JAVA_DD_AGENT_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$MSK_DMP_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-dmp-clickhouse-importer.log -ea -cp $MSK_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" -EMAIL_NOTIFICATION_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/email-import-notification-after-import.sh" - -IMPORT_FAIL=0 -mskextract_notification_file=$(mktemp $MSK_DMP_TMPDIR/mskextract-portal-update-notification.$now.XXXXXX) -# update msk-mind github repo -fetch_updates_in_data_sources "msk-mind-datahub" - -# fetch ddp timeline data -printTimeStampedDataProcessingStepMessage "DDP demographics fetch for MSKEXTRACT" -mskextract_dmp_pids_file=$MSK_DMP_TMPDIR/mskextract_patient_list.txt -grep -v "^#" $MSK_EXTRACT_COHORT_DATA_HOME/data_clinical_patient.txt | cut -f1 | grep -v "PATIENT_ID" | sort | uniq > $mskextract_dmp_pids_file -MSKEXTRACT_DDP_DEMOGRAPHICS_RECORD_COUNT=$(wc -l < $mskextract_dmp_pids_file) -if [ $MSKEXTRACT_DDP_DEMOGRAPHICS_RECORD_COUNT -le $DEFAULT_DDP_DEMOGRAPHICS_ROW_COUNT ] ; then - MSKEXTRACT_DDP_DEMOGRAPHICS_RECORD_COUNT=$DEFAULT_DDP_DEMOGRAPHICS_ROW_COUNT -fi - -$JAVA_BINARY $JAVA_DDP_FETCHER_ARGS -c mskextract -p $mskextract_dmp_pids_file -f diagnosis,radiation,chemotherapy,surgery,survival -o $MSK_EXTRACT_COHORT_DATA_HOME -r $MSKEXTRACT_DDP_DEMOGRAPHICS_RECORD_COUNT -if [ $? -gt 0 ] ; then - bash $PORTAL_HOME/scripts/datasource-repo-cleanup.sh $PORTAL_DATA_HOME/msk-mind - sendPreImportFailureMessageMskPipelineLogsSlack "MSKEXTRACT DDP Timeline Fetch" -else - echo "commit ddp timeline data for MSKEXTRACT" - cd $MSK_MIND_DATA_HOME ; rm -f $MSK_EXTRACT_COHORT_DATA_HOME/data_clinical_ddp.txt ; $GIT_BINARY add $MSK_EXTRACT_COHORT_DATA_HOME/data_timeline* ; $GIT_BINARY commit -m "Latest MSKEXTRACT DDP timeline data" ; $GIT_BINARY push origin -fi - -# update mskextract cohort in portal -$JAVA_BINARY -Xmx64g $MSK_JAVA_IMPORTER_ARGS --update-study-data --portal msk-mind-portal --notification-file $mskextract_notification_file --oncotree-version $ONCOTREE_VERSION_TO_USE --transcript-overrides-source mskcc --disable-redcap-export -if [ $? -gt 0 ]; then - echo "MSKEXTRACT update failed!" - IMPORT_FAIL=1 - EMAIL_BODY="MSKEXTRACT update failed" - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "Update failure: MSKEXTRACT" $CMO_EMAIL_LIST -fi - -# get num studies updated -if [[ $? -eq 0 && -f "$TMP_DIRECTORY/num_studies_updated.txt" ]]; then - num_studies_updated=`cat $TMP_DIRECTORY/num_studies_updated.txt` -else - num_studies_updated=0 -fi - -#### ROB : 2025_08_17 - persistence cache reset will now happen at the color transition instead -##### clear persistence cache -####if [[ $IMPORT_FAIL -eq 0 && $num_studies_updated -gt 0 ]]; then -#### echo "'$num_studies_updated' studies have been updated, clearing persistence cache for msk portals..." -#### if ! clearPersistenceCachesForMskPortals ; then -#### sendClearCacheFailureMessage msk update-msk-mind-cohort.sh -#### fi -####else -#### echo "No studies have been updated, not clearing persistence cache for msk portals..." -####fi - -# clean up msk-mind repo and send notification file -bash $PORTAL_HOME/scripts/datasource-repo-cleanup.sh $PORTAL_DATA_HOME/msk-mind -$EMAIL_NOTIFICATION_SCRIPT_FILEPATH msk-mind-portal "$mskextract_notification_file" diff --git a/import-scripts/update-msk-spectrum-cohort.sh b/import-scripts/update-msk-spectrum-cohort.sh deleted file mode 100755 index ddfa435cb..000000000 --- a/import-scripts/update-msk-spectrum-cohort.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -source $PORTAL_HOME/scripts/dmp-import-vars-functions.sh -source $PORTAL_HOME/scripts/set-data-source-environment-vars.sh - -echo $(date) - -if ! [ -d "$MSK_DMP_TMPDIR" ] ; then - if ! mkdir -p "$MSK_DMP_TMPDIR" ; then - echo "Error : could not create tmp directory '$MSK_DMP_TMPDIR'" >&2 - exit 1 - fi -fi -if [[ -d "$MSK_DMP_TMPDIR" && "$MSK_DMP_TMPDIR" != "/" ]] ; then - rm -rf "$MSK_DMP_TMPDIR"/* -fi - -if [ -z $JAVA_BINARY ] | [ -z $GIT_BINARY ] | [ -z $PORTAL_HOME ] | [ -z $MSK_SHAHLAB_DATA_HOME ] | [ -z $MSK_SPECTRUM_COHORT_DATA_HOME ] ; then - message="test could not run update-msk-spectrum-cohort.sh: automation-environment.sh script must be run in order to set needed environment variables (like MSK_SHAHLAB_DATA_HOME, ...)" - echo $message - echo -e "$message" | mail -s "update-msk-spectrum-cohort failed to run." $PIPELINES_EMAIL_LIST - sendPreImportFailureMessageMskPipelineLogsSlack "$message" - exit 2 -fi - -source $PORTAL_HOME/scripts/clear-persistence-cache-shell-functions.sh - -# Get the current production database color -GET_DB_IN_PROD_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/get_database_currently_in_production.sh" -MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH="/data/portal-cron/pipelines-credentials/manage_msk_clickhouse_database_update_tools.properties" -current_production_database_color=$($GET_DB_IN_PROD_SCRIPT_FILEPATH $MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH) -destination_database_color="unset" -if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" -fi -if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" -fi -if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 -fi - -MSK_IMPORTER_JAR_FILENAME="/data/portal-cron/lib/msk-clickhouse-importer-$destination_database_color.jar" -MSK_JAVA_IMPORTER_ARGS="$JAVA_PROXY_ARGS $java_debug_args $JAVA_SSL_ARGS $JAVA_DD_AGENT_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$MSK_DMP_TMPDIR -Dlog4j.appender.a.File=/data/portal-cron/logs/msk-dmp-clickhouse-importer.log -ea -cp $MSK_IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" - -IMPORT_FAIL=0 -mskspectrum_notification_file=$(mktemp $MSK_DMP_TMPDIR/mskspectrum-portal-update-notification.$now.XXXXXX) -# update msk-spectrum github repo -fetch_updates_in_data_sources "datahub_shahlab" - -# update mskspectrum cohort in portal -$JAVA_BINARY -Xmx64g $MSK_JAVA_IMPORTER_ARGS --update-study-data --portal msk-spectrum-portal --notification-file $mskspectrum_notification_file --oncotree-version $ONCOTREE_VERSION_TO_USE --transcript-overrides-source mskcc --disable-redcap-export -if [ $? -gt 0 ]; then - echo "MSKSPECTRUM update failed!" - IMPORT_FAIL=1 - EMAIL_BODY="MSKSPECTRUM update failed" - echo -e "Sending email $EMAIL_BODY" - echo -e "$EMAIL_BODY" | mail -s "Update failure: MSKSPECTRUM" $CMO_EMAIL_LIST -fi - -# get num studies updated -if [[ $? -eq 0 && -f "$TMP_DIRECTORY/num_studies_updated.txt" ]]; then - num_studies_updated=`cat $TMP_DIRECTORY/num_studies_updated.txt` -else - num_studies_updated=0 -fi - -#### ROB : 2025_08_17 - persistence cache reset will now happen at the color transition instead -##### clear persistence cache -####if [[ $IMPORT_FAIL -eq 0 && $num_studies_updated -gt 0 ]]; then -#### echo "'$num_studies_updated' studies have been updated, clearing persistence cache for msk portals..." -#### if ! clearPersistenceCachesForMskPortals ; then -#### sendClearCacheFailureMessage msk update-msk-spectrum-cohort.sh -#### fi -####else -#### echo "No studies have been updated, not clearing persistence cache for msk portals..." -####fi - -# clean up msk-spectrum repo and send notification file -bash $PORTAL_HOME/scripts/datasource-repo-cleanup.sh $PORTAL_DATA_HOME/datahub_shahlab -EMAIL_NOTIFICATION_SCRIPT_FILEPATH="$PORTAL_HOME/scripts/email-import-notification-after-import.sh" -# $EMAIL_NOTIFICATION_SCRIPT_FILEPATH msk-spectrum-portal "$mskspectrum_notification_file" - -exit $IMPORT_FAIL From 0f7ea9695dfa0687d20badec9163b873892b70e6 Mon Sep 17 00:00:00 2001 From: James Ko Date: Mon, 11 May 2026 13:49:58 -0400 Subject: [PATCH 2/4] delete importer config --- .../automation-environment.sh | 155 ------------------ .../knowledgesystems-importer/mycrontab | 18 -- 2 files changed, 173 deletions(-) delete mode 100755 import-scripts/knowledgesystems-importer/automation-environment.sh delete mode 100644 import-scripts/knowledgesystems-importer/mycrontab diff --git a/import-scripts/knowledgesystems-importer/automation-environment.sh b/import-scripts/knowledgesystems-importer/automation-environment.sh deleted file mode 100755 index 4fb84558b..000000000 --- a/import-scripts/knowledgesystems-importer/automation-environment.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/bash - -####################### -# general paths/options for system executables -####################### -export JAVA_PROXY_ARGS="-Dhttp.proxyPort=8080 -Dhttp.nonProxyHosts=draco.mskcc.org|pidvudb1.mskcc.org|phcrdbd2.mskcc.org|dashi-dev.cbio.mskcc.org|pipelines.cbioportal.mskcc.org|localhost" -export JAVA_HOME="/usr/lib/jdk-21.0.2" -export JAVA_BINARY="$JAVA_HOME/bin/java" -export PYTHON_BINARY=/usr/bin/python -export PYTHON3_BINARY=/usr/bin/python3 -export MAVEN_BINARY=/opt/apache-maven-3.8.6/bin/mvn -export HG_BINARY=/usr/bin/hg -export GIT_BINARY=/usr/bin/git -export YQ_BINARY=/home/cbioportal_importer/bin/yq -export PATH="/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/usr/local/bin:/opt/apache-maven-3.8.6/bin:/usr/local/go/bin:/home/cbioportal_importer/.local/bin:/home/cbioportal_importer/bin:/home/cbioportal_importer/tools/go/bin:/home/cbioportal_importer/tools/sling-cli:/home/cbioportal_importer/tools/clickhouse/bin" - -####################### -# environment variables for top-level data repositories / code bases -####################### -export PORTAL_HOME=/data/portal-cron -export PORTAL_DATA_HOME=$PORTAL_HOME/cbio-portal-data - -####################### -# environment variables for our git code bases -####################### -export PORTAL_GIT_HOME=$PORTAL_HOME/git-repos -export CMO_PIPELINES_HOME=$PORTAL_GIT_HOME/cmo-pipelines -export PIPELINES_HOME=$PORTAL_GIT_HOME/pipelines -export CBIOPORTAL_HOME=$PORTAL_GIT_HOME/cbioportal -export GENOME_NEXUS_ANNOTATOR_HOME=$PORTAL_GIT_HOME/genome-nexus-annotation-pipeline -export ANNOTATOR_JAR=$PORTAL_HOME/lib/annotationPipeline.jar -export ONCO_HOME=$PORTAL_GIT_HOME/oncotree -export ONCOKB_ANNOTATOR_HOME=$PORTAL_GIT_HOME/oncokb-annotator -export CDD_HOME=$PORTAL_GIT_HOME/clinical-data-dictionary -export DDP_CREDENTIALS_FILE=$PORTAL_HOME/pipelines-credentials/application-secure.properties -export AWS_SSL_TRUSTSTORE=$PORTAL_HOME/pipelines-credentials/AwsSsl.truststore -export AWS_SSL_TRUSTSTORE_PASSWORD_FILE=$PORTAL_HOME/pipelines-credentials/AwsSsl.truststore.password -export SLACK_URL_FILE=$PORTAL_HOME/pipelines-credentials/slack.url -export GMAIL_CREDS_FILE=$PORTAL_HOME/pipelines-credentials/gmail.credentials -export MAIL_SMTP_SERVER=$PORTAL_HOME/pipelines-credentials/mail.smtp.server -export PUBLIC_CLUSTER_KUBECONFIG=$PORTAL_HOME/pipelines-credentials/public-cluster-kubeconfig -export PUBLICARGOCD_CLUSTER_KUBECONFIG=$PORTAL_HOME/pipelines-credentials/publicargocd-cluster-kubeconfig -export EKS_CLUSTER_KUBECONFIG=$PORTAL_HOME/pipelines-credentials/eks-cluster-kubeconfig -export EKSARGOCD_CLUSTER_KUBECONFIG=$PORTAL_HOME/pipelines-credentials/eksargocd-cluster-kubeconfig - -####################### -# SSL args (for AWS + redcap) -####################### -export JAVA_SSL_ARGS="-Djavax.net.ssl.trustStore=$AWS_SSL_TRUSTSTORE -Djavax.net.ssl.trustStorePassword=`cat $AWS_SSL_TRUSTSTORE_PASSWORD_FILE`" - -####################### -# environment variables for configuration / properties files -####################### -export PORTAL_CONFIG_HOME=$PORTAL_GIT_HOME/portal-configuration -export PIPELINES_CONFIG_HOME=$PORTAL_GIT_HOME/pipelines-configuration -export GITHUB_CRONTAB_URL="https://api.github.com/repos/knowledgesystems/cmo-pipelines/contents/import-scripts/knowledgesystems-importer/mycrontab" - -####################### -# environment variables for top level data repositories -####################### -export BIC_LEGACY_DATA_HOME=$PORTAL_DATA_HOME/bic-mskcc-legacy -export CMO_ARGOS_DATA_HOME="$PORTAL_DATA_HOME/cmo-argos" -export PDX_DATA_HOME=$PORTAL_DATA_HOME/crdb_pdx -export PRIVATE_DATA_HOME=$PORTAL_DATA_HOME/private -export DMP_DATA_HOME=$PORTAL_DATA_HOME/dmp -export DMP_PRIVATE_DATA_HOME=$PORTAL_DATA_HOME/dmp-private -export FOUNDATION_DATA_HOME=$PORTAL_DATA_HOME/foundation -export IMPACT_DATA_HOME=$PORTAL_DATA_HOME/impact -export DATAHUB_DATA_HOME=$PORTAL_DATA_HOME/datahub/public -export MSK_MIND_DATA_HOME=$PORTAL_DATA_HOME/msk-mind -export MSK_SHAHLAB_DATA_HOME=$PORTAL_DATA_HOME/datahub_shahlab - -####################### -# environment variables used across import scripts -####################### -#export INHIBIT_RECACHING_FROM_TOPBRAID=true -export CASE_LIST_CONFIG_FILE=$PIPELINES_CONFIG_HOME/resources/case_list_config.tsv -export SKIP_VERIFICATION_OF_GENETIC_ALTERATION_COPIES="yes" -export SLING_GENETIC_ALTERATION_DATA_IN_CHUNKS="yes" - -####################### -# environment variables used in the fetch-and-import-dmp-impact-data script -####################### -# trigger files to communicate between fetch-dmp-data and import-dmp-data -export MSK_DMP_TMPDIR=$PORTAL_HOME/tmp/import-cron-dmp-msk -export MSK_IMPACT_CONSUME_TRIGGER=$MSK_DMP_TMPDIR/mskimpact_consume_trigger.txt -export MSK_HEMEPACT_CONSUME_TRIGGER=$MSK_DMP_TMPDIR/mskimpact_heme_consume_trigger.txt -export MSK_ARCHER_CONSUME_TRIGGER=$MSK_DMP_TMPDIR/mskarcher_consume_trigger.txt -export MSK_ACCESS_CONSUME_TRIGGER=$MSK_DMP_TMPDIR/mskaccess_consume_trigger.txt -export MSK_ARCHER_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/mskarcher_import_trigger.txt -export MSK_SOLID_HEME_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/msk_solid_heme_import_trigger.txt -export MSK_KINGS_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/kingscounty_import_trigger.txt -export MSK_LEHIGH_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/lehighvalley_import_trigger.txt -export MSK_QUEENS_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/queenscancercenter_import_trigger.txt -export MSK_MCI_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/miamicancerinstitute_import_trigger.txt -export MSK_HARTFORD_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/hartfordhealthcare_import_trigger.txt -export MSK_RALPHLAUREN_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/ralphlauren_import_trigger.txt -export MSK_RIKENGENESISJAPAN_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/msk_rikengenesisjapan_import_trigger.txt -export MSK_SCLC_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/sclc_mskimpact_import_trigger.txt -export MSKIMPACT_PED_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/mskimpact_ped_import_trigger.txt -export LYMPHOMA_SUPER_COHORT_IMPORT_TRIGGER=$MSK_DMP_TMPDIR/lymphoma_super_cohort_fmi_msk_import_trigger.txt -# data directories -export MSK_IMPACT_DATA_HOME=$DMP_DATA_HOME/mskimpact -export MSK_RAINDANCE_DATA_HOME=$DMP_DATA_HOME/mskraindance -export MSK_HEMEPACT_DATA_HOME=$DMP_DATA_HOME/mskimpact_heme -export MSK_ARCHER_DATA_HOME=$DMP_DATA_HOME/mskarcher -export MSK_ARCHER_UNFILTERED_DATA_HOME=$DMP_DATA_HOME/mskarcher_unfiltered -export MSK_ACCESS_DATA_HOME=$DMP_DATA_HOME/mskaccess -export MSK_IMPACT_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskimpact_private -export MSK_RAINDANCE_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskraindance_private -export MSK_HEMEPACT_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskimpact_heme_private -export MSK_ARCHER_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskarcher_private -export MSK_ARCHER_UNFILTERED_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskarcher_unfiltered_private -export MSK_ACCESS_PRIVATE_DATA_HOME=$DMP_PRIVATE_DATA_HOME/mskaccess_private -export MSK_MIXEDPACT_DATA_HOME=$DMP_DATA_HOME/mixedpact -export MSK_SOLID_HEME_DATA_HOME=$DMP_DATA_HOME/msk_solid_heme -export MSK_KINGS_DATA_HOME=$DMP_DATA_HOME/msk_kingscounty -export MSK_LEHIGH_DATA_HOME=$DMP_DATA_HOME/msk_lehighvalley -export MSK_QUEENS_DATA_HOME=$DMP_DATA_HOME/msk_queenscancercenter -export MSK_MCI_DATA_HOME=$DMP_DATA_HOME/msk_miamicancerinstitute -export MSK_HARTFORD_DATA_HOME=$DMP_DATA_HOME/msk_hartfordhealthcare -export MSK_RALPHLAUREN_DATA_HOME=$DMP_DATA_HOME/msk_ralphlauren -export MSK_RIKENGENESISJAPAN_DATA_HOME=$DMP_DATA_HOME/msk_rikengenesisjapan -export MSK_SCLC_DATA_HOME=$DMP_DATA_HOME/sclc_mskimpact_2017 -export MSKIMPACT_PED_DATA_HOME=$DMP_DATA_HOME/mskimpact_ped -export LYMPHOMA_SUPER_COHORT_DATA_HOME=$DMP_DATA_HOME/lymphoma_super_cohort_fmi_msk -export MSK_EXTRACT_COHORT_DATA_HOME=$MSK_MIND_DATA_HOME/datahub/msk_extract_cohort2_2019 -export MSK_SPECTRUM_COHORT_DATA_HOME=$MSK_SHAHLAB_DATA_HOME/msk_spectrum -# read-only data directories -export FMI_BATLEVI_DATA_HOME=$FOUNDATION_DATA_HOME/mixed/lymphoma/mskcc/foundation/lymph_landscape_fmi_201611 - -####################### -# environment variables used in the backup-redcap-data.sh script -####################### -export REDCAP_BACKUP_DATA_HOME=$PORTAL_DATA_HOME/redcap-snapshot -export MSKIMPACT_REDCAP_BACKUP=$REDCAP_BACKUP_DATA_HOME/mskimpact -export HEMEPACT_REDCAP_BACKUP=$REDCAP_BACKUP_DATA_HOME/mskimpact_heme -export ARCHER_REDCAP_BACKUP=$REDCAP_BACKUP_DATA_HOME/mskarcher -export ACCESS_REDCAP_BACKUP=$REDCAP_BACKUP_DATA_HOME/mskaccess - -####################### -# environment variables used in the import-pdx-data script -####################### -export CRDB_FETCHER_PDX_HOME=$PDX_DATA_HOME/crdb_pdx_raw_data - -####################### -# environment variables used for oncokb annotator script -####################### -export ONCOKB_TOKEN_FILE=$PORTAL_HOME/pipelines-credentials/oncokb.token - -####################### -# environment variables needed for openssl certificate verification with self-signed certificate -####################### -export SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" # needed to use the clickhouse CLI -export GIT_SSL_CAINFO="/etc/pki/tls/certs/ca-bundle.crt" # needed to use git diff --git a/import-scripts/knowledgesystems-importer/mycrontab b/import-scripts/knowledgesystems-importer/mycrontab deleted file mode 100644 index 0328fc591..000000000 --- a/import-scripts/knowledgesystems-importer/mycrontab +++ /dev/null @@ -1,18 +0,0 @@ -MAILTO=cbioportal-pipelines@cbioportal.org - -########################## -# Import Scripts (Comment these out when upgrading) -########################## - -########################## -# Monitors -########################## -0 0 * * * . /data/portal-cron/scripts/automation-environment.sh;/data/portal-cron/scripts/monitor-stalled-jobs.sh >> /data/portal-cron/logs/monitor-stalled-jobs.log 2>&1 || echo "Failure in crontab ($HOSTNAME): monitor-stalled-jobs.sh exited with non-zero exit status" | mail -r "cbioportal-pipelines@cbioportal.org" -s "Failure in crontab ($HOSTNAME)" cbioportal-pipelines@cbioportal.org -0 0 * * * . /data/portal-cron/scripts/automation-environment.sh;/data/portal-cron/scripts/monitor-crontab-version.sh >> /data/portal-cron/logs/monitor-crontab-version.log 2>&1 || echo "Failure in crontab ($HOSTNAME): monitor-crontab-version.sh exited with non-zero exit status" | mail -r "cbioportal-pipelines@cbioportal.org" -s "Failure in crontab ($HOSTNAME)" cbioportal-pipelines@cbioportal.org - -########################## -# Miscellaneous -########################## -0 0 * * * /usr/sbin/logrotate -f -s /data/portal-cron/logrotate.status /data/portal-cron/portal-cron-logrotate -# make sure we can execute all scripts, execute daily at midnight -55 23 * * * chmod u+x /data/portal-cron/scripts/*.sh /data/portal-cron/scripts/*.py || echo "Failure in crontab ($HOSTNAME): chmod exited with non-zero exit status" | mail -r "cbioportal-pipelines@cbioportal.org" -s "Failure in crontab ($HOSTNAME)" cbioportal-pipelines@cbioportal.org From 116f56b6d12a0772a133e49be7ec17116cba3ddf Mon Sep 17 00:00:00 2001 From: James Ko Date: Mon, 11 May 2026 13:51:47 -0400 Subject: [PATCH 3/4] add clickhouse_optimize_backoff_secs --- import-scripts/pipelines_eks/automation-environment.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/import-scripts/pipelines_eks/automation-environment.sh b/import-scripts/pipelines_eks/automation-environment.sh index ab78f0f6d..ebaf660c4 100755 --- a/import-scripts/pipelines_eks/automation-environment.sh +++ b/import-scripts/pipelines_eks/automation-environment.sh @@ -169,3 +169,7 @@ export DATABRICKS_CREDS_FILE=$PORTAL_HOME/pipelines-credentials/databricks.crede ####################### export SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" # needed to use the clickhouse CLI export GIT_SSL_CAINFO="/etc/pki/tls/certs/ca-bundle.crt" # needed to use git + + +###### needed for clickhouse derived table construction +export CLICKHOUSE_OPTIMIZE_BACKOFF_SECS=90 From 550217824dc7a6a719b7c2ec08a5f5663b277a09 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 12 Jun 2026 19:27:24 +0000 Subject: [PATCH 4/4] fix: address post-RFC100 cleanup PR feedback - Rename airflow-import-sql.sh to airflow-import-direct-to-clickhouse.sh - Fix docstrings in genie/public DAGs (remove ClickHouse references) - Collapse extra blank lines in genie DAG - Remove stale 'disabled on pipelines5' comment - Restore monitor-stalled-jobs.sh and test_if_impact_has_lost_allele_count.sh --- dags/import_base.py | 3 +- dags/import_genie_dag.py | 19 +--- dags/import_public_dag.py | 4 +- ...=> airflow-import-direct-to-clickhouse.sh} | 0 import-scripts/monitor-stalled-jobs.sh | 93 +++++++++++++++++++ .../test_if_impact_has_lost_allele_count.sh | 53 +++++++++++ 6 files changed, 154 insertions(+), 18 deletions(-) rename import-scripts/{airflow-import-sql.sh => airflow-import-direct-to-clickhouse.sh} (100%) create mode 100755 import-scripts/monitor-stalled-jobs.sh create mode 100755 import-scripts/test_if_impact_has_lost_allele_count.sh diff --git a/dags/import_base.py b/dags/import_base.py index 15aab4ad8..5c34120ea 100644 --- a/dags/import_base.py +++ b/dags/import_base.py @@ -229,10 +229,9 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No scripts_dir, db_properties_filepath, ), - # reuse the old import-sql script for now "import_direct_to_clickhouse": _script( scripts_dir, - "airflow-import-sql.sh", + "airflow-import-direct-to-clickhouse.sh", importer, scripts_dir, db_properties_filepath, diff --git a/dags/import_genie_dag.py b/dags/import_genie_dag.py index 70a81fafd..43a402837 100644 --- a/dags/import_genie_dag.py +++ b/dags/import_genie_dag.py @@ -1,6 +1,6 @@ """ -import_genie_clickhouse_dag.py -Imports Genie study to ClickHouse database. +import_genie_dag.py +Imports Genie study. """ import os import sys @@ -12,31 +12,23 @@ def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["verify_management_state"] - tasks["verify_management_state"] >> [ tasks["fetch_data"], - tasks["clone_database"] + tasks["clone_database"], ] - [ tasks["fetch_data"], - tasks["clone_database"] + tasks["clone_database"], ] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_direct_to_clickhouse"] - tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] - tasks["create_derived_tables"] >> tasks["transfer_deployment"] - tasks["transfer_deployment"] >> [ tasks["cleanup_data"], - tasks["send_update_notification"] + tasks["send_update_notification"], ] - _GENIE_CLICKHOUSE_CONFIG = ImporterConfig( dag_id="import_genie_clickhouse_dag", description="Imports Genie study to ClickHouse database", @@ -58,7 +50,6 @@ def _wire(tasks: dict[str, object]) -> None: "set_import_abandoned", ), db_properties_filename="manage_genie_clickhouse_database_update_tools.properties", - # disabled on pipelines5 machine during testing phase color_swap_config_filename="genie-db-color-swap-config.yaml", params={ "data_repos": Param( diff --git a/dags/import_public_dag.py b/dags/import_public_dag.py index a8b769471..003264781 100644 --- a/dags/import_public_dag.py +++ b/dags/import_public_dag.py @@ -1,6 +1,6 @@ """ -import_public_clickhouse_dag.py -Imports to Public cBioPortal ClickHouse database. +import_public_dag.py +Imports to Public cBioPortal database. """ import os import sys diff --git a/import-scripts/airflow-import-sql.sh b/import-scripts/airflow-import-direct-to-clickhouse.sh similarity index 100% rename from import-scripts/airflow-import-sql.sh rename to import-scripts/airflow-import-direct-to-clickhouse.sh diff --git a/import-scripts/monitor-stalled-jobs.sh b/import-scripts/monitor-stalled-jobs.sh new file mode 100755 index 000000000..62aa390be --- /dev/null +++ b/import-scripts/monitor-stalled-jobs.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +source /data/portal-cron/scripts/slack-message-functions.sh + +# converts timestamp (D:H:M:S) to seconds +function convert_to_seconds () { + elapsed_time=$1 + elapsed_time_in_seconds=`echo "$elapsed_time" | awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}'` + echo $elapsed_time_in_seconds +} + +# function for sending notification emails +function send_email_notification () { + process_name=$1 + hostname=`hostname` + ### FAILURE EMAIL ### + EMAIL_BODY="Following processes appear to be stalled.\nHostname: ${hostname}\ndate: ${now}\nrunning processes: see below\n\nCMD\tPID\tSTART_TIME\tETIME\n${process_name}\n" + echo -e "Sending email\n$EMAIL_BODY" + echo -e "$EMAIL_BODY" | mail -s "Alert: Import jobs stalled on ${hostname}" cbioportal-pipelines@cbioportal.org +} + +# Function for alerting slack channel of stalled jobs +function send_slack_warning_message () { + send_slack_message_to_channel "#msk-pipeline-logs" "string" "A stalled nightly import process has been detected. :tired_face:" +} + +# Array of process names being checked +checked_process_list=( + 'import_portal_users_genie.sh' + 'importUsers.py' + 'import-dmp-impact-data.sh' + 'import-temp-study.sh' + 'oncokb-annotator.sh' +) + +# Stalled times +mt_users_genie=$(( 5 * 60 )) # import_portal_users_genie.sh: 5 minutes +mt_import_users=$(( 30 * 60 )) # importUsers.py: 30 minutes +mt_import_dmp=$(( 10 * 60 * 60 )) # import-dmp-impact-data.sh: 10 hours +mt_import_temp_study=$(( 3 * 60 * 60 )) # import-temp-study.sh: 3 hours +mt_oncokb_annotator=$(( 4 * 60 * 60 )) # oncokb-annotator.sh: 4 hours +max_time=($mt_users_genie $mt_import_users $mt_import_dmp $mt_import_temp_study $mt_oncokb_annotator) +email_times=(0 0 0 0 0) + +while : +do + now=$(date "+%Y-%m-%d-%H-%M-%S") + echo date: ${now} : scanning for long running import jobs + myuidoutput=`id` + myuid=-1 + if [[ ${myuidoutput} =~ uid=([[:digit:]]+).* ]] ; then + myuid=${BASH_REMATCH[1]} + fi + if [ $myuid -eq -1 ] ; then + echo Error : could not determine uid + exit 1 + fi + + ex1="CMD" #exclude the header from ps + ex2="grep\|ps\|tail\|vim" #exclude grep, ps, tail, and vim commands + ex3="triage\|hot-deploy" #exclude triage or hot-deploy imports + ex4="scan-for-stalled-import-jobs\.sh" #exclude this command (scan-for-stalled-import-jobs.sh) + ex5="\.log" #exclude accesses to log files (less, cat, grep) + + for (( i=0; i<${#checked_process_list[@]}; i++ )); + do + # get ps for user and sort by elapsed time (take longest running process) and convert to seconds + ps_output=`export COLUMNS=24000 ; ps --user $myuid -o cmd:100,pid:15,start_time:15,etime:15 --sort=etime | grep "${checked_process_list[i]}" | grep -ve "$ex1\|$ex2\|$ex3\|$ex4\|$ex5" | sed 's/\s\s\s*/\t/g' | head -1` + ps_etime=`echo "$ps_output" | cut -f4 | sed 's/-/:/g'` + ps_etime_seconds=$(convert_to_seconds $ps_etime) + + # if process is not stalled then set date to current time so next break triggers email + if [ $ps_etime_seconds -le ${max_time[i]} ] ; then + email_times[i]="$(date +%H%M)" + else + # if process is stalled and current time is greater than 'email time' - send email and set email time to current time plus 3 hours + if [ $(date +%H%M) -gt ${email_times[i]} ] ; then + send_email_notification "$ps_output" + send_slack_warning_message + email_times[i]="$((10#$(date -d '+3 hours' +"%H%M")))" + fi + fi + done + + # kill script at 23:30 - restart at midnight in crontab + if [ $(date +"%H%M") -gt 2330 ] ; then + echo "Exiting monitoring script... script will restart at midnight" + exit 0 + fi + + # sleep 10 minutes before trying again + sleep 600 +done diff --git a/import-scripts/test_if_impact_has_lost_allele_count.sh b/import-scripts/test_if_impact_has_lost_allele_count.sh new file mode 100755 index 000000000..9e7607b18 --- /dev/null +++ b/import-scripts/test_if_impact_has_lost_allele_count.sh @@ -0,0 +1,53 @@ +#!/bin/bash +PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" +export tab=$'\t' +tmp=$PORTAL_HOME/tmp/import-cron-dmp-msk +sourcefilename=$MSK_IMPACT_DATA_HOME/data_mutations_extended.txt + +#find header field indices +tempfilename=$(mktemp $tmp/mskimpact_mut_filed_test.XXXXXX) +head ${sourcefilename} | + grep Mutation_Status | + grep t_ref_count | + grep t_alt_count | + grep n_ref_count | + grep n_alt_count > ${tempfilename} +headerlinecount=$(cat ${tempfilename} | wc -l) +if [ ${headerlinecount} -ne 1 ] ; then + message="failed to complete mskimpact missing tumor/normal-ref/alt counts test because no header was found" + echo ${message} + echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST + rm -f ${tempfilename} + exit 1 +fi +mut_stat_index=$(awk -F '\t' -v col='Mutation_Status' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) +t_ref_index=$(awk -F '\t' -v col='t_ref_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) +t_alt_index=$(awk -F '\t' -v col='t_alt_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) +n_ref_index=$(awk -F '\t' -v col='n_ref_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) +n_alt_index=$(awk -F '\t' -v col='n_alt_count' 'NR==1{for (i=1; i<=NF; i++) if ($i==col) {print i;exit}}' $tempfilename) +rm -f ${tempfilename}.fields + +# sanity check for empty indices +if [ -z ${mut_stat_index} ] || [ -z ${t_ref_index} ] || [ -z ${t_alt_index} ] || [ -z ${n_ref_index} ] || [ -z ${n_alt_index} ] ; then + message="failed to complete mskimpact missing tumor/normal-ref/alt counts test because header was missing a necessary field" + echo ${message} + echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST + rm ${tempfilename} + exit 2 +fi +neededfields=${mut_stat_index},${t_ref_index},${t_alt_index},${n_ref_index},${n_alt_index} +headerfilter="grep -v \"T01-IM3\"" +#count only records which are not germline (also ignore empty fields) +germlinefilter="grep SOMATIC\|UNKNOWN" +cut -f ${neededfields} ${sourcefilename} | ${headerfilter} | ${germlinefilter} | cut -f 2-5 > ${tempfilename} +sed "s/${tab}/,/g" ${tempfilename} | grep ",," > ${tempfilename}.blanks +recordswithblanks=$(cat ${tempfilename}.blanks | wc -l) +rm -f ${tempfilename} ${tempfilename}.blanks +if [ ${recordswithblanks} -gt 0 ] ; then + message="mskimpact missing tumor/normal-ref/alt counts test failed : ${recordswithblanks} (somatic/unknown) records with blanks found in field t_ref_count, t_alt_count, n_ref_count, or n_alt_count" + echo ${message} + echo -e "${message}" | mail -s "MSKIMPACT scan for missing tumor/normal-ref/alt counts failure" $PIPELINES_EMAIL_LIST + exit 3 +fi +#test passed - no blanks found in somatic/unknown records +exit 0