From 322ab2bf82bf28dbd42a9900c711a2ab4b409804 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:09:59 -0400 Subject: [PATCH 1/6] remove unused dags --- dags/import_msk_dag.py | 56 -------------------------------- dags/import_review_dag.py | 47 --------------------------- dags/import_triage_dag.py | 67 --------------------------------------- 3 files changed, 170 deletions(-) delete mode 100644 dags/import_msk_dag.py delete mode 100644 dags/import_review_dag.py delete mode 100644 dags/import_triage_dag.py diff --git a/dags/import_msk_dag.py b/dags/import_msk_dag.py deleted file mode 100644 index 1409214da..000000000 --- a/dags/import_msk_dag.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -import_msk_dag.py -Imports MSK study to MySQL and ClickHouse databases using blue/green deployment strategy. -This DAG is commented out because further work is needed to migrate the full MSK import process to Airflow. -""" - -""" -import os -import sys - -from airflow.models.param import Param - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_base import ImporterConfig, build_import_dag - - -def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["verify_management_state"] >> [tasks["fetch_data"], tasks["clone_database"]] - [tasks["fetch_data"], tasks["clone_database"]] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_sql"] >> tasks["import_clickhouse"] >> tasks["transfer_deployment"] >> tasks["set_import_abandoned"] >> tasks["cleanup_data"] - -_MSK_CONFIG = ImporterConfig( - dag_id="import_msk_dag", - description="Imports MSK study to MySQL and ClickHouse databases using blue/green deployment strategy", - importer="msk", - tags=["msk"], - target_nodes=("pipelines3_ssh",), - data_nodes=("pipelines3_ssh",), - task_names=( - "verify_management_state", - "clone_database", - "fetch_data", - "setup_import", - "import_sql", - "import_clickhouse", - "transfer_deployment", - "set_import_abandoned", - "cleanup_data", - ), - db_properties_filename="manage_msk_database_update_tools.properties", - color_swap_config_filename="msk-db-color-swap-config.yaml", - params={ - "data_repos": Param( - ["datahub"], - type="array", - description="Comma-separated list of data repositories to pull updates from/cleanup.", - title="Data Repositories", - examples=["datahub", "impact", "private"], - ), - }, - wire_dependencies=_wire, -) - -globals()[_MSK_CONFIG.dag_id] = build_import_dag(_MSK_CONFIG) -""" -pass diff --git a/dags/import_review_dag.py b/dags/import_review_dag.py deleted file mode 100644 index 7f99a2c3a..000000000 --- a/dags/import_review_dag.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -import_review_dag.py -Imports Review study to MySQL database. -""" -import os -import sys - -from airflow.models.param import Param - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_base import ImporterConfig, build_import_dag - -def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["fetch_data"] - tasks["fetch_data"] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_sql"] >> tasks["send_update_notification"] >> tasks["cleanup_data"] - -_REVIEW_CONFIG = ImporterConfig( - dag_id="import_review_dag", - description="Imports Review study to MySQL database", - importer="review", - tags=["review"], - target_nodes=("importer_ssh",), - data_nodes=("importer_ssh",), - task_names=( - "fetch_data", - "setup_import", - "import_sql", - # The review portal doesn't have a persistence cache like triage, hence no clear_persistence_caches here - "send_update_notification", - "cleanup_data", - ), - db_properties_filename="manage_review_database_update_tools.properties", - color_swap_config_filename=None, # Not used for MySQL - params={ - "data_repos": Param( - ["datahub-publicdbv7"], - type="array", - description="Comma-separated list of data repositories to pull updates from/cleanup.", - title="Data Repositories", - examples=["datahub-publicdbv7"], - ), - }, - wire_dependencies=_wire, -) - -globals()[_REVIEW_CONFIG.dag_id] = build_import_dag(_REVIEW_CONFIG) diff --git a/dags/import_triage_dag.py b/dags/import_triage_dag.py deleted file mode 100644 index 2d9e2ae2b..000000000 --- a/dags/import_triage_dag.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -import_triage_dag.py -Imports Triage study to MySQL database. -""" -import os -import sys - -from airflow.models.param import Param - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from dags.import_base import ImporterConfig, build_import_dag - -def _wire(tasks: dict[str, object]) -> None: - tasks["data_repos"] >> tasks["fetch_data"] - tasks["fetch_data"] >> tasks["setup_import"] - tasks["setup_import"] >> tasks["import_sql"] >> tasks["clear_persistence_caches"] >> tasks["send_update_notification"] >> tasks["cleanup_data"] - -_TRIAGE_CONFIG = ImporterConfig( - dag_id="import_triage_dag", - description="Imports Triage study to MySQL database", - importer="triage", - tags=["triage"], - target_nodes=("pipelines3_ssh",), - data_nodes=("pipelines3_ssh",), - task_names=( - "fetch_data", - "setup_import", - "import_sql", - "clear_persistence_caches", - "send_update_notification", - "cleanup_data", - ), - db_properties_filename="manage_triage_database_update_tools.properties", - color_swap_config_filename=None, # Not used for MySQL - params={ - "data_repos": Param( - [ - "datahub", - "cmo-argos", - "private", - "impact" - ], - type="array", - description="Comma-separated list of data repositories to pull updates from/cleanup.", - title="Data Repositories", - examples=[ - "datahub", - "bic-mskcc-legacy", - "cmo-argos", - "private", - "impact", - "knowledge-systems-curated-studies", - "datahub_shahlab", - "msk-mind-datahub", - "pipelines-testing", - "genie", - "extract-projects", - "cmo-access" - ], - ), - }, - wire_dependencies=_wire, - pool="triage_import_pool", - schedule_interval="0 0 * * *", -) - -globals()[_TRIAGE_CONFIG.dag_id] = build_import_dag(_TRIAGE_CONFIG) From ee0c8cd9a000ab3ce5880e40d9b8769162f32ac3 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:15:05 -0400 Subject: [PATCH 2/6] remove unused code --- .../airflow-clear-persistence-caches.sh | 23 ------- import-scripts/airflow-import-sql.sh | 66 +++++-------------- import-scripts/airflow-setup-import.sh | 65 ++++++------------ 3 files changed, 37 insertions(+), 117 deletions(-) delete mode 100644 import-scripts/airflow-clear-persistence-caches.sh diff --git a/import-scripts/airflow-clear-persistence-caches.sh b/import-scripts/airflow-clear-persistence-caches.sh deleted file mode 100644 index edb445b8e..000000000 --- a/import-scripts/airflow-clear-persistence-caches.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Script for clearing cBioPortal persistence cache from Airflow -# This script is only called for the triage portal, which still uses the MySQL database. -# For the ClickHouse portals, clearing the persistence cache is done as part of the transfer deployment step - -PORTAL_DATABASE=$1 -PORTAL_SCRIPTS_DIRECTORY=$2 - -source "$PORTAL_SCRIPTS_DIRECTORY/automation-environment.sh" -source "$PORTAL_SCRIPTS_DIRECTORY/clear-persistence-cache-shell-functions.sh" - -echo "Clearing persistence caches for $PORTAL_DATABASE portal..." -case "$PORTAL_DATABASE" in - triage) - clearPersistenceCachesForTriagePortals - ;; - *) - echo "Unrecognized portal database: $PORTAL_DATABASE" >&2 - exit 1 - ;; -esac -echo "Persistence caches for $PORTAL_DATABASE cleared successfully" diff --git a/import-scripts/airflow-import-sql.sh b/import-scripts/airflow-import-sql.sh index fc3763c1f..1440f2c34 100755 --- a/import-scripts/airflow-import-sql.sh +++ b/import-scripts/airflow-import-sql.sh @@ -19,10 +19,6 @@ if [ ! -f "$AUTOMATION_ENV_SCRIPT_FILEPATH" ] ; then fi source "$AUTOMATION_ENV_SCRIPT_FILEPATH" -function is_mysql_import() { - [[ "$PORTAL_DATABASE" = "triage" || "$PORTAL_DATABASE" = "review" ]] -} - # Set needed paths/filenames for import case "$PORTAL_DATABASE" in genie) @@ -39,13 +35,6 @@ case "$PORTAL_DATABASE" in PORTAL_NAME="public-portal" ONCOTREE_VERSION="oncotree_latest_stable" ;; - triage) - TMP_DIR_NAME="import-cron-triage" - IMPORTER_NAME="triage-cmo" - LOG_FILE_NAME="triage-cmo-importer.log" - PORTAL_NAME="triage-portal" - ONCOTREE_VERSION="oncotree_candidate_release" - ;; triage-clickhouse) TMP_DIR_NAME="import-cron-triage-clickhouse" IMPORTER_NAME="triage-clickhouse" @@ -53,49 +42,30 @@ case "$PORTAL_DATABASE" in PORTAL_NAME="triage-portal" ONCOTREE_VERSION="oncotree_candidate_release" ;; - review) - TMP_DIR_NAME="import-cron-review" - IMPORTER_NAME="review" - LOG_FILE_NAME="review-importer.log" - PORTAL_NAME="hgnc-portal" - ONCOTREE_VERSION="oncotree_latest_stable" - # Need to set a different PORTAL_DATA_HOME instead of pulling from the same Datahub clone as public - export PORTAL_DATA_HOME="/data2/portal-cron/cbio-portal-data-publicdbv7-rebuild" - ;; -# msk) -# TMP_DIR_NAME="import-cron-msk" -# IMPORTER_NAME="msk-cmo" -# LOG_FILE_NAME="msk-cmo-importer.log" -# PORTAL_NAME="msk-automation-portal" -# ;; *) echo "Unsupported portal database: $PORTAL_DATABASE" >&2 exit 1 ;; esac -if ! is_mysql_import; then - # Get the current production database color - GET_DB_IN_PROD_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/get_database_currently_in_production.sh" - current_production_database_color=$(sh "$GET_DB_IN_PROD_SCRIPT_FILEPATH" "$MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH") - destination_database_color="unset" - if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" - fi - if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" - fi - if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 - fi - - # eg. genie-aws-importer-blue.jar - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" -else - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer.jar" +# Get the current production database color +GET_DB_IN_PROD_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/get_database_currently_in_production.sh" +current_production_database_color=$(sh "$GET_DB_IN_PROD_SCRIPT_FILEPATH" "$MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH") +destination_database_color="unset" +if [ ${current_production_database_color:0:4} == "blue" ] ; then + destination_database_color="green" +fi +if [ ${current_production_database_color:0:5} == "green" ] ; then + destination_database_color="blue" +fi +if [ "$destination_database_color" == "unset" ] ; then + echo "Error during determination of the destination database color" >&2 + exit 1 fi +# eg. genie-aws-importer-blue.jar +IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" + tmp="${PORTAL_HOME}/tmp/${TMP_DIR_NAME}" JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" @@ -125,9 +95,7 @@ tail -f "$PORTAL_HOME/logs/$LOG_FILE_NAME" & TAIL_PID=$! trap 'kill "$TAIL_PID" 2>/dev/null; wait "$TAIL_PID" 2>/dev/null' EXIT INT TERM -if ! is_mysql_import; then - echo "Destination DB color: $destination_database_color" -fi +echo "Destination DB color: $destination_database_color" echo "Importing with $IMPORTER_JAR_FILENAME" echo "Importing cancer type updates into $destination_database_color mysql database" $JAVA_BINARY -Xmx16g $JAVA_IMPORTER_ARGS --import-types-of-cancer --oncotree-version $ONCOTREE_VERSION diff --git a/import-scripts/airflow-setup-import.sh b/import-scripts/airflow-setup-import.sh index 8d1567a80..2ca217990 100755 --- a/import-scripts/airflow-setup-import.sh +++ b/import-scripts/airflow-setup-import.sh @@ -20,11 +20,6 @@ if [ ! -f "$AUTOMATION_ENV_SCRIPT_FILEPATH" ] ; then fi source "$AUTOMATION_ENV_SCRIPT_FILEPATH" -# Helper: returns success for MySQL-style imports (no blue/green), otherwise failure -is_mysql_import() { - [[ "$PORTAL_DATABASE" == "triage" || "$PORTAL_DATABASE" == "review" ]] -} - # Configure names/paths based on portal database case "$PORTAL_DATABASE" in genie) @@ -39,12 +34,6 @@ case "$PORTAL_DATABASE" in LOG_FILE_NAME="public-data-importer.log" PORTAL_NAME="public-portal" ;; - triage) - TMP_DIR_NAME="import-cron-triage" - IMPORTER_NAME="triage-cmo" - LOG_FILE_NAME="triage-cmo-importer.log" - PORTAL_NAME="triage-portal" - ;; triage-clickhouse) TMP_DIR_NAME="import-cron-triage-clickhouse" IMPORTER_NAME="triage-clickhouse" @@ -52,14 +41,6 @@ case "$PORTAL_DATABASE" in PORTAL_NAME="triage-portal" ONCOTREE_VERSION="oncotree_candidate_release" ;; - review) - TMP_DIR_NAME="import-cron-review" - IMPORTER_NAME="review" - LOG_FILE_NAME="review-importer.log" - PORTAL_NAME="hgnc-portal" - # Need to set a different PORTAL_DATA_HOME instead of pulling from the same Datahub clone as public - export PORTAL_DATA_HOME="/data2/portal-cron/cbio-portal-data-publicdbv7-rebuild" - ;; msk) TMP_DIR_NAME="import-cron-msk" IMPORTER_NAME="msk-cmo" @@ -72,31 +53,27 @@ case "$PORTAL_DATABASE" in ;; esac -if ! is_mysql_import; then - # Get the current production database color - GET_DB_IN_PROD_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/get_database_currently_in_production.sh" - current_production_database_color=$(sh "$GET_DB_IN_PROD_SCRIPT_FILEPATH" "$MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH") - destination_database_color="unset" - if [ ${current_production_database_color:0:4} == "blue" ] ; then - destination_database_color="green" - fi - if [ ${current_production_database_color:0:5} == "green" ] ; then - destination_database_color="blue" - fi - if [ "$destination_database_color" == "unset" ] ; then - echo "Error during determination of the destination database color" >&2 - exit 1 - fi +# Get the current production database color +GET_DB_IN_PROD_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/get_database_currently_in_production.sh" +current_production_database_color=$(sh "$GET_DB_IN_PROD_SCRIPT_FILEPATH" "$MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH") +destination_database_color="unset" +if [ ${current_production_database_color:0:4} == "blue" ] ; then + destination_database_color="green" +fi +if [ ${current_production_database_color:0:5} == "green" ] ; then + destination_database_color="blue" +fi +if [ "$destination_database_color" == "unset" ] ; then + echo "Error during determination of the destination database color" >&2 + exit 1 +fi - if [ "$PORTAL_DATABASE" != "msk" ]; then - # eg. genie-aws-importer-blue.jar - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" - else - # msk importer follows different naming convention (why??), eg. msk-cmo-blue-importer.jar - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-${destination_database_color}-importer.jar" - fi +if [ "$PORTAL_DATABASE" != "msk" ]; then + # eg. genie-aws-importer-blue.jar + IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" else - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer.jar" + # msk importer follows different naming convention (why??), eg. msk-cmo-blue-importer.jar + IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-${destination_database_color}-importer.jar" fi tmp="$PORTAL_HOME/tmp/$TMP_DIR_NAME" @@ -108,9 +85,7 @@ tail -f "$PORTAL_HOME/logs/$LOG_FILE_NAME" & TAIL_PID=$! trap 'kill "$TAIL_PID" 2>/dev/null; wait "$TAIL_PID" 2>/dev/null' EXIT INT TERM -if ! is_mysql_import; then - echo "Destination DB color: $destination_database_color" -fi +echo "Destination DB color: $destination_database_color" echo "Using importer JAR: $IMPORTER_JAR_FILENAME" # Database check From 442100ab9546381d6324c44a92d6a65df3847ede Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:20:03 -0400 Subject: [PATCH 3/6] remove airflow task --- dags/import_clickhouse_base.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dags/import_clickhouse_base.py b/dags/import_clickhouse_base.py index 808ae40bc..959d87e09 100644 --- a/dags/import_clickhouse_base.py +++ b/dags/import_clickhouse_base.py @@ -225,7 +225,6 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, ), # reuse the old import-sql script for now - # TODO: we might need to update the send_update_notification code here "import_direct_to_clickhouse": _script( scripts_dir, "airflow-import-sql.sh", @@ -241,12 +240,6 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, color_swap_config_filepath, ), - "clear_persistence_caches": _script( - scripts_dir, - "airflow-clear-persistence-caches.sh", - importer, - scripts_dir, - ), "set_import_running": _script( scripts_dir, "set_update_process_state.sh", From 28bdbf049fce1f254762bc5d9cf8f2c77d73c9f3 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:22:02 -0400 Subject: [PATCH 4/6] more dead code --- .../clear-persistence-cache-shell-functions.sh | 15 --------------- .../clear_cbioportal_persistence_cache.sh | 8 -------- 2 files changed, 23 deletions(-) diff --git a/import-scripts/clear-persistence-cache-shell-functions.sh b/import-scripts/clear-persistence-cache-shell-functions.sh index eae53e6d1..93ed11cdb 100755 --- a/import-scripts/clear-persistence-cache-shell-functions.sh +++ b/import-scripts/clear-persistence-cache-shell-functions.sh @@ -49,16 +49,6 @@ function clearPersistenceCachesForMskGreenPortals() { fi } -function clearPersistenceCachesForTriagePortals() { - all_triage_portals="triage" - clearPersistenceCachesForPortals "$all_triage_portals" -} - -function clearPersistenceCachesForHgncPortals() { - all_hgnc_portals="hgnc" - clearPersistenceCachesForPortals "$all_hgnc_portals" -} - function clearPersistenceCachesForPublicPortals() { all_public_portals="public" clearPersistenceCachesForPortals "$all_public_portals" @@ -84,11 +74,6 @@ function clearPersistenceCachesForGenieGreenPortals() { clearPersistenceCachesForPortals "$all_genie_portals" } -function clearPersistenceCachesForCrdcPortals() { - all_crdc_portals="crdc" - clearPersistenceCachesForPortals "$all_crdc_portals" -} - function clearPersistenceCachesForTriageBluePortals() { all_triage_portals="triage-blue" clearPersistenceCachesForPortals "$all_triage_portals" diff --git a/import-scripts/clear_cbioportal_persistence_cache.sh b/import-scripts/clear_cbioportal_persistence_cache.sh index 0717f3700..fe979758a 100755 --- a/import-scripts/clear_cbioportal_persistence_cache.sh +++ b/import-scripts/clear_cbioportal_persistence_cache.sh @@ -42,10 +42,8 @@ portal_to_cluster_map["genie-private-blue"]="$CLUSTER_ID_PUBLICARGOCD" portal_to_cluster_map["genie-private-green"]="$CLUSTER_ID_PUBLICARGOCD" # pipelines3 -portal_to_cluster_map["triage"]="$CLUSTER_ID_EKSARGOCD" portal_to_cluster_map["triage-blue"]="$CLUSTER_ID_EKSARGOCD" portal_to_cluster_map["triage-green"]="$CLUSTER_ID_EKSARGOCD" -portal_to_cluster_map["hgnc"]="$CLUSTER_ID_EKSARGOCD" portal_to_cluster_map["msk-beta-blue"]="$CLUSTER_ID_EKSARGOCD" portal_to_cluster_map["msk-beta-green"]="$CLUSTER_ID_EKSARGOCD" portal_to_cluster_map["msk-blue"]="$CLUSTER_ID_EKSARGOCD" @@ -71,10 +69,8 @@ portal_to_deployment_map["genie-public-green"]="cbioportal-backend-genie-public- portal_to_deployment_map["genie-private-blue"]="cbioportal-backend-genie-private-blue" portal_to_deployment_map["genie-private-green"]="cbioportal-backend-genie-private-green" # pipelines3 -portal_to_deployment_map["triage"]="eks-triage" portal_to_deployment_map["triage-blue"]="eks-triage-blue" portal_to_deployment_map["triage-green"]="eks-triage-green" -portal_to_deployment_map["hgnc"]="eks-hgnc" portal_to_deployment_map["msk-beta-blue"]="eks-msk-beta-blue" portal_to_deployment_map["msk-beta-green"]="eks-msk-beta-green" portal_to_deployment_map["msk-blue"]="eks-msk-blue" @@ -102,10 +98,8 @@ portal_to_cache_service_basename["genie-public-green"]="cbioportal-genie-persist portal_to_cache_service_basename["genie-private-blue"]="cbioportal-genie-persistence-redis" portal_to_cache_service_basename["genie-private-green"]="cbioportal-genie-persistence-redis" # pipelines3 -portal_to_cache_service_basename["triage"]="triage-cbioportal-persistence-redis" portal_to_cache_service_basename["triage-blue"]="triage-cbioportal-persistence-redis" portal_to_cache_service_basename["triage-green"]="triage-cbioportal-persistence-redis" -portal_to_cache_service_basename["hgnc"]="" portal_to_cache_service_basename["msk-beta-blue"]="eks-msk-cbioportal-persistence-redis" portal_to_cache_service_basename["msk-beta-green"]="eks-msk-cbioportal-persistence-redis" portal_to_cache_service_basename["msk-blue"]="eks-msk-cbioportal-persistence-redis" @@ -131,10 +125,8 @@ portal_to_cache_database_number["genie-public-green"]="2" portal_to_cache_database_number["genie-private-blue"]="3" portal_to_cache_database_number["genie-private-green"]="4" # pipelines3 -portal_to_cache_database_number["triage"]="1" portal_to_cache_database_number["triage-blue"]="10" portal_to_cache_database_number["triage-green"]="11" -portal_to_cache_database_number["hgnc"]="unassigned" portal_to_cache_database_number["msk-beta-blue"]="6" portal_to_cache_database_number["msk-beta-green"]="7" portal_to_cache_database_number["msk-blue"]="8" From 6d80dae5cf00e95211595f93cd5a1fb790095f87 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:23:56 -0400 Subject: [PATCH 5/6] remove dead code branch (msk doesnt run from airflow) --- import-scripts/airflow-create-derived-tables.sh | 3 --- import-scripts/airflow-import-clickhouse.sh | 3 --- import-scripts/airflow-setup-import.sh | 15 ++------------- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/import-scripts/airflow-create-derived-tables.sh b/import-scripts/airflow-create-derived-tables.sh index e19bcab1e..29735a5b3 100644 --- a/import-scripts/airflow-create-derived-tables.sh +++ b/import-scripts/airflow-create-derived-tables.sh @@ -60,9 +60,6 @@ fi if [ "$PORTAL_DATABASE" == "genie" ] ; then clickhouse_schema_branch_name="genie-portal-db-clickhouse-sql-for-import" fi -if [ "$PORTAL_DATABASE" == "msk" ] ; then - clickhouse_schema_branch_name="msk-portal-db-clickhouse-sql-for-import" -fi if [ "$PORTAL_DATABASE" == "triage-clickhouse" ]; then clickhouse_schema_branch_name="triage-portal-db-clickhouse-sql-for-import" fi diff --git a/import-scripts/airflow-import-clickhouse.sh b/import-scripts/airflow-import-clickhouse.sh index e87a283d0..1e84eef13 100755 --- a/import-scripts/airflow-import-clickhouse.sh +++ b/import-scripts/airflow-import-clickhouse.sh @@ -79,9 +79,6 @@ fi if [ "$PORTAL_DATABASE" == "genie" ] ; then clickhouse_schema_branch_name="genie-portal-db-clickhouse-sql-for-import" fi -if [ "$PORTAL_DATABASE" == "msk" ] ; then - clickhouse_schema_branch_name="msk-portal-db-clickhouse-sql-for-import" -fi if ! $DOWNLOAD_DERVIED_TABLE_SQL_FILES_SCRIPT_FILEPATH --github_branch_name "$clickhouse_schema_branch_name" "$derived_table_sql_script_dirpath" ; then echo "Error during download of derived table construction .sql files from github" >&2 exit 1 diff --git a/import-scripts/airflow-setup-import.sh b/import-scripts/airflow-setup-import.sh index 2ca217990..f23b55bda 100755 --- a/import-scripts/airflow-setup-import.sh +++ b/import-scripts/airflow-setup-import.sh @@ -41,12 +41,6 @@ case "$PORTAL_DATABASE" in PORTAL_NAME="triage-portal" ONCOTREE_VERSION="oncotree_candidate_release" ;; - msk) - TMP_DIR_NAME="import-cron-msk" - IMPORTER_NAME="msk-cmo" - LOG_FILE_NAME="msk-cmo-importer.log" - PORTAL_NAME="msk-automation-portal" - ;; *) echo "Unsupported portal database: $PORTAL_DATABASE" >&2 exit 1 @@ -68,13 +62,8 @@ if [ "$destination_database_color" == "unset" ] ; then exit 1 fi -if [ "$PORTAL_DATABASE" != "msk" ]; then - # eg. genie-aws-importer-blue.jar - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" -else - # msk importer follows different naming convention (why??), eg. msk-cmo-blue-importer.jar - IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-${destination_database_color}-importer.jar" -fi +# eg. genie-aws-importer-blue.jar +IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" tmp="$PORTAL_HOME/tmp/$TMP_DIR_NAME" JAVA_IMPORTER_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" From 70e4aa6e1ebd8e28135b84715c0e50e8225d6e44 Mon Sep 17 00:00:00 2001 From: James Ko Date: Fri, 27 Mar 2026 11:44:58 -0400 Subject: [PATCH 6/6] add script for checking data integrity --- dags/import_clickhouse_base.py | 33 ++++++-------- dags/import_triage_clickhouse_dag.py | 5 ++- .../airflow-check-data-integrity.sh | 43 +++++++++++++++++++ import-scripts/airflow-import-sql.sh | 4 +- import-scripts/airflow-setup-import.sh | 2 +- 5 files changed, 64 insertions(+), 23 deletions(-) create mode 100644 import-scripts/airflow-check-data-integrity.sh diff --git a/dags/import_clickhouse_base.py b/dags/import_clickhouse_base.py index 959d87e09..61ef89016 100644 --- a/dags/import_clickhouse_base.py +++ b/dags/import_clickhouse_base.py @@ -196,19 +196,6 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No scripts_dir, db_properties_filepath, ), - "create_derived_tables": _script( - scripts_dir, - "airflow-create-derived-tables.sh", - importer, - scripts_dir, - db_properties_filepath, - ), - "set_import_complete": _script( - scripts_dir, - "set_update_process_state.sh", - db_properties_filepath, - "complete", - ), "fetch_data": _script( scripts_dir, "data_source_repo_clone_manager.sh", @@ -233,18 +220,26 @@ def send_update_notification(notification_filepath: str, ssh_conn_id: str) -> No db_properties_filepath, notification_filepath, ), - "transfer_deployment": _script( + "create_derived_tables": _script( scripts_dir, - "airflow-transfer-deployment.sh", + "airflow-create-derived-tables.sh", + importer, scripts_dir, db_properties_filepath, - color_swap_config_filepath, ), - "set_import_running": _script( + "check_data_integrity": _script( + scripts_dir, + "airflow-check-data-integrity.sh", + importer, scripts_dir, - "set_update_process_state.sh", db_properties_filepath, - "running", + ), + "transfer_deployment": _script( + scripts_dir, + "airflow-transfer-deployment.sh", + scripts_dir, + db_properties_filepath, + color_swap_config_filepath, ), "set_import_abandoned": _script( scripts_dir, diff --git a/dags/import_triage_clickhouse_dag.py b/dags/import_triage_clickhouse_dag.py index 13c94c1b1..7586afef0 100644 --- a/dags/import_triage_clickhouse_dag.py +++ b/dags/import_triage_clickhouse_dag.py @@ -28,7 +28,9 @@ def _wire(tasks: dict[str, object]) -> None: tasks["import_direct_to_clickhouse"] >> tasks["create_derived_tables"] - tasks["create_derived_tables"] >> tasks["transfer_deployment"] + tasks["create_derived_tables"] >> tasks["check_data_integrity"] + + tasks["check_data_integrity"] >> tasks["transfer_deployment"] tasks["transfer_deployment"] >> [ tasks["cleanup_data"], @@ -53,6 +55,7 @@ def _wire(tasks: dict[str, object]) -> None: "setup_import", "import_direct_to_clickhouse", "create_derived_tables", + "check_data_integrity", "transfer_deployment", #"clear_persistence_caches", "send_update_notification", diff --git a/import-scripts/airflow-check-data-integrity.sh b/import-scripts/airflow-check-data-integrity.sh new file mode 100644 index 000000000..539f093c5 --- /dev/null +++ b/import-scripts/airflow-check-data-integrity.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +PORTAL_DATABASE=$1 +PORTAL_SCRIPTS_DIRECTORY=$2 +MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH=$3 + +if [ -z "$PORTAL_SCRIPTS_DIRECTORY" ]; then + PORTAL_SCRIPTS_DIRECTORY="/data/portal-cron/scripts" +fi +AUTOMATION_ENV_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/automation-environment.sh" +if [ ! -f "$AUTOMATION_ENV_SCRIPT_FILEPATH" ] ; then + echo "$(date): Unable to locate $AUTOMATION_ENV_SCRIPT_FILEPATH, exiting..." + exit 1 +fi +source "$AUTOMATION_ENV_SCRIPT_FILEPATH" + +# Get the current production database color +GET_DB_IN_PROD_SCRIPT_FILEPATH="${PORTAL_SCRIPTS_DIRECTORY}/get_database_currently_in_production.sh" +current_production_database_color=$(sh "$GET_DB_IN_PROD_SCRIPT_FILEPATH" "$MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH") +destination_database_color="unset" +if [ ${current_production_database_color:0:4} == "blue" ] ; then + destination_database_color="green" +fi +if [ ${current_production_database_color:0:5} == "green" ] ; then + destination_database_color="blue" +fi +if [ "$destination_database_color" == "unset" ] ; then + echo "Error during determination of the destination database color" >&2 + exit 1 +fi + +# eg. genie-aws-importer-blue.jar +IMPORTER_JAR_FILENAME="/data/portal-cron/lib/${IMPORTER_NAME}-importer-${destination_database_color}.jar" + +tmp="${PORTAL_HOME}/tmp/${TMP_DIR_NAME}" +#INTEGRITY_CHECK_ARGS="$JAVA_SSL_ARGS -Dspring.profiles.active=dbcp -Djava.io.tmpdir=$tmp -ea -cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.importer.Admin" +INTEGRITY_CHECK_ARGS="-cp $IMPORTER_JAR_FILENAME org.mskcc.cbio.portal.scripts.CheckClickHouseConstraints" + +"$JAVA_BINARY" $INTEGRITY_CHECK_ARGS +if [ $? -gt 0 ]; then + echo "Error: Integrity check failed! Will not transfer deployment" >&2 + exit 1 +fi diff --git a/import-scripts/airflow-import-sql.sh b/import-scripts/airflow-import-sql.sh index 1440f2c34..a1ceee6d4 100755 --- a/import-scripts/airflow-import-sql.sh +++ b/import-scripts/airflow-import-sql.sh @@ -97,14 +97,14 @@ trap 'kill "$TAIL_PID" 2>/dev/null; wait "$TAIL_PID" 2>/dev/null' EXIT INT TERM echo "Destination DB color: $destination_database_color" echo "Importing with $IMPORTER_JAR_FILENAME" -echo "Importing cancer type updates into $destination_database_color mysql database" +echo "Importing cancer type updates into $destination_database_color database" $JAVA_BINARY -Xmx16g $JAVA_IMPORTER_ARGS --import-types-of-cancer --oncotree-version $ONCOTREE_VERSION if [ $? -gt 0 ]; then echo "Error: Cancer type import failed!" >&2 exit 1 fi -echo "Importing $PORTAL_DATABASE study data into $destination_database_color mysql database" +echo "Importing $PORTAL_DATABASE study data into $destination_database_color database" $JAVA_BINARY -Xmx64g $JAVA_IMPORTER_ARGS --update-study-data --portal $PORTAL_NAME --update-worksheet --oncotree-version $ONCOTREE_VERSION --transcript-overrides-source uniprot --disable-redcap-export --notification-file="$notification_file" exitcode=$? diff --git a/import-scripts/airflow-setup-import.sh b/import-scripts/airflow-setup-import.sh index f23b55bda..3ba6be5cb 100755 --- a/import-scripts/airflow-setup-import.sh +++ b/import-scripts/airflow-setup-import.sh @@ -78,7 +78,7 @@ echo "Destination DB color: $destination_database_color" echo "Using importer JAR: $IMPORTER_JAR_FILENAME" # Database check -echo "Checking if mysql database version is compatible" +echo "Checking if database version is compatible" "$JAVA_BINARY" $JAVA_IMPORTER_ARGS --check-db-version if [ $? -gt 0 ]; then echo "Error: Database version expected by portal does not match version in database!" >&2