From 0689a96d5010091131db80c47c44eb331a4d0bc6 Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 11:54:51 -0400 Subject: [PATCH 1/7] change import-users script to be clickhouse only --- import-scripts/import-portal-users.sh | 2 +- import-scripts/importUsers.py | 235 +++++++++++--------------- 2 files changed, 103 insertions(+), 134 deletions(-) diff --git a/import-scripts/import-portal-users.sh b/import-scripts/import-portal-users.sh index d8dcd8755..d3bc6cd8a 100755 --- a/import-scripts/import-portal-users.sh +++ b/import-scripts/import-portal-users.sh @@ -41,7 +41,7 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" echo "### Starting import" >> "$USERSGENIELOGFILENAME" date >> "$USERSGENIELOGFILENAME" - $PYTHON_BINARY $PORTAL_HOME/scripts/importUsers.py --port 3306 --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --ssl-ca $PORTAL_HOME/pipelines-credentials/pipelines-genie-db-aws-rds-combined-ca-bundle.pem --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 + $PYTHON_BINARY $PORTAL_HOME/scripts/importUsers.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 CGDS_GENIE_IMPORT_STATUS=$? clickhouse_commands_filepath=$(ls ${CLICKHOUSE_COMMANDS_PENDING_DIRPATH}/*.sql 2>/dev/null | head -n 1) if [ $CGDS_GENIE_IMPORT_STATUS -eq 0 ] && [ -f "$clickhouse_commands_filepath" ] ; then diff --git a/import-scripts/importUsers.py b/import-scripts/importUsers.py index 5b78b4648..3e35f2050 100755 --- a/import-scripts/importUsers.py +++ b/import-scripts/importUsers.py @@ -25,7 +25,7 @@ import sys import time import getopt -import MySQLdb +import clickhouse_connect import re import datetime @@ -57,10 +57,11 @@ OUTPUT_FILE = sys.stdout # fields in portal.properties -CGDS_DATABASE_HOST = 'db.host' -CGDS_DATABASE_NAME = 'db.portal_db_name' -CGDS_DATABASE_USER = 'db.user' -CGDS_DATABASE_PW = 'db.password' +CLICKHOUSE_HOST = 'clickhouse.host' +CLICKHOUSE_PORT = 'clickhouse.port' +CLICKHOUSE_USER = 'clickhouse.user' +CLICKHOUSE_PW = 'clickhouse.password' +CLICKHOUSE_DB = 'clickhouse.db' GOOGLE_ID = 'google.id' GOOGLE_PW = 'google.pw' CGDS_USERS_SPREADSHEET = 'users.spreadsheet' @@ -112,12 +113,13 @@ class PortalProperties(object): def __init__(self, - cgds_database_host, cgds_database_name, cgds_database_user, cgds_database_pw, - google_id, google_pw, google_spreadsheet, google_worksheet,google_importer_spreadsheet): - self.cgds_database_host = cgds_database_host - self.cgds_database_name = cgds_database_name - self.cgds_database_user = cgds_database_user - self.cgds_database_pw = cgds_database_pw + clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_pw, clickhouse_db, + google_id, google_pw, google_spreadsheet, google_worksheet, google_importer_spreadsheet): + self.clickhouse_host = clickhouse_host + self.clickhouse_port = clickhouse_port + self.clickhouse_user = clickhouse_user + self.clickhouse_pw = clickhouse_pw + self.clickhouse_db = clickhouse_db self.google_id = google_id self.google_pw = google_pw self.google_spreadsheet = google_spreadsheet @@ -245,32 +247,18 @@ def append_command_to_clickhouse_file(sql_command_string): with open(CLICKHOUSE_COMMANDS_FILEPATH, "a") as clickhouse_sql_file: clickhouse_sql_file.write(sql_command_string) -def insert_new_users(cursor, new_user_list): - # list of emails for users which returned an error when inserting into database - emails_to_remove = [] +def insert_new_users(new_user_list): added_user_triples = [] added_user_authority_duples = [] for user in new_user_list: - print >> OUTPUT_FILE, "new user: %s" % user.google_email; - try: - user_name = user.name - if isinstance(user_name, unicode): - user_name = user_name.encode('utf-8') - user_email_escaped=user.google_email.lower().replace('\'', '\\\'') - user_triple = (user_email_escaped, user_name, user.enabled) - sql_command_string = "insert into users values ('%s', '%s', '%s')" % user_triple - cursor.execute(sql_command_string) - added_user_triples.append(user_triple) - # authorities is semicolon delimited - authorities = user.authorities - cursor.executemany("insert into authorities values (%s, %s)", [(user_email_escaped, authority) for authority in authorities]) - added_user_authority_duples += ([(user_email_escaped, authority) for authority in authorities]) - except MySQLdb.Error, msg: - print >> OUTPUT_FILE, msg - print >> OUTPUT_FILE, "Removing user: %s" % user_name - print >> ERROR_FILE, msg - emails_to_remove.append(user.google_email.lower()) - # output commands to be executed on clickhouse sever + print >> OUTPUT_FILE, "new user: %s" % user.google_email + user_name = user.name + if isinstance(user_name, unicode): + user_name = user_name.encode('utf-8') + user_email_escaped = user.google_email.lower().replace('\'', '\\\'') + added_user_triples.append((user_email_escaped, user_name, user.enabled)) + added_user_authority_duples += [(user_email_escaped, authority) for authority in user.authorities] + # output commands to be executed on clickhouse server if len(added_user_triples) > 0: values_string_items = ["('%s','%s','%s')" % triple for triple in added_user_triples] sql_command_string = "INSERT INTO users VALUES %s;" % ",".join(values_string_items) @@ -279,12 +267,11 @@ def insert_new_users(cursor, new_user_list): values_string_items = ["('%s','%s')" % duple for duple in added_user_authority_duples] sql_command_string = "INSERT INTO authorities VALUES %s;" % ",".join(values_string_items) append_command_to_clickhouse_file(sql_command_string) - return emails_to_remove # ------------------------------------------------------------------------------ # get current users from database -def get_current_user_map(cursor): +def get_current_user_map(ch_client): # map that we are returning # key is the email address of the user (primary key) and value is a User object @@ -293,10 +280,10 @@ def get_current_user_map(cursor): # recall each tuple in user table is ['EMAIL', 'NAME', 'ENABLED'] & # no tuple can contain nulls try: - cursor.execute('select * from users') - for row in cursor.fetchall(): + result = ch_client.query('SELECT * FROM users') + for row in result.result_rows: to_return[row[0].lower()] = User(row[0].lower(), row[0].lower(), row[1], row[2], 'not_used_here') - except MySQLdb.Error, msg: + except Exception as msg: print >> ERROR_FILE, msg return None @@ -305,22 +292,23 @@ def get_current_user_map(cursor): # ------------------------------------------------------------------------------ # get current user authorities -def get_user_authorities(cursor, google_email): +def get_user_authorities(ch_client, google_email): - # list of authorities (cancer studies) we are returning -- as a set - to_return = [] + # list of authorities (cancer studies) we are returning -- as a set + to_return = [] - # recall each tuple in authorities table is ['EMAIL', 'AUTHORITY'] - # no tuple can contain nulls - try: - cursor.execute('select * from authorities where email = (%s)', [google_email]) - for row in cursor.fetchall(): - to_return.append(row[1]) - except MySQLdb.Error, msg: - print >> ERROR_FILE, msg - return None + # recall each tuple in authorities table is ['EMAIL', 'AUTHORITY'] + # no tuple can contain nulls + try: + result = ch_client.query('SELECT * FROM authorities WHERE email = {email:String}', + parameters={'email': google_email}) + for row in result.result_rows: + to_return.append(row[1]) + except Exception as msg: + print >> ERROR_FILE, msg + return None - return to_return + return to_return # ------------------------------------------------------------------------------ # get current users from google spreadsheet @@ -403,29 +391,21 @@ def get_rejected_user_map(spreadsheet, sheet_records, current_user_map, portal_n return to_return # ------------------------------------------------------------------------------ -# get db connection - -def get_db_connection(portal_properties, port, ssl_ca_filename=None): +# get clickhouse client - # try and create a connection to the db +def get_clickhouse_client(portal_properties): try: - if ssl_ca_filename: - connection = MySQLdb.connect(host=portal_properties.cgds_database_host, port=int(port), - user=portal_properties.cgds_database_user, - passwd=portal_properties.cgds_database_pw, - db=portal_properties.cgds_database_name, - ssl={'ca': ssl_ca_filename}) - else: - connection = MySQLdb.connect(host=portal_properties.cgds_database_host, port=int(port), - user=portal_properties.cgds_database_user, - passwd=portal_properties.cgds_database_pw, - db=portal_properties.cgds_database_name) - except MySQLdb.Error, msg: + return clickhouse_connect.get_client( + host=portal_properties.clickhouse_host, + port=int(portal_properties.clickhouse_port), + username=portal_properties.clickhouse_user, + password=portal_properties.clickhouse_pw, + database=portal_properties.clickhouse_db, + ) + except Exception as msg: print >> ERROR_FILE, msg return None - return connection - # ------------------------------------------------------------------------------ # parse portal.properties @@ -451,10 +431,11 @@ def get_portal_properties(portal_properties_filename): portal_properties_file.close() # error check - if (CGDS_DATABASE_HOST not in properties or len(properties[CGDS_DATABASE_HOST]) == 0 or - CGDS_DATABASE_NAME not in properties or len(properties[CGDS_DATABASE_NAME]) == 0 or - CGDS_DATABASE_USER not in properties or len(properties[CGDS_DATABASE_USER]) == 0 or - CGDS_DATABASE_PW not in properties or len(properties[CGDS_DATABASE_PW]) == 0 or + if (CLICKHOUSE_HOST not in properties or len(properties[CLICKHOUSE_HOST]) == 0 or + CLICKHOUSE_PORT not in properties or len(properties[CLICKHOUSE_PORT]) == 0 or + CLICKHOUSE_USER not in properties or len(properties[CLICKHOUSE_USER]) == 0 or + CLICKHOUSE_PW not in properties or len(properties[CLICKHOUSE_PW]) == 0 or + CLICKHOUSE_DB not in properties or len(properties[CLICKHOUSE_DB]) == 0 or GOOGLE_ID not in properties or len(properties[GOOGLE_ID]) == 0 or GOOGLE_PW not in properties or len(properties[GOOGLE_PW]) == 0 or CGDS_USERS_SPREADSHEET not in properties or len(properties[CGDS_USERS_SPREADSHEET]) == 0 or @@ -464,10 +445,11 @@ def get_portal_properties(portal_properties_filename): return None # return an instance of PortalProperties - return PortalProperties(properties[CGDS_DATABASE_HOST], - properties[CGDS_DATABASE_NAME], - properties[CGDS_DATABASE_USER], - properties[CGDS_DATABASE_PW], + return PortalProperties(properties[CLICKHOUSE_HOST], + properties[CLICKHOUSE_PORT], + properties[CLICKHOUSE_USER], + properties[CLICKHOUSE_PW], + properties[CLICKHOUSE_DB], properties[GOOGLE_ID], properties[GOOGLE_PW], properties[CGDS_USERS_SPREADSHEET], @@ -478,49 +460,50 @@ def get_portal_properties(portal_properties_filename): # adds new users from the google spreadsheet into the cgds portal database # returns new user map if users have been inserted, None otherwise -def manage_users(client, spreadsheet, cursor, sheet_records, portal_name): +def manage_users(client, spreadsheet, ch_client, sheet_records, portal_name): # get map of current portal users print >> OUTPUT_FILE, 'Getting list of current portal users' - current_user_map = get_current_user_map(cursor) + current_user_map = get_current_user_map(ch_client) if current_user_map is not None: print >> OUTPUT_FILE, 'We have found %s current portal users' % len(current_user_map) else: print >> OUTPUT_FILE, 'Error reading user table' - return None, None, None + return None, None # get list of new users and insert print >> OUTPUT_FILE, 'Checking for new users' new_user_map = get_new_user_map(spreadsheet, sheet_records, current_user_map, portal_name) rejected_user_map = get_rejected_user_map(spreadsheet, sheet_records, current_user_map, portal_name) - + if (len(new_user_map) > 0): print >> OUTPUT_FILE, 'We have %s new user(s) to add' % len(new_user_map) - emails_to_remove = insert_new_users(cursor, new_user_map.values()) - return new_user_map, rejected_user_map, emails_to_remove + insert_new_users(new_user_map.values()) + return new_user_map, rejected_user_map else: print >> OUTPUT_FILE, 'No new users to insert, exiting' - return None, rejected_user_map, None + return None, rejected_user_map # ------------------------------------------------------------------------------ # updates user study access -def update_user_authorities(spreadsheet, cursor, sheet_records, portal_name): - - # get map of current portal users - print >> OUTPUT_FILE, 'Getting list of current portal users from spreadsheet' - all_user_map = get_new_user_map(spreadsheet, sheet_records, {}, portal_name) - if all_user_map is None: - return None; - print >> OUTPUT_FILE, 'Updating authorities for each user in current portal user list' - for user in all_user_map.values(): - sheet_authorities = set(user.authorities) - db_authorities = set(get_user_authorities(cursor, user.google_email)) - try: - cursor.executemany("insert into authorities values(%s, %s)", - [(user.google_email, authority) for authority in sheet_authorities - db_authorities]) - except MySQLdb.Error, msg: - print >> ERROR_FILE, msg +def update_user_authorities(spreadsheet, ch_client, sheet_records, portal_name): + + # get map of current portal users + print >> OUTPUT_FILE, 'Getting list of current portal users from spreadsheet' + all_user_map = get_new_user_map(spreadsheet, sheet_records, {}, portal_name) + if all_user_map is None: + return None + print >> OUTPUT_FILE, 'Updating authorities for each user in current portal user list' + new_authority_pairs = [] + for user in all_user_map.values(): + sheet_authorities = set(user.authorities) + db_authorities = set(get_user_authorities(ch_client, user.google_email)) + new_authority_pairs += [(user.google_email, authority) for authority in sheet_authorities - db_authorities] + # single batched write to ClickHouse file + if new_authority_pairs: + values_string_items = ["('%s','%s')" % pair for pair in new_authority_pairs] + append_command_to_clickhouse_file("INSERT INTO authorities VALUES %s;" % ",".join(values_string_items)) # ------------------------------------------------------------------------------ # adds rejected user emails to rejected_users worksheet in an idempotent fashion @@ -632,23 +615,20 @@ def get_portal_name_map(google_spreadsheet,client): return portal_name -def establish_new_db_connection(portal_properties, port, ssl_ca_filename): - # get db connection & create cursor - print >> OUTPUT_FILE, 'Connecting to database: ' + portal_properties.cgds_database_name - connection = get_db_connection(portal_properties, port, ssl_ca_filename) - if connection is not None: - cursor = connection.cursor() - else: - print >> OUTPUT_FILE, 'Error connecting to database, exiting' +def establish_clickhouse_client(portal_properties): + print >> OUTPUT_FILE, 'Connecting to ClickHouse: ' + portal_properties.clickhouse_host + ch_client = get_clickhouse_client(portal_properties) + if ch_client is None: + print >> OUTPUT_FILE, 'Error connecting to ClickHouse, exiting' sys.exit(2) - return (connection, cursor) + return ch_client # ------------------------------------------------------------------------------ # displays program usage (invalid args) def usage(): - print >> OUTPUT_FILE, 'importUsers.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --port [mysql port number] --sender [sender identifier - optional] --ssl-ca [ssl certificate file - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]' + print >> OUTPUT_FILE, 'importUsers.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --sender [sender identifier - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]' # ------------------------------------------------------------------------------ # the big deal main. @@ -657,7 +637,7 @@ def main(): # parse command line options try: - opts, args = getopt.getopt(sys.argv[1:], '', ['secrets-file=', 'creds-file=', 'properties-file=', 'ssl-ca=', 'send-email-confirm=', 'use-institutional-id=', 'port=', 'sender=', 'gmail-username=', 'gmail-password=', 'smtp-server=']) + opts, args = getopt.getopt(sys.argv[1:], '', ['secrets-file=', 'creds-file=', 'properties-file=', 'send-email-confirm=', 'use-institutional-id=', 'sender=', 'gmail-username=', 'gmail-password=', 'smtp-server=']) except getopt.error, msg: print >> ERROR_FILE, msg usage() @@ -668,9 +648,7 @@ def main(): creds_filename = '' properties_filename = '' send_email_confirm = '' - port = '' sender = '' - ssl_ca_filename = '' # not required gmail_username = '' gmail_password = '' smtp_server = '' @@ -686,18 +664,14 @@ def main(): gmail_password = a elif o == '--properties-file': properties_filename = a - elif o == '--ssl-ca': - ssl_ca_filename = a elif o == '--send-email-confirm': send_email_confirm = a elif o == '--sender': sender = a - elif o == '--port': - port = a elif o == '--smtp-server': smtp_server = a - if (secrets_filename == '' or creds_filename == '' or properties_filename == '' or send_email_confirm == '' or port == '' or + if (secrets_filename == '' or creds_filename == '' or properties_filename == '' or send_email_confirm == '' or (send_email_confirm != 'true' and send_email_confirm != 'false') or (send_email_confirm == 'true' and (gmail_username == '' or gmail_password == '' or smtp_server == ''))): usage() @@ -720,31 +694,26 @@ def main(): # connect to importer configuration spreadsheet and get mapping of spreadsheet to portal name portal_name_map = get_portal_name_map(portal_properties.google_importer_spreadsheet,client) + ch_client = establish_clickhouse_client(portal_properties) + google_spreadsheets = portal_properties.google_spreadsheet.split(';') for google_spreadsheet in google_spreadsheets: if not google_spreadsheet == '': - (connection, cursor) = establish_new_db_connection(portal_properties, port, ssl_ca_filename) - sheet_records = get_sheet_records(client, google_spreadsheet, - portal_properties.google_worksheet) + portal_properties.google_worksheet) spreadsheet_title = get_spreadsheet_title(client, google_spreadsheet) - + print >> OUTPUT_FILE, 'Importing ' + spreadsheet_title + ' ...' app_name = portal_name_map[spreadsheet_title] - + # the 'guts' of the script # note: original script depended on one to one mapping of spreadsheet to app name - and lookup was by spreadsheet # with a now decommissioned app (genie-archive) we wanted to be able to do one to many mapping (one spreadsheet to multiple apps) # to fit this logic would have to rework how we specify properties or introduce new column (db name) as index but might have other effects - new_user_map, rejected_user_map, emails_to_remove = manage_users(client, google_spreadsheet, cursor, sheet_records, app_name) - - # update user authorities - update_user_authorities(google_spreadsheet, cursor, sheet_records, app_name) + new_user_map, rejected_user_map = manage_users(client, google_spreadsheet, ch_client, sheet_records, app_name) - # commit changes before moving on to next spreadsheet - cursor.close() - connection.commit() - connection.close() + # update user authorities + update_user_authorities(google_spreadsheet, ch_client, sheet_records, app_name) # add the emails from rejected_user_map to rejected_users worksheet in an idempotent fashion # also remove any emails from rejected_user_map that already exist in the worksheet-- @@ -754,7 +723,7 @@ def main(): # sending emails if send_email_confirm == 'true': print >> OUTPUT_FILE, "Sending confirmation emails to new users" - send_emails(new_user_map, google_spreadsheet, client, IMPORT_EMAIL_WORKSHEET, gmail_username, gmail_password, sender, smtp_server, emails_to_remove) + send_emails(new_user_map, google_spreadsheet, client, IMPORT_EMAIL_WORKSHEET, gmail_username, gmail_password, sender, smtp_server) print >> OUTPUT_FILE, "Sending rejection emails to newly rejected users" send_emails(rejected_user_map, google_spreadsheet, client, REJECT_EMAIL_WORKSHEET, gmail_username, gmail_password, sender, smtp_server) From 4643c85769cf5b3d2f932b3bf9c0158ce89ae488 Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 11:56:01 -0400 Subject: [PATCH 2/7] replace usage of clickhouse CLI with the clickhouse-connect library --- import-scripts/import-portal-users.sh | 19 ------------- import-scripts/importUsers.py | 40 ++++++++------------------- 2 files changed, 12 insertions(+), 47 deletions(-) diff --git a/import-scripts/import-portal-users.sh b/import-scripts/import-portal-users.sh index d3bc6cd8a..9831c5610 100755 --- a/import-scripts/import-portal-users.sh +++ b/import-scripts/import-portal-users.sh @@ -13,11 +13,6 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" SMTP_SERVER=`cat $MAIL_SMTP_SERVER` GENIE_BLUE_DATABASE_PROPERTIES_FILENAME="portal.properties.genie.blue" GENIE_GREEN_DATABASE_PROPERTIES_FILENAME="portal.properties.genie.green" - GENIE_BLUE_CLICKHOUSE_CLIENT_CONFIG_FILEPATH="/data/portal-cron/pipelines-credentials/clickhouse_client_genie_blue_config.yaml" - GENIE_GREEN_CLICKHOUSE_CLIENT_CONFIG_FILEPATH="/data/portal-cron/pipelines-credentials/clickhouse_client_genie_green_config.yaml" - CLICKHOUSE_COMMANDS_PENDING_DIRPATH="/data/portal-cron/tmp/import-users-genie/pending_clickhouse_commands" - CLICKHOUSE_COMMANDS_COMPLETED_DIRPATH="/data/portal-cron/tmp/import-users-genie/completed_clickhouse_commands" - CLICKHOUSE_BINARY_FILEPATH="/home/cbioportal_importer/tools/clickhouse/bin/clickhouse" GET_DATABASE_CURRENTLY_IN_PRODUCTION_SCRIPT="$PORTAL_HOME/scripts/get_database_currently_in_production.sh" MANAGE_DATABASE_TOOL_PROPERTIES_FILEPATH="$PORTAL_HOME/pipelines-credentials/manage_genie_database_update_tools.properties" CURRENT_DATABASE_OUTPUT_FILEPATH=$(mktemp $PORTAL_HOME/tmp/import-portal-users/get_current_database_output.txt.XXXXXX) @@ -29,11 +24,9 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" rm -f $CURRENT_DATABASE_OUTPUT_FILEPATH if [ ${current_production_database_color:0:5} == "green" ] ; then GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME="$GENIE_GREEN_DATABASE_PROPERTIES_FILENAME" - GENIE_CLICKHOUSE_CLIENT_CONFIG_FILEPATH="$GENIE_GREEN_CLICKHOUSE_CLIENT_CONFIG_FILEPATH" else if [ ${current_production_database_color:0:4} == "blue" ] ; then GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME="$GENIE_BLUE_DATABASE_PROPERTIES_FILENAME" - GENIE_CLICKHOUSE_CLIENT_CONFIG_FILEPATH="$GENIE_BLUE_CLICKHOUSE_CLIENT_CONFIG_FILEPATH" else echo "error : Failed to properly detect the current production database for genie" >> "$USERSGENIELOGFILENAME" fi @@ -43,18 +36,6 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" date >> "$USERSGENIELOGFILENAME" $PYTHON_BINARY $PORTAL_HOME/scripts/importUsers.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 CGDS_GENIE_IMPORT_STATUS=$? - clickhouse_commands_filepath=$(ls ${CLICKHOUSE_COMMANDS_PENDING_DIRPATH}/*.sql 2>/dev/null | head -n 1) - if [ $CGDS_GENIE_IMPORT_STATUS -eq 0 ] && [ -f "$clickhouse_commands_filepath" ] ; then - echo "executing clickhouse commands from $clickhouse_commands_filepath" >> "$USERSGENIELOGFILENAME" - date >> "$USERSGENIELOGFILENAME" - "$CLICKHOUSE_BINARY_FILEPATH" client --config-file="$GENIE_CLICKHOUSE_CLIENT_CONFIG_FILEPATH" --multiquery < "$clickhouse_commands_filepath" - CLICKHOUSE_IMPORT_STATUS=$? - if [ $CLICKHOUSE_IMPORT_STATUS -eq 0 ] ; then - mv "$clickhouse_commands_filepath" "$CLICKHOUSE_COMMANDS_COMPLETED_DIRPATH" - else - $CGDS_GENIE_IMPORT_STATUS=2 # failure flag - fi - fi PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" FAILED_DATABASES="" diff --git a/import-scripts/importUsers.py b/import-scripts/importUsers.py index 3e35f2050..2efc13208 100755 --- a/import-scripts/importUsers.py +++ b/import-scripts/importUsers.py @@ -23,7 +23,6 @@ # imports import os import sys -import time import getopt import clickhouse_connect import re @@ -48,10 +47,6 @@ # ------------------------------------------------------------------------------ # globals -# hardcoded file path for clickhouse -UNIX_EPOC_SECONDS = str(int(time.time())) -CLICKHOUSE_COMMANDS_FILEPATH = '/data/portal-cron/tmp/import-users-genie/pending_clickhouse_commands/%s.sql' % UNIX_EPOC_SECONDS - # some file descriptors ERROR_FILE = sys.stderr OUTPUT_FILE = sys.stdout @@ -243,30 +238,21 @@ def get_spreadsheet_title(client, ss): # ------------------------------------------------------------------------------ # insert new users into table - this list does not contain users already in table -def append_command_to_clickhouse_file(sql_command_string): - with open(CLICKHOUSE_COMMANDS_FILEPATH, "a") as clickhouse_sql_file: - clickhouse_sql_file.write(sql_command_string) - -def insert_new_users(new_user_list): - added_user_triples = [] - added_user_authority_duples = [] +def insert_new_users(ch_client, new_user_list): + added_user_rows = [] + added_authority_rows = [] for user in new_user_list: print >> OUTPUT_FILE, "new user: %s" % user.google_email user_name = user.name if isinstance(user_name, unicode): user_name = user_name.encode('utf-8') - user_email_escaped = user.google_email.lower().replace('\'', '\\\'') - added_user_triples.append((user_email_escaped, user_name, user.enabled)) - added_user_authority_duples += [(user_email_escaped, authority) for authority in user.authorities] - # output commands to be executed on clickhouse server - if len(added_user_triples) > 0: - values_string_items = ["('%s','%s','%s')" % triple for triple in added_user_triples] - sql_command_string = "INSERT INTO users VALUES %s;" % ",".join(values_string_items) - append_command_to_clickhouse_file(sql_command_string) - if len(added_user_authority_duples) > 0: - values_string_items = ["('%s','%s')" % duple for duple in added_user_authority_duples] - sql_command_string = "INSERT INTO authorities VALUES %s;" % ",".join(values_string_items) - append_command_to_clickhouse_file(sql_command_string) + user_email = user.google_email.lower() + added_user_rows.append([user_email, user_name, user.enabled]) + added_authority_rows += [[user_email, authority] for authority in user.authorities] + if added_user_rows: + ch_client.insert('users', added_user_rows, column_names=['email', 'name', 'enabled']) + if added_authority_rows: + ch_client.insert('authorities', added_authority_rows, column_names=['email', 'authority']) # ------------------------------------------------------------------------------ # get current users from database @@ -478,7 +464,7 @@ def manage_users(client, spreadsheet, ch_client, sheet_records, portal_name): if (len(new_user_map) > 0): print >> OUTPUT_FILE, 'We have %s new user(s) to add' % len(new_user_map) - insert_new_users(new_user_map.values()) + insert_new_users(ch_client, new_user_map.values()) return new_user_map, rejected_user_map else: print >> OUTPUT_FILE, 'No new users to insert, exiting' @@ -500,10 +486,8 @@ def update_user_authorities(spreadsheet, ch_client, sheet_records, portal_name): sheet_authorities = set(user.authorities) db_authorities = set(get_user_authorities(ch_client, user.google_email)) new_authority_pairs += [(user.google_email, authority) for authority in sheet_authorities - db_authorities] - # single batched write to ClickHouse file if new_authority_pairs: - values_string_items = ["('%s','%s')" % pair for pair in new_authority_pairs] - append_command_to_clickhouse_file("INSERT INTO authorities VALUES %s;" % ",".join(values_string_items)) + ch_client.insert('authorities', new_authority_pairs, column_names=['email', 'authority']) # ------------------------------------------------------------------------------ # adds rejected user emails to rejected_users worksheet in an idempotent fashion From 664a6305edcf736da187860e3ce1d0f782d0099c Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 15:20:04 -0400 Subject: [PATCH 3/7] wip --- ...import-portal-users.sh => import-portal-users-clickhouse.sh} | 2 +- import-scripts/{importUsers.py => importUsersClickhouse.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename import-scripts/{import-portal-users.sh => import-portal-users-clickhouse.sh} (84%) mode change 100755 => 100644 rename import-scripts/{importUsers.py => importUsersClickhouse.py} (100%) diff --git a/import-scripts/import-portal-users.sh b/import-scripts/import-portal-users-clickhouse.sh old mode 100755 new mode 100644 similarity index 84% rename from import-scripts/import-portal-users.sh rename to import-scripts/import-portal-users-clickhouse.sh index 9831c5610..37595095d --- a/import-scripts/import-portal-users.sh +++ b/import-scripts/import-portal-users-clickhouse.sh @@ -34,7 +34,7 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" echo "### Starting import" >> "$USERSGENIELOGFILENAME" date >> "$USERSGENIELOGFILENAME" - $PYTHON_BINARY $PORTAL_HOME/scripts/importUsers.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 + $PYTHON3_BINARY $PORTAL_HOME/scripts/importUsersClickhouse.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 CGDS_GENIE_IMPORT_STATUS=$? PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" diff --git a/import-scripts/importUsers.py b/import-scripts/importUsersClickhouse.py similarity index 100% rename from import-scripts/importUsers.py rename to import-scripts/importUsersClickhouse.py From 8f19f8adfc2c72ff4477bdce5f195e11e849f5ef Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 15:34:28 -0400 Subject: [PATCH 4/7] convert script from python2 to python3 --- import-scripts/importUsersClickhouse.py | 105 +++++++++++------------- 1 file changed, 50 insertions(+), 55 deletions(-) diff --git a/import-scripts/importUsersClickhouse.py b/import-scripts/importUsersClickhouse.py index 2efc13208..61f970b41 100755 --- a/import-scripts/importUsersClickhouse.py +++ b/import-scripts/importUsersClickhouse.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python3 # ------------------------------------------------------------------------------ # Script which adds new users from google spreadsheet into the the cgds # user table. The following properties must be specified in portal.properties: @@ -36,12 +36,10 @@ from oauth2client.client import flow_from_clientsecrets from oauth2client.tools import run_flow, argparser -from email.MIMEMultipart import MIMEMultipart -from email.MIMEBase import MIMEBase -from email.MIMEText import MIMEText -from email.Header import Header -from email.Utils import COMMASPACE, formatdate -from email import Encoders +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.header import Header +from email.utils import COMMASPACE, formatdate from googleapiclient.discovery import build # ------------------------------------------------------------------------------ @@ -138,7 +136,7 @@ def __init__(self, inst_email, google_email, name, enabled, authorities): def send_mail(to, subject, body, gmail_username, gmail_password, sender=MESSAGE_FROM_CMO, bcc=MESSAGE_BCC_CMO, server=None): if server is None: - print >> ERROR_FILE, 'smtp server must be specified' + print('smtp server must be specified', file=ERROR_FILE) sys.exit(2) assert type(to)==list @@ -213,10 +211,10 @@ def get_sheet_records(client, ss, ws): new_record[header[index]] = None sheet_records.append(new_record) except Exception as e: - print >> ERROR_FILE, "There was an error connecting to google." - print >> ERROR_FILE, e + print("There was an error connecting to google.", file=ERROR_FILE) + print(e, file=ERROR_FILE) exit(0) - + return sheet_records # ------------------------------------------------------------------------------ @@ -230,8 +228,8 @@ def get_spreadsheet_title(client, ss): data = response.get('properties', {}) spreadsheet_title = data["title"] except Exception as e: - print >> ERROR_FILE, "There was an error connecting to google." - print >> ERROR_FILE, e + print("There was an error connecting to google.", file=ERROR_FILE) + print(e, file=ERROR_FILE) exit(0) return spreadsheet_title @@ -242,10 +240,8 @@ def insert_new_users(ch_client, new_user_list): added_user_rows = [] added_authority_rows = [] for user in new_user_list: - print >> OUTPUT_FILE, "new user: %s" % user.google_email + print("new user: %s" % user.google_email, file=OUTPUT_FILE) user_name = user.name - if isinstance(user_name, unicode): - user_name = user_name.encode('utf-8') user_email = user.google_email.lower() added_user_rows.append([user_email, user_name, user.enabled]) added_authority_rows += [[user_email, authority] for authority in user.authorities] @@ -270,7 +266,7 @@ def get_current_user_map(ch_client): for row in result.result_rows: to_return[row[0].lower()] = User(row[0].lower(), row[0].lower(), row[1], row[2], 'not_used_here') except Exception as msg: - print >> ERROR_FILE, msg + print(msg, file=ERROR_FILE) return None return to_return @@ -291,7 +287,7 @@ def get_user_authorities(ch_client, google_email): for row in result.result_rows: to_return.append(row[1]) except Exception as msg: - print >> ERROR_FILE, msg + print(msg, file=ERROR_FILE) return None return to_return @@ -373,7 +369,7 @@ def get_rejected_user_map(spreadsheet, sheet_records, current_user_map, portal_n else: to_return[google_email.lower()] = User(inst_email, google_email, name, 0, [portal_name + ':' + au for au in authorities.split(';')]) - print >> OUTPUT_FILE, 'Rejected user added to list: %s' % google_email.lower() + print('Rejected user added to list: %s' % google_email.lower(), file=OUTPUT_FILE) return to_return # ------------------------------------------------------------------------------ @@ -389,7 +385,7 @@ def get_clickhouse_client(portal_properties): database=portal_properties.clickhouse_db, ) except Exception as msg: - print >> ERROR_FILE, msg + print(msg, file=ERROR_FILE) return None @@ -407,11 +403,11 @@ def get_portal_properties(portal_properties_filename): continue # store name/value property = line.split('=') - # spreadsheet url contains an '=' sign - if line.startswith(CGDS_USERS_SPREADSHEET): + # spreadsheet url / db pw may contain an '=' sign + if line.startswith(CGDS_USERS_SPREADSHEET) or line.startswith(CLICKHOUSE_PW): property = [property[0], line[line.index('=')+1:len(line)]] if (len(property) != 2): - print >> ERROR_FILE, 'Skipping invalid entry in property file: ' + line + print('Skipping invalid entry in property file: ' + line, file=ERROR_FILE) continue properties[property[0]] = property[1].strip() portal_properties_file.close() @@ -427,7 +423,7 @@ def get_portal_properties(portal_properties_filename): CGDS_USERS_SPREADSHEET not in properties or len(properties[CGDS_USERS_SPREADSHEET]) == 0 or CGDS_USERS_WORKSHEET not in properties or len(properties[CGDS_USERS_WORKSHEET]) == 0 or IMPORTER_SPREADSHEET not in properties or len(properties[IMPORTER_SPREADSHEET]) == 0): - print >> ERROR_FILE, 'Missing one or more required properties, please check property file' + print('Missing one or more required properties, please check property file', file=ERROR_FILE) return None # return an instance of PortalProperties @@ -449,25 +445,25 @@ def get_portal_properties(portal_properties_filename): def manage_users(client, spreadsheet, ch_client, sheet_records, portal_name): # get map of current portal users - print >> OUTPUT_FILE, 'Getting list of current portal users' + print('Getting list of current portal users', file=OUTPUT_FILE) current_user_map = get_current_user_map(ch_client) if current_user_map is not None: - print >> OUTPUT_FILE, 'We have found %s current portal users' % len(current_user_map) + print('We have found %s current portal users' % len(current_user_map), file=OUTPUT_FILE) else: - print >> OUTPUT_FILE, 'Error reading user table' + print('Error reading user table', file=OUTPUT_FILE) return None, None # get list of new users and insert - print >> OUTPUT_FILE, 'Checking for new users' + print('Checking for new users', file=OUTPUT_FILE) new_user_map = get_new_user_map(spreadsheet, sheet_records, current_user_map, portal_name) rejected_user_map = get_rejected_user_map(spreadsheet, sheet_records, current_user_map, portal_name) if (len(new_user_map) > 0): - print >> OUTPUT_FILE, 'We have %s new user(s) to add' % len(new_user_map) + print('We have %s new user(s) to add' % len(new_user_map), file=OUTPUT_FILE) insert_new_users(ch_client, new_user_map.values()) return new_user_map, rejected_user_map else: - print >> OUTPUT_FILE, 'No new users to insert, exiting' + print('No new users to insert, exiting', file=OUTPUT_FILE) return None, rejected_user_map # ------------------------------------------------------------------------------ @@ -476,11 +472,11 @@ def manage_users(client, spreadsheet, ch_client, sheet_records, portal_name): def update_user_authorities(spreadsheet, ch_client, sheet_records, portal_name): # get map of current portal users - print >> OUTPUT_FILE, 'Getting list of current portal users from spreadsheet' + print('Getting list of current portal users from spreadsheet', file=OUTPUT_FILE) all_user_map = get_new_user_map(spreadsheet, sheet_records, {}, portal_name) if all_user_map is None: return None - print >> OUTPUT_FILE, 'Updating authorities for each user in current portal user list' + print('Updating authorities for each user in current portal user list', file=OUTPUT_FILE) new_authority_pairs = [] for user in all_user_map.values(): sheet_authorities = set(user.authorities) @@ -496,7 +492,7 @@ def add_rejected_users_to_worksheet(rejected_user_map, google_spreadsheet, clien if rejected_user_map is None or len(rejected_user_map) == 0: return - print >> OUTPUT_FILE, 'Adding rejected users to rejected_users worksheet' + print('Adding rejected users to rejected_users worksheet', file=OUTPUT_FILE) # get existing records from the rejected_users worksheet # Note: get_sheet_records converts column names to lowercase with special chars removed @@ -510,23 +506,22 @@ def add_rejected_users_to_worksheet(rejected_user_map, google_spreadsheet, clien existing_emails.add(record[REJECTED_EMAIL_KEY].strip().lower()) except Exception as e: # worksheet might not exist or be empty, start with empty set - print >> OUTPUT_FILE, 'Creating new rejected_users worksheet or worksheet is empty' + print('Creating new rejected_users worksheet or worksheet is empty', file=OUTPUT_FILE) existing_emails = set() # remove any rejected users that are already in the worksheet - for user_email in rejected_user_map.keys(): + for user_email in list(rejected_user_map.keys()): if user_email.lower() in existing_emails: rejected_user_map.pop(user_email, None) # prepare new rows to append (only users not already in the worksheet) new_rows = [] - import datetime current_utc_time = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') for user_email in rejected_user_map.keys(): if user_email.lower() not in existing_emails: new_rows.append([user_email, current_utc_time]) - print >> OUTPUT_FILE, 'Adding rejected user to worksheet: %s' % user_email + print('Adding rejected user to worksheet: %s' % user_email, file=OUTPUT_FILE) # append new rows to the worksheet if there are any if len(new_rows) > 0: @@ -542,11 +537,11 @@ def add_rejected_users_to_worksheet(rejected_user_map, google_spreadsheet, clien valueInputOption='RAW', body=body ).execute() - print >> OUTPUT_FILE, 'Added %s rejected user(s) to worksheet' % len(new_rows) + print('Added %s rejected user(s) to worksheet' % len(new_rows), file=OUTPUT_FILE) except Exception as e: - print >> ERROR_FILE, 'Error adding rejected users to worksheet: %s' % str(e) + print('Error adding rejected users to worksheet: %s' % str(e), file=ERROR_FILE) else: - print >> OUTPUT_FILE, 'No new rejected users to add to worksheet' + print('No new rejected users to add to worksheet', file=OUTPUT_FILE) # ------------------------------------------------------------------------------ # sends emails to users from a user map @@ -569,8 +564,8 @@ def send_emails(user_map, google_spreadsheet, client, worksheet, gmail_username, error_subject = ERROR_EMAIL_SUBJECT_CMO error_body = ERROR_EMAIL_BODY_CMO if emails_to_remove is None or user_key not in emails_to_remove: - print >> OUTPUT_FILE, ('Sending confirmation or rejection email to user: %s at %s' % - (user.name, user.inst_email)) + print('Sending confirmation or rejection email to user: %s at %s' % + (user.name, user.inst_email), file=OUTPUT_FILE) send_mail([user.inst_email], subject, body, gmail_username, gmail_password, sender=from_field, bcc=bcc_field, server=smtp_server) else: send_mail([user_key], error_subject, error_body, gmail_username, gmail_password, sender=from_field, bcc=bcc_field, server=smtp_server) @@ -581,7 +576,7 @@ def send_emails(user_map, google_spreadsheet, client, worksheet, gmail_username, def get_email_parameters(google_spreadsheet,client,worksheet): subject = '' body = '' - print >> OUTPUT_FILE, 'Getting email parameters from google spreadsheet' + print('Getting email parameters from google spreadsheet', file=OUTPUT_FILE) email_sheet_records = get_sheet_records(client, google_spreadsheet, worksheet) for record in email_sheet_records: if record[SUBJECT_KEY] is not None and record[BODY_KEY] is not None: @@ -591,7 +586,7 @@ def get_email_parameters(google_spreadsheet,client,worksheet): def get_portal_name_map(google_spreadsheet,client): portal_name = {} - print >> OUTPUT_FILE, 'Getting access control parameter from google spreadsheet' + print('Getting access control parameter from google spreadsheet', file=OUTPUT_FILE) access_control_sheet = get_sheet_records(client,google_spreadsheet,ACCESS_CONTROL_WORKSHEET) for row in access_control_sheet: if row[PORTAL_NAME_KEY] is not None and row[SPREADSHEET_NAME_KEY] is not None: @@ -600,10 +595,10 @@ def get_portal_name_map(google_spreadsheet,client): def establish_clickhouse_client(portal_properties): - print >> OUTPUT_FILE, 'Connecting to ClickHouse: ' + portal_properties.clickhouse_host + print('Connecting to ClickHouse: ' + portal_properties.clickhouse_host, file=OUTPUT_FILE) ch_client = get_clickhouse_client(portal_properties) if ch_client is None: - print >> OUTPUT_FILE, 'Error connecting to ClickHouse, exiting' + print('Error connecting to ClickHouse, exiting', file=OUTPUT_FILE) sys.exit(2) return ch_client @@ -612,7 +607,7 @@ def establish_clickhouse_client(portal_properties): # displays program usage (invalid args) def usage(): - print >> OUTPUT_FILE, 'importUsers.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --sender [sender identifier - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]' + print('importUsersClickhouse.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --sender [sender identifier - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]', file=OUTPUT_FILE) # ------------------------------------------------------------------------------ # the big deal main. @@ -622,8 +617,8 @@ def main(): # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], '', ['secrets-file=', 'creds-file=', 'properties-file=', 'send-email-confirm=', 'use-institutional-id=', 'sender=', 'gmail-username=', 'gmail-password=', 'smtp-server=']) - except getopt.error, msg: - print >> ERROR_FILE, msg + except getopt.error as msg: + print(msg, file=ERROR_FILE) usage() sys.exit(2) @@ -663,14 +658,14 @@ def main(): # check existence of file if not os.path.exists(properties_filename): - print >> ERROR_FILE, 'properties file cannot be found: ' + properties_filename + print('properties file cannot be found: ' + properties_filename, file=ERROR_FILE) sys.exit(2) # parse/get relevant portal properties - print >> OUTPUT_FILE, 'Reading portal properties file: ' + properties_filename + print('Reading portal properties file: ' + properties_filename, file=OUTPUT_FILE) portal_properties = get_portal_properties(properties_filename) if not portal_properties: - print >> OUTPUT_FILE, 'Error reading %s, exiting' % properties_filename + print('Error reading %s, exiting' % properties_filename, file=OUTPUT_FILE) return # create client for interacting with google sheets api @@ -687,7 +682,7 @@ def main(): portal_properties.google_worksheet) spreadsheet_title = get_spreadsheet_title(client, google_spreadsheet) - print >> OUTPUT_FILE, 'Importing ' + spreadsheet_title + ' ...' + print('Importing ' + spreadsheet_title + ' ...', file=OUTPUT_FILE) app_name = portal_name_map[spreadsheet_title] # the 'guts' of the script @@ -706,9 +701,9 @@ def main(): # sending emails if send_email_confirm == 'true': - print >> OUTPUT_FILE, "Sending confirmation emails to new users" + print("Sending confirmation emails to new users", file=OUTPUT_FILE) send_emails(new_user_map, google_spreadsheet, client, IMPORT_EMAIL_WORKSHEET, gmail_username, gmail_password, sender, smtp_server) - print >> OUTPUT_FILE, "Sending rejection emails to newly rejected users" + print("Sending rejection emails to newly rejected users", file=OUTPUT_FILE) send_emails(rejected_user_map, google_spreadsheet, client, REJECT_EMAIL_WORKSHEET, gmail_username, gmail_password, sender, smtp_server) # ------------------------------------------------------------------------------ From 8abfae63faf8cbf2cf5a93d3230e29145f2c0206 Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 15:37:18 -0400 Subject: [PATCH 5/7] more details on authorities --- import-scripts/importUsersClickhouse.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/import-scripts/importUsersClickhouse.py b/import-scripts/importUsersClickhouse.py index 61f970b41..9d045da91 100755 --- a/import-scripts/importUsersClickhouse.py +++ b/import-scripts/importUsersClickhouse.py @@ -476,14 +476,22 @@ def update_user_authorities(spreadsheet, ch_client, sheet_records, portal_name): all_user_map = get_new_user_map(spreadsheet, sheet_records, {}, portal_name) if all_user_map is None: return None - print('Updating authorities for each user in current portal user list', file=OUTPUT_FILE) + total = len(all_user_map) + print('Updating authorities for %d user(s) in current portal user list' % total, file=OUTPUT_FILE) new_authority_pairs = [] - for user in all_user_map.values(): + for i, user in enumerate(all_user_map.values(), 1): + print(' [%d/%d] checking authorities for %s' % (i, total, user.google_email), file=OUTPUT_FILE) sheet_authorities = set(user.authorities) db_authorities = set(get_user_authorities(ch_client, user.google_email)) - new_authority_pairs += [(user.google_email, authority) for authority in sheet_authorities - db_authorities] + added = [(user.google_email, authority) for authority in sheet_authorities - db_authorities] + if added: + print(' -> adding %d new authority(s): %s' % (len(added), [a for _, a in added]), file=OUTPUT_FILE) + new_authority_pairs += added if new_authority_pairs: + print('Inserting %d new authority pair(s) into ClickHouse' % len(new_authority_pairs), file=OUTPUT_FILE) ch_client.insert('authorities', new_authority_pairs, column_names=['email', 'authority']) + else: + print('No new authorities to insert', file=OUTPUT_FILE) # ------------------------------------------------------------------------------ # adds rejected user emails to rejected_users worksheet in an idempotent fashion From 239c95e09e0dbe3fd37f871cae5e2b58e7dd73ee Mon Sep 17 00:00:00 2001 From: James Ko Date: Wed, 8 Apr 2026 15:48:10 -0400 Subject: [PATCH 6/7] batch query for authorities --- import-scripts/importUsersClickhouse.py | 37 ++++++++----------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/import-scripts/importUsersClickhouse.py b/import-scripts/importUsersClickhouse.py index 9d045da91..832f310bd 100755 --- a/import-scripts/importUsersClickhouse.py +++ b/import-scripts/importUsersClickhouse.py @@ -271,26 +271,6 @@ def get_current_user_map(ch_client): return to_return -# ------------------------------------------------------------------------------ -# get current user authorities - -def get_user_authorities(ch_client, google_email): - - # list of authorities (cancer studies) we are returning -- as a set - to_return = [] - - # recall each tuple in authorities table is ['EMAIL', 'AUTHORITY'] - # no tuple can contain nulls - try: - result = ch_client.query('SELECT * FROM authorities WHERE email = {email:String}', - parameters={'email': google_email}) - for row in result.result_rows: - to_return.append(row[1]) - except Exception as msg: - print(msg, file=ERROR_FILE) - return None - - return to_return # ------------------------------------------------------------------------------ # get current users from google spreadsheet @@ -369,7 +349,6 @@ def get_rejected_user_map(spreadsheet, sheet_records, current_user_map, portal_n else: to_return[google_email.lower()] = User(inst_email, google_email, name, 0, [portal_name + ':' + au for au in authorities.split(';')]) - print('Rejected user added to list: %s' % google_email.lower(), file=OUTPUT_FILE) return to_return # ------------------------------------------------------------------------------ @@ -477,15 +456,21 @@ def update_user_authorities(spreadsheet, ch_client, sheet_records, portal_name): if all_user_map is None: return None total = len(all_user_map) - print('Updating authorities for %d user(s) in current portal user list' % total, file=OUTPUT_FILE) + print('Fetching existing authorities for %d user(s) in batch' % total, file=OUTPUT_FILE) + emails = list(all_user_map.keys()) + result = ch_client.query('SELECT email, authority FROM authorities WHERE email IN {emails:Array(String)}', + parameters={'emails': emails}) + db_authorities_map = {} + for row in result.result_rows: + db_authorities_map.setdefault(row[0].lower(), set()).add(row[1]) + new_authority_pairs = [] - for i, user in enumerate(all_user_map.values(), 1): - print(' [%d/%d] checking authorities for %s' % (i, total, user.google_email), file=OUTPUT_FILE) + for user in all_user_map.values(): sheet_authorities = set(user.authorities) - db_authorities = set(get_user_authorities(ch_client, user.google_email)) + db_authorities = db_authorities_map.get(user.google_email, set()) added = [(user.google_email, authority) for authority in sheet_authorities - db_authorities] if added: - print(' -> adding %d new authority(s): %s' % (len(added), [a for _, a in added]), file=OUTPUT_FILE) + print(' %s: adding %d new authority(s)' % (user.google_email, len(added)), file=OUTPUT_FILE) new_authority_pairs += added if new_authority_pairs: print('Inserting %d new authority pair(s) into ClickHouse' % len(new_authority_pairs), file=OUTPUT_FILE) From e942ce79f3242fc2e198d8d20da16df4cf37e3f1 Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 23 Apr 2026 10:51:51 -0400 Subject: [PATCH 7/7] remove the clickhouse suffix --- ...import-portal-users-clickhouse.sh => import-portal-users.sh} | 2 +- import-scripts/{importUsersClickhouse.py => importUsers.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename import-scripts/{import-portal-users-clickhouse.sh => import-portal-users.sh} (84%) rename import-scripts/{importUsersClickhouse.py => importUsers.py} (98%) diff --git a/import-scripts/import-portal-users-clickhouse.sh b/import-scripts/import-portal-users.sh similarity index 84% rename from import-scripts/import-portal-users-clickhouse.sh rename to import-scripts/import-portal-users.sh index 37595095d..ff0342c91 100644 --- a/import-scripts/import-portal-users-clickhouse.sh +++ b/import-scripts/import-portal-users.sh @@ -34,7 +34,7 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/import-portal-users.lock" echo "### Starting import" >> "$USERSGENIELOGFILENAME" date >> "$USERSGENIELOGFILENAME" - $PYTHON3_BINARY $PORTAL_HOME/scripts/importUsersClickhouse.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 + $PYTHON3_BINARY $PORTAL_HOME/scripts/importUsers.py --secrets-file $PIPELINES_CONFIG_HOME/google-docs/client_secrets.json --creds-file $PIPELINES_CONFIG_HOME/google-docs/creds.dat --properties-file $PIPELINES_CONFIG_HOME/properties/import-users/${GENIE_PRODUCTION_DATABASE_PROPERTIES_FILENAME} --send-email-confirm true --sender GENIE --gmail-username $GMAIL_USERNAME --gmail-password $GMAIL_PASSWORD --smtp-server $SMTP_SERVER >> "$USERSGENIELOGFILENAME" 2>&1 CGDS_GENIE_IMPORT_STATUS=$? PIPELINES_EMAIL_LIST="cbioportal-pipelines@cbioportal.org" diff --git a/import-scripts/importUsersClickhouse.py b/import-scripts/importUsers.py similarity index 98% rename from import-scripts/importUsersClickhouse.py rename to import-scripts/importUsers.py index 832f310bd..75d76c16c 100755 --- a/import-scripts/importUsersClickhouse.py +++ b/import-scripts/importUsers.py @@ -600,7 +600,7 @@ def establish_clickhouse_client(portal_properties): # displays program usage (invalid args) def usage(): - print('importUsersClickhouse.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --sender [sender identifier - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]', file=OUTPUT_FILE) + print('importUsers.py --secrets-file [google secrets.json] --creds-file [oauth creds filename] --properties-file [properties file] --send-email-confirm [true or false] --use-institutional-id [true or false] --sender [sender identifier - optional] --smtp-server [smtp server hostname - required when send-email-confirm is true]', file=OUTPUT_FILE) # ------------------------------------------------------------------------------ # the big deal main.