diff --git a/usaspending_api/common/tests/integration/test_spark_jobs.py b/usaspending_api/common/tests/integration/test_spark_jobs.py index 389d41371e..063de13a8a 100644 --- a/usaspending_api/common/tests/integration/test_spark_jobs.py +++ b/usaspending_api/common/tests/integration/test_spark_jobs.py @@ -5,7 +5,7 @@ def test_local_spark_jobs_strategy(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): expected_table_name = "award_search" delta_table_spec = TABLE_SPEC[expected_table_name] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database spark_jobs = SparkJobs(LocalStrategy()) spark_jobs.start( diff --git a/usaspending_api/etl/management/commands/archive_table_in_delta.py b/usaspending_api/etl/management/commands/archive_table_in_delta.py index b26c5d242a..439d6cb864 100644 --- a/usaspending_api/etl/management/commands/archive_table_in_delta.py +++ b/usaspending_api/etl/management/commands/archive_table_in_delta.py @@ -1,10 +1,9 @@ import logging -import psycopg2 - from datetime import datetime, timedelta -from django.core.management.base import BaseCommand -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +import psycopg2 +from django.core.management.base import BaseCommand, CommandParser + from usaspending_api.common.etl.spark import load_delta_table from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, @@ -12,19 +11,25 @@ get_jdbc_connection_properties, get_usas_jdbc_url, ) -from usaspending_api.download.delta_models.download_job import download_job_create_sql_string +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +from usaspending_api.download.delta_models.download_job import ( + download_job_create_sql_string, +) +from usaspending_api.etl.table_specs import ArchiveTableSpec logger = logging.getLogger(__name__) TABLE_SPEC = { - "download_job": { - "destination_database": "arc", - "destination_table": "download_job", - "archive_date_field": "update_date", - "source_table": "download_job", - "source_database": "public", - "delta_table_create_sql": download_job_create_sql_string, - } + "download_job": ArchiveTableSpec( + **{ + "destination_database": "arc", + "destination_table": "download_job", + "archive_date_field": "update_date", + "source_table": "download_job", + "source_database": "public", + "delta_table_create_sql": download_job_create_sql_string, + } + ) } @@ -35,7 +40,8 @@ class Command(BaseCommand): those records from Postgres. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -54,7 +60,8 @@ def add_arguments(self, parser): "--alt-db", type=str, required=False, - help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's destination_database", + help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's" + " destination_database", ) parser.add_argument( "--alt-name", @@ -63,7 +70,7 @@ def add_arguments(self, parser): help="An alternate Delta Table name which to archive this table, overriding the destination_table", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -86,12 +93,12 @@ def handle(self, *args, **options): archive_period = options["archive_period"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - source_database = table_spec["source_database"] + source_table = table_spec.source_table + source_database = table_spec.source_database qualified_source_table = f"{source_database}.{source_table}" - archive_date_field = table_spec["archive_date_field"] + archive_date_field = table_spec.archive_date_field archive_date = datetime.now() - timedelta(days=archive_period) archive_date_string = archive_date.strftime("%Y-%m-%d") @@ -104,13 +111,18 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError( + "Couldn't find JDBC url, please properly configure your CONFIG." + ) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) # Retrieve data from Postgres query_with_predicate = ( - f"(SELECT * FROM {qualified_source_table} WHERE {archive_date_field} < '{archive_date_string}') AS tmp" + f"(SELECT * FROM {qualified_source_table} " + f"WHERE {archive_date_field} < '{archive_date_string}') AS tmp" ) df = spark.read.jdbc( @@ -122,7 +134,9 @@ def handle(self, *args, **options): # Write data to Delta Lake in Append Mode load_delta_table(spark, df, destination_table_name, overwrite=False) archived_count = df.count() - logger.info(f"Archived {archived_count} records from the {qualified_source_table}") + logger.info( + f"Archived {archived_count} records from the {qualified_source_table}" + ) # Delete data from with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -132,7 +146,9 @@ def handle(self, *args, **options): ) deleted_count = cursor.rowcount - logger.info(f"Deleted {deleted_count} records from the {qualified_source_table} table") + logger.info( + f"Deleted {deleted_count} records from the {qualified_source_table} table" + ) # Shut down spark if spark_created_by_command: diff --git a/usaspending_api/etl/management/commands/create_delta_table.py b/usaspending_api/etl/management/commands/create_delta_table.py index cbdfe84f40..19978107dc 100644 --- a/usaspending_api/etl/management/commands/create_delta_table.py +++ b/usaspending_api/etl/management/commands/create_delta_table.py @@ -1,6 +1,6 @@ import logging -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql.types import StructType from usaspending_api.awards.delta_models.award_id_lookup import AWARD_ID_LOOKUP_SCHEMA @@ -10,23 +10,36 @@ ) from usaspending_api.common.spark.configs import DEFAULT_EXTRA_CONF from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.archive_table_in_delta import TABLE_SPEC as ARCHIVE_TABLE_SPEC -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC -from usaspending_api.transactions.delta_models.transaction_id_lookup import TRANSACTION_ID_LOOKUP_SCHEMA +from usaspending_api.etl.management.commands.archive_table_in_delta import ( + TABLE_SPEC as ARCHIVE_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_query_to_delta import ( + TABLE_SPEC as LOAD_QUERY_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_table_to_delta import ( + TABLE_SPEC as LOAD_TABLE_TABLE_SPEC, +) +from usaspending_api.etl.table_specs import TableSpec +from usaspending_api.transactions.delta_models.transaction_id_lookup import ( + TRANSACTION_ID_LOOKUP_SCHEMA, +) TABLE_SPEC = { **ARCHIVE_TABLE_SPEC, **LOAD_TABLE_TABLE_SPEC, **LOAD_QUERY_TABLE_SPEC, - "award_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, - }, - "transaction_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, - }, + "award_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, + } + ), + "transaction_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, + } + ), } logger = logging.getLogger(__name__) @@ -37,7 +50,7 @@ class Command(BaseCommand): This command creates an empty Delta Table based on the provided --destination-table argument. """ - def add_arguments(self, parser): + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -66,7 +79,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: spark = get_active_spark_session() spark_created_by_command = False if not spark: @@ -78,27 +91,27 @@ def handle(self, *args, **options): spark_s3_bucket = options["spark_s3_bucket"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") spark.sql(f"create database if not exists {destination_database};") spark.sql(f"use {destination_database};") - if isinstance(table_spec["delta_table_create_sql"], str): + if isinstance(table_spec.delta_table_create_sql, str): # Define Schema Using CREATE TABLE AS command spark.sql( - TABLE_SPEC[destination_table]["delta_table_create_sql"].format( + table_spec.delta_table_create_sql.format( DESTINATION_TABLE=destination_table_name, DESTINATION_DATABASE=destination_database, SPARK_S3_BUCKET=spark_s3_bucket, DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, ) ) - elif isinstance(table_spec["delta_table_create_sql"], StructType): - schema = table_spec["delta_table_create_sql"] - additional_options = table_spec.get("delta_table_create_options") or {} - partition_cols = table_spec.get("delta_table_create_partitions") or [] + elif isinstance(table_spec.delta_table_create_sql, StructType): + schema = table_spec.delta_table_create_sql + additional_options = table_spec.delta_table_create_options or {} + partition_cols = table_spec.delta_table_create_partitions or [] df = spark.createDataFrame([], schema) default_options = { diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index aa58341cd9..ff609a79e2 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -2,7 +2,7 @@ from argparse import ArgumentTypeError from typing import Callable -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql import SparkSession from usaspending_api.common.etl.spark import create_ref_temp_views @@ -35,7 +35,10 @@ load_object_class_program_activity_incremental, object_class_program_activity_schema, ) -from usaspending_api.download.delta_models.transaction_download import transaction_download_schema +from usaspending_api.download.delta_models.transaction_download import ( + transaction_download_schema, +) +from usaspending_api.etl.table_specs import QueryTableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -57,7 +60,10 @@ AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, award_search_create_sql_string, ) -from usaspending_api.search.delta_models.dataframes.award_search import load_award_search, load_award_search_incremental +from usaspending_api.search.delta_models.dataframes.award_search import ( + load_award_search, + load_award_search_incremental, +) from usaspending_api.search.delta_models.dataframes.transaction_search import ( load_transaction_search, load_transaction_search_incremental, @@ -69,7 +75,12 @@ subaward_search_create_sql_string, subaward_search_load_sql_string, ) -from usaspending_api.search.models import AwardSearch, SubawardSearch, SummaryStateView, TransactionSearch +from usaspending_api.search.models import ( + AwardSearch, + SubawardSearch, + SummaryStateView, + TransactionSearch, +) from usaspending_api.settings import HOST from usaspending_api.transactions.delta_models import ( SUMMARY_STATE_VIEW_COLUMNS, @@ -89,360 +100,262 @@ logger = logging.getLogger(__name__) TABLE_SPEC = { - "award_search": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "award_search_gold": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_lookup": { - "model": RecipientLookup, - "is_from_broker": False, - "source_query": recipient_lookup_load_sql_string_list, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_lookup", - "swap_schema": "rpt", - "partition_column": "recipient_hash", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), - "postgres_seq_name": "recipient_lookup_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_profile": { - "model": RecipientProfile, - "is_from_broker": False, - "source_query": recipient_profile_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_profile", - "swap_schema": "rpt", - "partition_column": "recipient_hash", # This isn't used for anything - "partition_column_type": "string", - "is_partition_column_unique": False, - "delta_table_create_sql": recipient_profile_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), - "postgres_seq_name": "recipient_profile_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "summary_state_view": { - "model": SummaryStateView, - "is_from_broker": False, - "source_query": summary_state_view_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "summary_state_view", - "swap_schema": "rpt", - "partition_column": "duh", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": summary_state_view_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, - "custom_schema": "duh STRING", - "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "sam_recipient": { - "model": None, - "is_from_broker": True, - "source_query": sam_recipient_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "duns", - "swap_schema": "int", - "partition_column": "broker_duns_id", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": sam_recipient_create_sql_string, - "delta_table_create_options": None, - "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(SAM_RECIPIENT_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search_gold": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": { - "partition_keys": ["is_fpds"], - "partitioning_form": "LIST", - "partitions": [ - {"table_suffix": "_fpds", "partitioning_clause": "FOR VALUES IN (TRUE)"}, - {"table_suffix": "_fabs", "partitioning_clause": "FOR VALUES IN (FALSE)"}, + "award_search": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_COLUMNS), + } + ), + "award_search_gold": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), + } + ), + "recipient_lookup": QueryTableSpec( + **{ + "model": RecipientLookup, + "source_query": recipient_lookup_load_sql_string_list, + "destination_database": "rpt", + "swap_table": "recipient_lookup", + "swap_schema": "rpt", + "partition_column": "recipient_hash", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, + "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), + "postgres_seq_name": "recipient_lookup_id_seq", + } + ), + "recipient_profile": QueryTableSpec( + **{ + "model": RecipientProfile, + "source_query": recipient_profile_load_sql_strings, + "destination_database": "rpt", + "swap_table": "recipient_profile", + "swap_schema": "rpt", + "partition_column": "recipient_hash", # This isn't used for anything + "partition_column_type": "string", + "delta_table_create_sql": recipient_profile_create_sql_string, + "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), + "postgres_seq_name": "recipient_profile_id_seq", + } + ), + "summary_state_view": QueryTableSpec( + **{ + "model": SummaryStateView, + "source_query": summary_state_view_load_sql_string, + "destination_database": "rpt", + "swap_table": "summary_state_view", + "swap_schema": "rpt", + "partition_column": "duh", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": summary_state_view_create_sql_string, + "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, + "custom_schema": "duh STRING", + "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), + } + ), + "sam_recipient": QueryTableSpec( + **{ + "is_from_broker": True, + "source_query": sam_recipient_load_sql_string, + "destination_database": "int", + "swap_table": "duns", + "swap_schema": "int", + "partition_column": "broker_duns_id", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": sam_recipient_create_sql_string, + "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, + "column_names": list(SAM_RECIPIENT_COLUMNS), + } + ), + "transaction_search": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + } + ), + "transaction_search_gold": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), + "postgres_partition_spec": { + "partition_keys": ["is_fpds"], + "partitioning_form": "LIST", + "partitions": [ + { + "table_suffix": "_fpds", + "partitioning_clause": "FOR VALUES IN (TRUE)", + }, + { + "table_suffix": "_fabs", + "partitioning_clause": "FOR VALUES IN (FALSE)", + }, + ], + }, + } + ), + "transaction_current_cd_lookup": QueryTableSpec( + **{ + "source_query": transaction_current_cd_lookup_load_sql_string, + "destination_database": "int", + "swap_table": "transaction_current_cd_lookup", + "swap_schema": "int", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, + "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, + "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), + } + ), + "subaward_search": QueryTableSpec( + **{ + "model": SubawardSearch, + "source_query": subaward_search_load_sql_string, + "destination_database": "rpt", + "swap_table": "subaward_search", + "swap_schema": "rpt", + "partition_column": "broker_subaward_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": subaward_search_create_sql_string, + "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "treasury_account_identifiers ARRAY", + "column_names": list(SUBAWARD_SEARCH_COLUMNS), + "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, + } + ), + "covid_faba_spending": QueryTableSpec( + **{ + "model": CovidFABASpending, + "source_query": covid_faba_spending_load_sql_strings, + "destination_database": "rpt", + "swap_table": "covid_faba_spending", + "swap_schema": "rpt", + "partition_column": "id", + "partition_column_type": "numeric", + "delta_table_create_sql": covid_faba_spending_create_sql_string, + "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, + "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), + } + ), + "account_balances_download": QueryTableSpec( + **{ + "source_query": load_account_balances, + "source_query_incremental": load_account_balances_incremental, + "destination_database": "rpt", + "partition_column": "appropriation_account_balances_id", + "partition_column_type": "numeric", + "delta_table_create_sql": account_balances_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", ], - }, - "delta_table_create_partitions": None, - }, - "transaction_current_cd_lookup": { - "model": None, - "is_from_broker": False, - "source_query": transaction_current_cd_lookup_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "transaction_current_cd_lookup", - "swap_schema": "int", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, - "custom_schema": "", - "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "subaward_search": { - "model": SubawardSearch, - "is_from_broker": False, - "source_query": subaward_search_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "subaward_search", - "swap_schema": "rpt", - "partition_column": "broker_subaward_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": subaward_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "treasury_account_identifiers ARRAY", - "column_names": list(SUBAWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "covid_faba_spending": { - "model": CovidFABASpending, - "is_from_broker": False, - "source_query": covid_faba_spending_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "covid_faba_spending", - "swap_schema": "rpt", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": covid_faba_spending_create_sql_string, - "delta_table_create_options": None, - "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "account_balances_download": { - "model": None, - "is_from_broker": False, - "source_query": load_account_balances, - "source_query_incremental": load_account_balances_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "appropriation_account_balances_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": account_balances_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "award_financial_download": { - "model": None, - "is_from_broker": False, - "source_query": load_award_financial, - "source_query_incremental": load_award_financial_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": award_financial_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "object_class_program_activity_download": { - "model": None, - "is_from_broker": False, - "source_query": load_object_class_program_activity, - "source_query_incremental": load_object_class_program_activity_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_program_activity_object_class_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": object_class_program_activity_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "transaction_download": { - "model": None, - "is_from_broker": False, - "source_query": None, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": transaction_download_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["awarding_agency_code", "is_fpds", "action_date_fiscal_year"], - }, + } + ), + "award_financial_download": QueryTableSpec( + **{ + "source_query": load_award_financial, + "source_query_incremental": load_award_financial_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "delta_table_create_sql": award_financial_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], + } + ), + "object_class_program_activity_download": QueryTableSpec( + **{ + "source_query": load_object_class_program_activity, + "source_query_incremental": load_object_class_program_activity_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_program_activity_object_class_id", + "partition_column_type": "numeric", + "delta_table_create_sql": object_class_program_activity_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], + } + ), + "transaction_download": QueryTableSpec( + **{ + "destination_database": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "delta_table_create_sql": transaction_download_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "awarding_agency_code", + "is_fpds", + "action_date_fiscal_year", + ], + } + ), } @@ -458,7 +371,8 @@ class Command(BaseCommand): destination_table_name: str spark: SparkSession - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -486,7 +400,7 @@ def add_arguments(self, parser): help="Whether or not the table will be updated incrementally", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -501,32 +415,37 @@ def handle(self, *args, **options): spark_created_by_command = False if not self.spark: spark_created_by_command = True - self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) # type: SparkSession + self.spark = configure_spark_session( + **extra_conf, spark_context=self.spark + ) # type: SparkSession # Resolve Parameters destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - self.destination_database = options["alt_db"] or table_spec["destination_database"] - self.destination_table_name = options["alt_name"] or destination_table.split(".")[-1] - source_query_key = "source_query_incremental" if options["incremental"] else "source_query" - load_query = table_spec.get(source_query_key) + self.destination_database = options["alt_db"] or table_spec.destination_database + self.destination_table_name = ( + options["alt_name"] or destination_table.split(".")[-1] + ) + source_query_key = ( + "source_query_incremental" if options["incremental"] else "source_query" + ) + load_query = getattr(table_spec, source_query_key) if load_query is None: - raise ArgumentTypeError(f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC.") + raise ArgumentTypeError( + f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC." + ) # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {self.destination_database}") self.spark.sql(f"use {self.destination_database};") - # Create User Defined Functions if needed - if table_spec.get("user_defined_functions"): - for udf_args in table_spec["user_defined_functions"]: - self.spark.udf.register(**udf_args) - create_ref_temp_views(self.spark, create_broker_views=True) if isinstance(load_query, list): for index, query in enumerate(load_query): - logger.info(f"Running query number: {index + 1}\nPreview of query: {query[:100]}") + logger.info( + f"Running query number: {index + 1}\nPreview of query: {query[:100]}" + ) self.run_spark_sql(query) else: self.run_spark_sql(load_query) @@ -534,7 +453,9 @@ def handle(self, *args, **options): if spark_created_by_command: self.spark.stop() - def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): + def run_spark_sql( + self, query: str | Callable[[SparkSession, str, str], None] + ) -> None: if isinstance(query, str): jdbc_conn_props = get_jdbc_connection_properties() self.spark.sql( @@ -551,4 +472,6 @@ def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): elif isinstance(query, Callable): query(self.spark, self.destination_database, self.destination_table_name) else: - raise ArgumentTypeError(f"Invalid query. `{query}` must be a string or a Callable.") + raise ArgumentTypeError( + f"Invalid query. `{query}` must be a string or a Callable." + ) diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 79892c5dce..05ca3e78cd 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -1,40 +1,46 @@ import itertools import logging +from datetime import datetime +from math import ceil +from typing import Dict, Optional import boto3 import numpy as np import psycopg2 - from django import db +from django.core.management import CommandParser from django.core.management.base import BaseCommand from django.db.models import Model -from math import ceil -from pyspark.sql import SparkSession, DataFrame -from typing import Dict, Optional, List -from datetime import datetime +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.csv_stream_s3_to_pg import copy_csvs_from_s3_to_pg from usaspending_api.common.etl.spark import convert_array_cols_to_string -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, get_jdbc_connection_properties, get_usas_jdbc_url, ) +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG -from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG - from usaspending_api.etl.management.commands.create_delta_table import TABLE_SPEC +from usaspending_api.etl.table_specs import QueryTableSpec +from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG logger = logging.getLogger(__name__) # Note: the `delta` type is not actually in Spark SQL. It's how we're temporarily storing the data before converting it # to the proper postgres type, since pySpark doesn't automatically support this conversion. SPECIAL_TYPES_MAPPING = { - db.models.UUIDField: {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, + db.models.UUIDField: { + "postgres": "UUID USING {column_name}::UUID", + "delta": "TEXT", + }, "UUID": {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, - db.models.JSONField: {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, + db.models.JSONField: { + "postgres": "JSONB using {column_name}::JSON", + "delta": "TEXT", + }, "JSONB": {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, } @@ -46,7 +52,6 @@ class Command(BaseCommand): - help = """ This command reads data from a Delta table and copies it into a corresponding Postgres database table (under a temp name). As of now, it only supports a full reload of a table. If the table with the chosen temp name already @@ -55,7 +60,20 @@ class Command(BaseCommand): if a new table has been made. """ - def add_arguments(self, parser): + delta_table: str + delta_table_name: str + destination_database: str + column_names: list + + postgres_table: str + postgres_table_name: str + postgres_schema: str + postgres_cols: dict + + temp_table: str + temp_table_name: str + + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--delta-table", type=str, @@ -73,7 +91,7 @@ def add_arguments(self, parser): "--alt-delta-name", type=str, required=False, - help="An alternate delta table name to load, overriding the TABLE_SPEC destination_table" "name", + help="An alternate delta table name to load, overriding the TABLE_SPEC destination_tablename", ) parser.add_argument( "--jdbc-inserts", @@ -111,7 +129,8 @@ def add_arguments(self, parser): "If the job fails for some unexpected reason then the sequence will be reset to the previous value.", ) - def _split_dfs(self, df, special_columns): + @staticmethod + def _split_dfs(df: DataFrame, special_columns: str | Column) -> [DataFrame]: """Split a DataFrame into DataFrame subsets based on presence of NULL values in certain special columns Unfortunately, pySpark with the JDBC doesn't handle UUIDs/JSON well. @@ -129,13 +148,18 @@ def _split_dfs(self, df, special_columns): # Figure all the possible combos of filters filter_batches = [] for subset in itertools.product([True, False], repeat=len(special_columns)): - filter_batches.append({col: subset[i] for i, col in enumerate(special_columns)}) + filter_batches.append( + {col: subset[i] for i, col in enumerate(special_columns)} + ) # Generate all the split dfs based on the filter batches split_dfs = [] for filter_batch in filter_batches: # Apply the filters (True = null column, drop it. False = not null column, keep it) - modified_filters = [df[col].isNull() if val else df[col].isNotNull() for col, val in filter_batch.items()] + modified_filters = [ + df[col].isNull() if val else df[col].isNotNull() + for col, val in filter_batch.items() + ] split_df = df.filter(np.bitwise_and.reduce(modified_filters)) # Drop the columns where it's null **after filtering them out** @@ -145,7 +169,7 @@ def _split_dfs(self, df, special_columns): split_dfs.append(split_df) return split_dfs - def handle(self, *args, **options): + def _get_spark_session(self) -> SparkSession: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -155,202 +179,269 @@ def handle(self, *args, **options): "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json } - spark = get_active_spark_session() spark_created_by_command = False if not spark: spark_created_by_command = True - spark = configure_spark_session(**extra_conf, spark_context=spark) # type: SparkSession + spark = configure_spark_session( + **extra_conf, spark_context=spark + ) # type: SparkSession + return spark, spark_created_by_command + + def handle(self, *args, **options) -> None: + spark, spark_created_by_command = self._get_spark_session() # Resolve Parameters - delta_table = options["delta_table"] + self.delta_table = options["delta_table"] recreate = options["recreate"] - - table_spec = TABLE_SPEC[delta_table] + table_spec = TABLE_SPEC[self.delta_table] # Delta side - destination_database = options["alt_delta_db"] or table_spec["destination_database"] - delta_table_name = options["alt_delta_name"] or delta_table - delta_table = f"{destination_database}.{delta_table_name}" if destination_database else delta_table_name + self.destination_database = ( + options["alt_delta_db"] or table_spec.destination_database + ) + self.delta_table_name = options["alt_delta_name"] or self.delta_table + self.delta_table = ( + f"{self.destination_database}.{self.delta_table_name}" + if self.destination_database + else self.delta_table_name + ) # Postgres side - source - postgres_table = None - postgres_model = table_spec["model"] - postgres_schema = table_spec["source_database"] or table_spec["swap_schema"] - postgres_table_name = table_spec["source_table"] or table_spec["swap_table"] - postgres_cols = table_spec["source_schema"] - column_names = table_spec.get("column_names") - tsvectors = table_spec.get("tsvectors") or {} - if postgres_table_name: - postgres_table = f"{postgres_schema}.{postgres_table_name}" if postgres_schema else postgres_table_name + self.postgres_schema = table_spec.source_database or table_spec.swap_schema + self.postgres_table_name = table_spec.source_table or table_spec.swap_table + self.postgres_cols = table_spec.source_schema + self.column_names = table_spec.column_names + if self.postgres_table_name: + self.postgres_table = ( + f"{self.postgres_schema}.{self.postgres_table_name}" + if self.postgres_schema + else self.postgres_table_name + ) # Postgres side - temp temp_schema = "temp" temp_table_suffix = "temp" - temp_table_suffix_appendage = f"_{temp_table_suffix}" if {temp_table_suffix} else "" - if postgres_table: - temp_table_name = f"{postgres_table_name}{temp_table_suffix_appendage}" - else: - temp_table_name = f"{delta_table_name}{temp_table_suffix_appendage}" - temp_table = f"{temp_schema}.{temp_table_name}" + temp_table_suffix_appendage = ( + f"_{temp_table_suffix}" if {temp_table_suffix} else "" + ) + self.temp_table_name = ( + f"{self.postgres_table_name}{temp_table_suffix_appendage}" + if self.postgres_table + else f"{self.delta_table_name}{temp_table_suffix_appendage}" + ) + self.temp_table = f"{temp_schema}.{self.temp_table_name}" - summary_msg = f"Copying delta table {delta_table} to a Postgres temp table {temp_table}." - if postgres_table: - summary_msg = f"{summary_msg} The temp table will be based on the postgres table {postgres_table}" + summary_msg = f"Copying delta table {self.delta_table} to a Postgres temp table {self.temp_table}." + if self.postgres_table: + summary_msg = f"{summary_msg} The temp table will be based on the postgres table {self.postgres_table}" logger.info(summary_msg) - # Checking if the temp destination table already exists - temp_dest_table_exists_sql = f""" - SELECT EXISTS ( - SELECT 1 - FROM information_schema.tables - WHERE table_schema = '{temp_schema}' - AND table_name = '{temp_table_name}') - """ - with db.connection.cursor() as cursor: - cursor.execute(temp_dest_table_exists_sql) - temp_dest_table_exists = cursor.fetchone()[0] + temp_dest_table_exists = self._temp_table_exists( + temp_schema, self.temp_table_name + ) # If it does, and we're recreating it, drop it first if temp_dest_table_exists and recreate: - logger.info(f"{temp_table} exists and recreate argument provided. Dropping first.") - # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it - clear_table_sql = f"DROP TABLE {temp_table}" - with db.connection.cursor() as cursor: - cursor.execute(clear_table_sql) - logger.info(f"{temp_table} dropped.") + self._drop_temp_table() temp_dest_table_exists = False make_new_table = not temp_dest_table_exists - is_postgres_table_partitioned = table_spec.get("postgres_partition_spec") is not None - - if postgres_table or postgres_cols: - # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for - # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. - # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table - if make_new_table: - partition_clause = "" - storage_parameters = "WITH (autovacuum_enabled=FALSE)" - partitions_sql = [] - if is_postgres_table_partitioned: - partition_clause = ( - f"PARTITION BY {table_spec['postgres_partition_spec']['partitioning_form']}" - f"({', '.join(table_spec['postgres_partition_spec']['partition_keys'])})" - ) - storage_parameters = "" - partitions_sql = [ - ( - f"CREATE TABLE " - # Below: e.g. my_tbl_temp -> my_tbl_part_temp - f"{temp_table[:-len(temp_table_suffix_appendage)]}{pt['table_suffix']}{temp_table_suffix_appendage} " - f"PARTITION OF {temp_table} {pt['partitioning_clause']} " - f"{storage_parameters}" - ) - for pt in table_spec["postgres_partition_spec"]["partitions"] - ] - if postgres_table: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - LIKE {postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY - ) {partition_clause} {storage_parameters} - """ - elif postgres_cols: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - {", ".join([f'{key} {val}' for key, val in postgres_cols.items()])} - ) {partition_clause} {storage_parameters} - """ - else: - raise RuntimeError( - "make_new_table=True but neither a postgres_table or postgres_cols are " - "populated for the target delta table in the TABLE_SPEC" - ) - with db.connection.cursor() as cursor: - logger.info(f"Creating {temp_table}") - cursor.execute(create_temp_sql) - logger.info(f"{temp_table} created.") - - if is_postgres_table_partitioned and partitions_sql: - for create_partition in partitions_sql: - logger.info(f"Creating partition of {temp_table} with SQL:\n{create_partition}") - cursor.execute(create_partition) - logger.info("Partition created.") - - # If there are vectors, add the triggers that will populate them based on other calls - # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, - # at the top-level virtual/partitioned table (versus having to apply on each partition) - for tsvector_name, derived_from_cols in tsvectors.items(): - logger.info( - f"To prevent any confusion or duplicates, dropping the trigger" - f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." - ) - cursor.execute(f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}") - - logger.info( - f"Adding tsvector trigger for column {tsvector_name}" - f" based on the following columns: {derived_from_cols}" - ) - derived_from_cols_str = ", ".join(derived_from_cols) - tsvector_trigger_sql = f""" - CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE - ON {temp_table} FOR EACH ROW EXECUTE PROCEDURE - tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', - {derived_from_cols_str}) - """ - cursor.execute(tsvector_trigger_sql) - logger.info(f"tsvector trigger for column {tsvector_name} added.") + if self.postgres_table or self.postgres_cols: + self._recreate_table( + make_new_table=make_new_table, + table_spec=table_spec, + temp_table_suffix_appendage=temp_table_suffix_appendage, + ) # Read from Delta - df = spark.table(delta_table) + df = spark.table(self.delta_table) # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if column_names: - df = df.select(column_names) + if self.column_names: + df = df.select(self.column_names) # If we're working off an existing table, truncate before loading in all the data if not make_new_table: - logger.info(f"Truncating existing table {temp_table}") + logger.info(f"Truncating existing table {self.temp_table}") with db.connection.cursor() as cursor: - cursor.execute(f"TRUNCATE {temp_table}") - logger.info(f"{temp_table} truncated.") + cursor.execute(f"TRUNCATE {self.temp_table}") + logger.info(f"{self.temp_table} truncated.") # Reset the sequence before load for a table if it exists - if options["reset_sequence"] and table_spec.get("postgres_seq_name"): - postgres_seq_last_value = self._set_sequence_value(table_spec["postgres_seq_name"]) - else: - postgres_seq_last_value = None + postgres_seq_last_value = ( + self._set_sequence_value(table_spec.postgres_seq_name) + if options["reset_sequence"] + and hasattr(table_spec, "postgres_seq_name") + and table_spec.postgres_seq_name + else None + ) + + self._write_df( + spark=spark, + df=df, + options=options, + postgres_seq_last_value=postgres_seq_last_value, + table_spec=table_spec, + ) + + self._finish( + options=options, + spark_created_by_command=spark_created_by_command, + spark=spark, + ) + + @staticmethod + def _temp_table_exists(temp_schema: str, temp_table_name: str) -> bool: + # Checking if the temp destination table already exists + temp_dest_table_exists_sql = f""" + SELECT EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = '{temp_schema}' + AND table_name = '{temp_table_name}') + """ + with db.connection.cursor() as cursor: + cursor.execute(temp_dest_table_exists_sql) + return bool(cursor.fetchone()[0]) + + def _drop_temp_table(self) -> None: + logger.info( + f"{self.temp_table} exists and recreate argument provided. Dropping first." + ) + # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it + clear_table_sql = f"DROP TABLE {self.temp_table}" + with db.connection.cursor() as cursor: + cursor.execute(clear_table_sql) + logger.info(f"{self.temp_table} dropped.") + + def _recreate_table( + self, + make_new_table: bool, + table_spec: QueryTableSpec, + temp_table_suffix_appendage: str, + ) -> None: + # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for + # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. + # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table + is_postgres_table_partitioned = ( + hasattr(table_spec, "postgres_partition_spec") + and table_spec.postgres_partition_spec is not None + ) + tsvectors = table_spec.tsvectors or {} + + if make_new_table: + partition_clause = "" + storage_parameters = "WITH (autovacuum_enabled=FALSE)" + partitions_sql = [] + if is_postgres_table_partitioned: + partition_clause = ( + f"PARTITION BY {table_spec.postgres_partition_spec['partitioning_form']}" + f"({', '.join(table_spec.postgres_partition_spec['partition_keys'])})" + ) + storage_parameters = "" + partitions_sql = [ + ( + f"CREATE TABLE " + # Below: e.g. my_tbl_temp -> my_tbl_part_temp + f"{self.temp_table[: -len(temp_table_suffix_appendage)]}" + f"{pt['table_suffix']}{temp_table_suffix_appendage} " + f"PARTITION OF {self.temp_table} {pt['partitioning_clause']} " + f"{storage_parameters}" + ) + for pt in table_spec.postgres_partition_spec["partitions"] + ] + if self.postgres_table: + create_temp_sql = f""" + CREATE TABLE {self.temp_table} ( + LIKE {self.postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY + ) {partition_clause} {storage_parameters} + """ + elif self.postgres_cols: + create_temp_sql = f""" + CREATE TABLE {self.temp_table} ( + {", ".join([f"{key} {val}" for key, val in self.postgres_cols.items()])} + ) {partition_clause} {storage_parameters} + """ + else: + raise RuntimeError( + "make_new_table=True but neither a postgres_table or postgres_cols are " + "populated for the target delta table in the TABLE_SPEC" + ) + with db.connection.cursor() as cursor: + logger.info(f"Creating {self.temp_table}") + cursor.execute(create_temp_sql) + logger.info(f"{self.temp_table} created.") + + if is_postgres_table_partitioned and partitions_sql: + for create_partition in partitions_sql: + logger.info( + f"Creating partition of {self.temp_table} with SQL:\n{create_partition}" + ) + cursor.execute(create_partition) + logger.info("Partition created.") + + # If there are vectors, add the triggers that will populate them based on other calls + # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, + # at the top-level virtual/partitioned table (versus having to apply on each partition) + for tsvector_name, derived_from_cols in tsvectors.items(): + logger.info( + f"To prevent any confusion or duplicates, dropping the trigger" + f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." + ) + cursor.execute( + f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {self.temp_table}" + ) - # Write to Postgres + logger.info( + f"Adding tsvector trigger for column {tsvector_name}" + f" based on the following columns: {derived_from_cols}" + ) + derived_from_cols_str = ", ".join(derived_from_cols) + tsvector_trigger_sql = f""" + CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE + ON {self.temp_table} FOR EACH ROW EXECUTE PROCEDURE + tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', + {derived_from_cols_str}) + """ + cursor.execute(tsvector_trigger_sql) + logger.info(f"tsvector trigger for column {tsvector_name} added.") + + def _write_df( + self, + spark: SparkSession, + df: DataFrame, + options: dict, + postgres_seq_last_value: int | bool, + table_spec: QueryTableSpec, + ) -> None: use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( - f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" + f"LOAD (START): Loading data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" ) - try: if use_jdbc_inserts: self._write_with_jdbc_inserts( - spark, df, - temp_table, + self.temp_table, split_df_by_special_cols=True, - postgres_model=postgres_model, - postgres_cols=postgres_cols, + postgres_model=table_spec.model, + postgres_cols=self.postgres_cols, overwrite=False, ) else: - if not column_names: - raise RuntimeError("column_names None or empty, but are required to map CSV cols to table cols") + if not self.column_names: + raise RuntimeError( + "column_names None or empty, but are required to map CSV cols to table cols" + ) spark_s3_bucket_name = options["spark_s3_bucket"] self._write_with_sql_bulk_copy_csv( spark, df, - delta_db=destination_database, - delta_table_name=delta_table_name, - temp_table=temp_table, - ordered_col_names=column_names, spark_s3_bucket_name=spark_s3_bucket_name, keep_csv_files=True if options["keep_csv_files"] else False, ) @@ -359,24 +450,10 @@ def handle(self, *args, **options): logger.error( f"Command failed unexpectedly; resetting the sequence to previous value: {postgres_seq_last_value}" ) - self._set_sequence_value(table_spec["postgres_seq_name"], postgres_seq_last_value) - raise Exception(exc) - - logger.info( - f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" - ) - - # We're done with spark at this point - if spark_created_by_command: - spark.stop() - - if postgres_table: - logger.info( - f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" - f" metadata portion of the table download to a separate script. If not already done so," - f" please run the following additional command to complete the process: " - f" 'copy_table_metadata --source-table {postgres_table} --dest-table {temp_table}'." - ) + self._set_sequence_value( + table_spec.postgres_seq_name, postgres_seq_last_value + ) + raise exc def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: """ @@ -391,20 +468,18 @@ def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: with db.connection.cursor() as cursor: cursor.execute(f"SELECT last_value FROM {seq_name}") last_value = cursor.fetchone()[0] - cursor.execute(f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}") + cursor.execute( + f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}" + ) return last_value def _write_with_sql_bulk_copy_csv( self, spark: SparkSession, df: DataFrame, - delta_db: str, - delta_table_name: str, - temp_table: str, - ordered_col_names: List[str], spark_s3_bucket_name: str, - keep_csv_files=False, - ): + keep_csv_files: bool = False, + ) -> None: """ Write-from-delta-to-postgres strategy that relies on SQL bulk COPY of CSV files to Postgres. It uses the SQL COPY command on CSV files, which are created from the Delta table's underlying parquet files. @@ -448,10 +523,10 @@ def _write_with_sql_bulk_copy_csv( sub-folder of a "temp" folder. Be mindful of cleaning these up if setting to True. If False, the same output path is used for each write and nukes-and-paves the files in that output path. """ - csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{delta_db}/{delta_table_name}/" + csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{self.destination_database}/{self.delta_table_name}/" if keep_csv_files: csv_path = ( - f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{delta_db}/{delta_table_name}/" + f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{self.destination_database}/{self.delta_table_name}/" f"{datetime.strftime(datetime.utcnow(), '%Y%m%d%H%M%S')}/" ) s3_bucket_with_csv_path = f"s3a://{spark_s3_bucket_name}/{csv_path}" @@ -464,11 +539,15 @@ def _write_with_sql_bulk_copy_csv( aws_secret_access_key=CONFIG.AWS_SECRET_KEY.get_secret_value(), ) s3_resource = boto3_session.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}", ) else: s3_resource = boto3.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}", ) s3_bucket_name = spark_s3_bucket_name s3_bucket = s3_resource.Bucket(s3_bucket_name) @@ -476,15 +555,25 @@ def _write_with_sql_bulk_copy_csv( initial_size = sum(1 for _ in objs_collection) if initial_size > 0: - logger.info(f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}" + ) objs_collection.delete() post_delete_size = sum(1 for _ in objs_collection) - logger.info(f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}" + ) else: - logger.info(f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created") + logger.info( + f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created" + ) - logger.info(f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}") - df_no_arrays = convert_array_cols_to_string(df, is_postgres_array_format=True, is_for_csv_export=True) + logger.info( + f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}" + ) + df_no_arrays = convert_array_cols_to_string( + df, is_postgres_array_format=True, is_for_csv_export=True + ) df_no_arrays.write.options( maxRecordsPerFile=_SPARK_CSV_WRITE_TO_PG_MAX_RECORDS_PER_FILE, compression="gzip", @@ -493,18 +582,28 @@ def _write_with_sql_bulk_copy_csv( ignoreLeadingWhiteSpace=False, # must set for CSV write, as it defaults to true ignoreTrailingWhiteSpace=False, # must set for CSV write, as it defaults to true timestampFormat=CONFIG.SPARK_CSV_TIMEZONE_FORMAT, - ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv(s3_bucket_with_csv_path) + ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv( + s3_bucket_with_csv_path + ) logger.debug( f"Connecting to S3 at endpoint_url={CONFIG.AWS_S3_ENDPOINT}, region_name={CONFIG.AWS_REGION} to " f"get listing of contents of Bucket={spark_s3_bucket_name} with Prefix={csv_path}" ) - gzipped_csv_files = [f.key for f in s3_bucket.objects.filter(Prefix=csv_path) if f.key.endswith(".csv.gz")] + gzipped_csv_files = [ + f.key + for f in s3_bucket.objects.filter(Prefix=csv_path) + if f.key.endswith(".csv.gz") + ] file_count = len(gzipped_csv_files) - logger.info(f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}" + ) - logger.info(f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" + ) db_dsn = get_database_dsn_string() with psycopg2.connect(dsn=db_dsn) as connection: @@ -518,7 +617,10 @@ def _write_with_sql_bulk_copy_csv( # fraction less than 1.0. The final value will be the greater of that or # SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS partitions = max( - ceil(max_parallel_workers * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER), + ceil( + max_parallel_workers + * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER + ), CONFIG.SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS, ) @@ -534,6 +636,8 @@ def _write_with_sql_bulk_copy_csv( # into the mapped function, its module, or an arg of it ... that is not pickle-able, this will throw an error. # One way to help is to resolve all arguments to primitive types (int, string) that can be passed # to the mapped function + temp_table = self.temp_table + ordered_col_names = self.column_names rdd.mapPartitionsWithIndex( lambda partition_idx, s3_obj_keys: copy_csvs_from_s3_to_pg( batch_num=partition_idx, @@ -547,18 +651,19 @@ def _write_with_sql_bulk_copy_csv( ), ).collect() - logger.info(f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" + ) def _write_with_jdbc_inserts( self, - spark: SparkSession, df: DataFrame, temp_table: str, split_df_by_special_cols: bool = False, postgres_model: Optional[Model] = None, postgres_cols: Optional[Dict[str, str]] = None, overwrite: bool = False, - ): + ) -> None: """ Write-from-delta-to-postgres strategy that leverages the native Spark ``DataFrame.write.jdbc`` approach. This will issue a series of individual INSERT statements over a JDBC connection-per-executor. @@ -594,7 +699,10 @@ def _write_with_jdbc_inserts( # special handling. Get those columns and handle each. if split_df_by_special_cols: if postgres_model: - col_type_mapping = [(column.name, type(column)) for column in postgres_model._meta.get_fields()] + col_type_mapping = [ + (column.name, type(column)) + for column in postgres_model._meta.get_fields() + ] else: col_type_mapping = list(postgres_cols.items()) for column_name, column_type in col_type_mapping: @@ -609,14 +717,18 @@ def _write_with_jdbc_inserts( ) for i, split_df in enumerate(split_dfs): # Note: we're only appending here as we don't want to re-truncate or overwrite with multiple dataframes - logger.info(f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)") + logger.info( + f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)" + ) split_df.write.jdbc( url=get_usas_jdbc_url(), table=temp_table, mode=save_mode, properties=get_jdbc_connection_properties(), ) - logger.info(f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)") + logger.info( + f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)" + ) else: # Do it in one shot df.write.jdbc( @@ -625,3 +737,28 @@ def _write_with_jdbc_inserts( mode=save_mode, properties=get_jdbc_connection_properties(), ) + + def _finish( + self, + options: dict, + spark_created_by_command: bool, + spark: SparkSession, + ) -> None: + use_jdbc_inserts = options["jdbc_inserts"] + strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" + logger.info( + f"LOAD (FINISH): Loaded data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" + ) + + # We're done with spark at this point + if spark_created_by_command: + spark.stop() + + if self.postgres_table: + logger.info( + f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" + f" metadata portion of the table download to a separate script. If not already done so," + f" please run the following additional command to complete the process: " + f" 'copy_table_metadata --source-table {self.postgres_table} --dest-table {self.temp_table}'." + ) diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index dd031c7115..13d9d425a0 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -1,307 +1,264 @@ import logging -from django.core.management import BaseCommand +from django.core.management import BaseCommand, CommandParser from usaspending_api.awards.delta_models import ( AWARDS_COLUMNS, - awards_sql_string, - FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, - financial_accounts_by_awards_sql_string, BROKER_SUBAWARDS_COLUMNS, + FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, + awards_sql_string, broker_subawards_sql_string, + financial_accounts_by_awards_sql_string, +) +from usaspending_api.awards.models import ( + Award, + FinancialAccountsByAwards, + TransactionFABS, + TransactionFPDS, + TransactionNormalized, +) +from usaspending_api.broker.delta_models.broker_zips import ( + ZIPS_COLUMNS, + zips_sql_string, +) +from usaspending_api.common.etl.spark import ( + extract_db_data_frame, + get_partition_bounds_sql, + load_delta_table, ) -from usaspending_api.broker.delta_models.broker_zips import ZIPS_COLUMNS, zips_sql_string -from usaspending_api.common.etl.spark import extract_db_data_frame, get_partition_bounds_sql, load_delta_table from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, + get_broker_jdbc_url, get_jdbc_connection_properties, get_usas_jdbc_url, - get_broker_jdbc_url, ) from usaspending_api.config import CONFIG +from usaspending_api.etl.table_specs import TableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_COLUMNS, - recipient_lookup_create_sql_string, - recipient_profile_create_sql_string, RECIPIENT_PROFILE_DELTA_COLUMNS, SAM_RECIPIENT_COLUMNS, + recipient_lookup_create_sql_string, + recipient_profile_create_sql_string, sam_recipient_create_sql_string, ) -from usaspending_api.search.models import TransactionSearch, AwardSearch +from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile +from usaspending_api.search.delta_models.award_search import ( + AWARD_SEARCH_COLUMNS, + award_search_create_sql_string, +) +from usaspending_api.search.models import AwardSearch, TransactionSearch from usaspending_api.transactions.delta_models import ( DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, - detached_award_procurement_create_sql_string, + PUBLISHED_FABS_COLUMNS, TRANSACTION_FABS_VIEW_COLUMNS, - transaction_fabs_sql_string, TRANSACTION_FPDS_VIEW_COLUMNS, - transaction_fpds_sql_string, TRANSACTION_NORMALIZED_COLUMNS, - transaction_normalized_sql_string, TRANSACTION_SEARCH_POSTGRES_COLUMNS, - transaction_search_create_sql_string, - PUBLISHED_FABS_COLUMNS, + detached_award_procurement_create_sql_string, published_fabs_create_sql_string, + transaction_fabs_sql_string, + transaction_fpds_sql_string, + transaction_normalized_sql_string, + transaction_search_create_sql_string, ) -from usaspending_api.transactions.models import SourceAssistanceTransaction -from usaspending_api.transactions.models import SourceProcurementTransaction -from usaspending_api.search.delta_models.award_search import award_search_create_sql_string, AWARD_SEARCH_COLUMNS - -from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile -from usaspending_api.awards.models import ( - Award, - FinancialAccountsByAwards, - TransactionFABS, - TransactionFPDS, - TransactionNormalized, +from usaspending_api.transactions.models import ( + SourceAssistanceTransaction, + SourceProcurementTransaction, ) logger = logging.getLogger(__name__) + TABLE_SPEC = { - "awards": { - "model": Award, - "is_from_broker": False, - "source_table": "vw_awards", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": awards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(AWARDS_COLUMNS), - "tsvectors": None, - }, - "detached_award_procurement": { - "model": SourceProcurementTransaction, - "is_from_broker": False, - "source_table": "source_procurement_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "detached_award_procurement_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": detached_award_procurement_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), - "tsvectors": None, - }, - "financial_accounts_by_awards": { - "model": FinancialAccountsByAwards, - "is_from_broker": False, - "source_table": "financial_accounts_by_awards", - "source_database": "public", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": financial_accounts_by_awards_sql_string, - "source_schema": None, - "custom_schema": "award_id LONG", - "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), - "tsvectors": None, - }, - "transaction_fabs": { - "model": TransactionFABS, - "is_from_broker": False, - "source_table": "vw_transaction_fabs", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fabs_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FABS_VIEW_COLUMNS, - "tsvectors": None, - }, - "published_fabs": { - "model": SourceAssistanceTransaction, - "is_from_broker": False, - "source_table": "source_assistance_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "published_fabs_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": published_fabs_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(PUBLISHED_FABS_COLUMNS), - "tsvectors": None, - }, - "transaction_fpds": { - "model": TransactionFPDS, - "is_from_broker": False, - "source_table": "vw_transaction_fpds", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fpds_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, - "tsvectors": None, - }, - "transaction_normalized": { - "model": TransactionNormalized, - "is_from_broker": False, - "source_table": "vw_transaction_normalized", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_normalized_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), - "tsvectors": None, - }, + "awards": TableSpec( + **{ + "model": Award, + "source_table": "vw_awards", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": awards_sql_string, + "column_names": list(AWARDS_COLUMNS), + } + ), + "detached_award_procurement": TableSpec( + **{ + "model": SourceProcurementTransaction, + "source_table": "source_procurement_transaction", + "source_database": "raw", + "destination_database": "raw", + "partition_column": "detached_award_procurement_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": detached_award_procurement_create_sql_string, + "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), + } + ), + "financial_accounts_by_awards": TableSpec( + **{ + "model": FinancialAccountsByAwards, + "source_table": "financial_accounts_by_awards", + "source_database": "public", + "destination_database": "raw", + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": financial_accounts_by_awards_sql_string, + "custom_schema": "award_id LONG", + "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), + } + ), + "transaction_fabs": TableSpec( + **{ + "model": TransactionFABS, + "source_table": "vw_transaction_fabs", + "source_database": "int", + "destination_database": "raw", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_fabs_sql_string, + "column_names": TRANSACTION_FABS_VIEW_COLUMNS, + } + ), + "published_fabs": TableSpec( + **{ + "model": SourceAssistanceTransaction, + "source_table": "source_assistance_transaction", + "source_database": "raw", + "destination_database": "raw", + "partition_column": "published_fabs_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": published_fabs_create_sql_string, + "column_names": list(PUBLISHED_FABS_COLUMNS), + } + ), + "transaction_fpds": TableSpec( + **{ + "model": TransactionFPDS, + "source_table": "vw_transaction_fpds", + "source_database": "int", + "destination_database": "raw", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_fpds_sql_string, + "custom_schema": "", + "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, + } + ), + "transaction_normalized": TableSpec( + **{ + "model": TransactionNormalized, + "source_table": "vw_transaction_normalized", + "source_database": "int", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_normalized_sql_string, + "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), + } + ), # Tables loaded in from the Broker - "subaward": { - "model": None, - "is_from_broker": True, - "source_table": "subaward", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": broker_subawards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(BROKER_SUBAWARDS_COLUMNS), - "tsvectors": None, - }, - "zips": { - "model": None, - "is_from_broker": True, - "source_table": "zips", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "zips_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": zips_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(ZIPS_COLUMNS), - "tsvectors": None, - }, + "subaward": TableSpec( + **{ + "is_from_broker": True, + "source_table": "subaward", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": broker_subawards_sql_string, + "column_names": list(BROKER_SUBAWARDS_COLUMNS), + } + ), + "zips": TableSpec( + **{ + "is_from_broker": True, + "source_table": "zips", + "destination_database": "raw", + "partition_column": "zips_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": zips_sql_string, + "column_names": list(ZIPS_COLUMNS), + } + ), # Additional definitions for use in testing; # These are copies of Views / Materialized Views / Tables from Postgres to Spark to aid in # data comparison between current Postgres data and the data transformed via Spark. - "award_search_testing": { - "model": AwardSearch, - "is_from_broker": False, - "source_table": "award_search", - "source_database": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "source_schema": None, - "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " - "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "tsvectors": None, - }, - "recipient_lookup_testing": { - "model": RecipientLookup, - "is_from_broker": False, - "source_table": "recipient_lookup", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": recipient_lookup_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_LOOKUP_COLUMNS), - "tsvectors": None, - }, - "recipient_profile_testing": { - "model": RecipientProfile, - "is_from_broker": False, - "source_table": "recipient_profile", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "delta_table_create_sql": recipient_profile_create_sql_string, - "is_partition_column_unique": True, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), - "tsvectors": None, - }, - "sam_recipient_testing": { - "model": DUNS, - "is_from_broker": False, - "source_table": "duns", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": None, - "partition_column_type": None, - "is_partition_column_unique": False, - "delta_table_create_sql": sam_recipient_create_sql_string, - "source_schema": None, - "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", - "column_names": list(SAM_RECIPIENT_COLUMNS), - "tsvectors": None, - }, - "transaction_search_testing": { - "model": TransactionSearch, - "is_from_broker": False, - "source_table": "transaction_search", - "source_database": None, - "destination_database": "test", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "tsvectors": None, - }, + "award_search_testing": TableSpec( + **{ + "model": AwardSearch, + "source_table": "award_search", + "destination_database": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " + "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", + "column_names": list(AWARD_SEARCH_COLUMNS), + } + ), + "recipient_lookup_testing": TableSpec( + **{ + "model": RecipientLookup, + "source_table": "recipient_lookup", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": recipient_lookup_create_sql_string, + "custom_schema": "recipient_hash STRING", + "column_names": list(RECIPIENT_LOOKUP_COLUMNS), + } + ), + "recipient_profile_testing": TableSpec( + **{ + "model": RecipientProfile, + "source_table": "recipient_profile", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "delta_table_create_sql": recipient_profile_create_sql_string, + "is_partition_column_unique": True, + "custom_schema": "recipient_hash STRING", + "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), + } + ), + "sam_recipient_testing": TableSpec( + **{ + "model": DUNS, + "source_table": "duns", + "source_database": "int", + "destination_database": "raw", + "delta_table_create_sql": sam_recipient_create_sql_string, + "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", + "column_names": list(SAM_RECIPIENT_COLUMNS), + } + ), + "transaction_search_testing": TableSpec( + **{ + "model": TransactionSearch, + "source_table": "transaction_search", + "destination_database": "test", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + } + ), } SPARK_PARTITION_ROWS = CONFIG.SPARK_PARTITION_ROWS @@ -315,7 +272,8 @@ class Command(BaseCommand): before new data is written. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -337,7 +295,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -358,14 +316,14 @@ def handle(self, *args, **options): destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - is_from_broker = table_spec["is_from_broker"] - destination_database = options["alt_db"] or table_spec["destination_database"] + is_from_broker = table_spec.is_from_broker + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - partition_column = table_spec["partition_column"] - partition_column_type = table_spec["partition_column_type"] - is_partition_column_unique = table_spec["is_partition_column_unique"] - custom_schema = table_spec["custom_schema"] + source_table = table_spec.source_table + partition_column = table_spec.partition_column + partition_column_type = table_spec.partition_column_type + is_partition_column_unique = table_spec.is_partition_column_unique + custom_schema = table_spec.custom_schema # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") @@ -374,9 +332,13 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not is_from_broker else get_broker_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError( + "Couldn't find JDBC url, please properly configure your CONFIG." + ) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) # If a partition_column is present, read from jdbc using partitioning if partition_column: @@ -387,7 +349,9 @@ def handle(self, *args, **options): is_numeric_partitioning_col = False is_date_partitioning_col = True else: - raise ValueError("partition_column_type should be either 'numeric' or 'date'") + raise ValueError( + "partition_column_type should be either 'numeric' or 'date'" + ) # Read from table or view df = extract_db_data_frame( @@ -417,8 +381,8 @@ def handle(self, *args, **options): # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if table_spec.get("column_names"): - df = df.select(table_spec.get("column_names")) + if table_spec.column_names: + df = df.select(table_spec.column_names) # Write to S3 load_delta_table(spark, df, destination_table_name, True) diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py new file mode 100644 index 0000000000..d3fc405e7c --- /dev/null +++ b/usaspending_api/etl/table_specs.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass +from typing import Any, Callable, Literal + +from django.db import models +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType + + +@dataclass(kw_only=True) +class TableSpec: + destination_database: Literal["arc", "int", "raw", "rpt", "test"] + delta_table_create_sql: str | StructType + column_names: list[str] | None = None + model: models.Model | None = None + is_from_broker: bool = False + source_table: str | None = None + source_database: Literal["public", "int", "raw", "rpt"] | None = None + swap_table: str | None = None + swap_schema: str | None = None + partition_column: str | None = None + partition_column_type: Literal["date", "numeric"] | None = None + is_partition_column_unique: bool = False + source_schema: dict[str, str] | None = None + custom_schema: str = "" + delta_table_create_options: dict[str, str | bool] | None = None + delta_table_create_partitions: list[str] | None = None + tsvectors: dict[str, list[str]] | None = None + + +@dataclass(kw_only=True) +class QueryTableSpec(TableSpec): + source_query: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + source_query_incremental: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + postgres_seq_name: str | None = None + postgres_partition_spec: dict[str, Any] | None = None + + +@dataclass(kw_only=True) +class ArchiveTableSpec(TableSpec): + destination_table: str + archive_date_field: str diff --git a/usaspending_api/etl/tests/data/delta_model_for_test.py b/usaspending_api/etl/tests/data/delta_model_for_test.py index e4d63eb538..15841ed1fd 100644 --- a/usaspending_api/etl/tests/data/delta_model_for_test.py +++ b/usaspending_api/etl/tests/data/delta_model_for_test.py @@ -2,9 +2,13 @@ from django.db import models +from usaspending_api.etl.table_specs import TableSpec + class TestModel(models.Model): - id = models.IntegerField(primary_key=True, help_text="surrogate primary key defined in Broker") + id = models.IntegerField( + primary_key=True, help_text="surrogate primary key defined in Broker" + ) test_timestamp = models.DateTimeField(null=True, blank=True) class Meta: @@ -26,21 +30,17 @@ class Meta: """ TEST_TABLE_SPEC = { - "test_table": { - "model": TestModel, - "is_from_broker": False, - "source_table": "test_table", - "source_database": "temp", - "destination_database": "temp", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": TEST_TABLE_DELTA, - "source_schema": None, - "custom_schema": "", - "column_names": ["id", "test_timestamp"], - "tsvectors": None, - } + "test_table": TableSpec( + **{ + "model": TestModel, + "source_table": "test_table", + "source_database": "temp", + "destination_database": "temp", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": TEST_TABLE_DELTA, + "column_names": ["id", "test_timestamp"], + } + ) } diff --git a/usaspending_api/etl/tests/integration/test_create_delta_table.py b/usaspending_api/etl/tests/integration/test_create_delta_table.py index b36597868f..e7e4106f18 100644 --- a/usaspending_api/etl/tests/integration/test_create_delta_table.py +++ b/usaspending_api/etl/tests/integration/test_create_delta_table.py @@ -21,7 +21,7 @@ def _verify_delta_table_creation( delta_table_spec = TABLE_SPEC[delta_table_name] cmd_args = [f"--destination-table={delta_table_name}", f"--spark-s3-bucket={s3_bucket}"] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database if alt_db: cmd_args += [f"--alt-db={alt_db}"] expected_db_name = alt_db diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index dcd0b38ab8..c66b60fb18 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -225,11 +225,11 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 else: expected_table_name = delta_table_name.split(".")[-1] - partition_col = TABLE_SPEC[delta_table_name].get("partition_column") + partition_col = TABLE_SPEC[delta_table_name].partition_column if dummy_data is None: # get the postgres data to compare - model = TABLE_SPEC[delta_table_name]["model"] - is_from_broker = TABLE_SPEC[delta_table_name]["is_from_broker"] + model = TABLE_SPEC[delta_table_name].model + is_from_broker = TABLE_SPEC[delta_table_name].is_from_broker if delta_table_name == "summary_state_view": dummy_query = f"SELECT * from {expected_table_name}" if partition_col is not None: @@ -243,7 +243,7 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 elif is_from_broker: # model can be None if loading from the Broker broker_connection = connections[settings.BROKER_DB_ALIAS] - source_broker_name = TABLE_SPEC[delta_table_name]["source_table"] + source_broker_name = TABLE_SPEC[delta_table_name].source_table with broker_connection.cursor() as cursor: dummy_query = f"SELECT * from {source_broker_name}" if partition_col is not None: @@ -266,7 +266,7 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 assert equal_datasets( dummy_data, received_data, - TABLE_SPEC[delta_table_name]["custom_schema"], + TABLE_SPEC[delta_table_name].custom_schema, ignore_fields, ) @@ -304,9 +304,10 @@ def verify_delta_table_loaded_from_delta( call_command(load_command, *cmd_args) # get the postgres data to compare + source_table = ( - TABLE_SPEC[delta_table_name]["source_table"] - or TABLE_SPEC[delta_table_name]["swap_table"] + TABLE_SPEC[delta_table_name].source_table + or TABLE_SPEC[delta_table_name].swap_table ) temp_schema = "temp" if source_table: @@ -314,7 +315,7 @@ def verify_delta_table_loaded_from_delta( else: tmp_table_name = f"{temp_schema}.{expected_table_name}_temp" postgres_query = f"SELECT * FROM {tmp_table_name}" - partition_col = TABLE_SPEC[delta_table_name]["partition_column"] + partition_col = TABLE_SPEC[delta_table_name].partition_column if partition_col is not None: postgres_query = f"{postgres_query} ORDER BY {partition_col}" with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -331,7 +332,7 @@ def verify_delta_table_loaded_from_delta( assert equal_datasets( postgres_data, delta_data, - TABLE_SPEC[delta_table_name]["custom_schema"], + TABLE_SPEC[delta_table_name].custom_schema, ignore_fields=ignore_fields, ) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py index bf74f2e3fc..2a897a4b07 100644 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py @@ -5,22 +5,34 @@ from copy import deepcopy from datetime import datetime, timedelta, timezone + from django.core.management import call_command from model_bakery import baker from pytest import mark -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) +from usaspending_api.config import CONFIG +from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( _BEGINNING_OF_TIME, _INITIAL_SOURCE_TABLE_LOAD_DATETIME, _InitialRunWithPostgresLoader, _TableLoadInfo, +) +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( TestInitialRun as InitialRun, # Remove 'test' prefix to avoid pytest running these tests twice - TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, # Remove 'test' prefix to avoid pytest running these tests twice ) -from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( + # Remove 'test' prefix to avoid pytest running these tests twice + TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, +) class _TransactionFabsFpdsCore: @@ -60,7 +72,7 @@ def unexpected_paths_source_tables_only_test_core(self): self.spark.sql(f"create database if not exists {raw_db};") self.spark.sql(f"use {raw_db};") self.spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=self.s3_data_bucket, @@ -68,7 +80,7 @@ def unexpected_paths_source_tables_only_test_core(self): ) ) self.spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=self.s3_data_bucket, @@ -92,7 +104,9 @@ def unexpected_paths_source_tables_only_test_core(self): } # Even though nothing will have been loaded to that table, the table whose etl_level has been called will # have its last load date set to the date of the source tables' load. - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify(self.spark, [], [], **kwargs) # 2. With raw.transaction_normalized and raw.awards still not created, call load_transactions_in_delta @@ -102,18 +116,26 @@ def unexpected_paths_source_tables_only_test_core(self): # need to reset the last load date on transaction_fabs update_last_load_date(self.etl_level, _BEGINNING_OF_TIME) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) for item in expected_transaction_id_lookup: item["transaction_id"] += 1 # Also, the last load date of the transaction_id_lookup table and of the table whose etl_level is being # called should be updated to the load time of the source tables - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_transaction_id_lookup, @@ -130,20 +152,31 @@ def unexpected_paths_source_tables_only_test_core(self): delta_data = [row.asDict() for row in self.spark.sql(query).collect()] if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) for item in expected_transaction_fabs_fpds: item["transaction_id"] += 1 assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") def unexpected_paths_test_core( - self, load_other_raw_tables, expected_initial_transaction_id_lookup, expected_initial_award_id_lookup + self, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, ): # 1. Call load_transactions_in_delta with etl-level of initial_run first, making sure to load # raw.transaction_normalized along with the source tables, but don't copy the raw tables to int. # Then immediately call load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables, initial_copy=False) + InitialRun.initial_run( + self.s3_data_bucket, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Even without the call to load_transactions_in_delta with etl-level of transaction_id_lookup, the appropriate @@ -157,7 +190,9 @@ def unexpected_paths_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_initial_transaction_id_lookup, @@ -172,9 +207,13 @@ def unexpected_paths_test_core( query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" delta_data = [row.asDict() for row in self.spark.sql(query).collect()] if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) # 2. Test inserting, updating, and deleting without calling load_transactions_in_delta with etl-level # of transaction_id_lookup before calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. @@ -233,9 +272,13 @@ def unexpected_paths_test_core( # However, this call should *NOT* pick up the inserts or deletes, since those transactions will not # have changed in the transaction_id_lookup table. if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") @@ -264,8 +307,12 @@ def happy_paths_test_core( ): # 1, Test calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s after calling with # etl-levels of initial_run and transaction_id_lookup. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + InitialRun.initial_run( + self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables + ) + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Verify the tables. The transaction and award id lookup tables should be the same as during the initial run. @@ -277,7 +324,9 @@ def happy_paths_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_initial_transaction_id_lookup, @@ -289,14 +338,23 @@ def happy_paths_test_core( ) # Verify key fields in transaction_fabs table - transaction_fabs_fpds_query = ( - f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - ) - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] + transaction_fabs_fpds_query = f""" + SELECT {', '.join(self.compare_fields)} + FROM int.{self.etl_level} + ORDER BY {self.pk_field} + """ + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) # 2. Test inserting, updating, and deleting records followed by calling load_transactions_in_delta with # etl-levels of transaction_id_lookup and then transaction_f[ab|pd]s. @@ -364,14 +422,18 @@ def happy_paths_test_core( ) # Need to load changes into the transaction_id_lookup table. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) for pop_index in expected_transaction_id_lookup_pops: expected_transaction_id_lookup.pop(pop_index) expected_transaction_id_lookup_append.update( @@ -383,26 +445,41 @@ def happy_paths_test_core( assert equal_datasets(expected_transaction_id_lookup, delta_data, "") # Verify country code scalar transformation - query = f"SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" + query = f""" + SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ delta_data = [row.asDict() for row in self.spark.sql(query).collect()] assert len(delta_data) == 1 assert delta_data[0]["legal_entity_country_code"] == "USA" assert delta_data[0]["place_of_perform_country_c"] == "USA" # Verify country name scalar transformation - query = f"SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" + query = f""" + SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ delta_data = [row.asDict() for row in self.spark.sql(query).collect()] assert len(delta_data) == 1 assert delta_data[0]["legal_entity_country_name"] == "UNITED STATES" assert delta_data[0]["place_of_perform_country_n"] == "UNITED STATES" # Verify key fields in transaction_f[ab|pd]s table - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) expected_transaction_fabs_fpds.pop(1) expected_transaction_fabs_fpds.pop(1) expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime @@ -420,7 +497,10 @@ def happy_paths_test_core( # Verify that the last_load_dates of the transaction_id_lookup table and the table whose etl_level has been # called did NOT change, since only one of the broker source tables' last load date was changed. - assert get_last_load_date("transaction_id_lookup") == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + assert ( + get_last_load_date("transaction_id_lookup") + == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) assert get_last_load_date(self.etl_level) == _INITIAL_SOURCE_TABLE_LOAD_DATETIME def happy_paths_no_pg_loader_test_core( @@ -442,7 +522,9 @@ def happy_paths_no_pg_loader_test_core( self.etl_level, initial_transaction_fabs_fpds, ), - _TableLoadInfo(self.spark, "awards", InitialRunNoPostgresLoader.initial_awards), + _TableLoadInfo( + self.spark, "awards", InitialRunNoPostgresLoader.initial_awards + ), ), InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, InitialRunNoPostgresLoader.expected_initial_award_id_lookup, @@ -459,7 +541,9 @@ class TestTransactionFabs: usas_source_table_name = "published_fabs" broker_source_table_name = "source_assistance_transaction" baker_table = "transactions.SourceAssistanceTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[0].keys() + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[ + 0 + ].keys() new_afa_generated_unique = "award_assist_0004_trans_0001" new_unique_award_key = "award_assist_0004" baker_kwargs = { @@ -479,7 +563,9 @@ class TestTransactionFabs: "unique_award_key": new_unique_award_key.upper(), } - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fabs): + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fabs + ): return _TransactionFabsFpdsCore( spark, s3_data_bucket, @@ -496,28 +582,46 @@ def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_i @mark.django_db(transaction=True) def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fabs + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fabs, ) transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() @mark.django_db(transaction=True) def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, ) transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() @mark.django_db(transaction=True) def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, ) transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( InitialRunNoPostgresLoader.initial_transaction_fabs, @@ -534,7 +638,9 @@ class TestTransactionFpds: usas_source_table_name = "detached_award_procurement" broker_source_table_name = "source_procurement_transaction" baker_table = "transactions.SourceProcurementTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[0].keys() + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[ + 0 + ].keys() new_detached_award_proc_unique = "award_procure_0004_trans_0001" new_unique_award_key = "award_procure_0004" baker_kwargs = { @@ -552,7 +658,9 @@ class TestTransactionFpds: "unique_award_key": new_unique_award_key.upper(), } - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fpds): + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fpds + ): return _TransactionFabsFpdsCore( spark, s3_data_bucket, @@ -569,28 +677,46 @@ def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_i @mark.django_db(transaction=True) def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fpds + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fpds, ) transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() @mark.django_db(transaction=True) def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, ) transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() @mark.django_db(transaction=True) def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, ) transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( InitialRunNoPostgresLoader.initial_transaction_fpds, diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py index 978cfa345d..b623129860 100644 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py @@ -3,29 +3,41 @@ NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py """ -import dateutil import re -import pyspark - from copy import deepcopy from dataclasses import dataclass from datetime import datetime, timedelta, timezone -from django.db import connection +from typing import Any, Dict, Optional, Sequence +from unittest.mock import patch + +import dateutil +import pyspark from django.core.management import call_command +from django.db import connection from model_bakery import baker from pyspark.sql import SparkSession from pytest import mark, raises -from typing import Any, Dict, Optional, Sequence -from unittest.mock import patch -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) from usaspending_api.common.helpers.spark_helpers import load_dict_to_delta_table -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets -from usaspending_api.transactions.delta_models.transaction_fabs import TRANSACTION_FABS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_fpds import TRANSACTION_FPDS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_normalized import TRANSACTION_NORMALIZED_COLUMNS from usaspending_api.config import CONFIG from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) +from usaspending_api.transactions.delta_models.transaction_fabs import ( + TRANSACTION_FABS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_fpds import ( + TRANSACTION_FPDS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_normalized import ( + TRANSACTION_NORMALIZED_COLUMNS, +) _BEGINNING_OF_TIME = datetime(1970, 1, 1, tzinfo=timezone.utc) _INITIAL_DATETIME = datetime(2022, 10, 31, tzinfo=timezone.utc) @@ -144,7 +156,9 @@ class _TableLoadInfo: overwrite: Optional[bool] = False -def _load_tables_to_delta(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None): +def _load_tables_to_delta( + s3_data_bucket, load_source_tables=True, load_other_raw_tables=None +): if load_source_tables: load_delta_table_from_postgres("published_fabs", s3_data_bucket) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) @@ -152,22 +166,42 @@ def _load_tables_to_delta(s3_data_bucket, load_source_tables=True, load_other_ra if load_other_raw_tables: for item in load_other_raw_tables: if isinstance(item, _TableLoadInfo): - load_dict_to_delta_table(item.spark, s3_data_bucket, "raw", item.table_name, item.data, item.overwrite) + load_dict_to_delta_table( + item.spark, + s3_data_bucket, + "raw", + item.table_name, + item.data, + item.overwrite, + ) else: load_delta_table_from_postgres(item, s3_data_bucket) class TestInitialRun: @staticmethod - def initial_run(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None, initial_copy=True): + def initial_run( + s3_data_bucket, + load_source_tables=True, + load_other_raw_tables=None, + initial_copy=True, + ): _load_tables_to_delta(s3_data_bucket, load_source_tables, load_other_raw_tables) - call_params = ["load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_data_bucket] + call_params = [ + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_data_bucket, + ] if not initial_copy: call_params.append("--no-initial-copy") call_command(*call_params) @staticmethod - def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_load=None): + def verify_transaction_ids( + spark, expected_transaction_id_lookup, expected_last_load=None + ): # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] @@ -180,7 +214,10 @@ def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_ max_transaction_id = cursor.fetchone()[0] if expected_transaction_id_lookup: assert max_transaction_id == max( - [transaction["transaction_id"] for transaction in expected_transaction_id_lookup] + [ + transaction["transaction_id"] + for transaction in expected_transaction_id_lookup + ] ) else: assert max_transaction_id == 1 @@ -188,12 +225,16 @@ def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_ # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) @staticmethod def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] assert equal_datasets(expected_award_id_lookup, delta_data, "") @@ -203,7 +244,9 @@ def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_award_id = cursor.fetchone()[0] if expected_award_id_lookup: - assert max_award_id == max([award["award_id"] for award in expected_award_id_lookup]) + assert max_award_id == max( + [award["award_id"] for award in expected_award_id_lookup] + ) else: assert max_award_id == 1 @@ -221,9 +264,13 @@ def verify_lookup_info( expected_load_load_award_id_lookup=None, ): TestInitialRun.verify_transaction_ids( - spark, expected_transaction_id_lookup, expected_last_load_transaction_id_lookup + spark, + expected_transaction_id_lookup, + expected_last_load_transaction_id_lookup, + ) + TestInitialRun.verify_award_ids( + spark, expected_award_id_lookup, expected_load_load_award_id_lookup ) - TestInitialRun.verify_award_ids(spark, expected_award_id_lookup, expected_load_load_award_id_lookup) @staticmethod def verify_raw_vs_int_tables(spark, table_name, col_names): @@ -269,11 +316,13 @@ def verify( ) # int.award_ids_delete_modified should exist, but be empty - actual_count = spark.sql("SELECT COUNT(*) AS count from int.award_ids_delete_modified").collect()[0]["count"] + actual_count = spark.sql( + "SELECT COUNT(*) AS count from int.award_ids_delete_modified" + ).collect()[0]["count"] assert actual_count == 0 # Make sure int.transaction_[normalized,fabs,fpds] tables have been created and have the expected sizes. - for table_name, expected_count, expected_last_load, col_names in zip( + for table_name, expected_count, _expected_last_load, col_names in zip( (f"transaction_{t}" for t in ("normalized", "fabs", "fpds")), (expected_normalized_count, expected_fabs_count, expected_fpds_count), ( @@ -281,9 +330,16 @@ def verify( expected_last_load_transaction_fabs, expected_last_load_transaction_fpds, ), - (list(TRANSACTION_NORMALIZED_COLUMNS), TRANSACTION_FABS_COLUMNS, TRANSACTION_FPDS_COLUMNS), + ( + list(TRANSACTION_NORMALIZED_COLUMNS), + TRANSACTION_FABS_COLUMNS, + TRANSACTION_FPDS_COLUMNS, + ), + strict=False, ): - actual_count = spark.sql(f"SELECT COUNT(*) AS count from int.{table_name}").collect()[0]["count"] + actual_count = spark.sql( + f"SELECT COUNT(*) AS count from int.{table_name}" + ).collect()[0]["count"] assert actual_count == expected_count if expected_count > 0: @@ -300,16 +356,20 @@ def verify( else: raise e else: - TestInitialRun.verify_raw_vs_int_tables(spark, table_name, col_names) + TestInitialRun.verify_raw_vs_int_tables( + spark, table_name, col_names + ) @mark.django_db(transaction=True) - def test_edge_cases_using_only_source_tables(self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db): + def test_edge_cases_using_only_source_tables( + self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db + ): # Setup some source tables without data, this test does not require these tables to be populated raw_db = "raw" spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -317,7 +377,7 @@ def test_edge_cases_using_only_source_tables(self, spark, s3_unittest_data_bucke ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -349,16 +409,23 @@ class _InitialRunWithPostgresLoader: { "transaction_id": id, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[id - 1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[id - 1][ + "afa_generated_unique" + ].upper(), } for id in range(1, len(_INITIAL_ASSISTS) + 1) ] + [ { "transaction_id": id, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[id - 6]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[id - 6][ + "detached_award_proc_unique" + ].upper(), } - for id in range(len(_INITIAL_ASSISTS) + 1, len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) + for id in range( + len(_INITIAL_ASSISTS) + 1, + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1, + ) ] expected_initial_award_id_lookup = [ @@ -373,7 +440,12 @@ class _InitialRunWithPostgresLoader: { "award_id": ( int(procure["unique_award_key"].split("_")[-1]) - + max([int(assist["unique_award_key"].split("_")[-1]) for assist in _INITIAL_ASSISTS]) + + max( + [ + int(assist["unique_award_key"].split("_")[-1]) + for assist in _INITIAL_ASSISTS + ] + ) ), "is_fpds": True, "transaction_unique_id": procure["detached_award_proc_unique"].upper(), @@ -385,7 +457,9 @@ class _InitialRunWithPostgresLoader: expected_initial_transaction_fabs = [ { **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), "afa_generated_unique": assist["afa_generated_unique"].upper(), "transaction_id": assist["published_fabs_id"], "unique_award_key": assist["unique_award_key"].upper(), @@ -396,9 +470,12 @@ class _InitialRunWithPostgresLoader: expected_initial_transaction_fpds = [ { **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), - "transaction_id": procure["detached_award_procurement_id"] + len(_INITIAL_ASSISTS), + "transaction_id": procure["detached_award_procurement_id"] + + len(_INITIAL_ASSISTS), "unique_award_key": procure["unique_award_key"].upper(), } for procure in _INITIAL_PROCURES @@ -410,52 +487,72 @@ class TestInitialRunNoPostgresLoader: { "transaction_id": 1, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 2, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 3, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 7, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 8, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 9, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 10, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), }, ] @@ -463,62 +560,102 @@ class TestInitialRunNoPostgresLoader: { "award_id": 1, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), }, { "award_id": 2, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), }, { "award_id": 2, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[2][ + "unique_award_key" + ].upper(), }, { "award_id": 3, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), }, { "award_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), }, { "award_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[2]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[2][ + "unique_award_key" + ].upper(), }, { "award_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), }, { "award_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[4][ + "unique_award_key" + ].upper(), }, { "award_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), }, { "award_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[4]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[4][ + "unique_award_key" + ].upper(), }, ] @@ -528,49 +665,73 @@ class TestInitialRunNoPostgresLoader: { "id": 1, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 2, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 3, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, { "id": 4, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, { "id": 5, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 6, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, ] @@ -580,9 +741,13 @@ class TestInitialRunNoPostgresLoader: "id": 1, "award_id": 1, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[0]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[0]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), @@ -591,9 +756,13 @@ class TestInitialRunNoPostgresLoader: "id": 2, "award_id": 3, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[0]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[0]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[0]["unique_award_key"].upper(), @@ -602,9 +771,13 @@ class TestInitialRunNoPostgresLoader: "id": 3, "award_id": 2, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[1]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[1]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), @@ -613,9 +786,13 @@ class TestInitialRunNoPostgresLoader: "id": 4, "award_id": 4, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[1]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[1]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[1]["unique_award_key"].upper(), @@ -624,9 +801,13 @@ class TestInitialRunNoPostgresLoader: "id": 5, "award_id": 2, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[2]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[2]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), @@ -635,9 +816,13 @@ class TestInitialRunNoPostgresLoader: "id": 6, "award_id": 4, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[2]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[2]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[2]["unique_award_key"].upper(), @@ -646,9 +831,13 @@ class TestInitialRunNoPostgresLoader: "id": 7, "award_id": 5, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), @@ -657,9 +846,13 @@ class TestInitialRunNoPostgresLoader: "id": 8, "award_id": 5, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[4]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), @@ -668,9 +861,13 @@ class TestInitialRunNoPostgresLoader: "id": 9, "award_id": 6, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), @@ -679,9 +876,13 @@ class TestInitialRunNoPostgresLoader: "id": 10, "award_id": 6, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), @@ -691,7 +892,9 @@ class TestInitialRunNoPostgresLoader: initial_transaction_fabs = [ { **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), "afa_generated_unique": assist["afa_generated_unique"].upper(), "transaction_id": (assist["published_fabs_id"] - 1) * 2 + 1, "unique_award_key": assist["unique_award_key"].upper(), @@ -700,7 +903,9 @@ class TestInitialRunNoPostgresLoader: ] + [ { **_INITIAL_ASSISTS[4], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]) + .date() + .isoformat(), "afa_generated_unique": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), "transaction_id": 8, "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), @@ -710,7 +915,9 @@ class TestInitialRunNoPostgresLoader: initial_transaction_fpds = [ { **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), "transaction_id": procure["detached_award_procurement_id"] * 2, "unique_award_key": procure["unique_award_key"].upper(), @@ -719,15 +926,23 @@ class TestInitialRunNoPostgresLoader: ] + [ { **_INITIAL_PROCURES[3], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "transaction_id": 9, "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), }, { **_INITIAL_PROCURES[4], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), "transaction_id": 10, "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), }, @@ -736,7 +951,9 @@ class TestInitialRunNoPostgresLoader: # This test will only load the source tables from postgres, and NOT use the Postgres transaction loader # to populate any other Delta tables, so can only test for NULLs originating in Delta. @mark.django_db(transaction=True) - @patch("usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions") + @patch( + "usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions" + ) def test_nulls_in_trans_norm_unique_award_key_from_delta( self, orphaned_txns_patch, @@ -749,7 +966,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -757,7 +974,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -765,7 +982,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( ) ) spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -808,9 +1025,16 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( """ ) - with raises(ValueError, match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!"): + with raises( + ValueError, + match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!", + ): call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, ) spark.sql( @@ -849,19 +1073,32 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( """ ) - with raises(ValueError, match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!"): + with raises( + ValueError, + match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!", + ): call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, ) @mark.django_db(transaction=True) def test_happy_path_scenarios( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ - _TableLoadInfo(spark, "transaction_normalized", self.initial_transaction_normalized), + _TableLoadInfo( + spark, "transaction_normalized", self.initial_transaction_normalized + ), _TableLoadInfo(spark, "awards", self.initial_awards), ] # Setup some source tables with data, without loading these Delta Tables from Postgres @@ -870,7 +1107,7 @@ def test_happy_path_scenarios( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -878,7 +1115,7 @@ def test_happy_path_scenarios( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -915,7 +1152,10 @@ def test_happy_path_scenarios( "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } TestInitialRun.verify( - spark, self.expected_initial_transaction_id_lookup, self.expected_initial_award_id_lookup, **kwargs + spark, + self.expected_initial_transaction_id_lookup, + self.expected_initial_award_id_lookup, + **kwargs, ) # 2. Call initial_run with initial-copy, and have all raw tables populated @@ -927,10 +1167,18 @@ def test_happy_path_scenarios( _TableLoadInfo(spark, "transaction_fpds", self.initial_transaction_fpds), ] # Don't call Postgres loader or re-load the source tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, False, load_other_raw_tables) - kwargs["expected_last_load_transaction_normalized"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fabs"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fpds"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + TestInitialRun.initial_run( + s3_unittest_data_bucket, False, load_other_raw_tables + ) + kwargs["expected_last_load_transaction_normalized"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fabs"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fpds"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify( spark, self.expected_initial_transaction_id_lookup, @@ -945,7 +1193,11 @@ def test_happy_path_scenarios( class TestTransactionIdLookup: @mark.django_db(transaction=True) def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Setup some source tables with data, without loading these Delta Tables from Postgres # for efficiency reasons. @@ -953,7 +1205,7 @@ def test_unexpected_paths( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -961,7 +1213,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -991,7 +1243,7 @@ def test_unexpected_paths( # First, create blank raw.transaction_normalized and raw.awards tables spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -999,7 +1251,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( + TABLE_SPEC["awards"].delta_table_create_sql.format( DESTINATION_TABLE="awards", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1009,7 +1261,9 @@ def test_unexpected_paths( # Then, call load_transactions_in_delta with etl-level of initial_run and verify. # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) kwargs = { "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, @@ -1020,16 +1274,22 @@ def test_unexpected_paths( TestInitialRun.verify(spark, [], [], **kwargs) # Then, call load_transactions_in_delta with etl-level of transaction_id_lookup. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) for item in expected_transaction_id_lookup: item["transaction_id"] += 1 # Also, the last load date for the transaction_id_lookup table should be updated to the date of the # initial loads. - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify(spark, expected_transaction_id_lookup, [], **kwargs) @staticmethod @@ -1047,7 +1307,7 @@ def _happy_path_test_core( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1055,7 +1315,7 @@ def _happy_path_test_core( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1080,7 +1340,10 @@ def _happy_path_test_core( ) # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, ) # 1. Test deleting the transaction(s) with the last transaction ID(s) from the appropriate raw table, @@ -1094,13 +1357,17 @@ def _happy_path_test_core( WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 """ ) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) expected_transaction_id_lookup.pop() expected_transaction_id_lookup.pop() assert equal_datasets(expected_transaction_id_lookup, delta_data, "") @@ -1115,7 +1382,9 @@ def _happy_path_test_core( # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value as previously. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) # 3. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of # transaction_id_lookup. @@ -1126,18 +1395,28 @@ def _happy_path_test_core( insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) load_delta_table_from_postgres("published_fabs", s3_data_bucket) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) expected_transaction_id_lookup.pop() expected_transaction_id_lookup.pop() @@ -1160,19 +1439,28 @@ def _happy_path_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - TestInitialRun.verify(spark, expected_transaction_id_lookup, expected_initial_award_id_lookup, **kwargs) + TestInitialRun.verify( + spark, + expected_transaction_id_lookup, + expected_initial_award_id_lookup, + **kwargs, + ) # Also, make sure transaction_id_seq hasn't gone backwards with connection.cursor() as cursor: cursor.execute("SELECT nextval('transaction_id_seq')") # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_transaction_id = cursor.fetchone()[0] - assert max_transaction_id == (len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) # Add one for the insert + assert max_transaction_id == ( + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1 + ) # Add one for the insert # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value as previously. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of # transaction_id_lookup, and test that the results are as expected. @@ -1180,10 +1468,16 @@ def _happy_path_test_core( insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) spark.sql( @@ -1199,7 +1493,9 @@ def _happy_path_test_core( """ ) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" @@ -1211,7 +1507,9 @@ def _happy_path_test_core( { "transaction_id": 12, "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), } ) assert equal_datasets(expected_transaction_id_lookup, delta_data, "") @@ -1220,15 +1518,23 @@ def _happy_path_test_core( @mark.django_db(transaction=True) def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), ] self._happy_path_test_core( @@ -1244,7 +1550,11 @@ def test_happy_path_scenarios_no_pg_loader( class TestAwardIdLookup: @mark.django_db(transaction=True) def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # First, setup some source tables with data, without loading these Delta Tables from Postgres # for efficiency reasons. @@ -1252,7 +1562,7 @@ def test_unexpected_paths( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1260,7 +1570,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1290,7 +1600,7 @@ def test_unexpected_paths( # First, create blank raw.transaction_normalized and raw.awards tables spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1298,7 +1608,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( + TABLE_SPEC["awards"].delta_table_create_sql.format( DESTINATION_TABLE="awards", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1308,7 +1618,9 @@ def test_unexpected_paths( # Then, call load_transactions_in_delta with etl-level of initial_run and verify. # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) kwargs = { "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, @@ -1323,11 +1635,15 @@ def test_unexpected_paths( # The expected award_id_lookup table should be the same as in TestInitialRunWithPostgresLoader, # but all of the award ids should be 1 larger than expected there. - expected_award_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_award_id_lookup) + expected_award_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_award_id_lookup + ) for item in expected_award_id_lookup: item["award_id"] += 1 # Also, the last load date for the award_id_lookup table should be updated to the date of the initial loads. - kwargs["expected_last_load_award_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_award_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify(spark, [], expected_award_id_lookup, **kwargs) @staticmethod @@ -1346,7 +1662,7 @@ def _happy_path_test_core( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1354,7 +1670,7 @@ def _happy_path_test_core( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1379,7 +1695,10 @@ def _happy_path_test_core( ) # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, ) # 1. Test deleting the transactions with the last award ID from the appropriate raw table, @@ -1400,15 +1719,23 @@ def _happy_path_test_core( insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) load_delta_table_from_postgres("published_fabs", s3_data_bucket) call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] expected_award_id_lookup = deepcopy(expected_initial_award_id_lookup) @@ -1435,7 +1762,12 @@ def _happy_path_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - TestInitialRun.verify(spark, expected_initial_transaction_id_lookup, expected_award_id_lookup, **kwargs) + TestInitialRun.verify( + spark, + expected_initial_transaction_id_lookup, + expected_award_id_lookup, + **kwargs, + ) # Make sure award_id_seq hasn't gone backwards with connection.cursor() as cursor: @@ -1443,7 +1775,11 @@ def _happy_path_test_core( # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_award_id = cursor.fetchone()[0] assert ( - max_award_id == max([award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards]) + 1 + max_award_id + == max( + [award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards] + ) + + 1 ) # Add one for the insert # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false @@ -1458,10 +1794,16 @@ def _happy_path_test_core( insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) spark.sql( @@ -1480,7 +1822,9 @@ def _happy_path_test_core( call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] for pop in expected_award_id_lookup_pops: @@ -1489,7 +1833,9 @@ def _happy_path_test_core( { "award_id": 8, "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), "generated_unique_award_id": _NEW_PROCURE["unique_award_key"].upper(), } ) @@ -1500,19 +1846,29 @@ def _happy_path_test_core( # Verify award_ids_delete_modified table query = "SELECT * FROM int.award_ids_delete_modified ORDER BY award_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - assert equal_datasets([{"award_id": partially_deleted_award_id}], delta_data, "") + assert equal_datasets( + [{"award_id": partially_deleted_award_id}], delta_data, "" + ) @mark.django_db(transaction=True) def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), ] self._happy_path_test_core( diff --git a/usaspending_api/etl/tests/unit/test_spark.py b/usaspending_api/etl/tests/unit/test_spark.py deleted file mode 100644 index 3d1c6cf0a3..0000000000 --- a/usaspending_api/etl/tests/unit/test_spark.py +++ /dev/null @@ -1,17 +0,0 @@ -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC - - -def test_table_spec_consistency(): - table_spec_config_groups = { - "LOAD_QUERY_TABLE_SPEC": LOAD_QUERY_TABLE_SPEC, - "LOAD_TABLE_TABLE_SPEC": LOAD_TABLE_TABLE_SPEC, - } - for table_spec_group_name, table_spec_config_group in table_spec_config_groups.items(): - unioned_table_spec_keys = set() - for table_name, config in table_spec_config_group.items(): - unioned_table_spec_keys = unioned_table_spec_keys.union(set(list(config.keys()))) - for table_name, config in table_spec_config_group.items(): - diff = unioned_table_spec_keys - set(list(config.keys())) - if diff: - raise Exception(f"{table_name} is missing the following {table_spec_group_name} values: {diff}")