Skip to content

Commit 36c6910

Browse files
Merge pull request #4626 from fedspendingtransparency/fix/dev-14824-handle-datetime-format-on-load
[DEV-14824] Downmerge into QAT
2 parents 60cb39d + 6f8d44f commit 36c6910

8 files changed

Lines changed: 19 additions & 20 deletions

File tree

usaspending_api/common/helpers/spark_helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ def clean_postgres_sql_for_spark_sql(
708708
if identifier_replacements:
709709
for old, new in identifier_replacements.items():
710710
matches = re.finditer(
711-
rf"(\s+|^|\(|'){old}(\s+|$|\(|')",
711+
rf"(\s+|^|\(){old}(\s+|$|\()",
712712
spark_sql,
713713
flags=re.IGNORECASE | re.MULTILINE,
714714
)

usaspending_api/database_scripts/etl/award_delta_view.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ SELECT
2020
"award_amount",
2121
"total_subsidy_cost",
2222
"total_loan_value",
23-
CAST(TO_CHAR(update_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS update_date,
23+
"update_date",
2424

2525
"recipient_name",
2626
"recipient_unique_id",
@@ -34,7 +34,7 @@ SELECT
3434

3535
"action_date",
3636
"fiscal_year",
37-
CAST(TO_CHAR(last_modified_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS last_modified_date,
37+
"last_modified_date",
3838
"period_of_performance_start_date",
3939
"period_of_performance_current_end_date",
4040
"date_signed",

usaspending_api/database_scripts/etl/subaward_es_view.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ SELECT
100100
s.prime_award_group,
101101
s.prime_award_type,
102102
s.latest_transaction_id,
103-
CAST(TO_CHAR(s.last_modified_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS last_modified_date,
103+
s.last_modified_date,
104104
s.awarding_toptier_agency_name,
105105
s.awarding_toptier_agency_abbreviation,
106106
s.funding_toptier_agency_name,

usaspending_api/database_scripts/etl/transaction_delta_view.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ SELECT
2323
END AS display_award_id,
2424
"action_date",
2525
"fiscal_action_date",
26-
CAST(TO_CHAR(last_modified_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS last_modified_date,
26+
"last_modified_date",
2727
"fiscal_year",
2828
"award_certified_date",
2929
"award_fiscal_year",
3030
"award_date_signed",
31-
CAST(TO_CHAR(update_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS update_date,
31+
"update_date",
3232
"award_update_date",
33-
CAST(TO_CHAR(etl_update_date, 'yyyy-MM-dd HH24:MI:SS') AS TIMESTAMP) AS etl_update_date,
33+
"etl_update_date",
3434
"period_of_performance_start_date",
3535
"period_of_performance_current_end_date",
3636
"ordering_period_end_date",

usaspending_api/etl/elasticsearch_loader_helpers/controller_for_spark.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,22 +77,13 @@ def ensure_view_exists(
7777
f"DROP VIEW IF EXISTS {sql_view_name};", ""
7878
)
7979

80-
# Replacements to supports converting timestamp to a formatted string in both Postgres and Spark
81-
datetime_replacements = {
82-
"TO_CHAR": "DATE_FORMAT",
83-
"yyyy-MM-dd HH24:MI:SS": "yyyy-MM-dd HH:mm:ss", # different formats to support 24-hour time
84-
}
85-
8680
identifier_replacements = {}
8781
if self.config["load_type"] == "transaction":
8882
identifier_replacements["transaction_search"] = "rpt.transaction_search"
89-
identifier_replacements.update(datetime_replacements)
9083
elif self.config["load_type"] == "award":
9184
identifier_replacements["award_search"] = "rpt.award_search"
92-
identifier_replacements.update(datetime_replacements)
9385
elif self.config["load_type"] == "subaward":
9486
identifier_replacements["toptier_agency"] = "global_temp.toptier_agency"
95-
identifier_replacements.update(datetime_replacements)
9687
elif self.config["load_type"] == "recipient":
9788
identifier_replacements = None
9889
else:

usaspending_api/etl/elasticsearch_loader_helpers/transform_data.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
convert_json_array_to_list_of_str,
1111
convert_json_data_to_dict,
1212
format_log,
13+
format_timestamp_as_string,
1314
)
1415

1516
logger = logging.getLogger("script")
@@ -19,7 +20,9 @@ def transform_award_data(worker: TaskSpec, records: List[dict]) -> List[dict]:
1920
replace_fields = {
2021
"spending_by_defc": convert_json_data_to_dict,
2122
"federal_accounts": convert_json_array_to_list_of_str,
23+
"last_modified_date": format_timestamp_as_string,
2224
"program_activities": convert_json_data_to_dict,
25+
"update_date": format_timestamp_as_string,
2326
}
2427
# TODO: Move some of the 1:1 agg_keys that match a field already on Elasticsearch
2528
insert_fields = {
@@ -64,8 +67,11 @@ def transform_award_data(worker: TaskSpec, records: List[dict]) -> List[dict]:
6467

6568
def transform_transaction_data(worker: TaskSpec, records: List[dict]) -> List[dict]:
6669
replace_fields = {
70+
"etl_update_date": format_timestamp_as_string,
6771
"federal_accounts": convert_json_array_to_list_of_str,
72+
"last_modified_date": format_timestamp_as_string,
6873
"program_activities": convert_json_data_to_dict,
74+
"update_date": format_timestamp_as_string,
6975
}
7076
# TODO: Move some of the 1:1 agg_keys that match a field already on Elasticsearch
7177
insert_fields = {
@@ -108,6 +114,7 @@ def transform_transaction_data(worker: TaskSpec, records: List[dict]) -> List[di
108114

109115
def transform_subaward_data(worker: TaskSpec, records: List[dict]) -> List[dict]:
110116
replace_fields = {
117+
"last_modified_date": format_timestamp_as_string,
111118
"program_activities": convert_json_data_to_dict,
112119
}
113120
insert_fields = {

usaspending_api/etl/elasticsearch_loader_helpers/utilities.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import re
44
from dataclasses import dataclass
5+
from datetime import datetime
56
from pathlib import Path
67
from random import choice
78
from typing import Any, Generator, List, Optional, Union
@@ -136,6 +137,10 @@ def convert_json_array_to_list_of_str(json_data: Union[list, str]) -> Optional[L
136137
return result
137138

138139

140+
def format_timestamp_as_string(value_to_format: datetime, datetime_format: str = "%Y-%m-%d %H:%M:%S") -> str:
141+
return datetime.strftime(value_to_format, datetime_format) if value_to_format else value_to_format
142+
143+
139144
def execute_sql_statement(cmd: str | Composed, results: bool = False, verbose: bool = False) -> Optional[List[dict]]:
140145
"""Simple function to execute SQL using a single-use psycopg connection"""
141146
rows = None

usaspending_api/etl/management/commands/elasticsearch_indexer.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,6 @@ def parse_cli_args(options: dict, es_client: Elasticsearch) -> dict: # noqa: PL
241241
logger.error(f"--start-datetime is too early. Set no earlier than {config['initial_datetime']}")
242242
raise SystemExit(1)
243243

244-
# Format to include timezone, but remove milliseconds to allow proper comparison with datetime values
245-
# that are captured on the documents
246-
config["starting_date"] = datetime.strftime(config["starting_date"], "%Y-%m-%d %H:%M:%S%z")
247-
248244
return config
249245

250246

0 commit comments

Comments
 (0)