|
8 | 8 | import re |
9 | 9 | from datetime import datetime |
10 | 10 |
|
11 | | -from brus_backend_common.models import DEFCBronze, DEFCSilver, DEFCGroup |
| 11 | +from brus_backend_common.models import DEFCBronze, DEFCGroup, DEFCGold |
12 | 12 | from brus_backend_common.models.lakehouse_model import update_external_data_load_date |
13 | 13 | from brus_backend_common.helpers.aws import _get_boto3 |
14 | | -from brus_backend_common.helpers.spark import SparkScriptSession |
15 | 14 | from brus_backend_common.helpers.pandas import check_dataframe_diff |
16 | 15 | from brus_backend_common.helpers.scripts import ( |
17 | 16 | clean_data, |
@@ -203,107 +202,106 @@ def main(local_file: str | None = None, force_reload: bool = False, metrics_json |
203 | 202 |
|
204 | 203 | s3 = _get_boto3("client", "s3") |
205 | 204 |
|
206 | | - with SparkScriptSession() as spark: |
207 | | - raw_model = DEFCBronze() |
208 | | - if not raw_model.exists(): |
209 | | - raise ValueError(f"{raw_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
210 | | - |
211 | | - group_model = DEFCGroup() |
212 | | - if not group_model.exists(): |
213 | | - raise ValueError(f"{group_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
214 | | - |
215 | | - int_model = DEFCSilver(spark=spark) |
216 | | - if not int_model.exists(): |
217 | | - raise ValueError(f"{int_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
218 | | - |
219 | | - start_time = datetime.now() |
220 | | - metrics_json["start_time"] = str(start_time) |
221 | | - |
222 | | - logger.info("Parsing DEFC data") |
223 | | - try: |
224 | | - if not local_file: |
225 | | - raw_data = raw_model.to_pandas_df(dtype=str, na_filter=False) |
226 | | - else: |
227 | | - raw_data = pd.read_csv(local_file, dtype=str, na_filter=False) |
228 | | - except pd.errors.EmptyDataError: |
229 | | - metrics_json["blank_file"] = True |
230 | | - metrics_json["exit_code"] = 4 # exit code chosen arbitrarily, to indicate distinct failure states |
231 | | - return metrics_json |
232 | | - headers = set([header.upper() for header in list(raw_data)]) |
233 | | - |
234 | | - if not VALID_HEADERS.issubset(headers): |
235 | | - logger.error("Missing required headers. Required headers include: %s" % str(VALID_HEADERS)) |
236 | | - metrics_json["exit_code"] = 4 |
237 | | - return metrics_json |
238 | | - metrics_json["records_received"] = len(raw_data) |
239 | | - # Creating a dataframe of the export csv first and then copying columns to match the database |
240 | | - raw_data = raw_data.rename(columns={"DEFC_CODE": "DEFC", "DEFC_TITLE": "Public Law"}) |
241 | | - |
242 | | - group_model_df = group_model.to_pandas_df() |
243 | | - group_mapping = group_model_df.groupby("group")["code"].agg(list).to_dict() |
244 | | - |
245 | | - raw_data = apply_defc_derivations(raw_data, group_mapping) |
246 | | - |
247 | | - raw_data = add_defc_outliers(raw_data, group_mapping) |
248 | | - |
249 | | - # Clear any lingering np.nan's |
250 | | - raw_data = raw_data.replace({np.nan: None}) |
251 | | - |
252 | | - logger.info("Checking for differences in DEFC data") |
253 | | - defc_mapping = { |
254 | | - "defc": "code", |
255 | | - "public_law": "public_laws", |
256 | | - "public_law_short_title": "public_law_short_titles", |
257 | | - "group_name": "group", |
258 | | - "urls": "urls", |
259 | | - "is_valid": "is_valid", |
260 | | - "earliest_public_law_enactment_date": "earliest_pl_action_date", |
261 | | - } |
262 | | - data = clean_data(raw_data, defc_mapping, {}) |
263 | | - diff_found = check_dataframe_diff(data, int_model.to_pandas_df(), ["defc_id"], ["code"], date_format="%Y-%m-%d") |
264 | | - if force_reload or diff_found: |
265 | | - |
266 | | - # The only diff should be whenever a new code is added. Noting it here |
267 | | - if diff_found: |
268 | | - incoming_defcs = list(data["code"]) |
269 | | - curr_defcs = list(int_model.to_pandas_df()["code"]) |
270 | | - diff_defcs = list(set(incoming_defcs) - set(curr_defcs)) |
271 | | - metrics_json["new_defc"] = diff_defcs |
272 | | - logger.info(f"Difference found: {diff_defcs}") |
273 | | - |
274 | | - logger.info("Overwriting new DEFC data to Broker") |
275 | | - int_model.save(data) |
276 | | - |
277 | | - update_external_data_load_date(int_model, start_time, datetime.now()) |
278 | | - logger.info("{} records inserted to DEFC".format(len(data))) |
279 | | - |
280 | | - # convert the arrays to pipe-delimited strings |
281 | | - defc_delim = "|" |
282 | | - array_cols = ["Public Law", "Public Law Short Title", "URLs"] |
283 | | - for array_col in array_cols: |
284 | | - raw_data[array_col] = raw_data[array_col].apply(lambda value: defc_delim.join(value)) |
285 | | - |
286 | | - header_order = [ |
287 | | - "DEFC", |
288 | | - "Public Law", |
289 | | - "Public Law Short Title", |
290 | | - "Group Name", |
291 | | - "URLs", |
292 | | - "Is Valid", |
293 | | - "Earliest Public Law Enactment Date", |
294 | | - ] |
295 | | - raw_data = raw_data[header_order] |
296 | | - export_name = "def_codes.csv" |
297 | | - logger.info("Exporting loaded DEFC file to {}".format(export_name)) |
298 | | - raw_data.to_csv(export_name, index=0) |
299 | | - |
300 | | - s3.upload_file(export_name, CONFIG.PUBLIC_FILES_BUCKET, export_name) |
301 | | - |
302 | | - os.remove(export_name) |
| 205 | + raw_model = DEFCBronze() |
| 206 | + if not raw_model.exists(): |
| 207 | + raise ValueError(f"{raw_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
| 208 | + |
| 209 | + group_model = DEFCGroup() |
| 210 | + if not group_model.exists(): |
| 211 | + raise ValueError(f"{group_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
| 212 | + |
| 213 | + gold_model = DEFCGold() |
| 214 | + if not gold_model.exists(): |
| 215 | + raise ValueError(f"{gold_model.TABLE_REF} doesn't exist. Use create_migrate_delta_table beforehand.") |
| 216 | + |
| 217 | + start_time = datetime.now() |
| 218 | + metrics_json["start_time"] = str(start_time) |
| 219 | + |
| 220 | + logger.info("Parsing DEFC data") |
| 221 | + try: |
| 222 | + if not local_file: |
| 223 | + raw_data = raw_model.to_pandas_df(dtype=str, na_filter=False) |
303 | 224 | else: |
304 | | - logger.info("No differences found, skipping defc table reload.") |
| 225 | + raw_data = pd.read_csv(local_file, dtype=str, na_filter=False) |
| 226 | + except pd.errors.EmptyDataError: |
| 227 | + metrics_json["blank_file"] = True |
| 228 | + metrics_json["exit_code"] = 4 # exit code chosen arbitrarily, to indicate distinct failure states |
| 229 | + return metrics_json |
| 230 | + headers = set([header.upper() for header in list(raw_data)]) |
| 231 | + |
| 232 | + if not VALID_HEADERS.issubset(headers): |
| 233 | + logger.error("Missing required headers. Required headers include: %s" % str(VALID_HEADERS)) |
| 234 | + metrics_json["exit_code"] = 4 |
| 235 | + return metrics_json |
| 236 | + metrics_json["records_received"] = len(raw_data) |
| 237 | + # Creating a dataframe of the export csv first and then copying columns to match the database |
| 238 | + raw_data = raw_data.rename(columns={"DEFC_CODE": "DEFC", "DEFC_TITLE": "Public Law"}) |
| 239 | + |
| 240 | + group_model_df = group_model.to_pandas_df() |
| 241 | + group_mapping = group_model_df.groupby("group")["code"].agg(list).to_dict() |
| 242 | + |
| 243 | + raw_data = apply_defc_derivations(raw_data, group_mapping) |
| 244 | + |
| 245 | + raw_data = add_defc_outliers(raw_data, group_mapping) |
| 246 | + |
| 247 | + # Clear any lingering np.nan's |
| 248 | + raw_data = raw_data.replace({np.nan: None}) |
| 249 | + |
| 250 | + logger.info("Checking for differences in DEFC data") |
| 251 | + defc_mapping = { |
| 252 | + "defc": "code", |
| 253 | + "public_law": "public_laws", |
| 254 | + "public_law_short_title": "public_law_short_titles", |
| 255 | + "group_name": "group", |
| 256 | + "urls": "urls", |
| 257 | + "is_valid": "is_valid", |
| 258 | + "earliest_public_law_enactment_date": "earliest_pl_action_date", |
| 259 | + } |
| 260 | + data = clean_data(raw_data, defc_mapping, {}) |
| 261 | + diff_found = check_dataframe_diff(data, gold_model.to_pandas_df(), ["defc_id"], ["code"], date_format="%Y-%m-%d") |
| 262 | + if force_reload or diff_found: |
| 263 | + |
| 264 | + # The only diff should be whenever a new code is added. Noting it here |
| 265 | + if diff_found: |
| 266 | + incoming_defcs = list(data["code"]) |
| 267 | + curr_defcs = list(gold_model.to_pandas_df()["code"]) |
| 268 | + diff_defcs = list(set(incoming_defcs) - set(curr_defcs)) |
| 269 | + metrics_json["new_defc"] = diff_defcs |
| 270 | + logger.info(f"Difference found: {diff_defcs}") |
| 271 | + |
| 272 | + logger.info("Overwriting new DEFC data to Broker") |
| 273 | + gold_model.save(data) |
| 274 | + |
| 275 | + update_external_data_load_date(gold_model, start_time, datetime.now()) |
| 276 | + logger.info("{} records inserted to DEFC".format(len(data))) |
| 277 | + |
| 278 | + # convert the arrays to pipe-delimited strings |
| 279 | + defc_delim = "|" |
| 280 | + array_cols = ["Public Law", "Public Law Short Title", "URLs"] |
| 281 | + for array_col in array_cols: |
| 282 | + raw_data[array_col] = raw_data[array_col].apply(lambda value: defc_delim.join(value)) |
| 283 | + |
| 284 | + header_order = [ |
| 285 | + "DEFC", |
| 286 | + "Public Law", |
| 287 | + "Public Law Short Title", |
| 288 | + "Group Name", |
| 289 | + "URLs", |
| 290 | + "Is Valid", |
| 291 | + "Earliest Public Law Enactment Date", |
| 292 | + ] |
| 293 | + raw_data = raw_data[header_order] |
| 294 | + export_name = "def_codes.csv" |
| 295 | + logger.info("Exporting loaded DEFC file to {}".format(export_name)) |
| 296 | + raw_data.to_csv(export_name, index=0) |
| 297 | + |
| 298 | + s3.upload_file(export_name, CONFIG.PUBLIC_FILES_BUCKET, export_name) |
| 299 | + |
| 300 | + os.remove(export_name) |
| 301 | + else: |
| 302 | + logger.info("No differences found, skipping defc table reload.") |
305 | 303 |
|
306 | | - total_defc_count = int_model.count() |
| 304 | + total_defc_count = gold_model.count() |
307 | 305 |
|
308 | 306 | metrics_json["total_defc_count"] = total_defc_count |
309 | 307 |
|
|
0 commit comments