|
| 1 | +import requests |
| 2 | +import pandas as pd |
| 3 | +import datetime |
| 4 | +import os |
| 5 | +import click |
| 6 | +import tabula |
| 7 | +from . import store_utils as utils |
| 8 | +from vtools import ts_splice |
| 9 | +from csv import QUOTE_NONNUMERIC, QUOTE_MINIMAL |
| 10 | + |
| 11 | + |
| 12 | +@click.command() |
| 13 | +@click.option("--base-dir", default="smscg") |
| 14 | +@click.option("--outfile", default="dms_smscg_gate.csv") |
| 15 | +def main(base_dir=".", outfile="dms_smscg_gate.csv"): |
| 16 | + raw_dir = os.path.join(base_dir, "raw") |
| 17 | + convert_dir = os.path.join(base_dir, "converted") |
| 18 | + os.makedirs(raw_dir, exist_ok=True) |
| 19 | + os.makedirs(convert_dir, exist_ok=True) |
| 20 | + df0 = download_and_parse_archived_pdf(raw_dir) |
| 21 | + df1 = download_and_parse_active_gate_log(raw_dir) |
| 22 | + df_final = reconcile_archive_with_new(df0, df1) |
| 23 | + # Write CSV with only "remarks" and "user_remarks" quoted |
| 24 | + #df_final = _quote_selected_columns(df_final, ["remarks", "user_remarks"]) |
| 25 | + outfile = os.path.join(convert_dir, outfile) |
| 26 | + df_final.to_csv(outfile, index=True, quoting=QUOTE_MINIMAL) |
| 27 | + |
| 28 | + |
| 29 | + |
| 30 | +def reconcile_archive_with_new(df_archive,df_new): |
| 31 | + df_archive.index.name="datetime" |
| 32 | + df_new.index.name="datetime" # This should already be true, modulo case convention |
| 33 | + |
| 34 | + df_archive.columns = [x.lower().replace(" ","_") for x in df_archive.columns] |
| 35 | + df_new.columns = [x.lower().replace(" ","_") for x in df_new.columns] |
| 36 | + final_columns = ["flashboards","gate_1","gate_2","gate_3","action","remarks","user_remarks"] |
| 37 | + df_archive = df_archive.reindex(columns=final_columns) |
| 38 | + df_new = df_new.reindex(columns=final_columns) |
| 39 | + df_final = ts_splice((df_archive,df_new),transition="prefer_last") |
| 40 | + cols = ["gate_1", "gate_2", "gate_3"] |
| 41 | + |
| 42 | + mapping = { |
| 43 | + "O": "Open", |
| 44 | + "OPEN": "Open", |
| 45 | + "C": "Closed", |
| 46 | + "CLSD": "Closed", |
| 47 | + "CLOSED": "Closed", |
| 48 | + "OP": "Tidal", |
| 49 | + "M-OP": "Tidal", |
| 50 | + "TIDAL": "Tidal", |
| 51 | + } |
| 52 | + |
| 53 | + for col in cols: |
| 54 | + s = df_final[col].astype("string").str.strip().str.upper() |
| 55 | + df_final.loc[:, col] = s.map(mapping).fillna(df_final[col]) |
| 56 | + # Ensure the time index is sorted and unique |
| 57 | + df_final = df_final[~df_final.index.duplicated(keep="last")] |
| 58 | + df_final = df_final.sort_index() |
| 59 | + |
| 60 | + return df_final |
| 61 | + |
| 62 | +def download_and_parse_active_gate_log(raw_dir="raw"): |
| 63 | + """ |
| 64 | + Download the Suisun Marsh Salinity Control Gates log from the California Natural Resources Agency |
| 65 | + # https://data.cnra.ca.gov/dataset/suisun-marsh-salinity-control-gates-log/resource/265729e9-4ac0-469e-828b-2564ac077689 |
| 66 | + """ |
| 67 | + utils.ensure_dir(raw_dir) |
| 68 | + today = datetime.datetime.now() |
| 69 | + date_str = today.strftime("%Y-%m-%d") |
| 70 | + url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/265729e9-4ac0-469e-828b-2564ac077689/download/smscg-log.xlsx" |
| 71 | + response = requests.get(url) |
| 72 | + assert response.status_code == 200 |
| 73 | + fname = url.split("/")[-1] |
| 74 | + xlsfname = os.path.join(raw_dir, fname.split(".")[0] + ".xlsx") |
| 75 | + with open(xlsfname, "wb") as fh: |
| 76 | + fh.write(response.content) |
| 77 | + df = pd.read_excel(xlsfname, parse_dates=True, index_col=0) |
| 78 | + df = df.sort_index() |
| 79 | + conv_dir = os.path.dirname(xlsfname).replace("/raw/", "/converted/") |
| 80 | + utils.ensure_dir(conv_dir) |
| 81 | + #df.to_csv(os.path.join(raw_dir, fname.split(".")[0] + ".csv")) |
| 82 | + return df |
| 83 | + |
| 84 | + |
| 85 | + |
| 86 | +def download_and_parse_archived_pdf(base_dir="raw"): |
| 87 | + """ |
| 88 | + Download and parse the archived PDF log of the Suisun Marsh Salinity Control Gates. |
| 89 | + """ |
| 90 | + utils.ensure_dir(base_dir) |
| 91 | + url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/7b3ab962-202b-43c2-9ac7-08f2303b153b/download/histsmscgopnew.pdf" |
| 92 | + response = requests.get(url) |
| 93 | + assert response.status_code == 200 |
| 94 | + pdf_fname = os.path.join(base_dir, "histsmscgopnew.pdf") |
| 95 | + with open(pdf_fname, "wb") as fh: |
| 96 | + fh.write(response.content) |
| 97 | + |
| 98 | + # Parse the PDF using tabula-py |
| 99 | + dfs = tabula.read_pdf( |
| 100 | + pdf_fname, pages="all", multiple_tables=True, encoding="ISO-8859-1" |
| 101 | + ) # for windows maybe?) |
| 102 | + |
| 103 | + # Combine all tables into a single DataFrame |
| 104 | + for i in range(len(dfs)): |
| 105 | + dfs[i]["DATE"] = pd.to_datetime(dfs[i]["DATE"], errors="coerce") |
| 106 | + dfs[i] = dfs[i].set_index("DATE") |
| 107 | + df = pd.concat(dfs) |
| 108 | + df = df.sort_index() |
| 109 | + |
| 110 | + # Save the DataFrame to CSV |
| 111 | + conv_dir = os.path.dirname(pdf_fname).replace("/raw/", "/converted/") |
| 112 | + utils.ensure_dir(conv_dir) |
| 113 | + #df.to_csv(os.path.join(conv_dir, "histsmscgopnew.csv"), index=True) |
| 114 | + return df |
| 115 | + |
| 116 | +if __name__ == "__main__": |
| 117 | + main() |
0 commit comments