Skip to content

Commit 47b258a

Browse files
eli@water.ca.goveli@water.ca.gov
authored andcommitted
Added download_smscg.py as modification of 'montezuma' script
1 parent 7b44838 commit 47b258a

File tree

3 files changed

+120
-1
lines changed

3 files changed

+120
-1
lines changed

dms_datastore/download_smscg.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import requests
2+
import pandas as pd
3+
import datetime
4+
import os
5+
import click
6+
import tabula
7+
from . import store_utils as utils
8+
from vtools import ts_splice
9+
from csv import QUOTE_NONNUMERIC, QUOTE_MINIMAL
10+
11+
12+
@click.command()
13+
@click.option("--base-dir", default="smscg")
14+
@click.option("--outfile", default="dms_smscg_gate.csv")
15+
def main(base_dir=".", outfile="dms_smscg_gate.csv"):
16+
raw_dir = os.path.join(base_dir, "raw")
17+
convert_dir = os.path.join(base_dir, "converted")
18+
os.makedirs(raw_dir, exist_ok=True)
19+
os.makedirs(convert_dir, exist_ok=True)
20+
df0 = download_and_parse_archived_pdf(raw_dir)
21+
df1 = download_and_parse_active_gate_log(raw_dir)
22+
df_final = reconcile_archive_with_new(df0, df1)
23+
# Write CSV with only "remarks" and "user_remarks" quoted
24+
#df_final = _quote_selected_columns(df_final, ["remarks", "user_remarks"])
25+
outfile = os.path.join(convert_dir, outfile)
26+
df_final.to_csv(outfile, index=True, quoting=QUOTE_MINIMAL)
27+
28+
29+
30+
def reconcile_archive_with_new(df_archive,df_new):
31+
df_archive.index.name="datetime"
32+
df_new.index.name="datetime" # This should already be true, modulo case convention
33+
34+
df_archive.columns = [x.lower().replace(" ","_") for x in df_archive.columns]
35+
df_new.columns = [x.lower().replace(" ","_") for x in df_new.columns]
36+
final_columns = ["flashboards","gate_1","gate_2","gate_3","action","remarks","user_remarks"]
37+
df_archive = df_archive.reindex(columns=final_columns)
38+
df_new = df_new.reindex(columns=final_columns)
39+
df_final = ts_splice((df_archive,df_new),transition="prefer_last")
40+
cols = ["gate_1", "gate_2", "gate_3"]
41+
42+
mapping = {
43+
"O": "Open",
44+
"OPEN": "Open",
45+
"C": "Closed",
46+
"CLSD": "Closed",
47+
"CLOSED": "Closed",
48+
"OP": "Tidal",
49+
"M-OP": "Tidal",
50+
"TIDAL": "Tidal",
51+
}
52+
53+
for col in cols:
54+
s = df_final[col].astype("string").str.strip().str.upper()
55+
df_final.loc[:, col] = s.map(mapping).fillna(df_final[col])
56+
# Ensure the time index is sorted and unique
57+
df_final = df_final[~df_final.index.duplicated(keep="last")]
58+
df_final = df_final.sort_index()
59+
60+
return df_final
61+
62+
def download_and_parse_active_gate_log(raw_dir="raw"):
63+
"""
64+
Download the Suisun Marsh Salinity Control Gates log from the California Natural Resources Agency
65+
# https://data.cnra.ca.gov/dataset/suisun-marsh-salinity-control-gates-log/resource/265729e9-4ac0-469e-828b-2564ac077689
66+
"""
67+
utils.ensure_dir(raw_dir)
68+
today = datetime.datetime.now()
69+
date_str = today.strftime("%Y-%m-%d")
70+
url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/265729e9-4ac0-469e-828b-2564ac077689/download/smscg-log.xlsx"
71+
response = requests.get(url)
72+
assert response.status_code == 200
73+
fname = url.split("/")[-1]
74+
xlsfname = os.path.join(raw_dir, fname.split(".")[0] + ".xlsx")
75+
with open(xlsfname, "wb") as fh:
76+
fh.write(response.content)
77+
df = pd.read_excel(xlsfname, parse_dates=True, index_col=0)
78+
df = df.sort_index()
79+
conv_dir = os.path.dirname(xlsfname).replace("/raw/", "/converted/")
80+
utils.ensure_dir(conv_dir)
81+
#df.to_csv(os.path.join(raw_dir, fname.split(".")[0] + ".csv"))
82+
return df
83+
84+
85+
86+
def download_and_parse_archived_pdf(base_dir="raw"):
87+
"""
88+
Download and parse the archived PDF log of the Suisun Marsh Salinity Control Gates.
89+
"""
90+
utils.ensure_dir(base_dir)
91+
url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/7b3ab962-202b-43c2-9ac7-08f2303b153b/download/histsmscgopnew.pdf"
92+
response = requests.get(url)
93+
assert response.status_code == 200
94+
pdf_fname = os.path.join(base_dir, "histsmscgopnew.pdf")
95+
with open(pdf_fname, "wb") as fh:
96+
fh.write(response.content)
97+
98+
# Parse the PDF using tabula-py
99+
dfs = tabula.read_pdf(
100+
pdf_fname, pages="all", multiple_tables=True, encoding="ISO-8859-1"
101+
) # for windows maybe?)
102+
103+
# Combine all tables into a single DataFrame
104+
for i in range(len(dfs)):
105+
dfs[i]["DATE"] = pd.to_datetime(dfs[i]["DATE"], errors="coerce")
106+
dfs[i] = dfs[i].set_index("DATE")
107+
df = pd.concat(dfs)
108+
df = df.sort_index()
109+
110+
# Save the DataFrame to CSV
111+
conv_dir = os.path.dirname(pdf_fname).replace("/raw/", "/converted/")
112+
utils.ensure_dir(conv_dir)
113+
#df.to_csv(os.path.join(conv_dir, "histsmscgopnew.csv"), index=True)
114+
return df
115+
116+
if __name__ == "__main__":
117+
main()

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ dependencies = [
3737
"cfgrib",
3838
"diskcache",
3939
"pytest",
40-
"pytest-runner"
40+
"pytest-runner",
41+
"tabula-py"
4142
]
4243

4344
[project.optional-dependencies]

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
"download_cimis=dms_datastore.download_cimis:main",
7272
"download_dcc=dms_datastore.download_dcc:main",
7373
"download_montezuma_gates=dms_datastore.download_montezuma_gates:main",
74+
"download_smscg=dms_datastore.download_smscg:main",
7475
"compare_directories=dms_datastore.compare_directories:main",
7576
"populate_repo=dms_datastore.populate_repo:main",
7677
"station_info=dms_datastore.station_info:main",

0 commit comments

Comments
 (0)