Skip to content

Commit da31259

Browse files
committed
DD-124 Montezuma salinity gates active and archived parsed
1 parent addf66f commit da31259

File tree

1 file changed

+41
-1
lines changed

1 file changed

+41
-1
lines changed

dms_datastore/download_montezuma_gates.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
import requests
22
import pandas as pd
3-
from . import store_utils as utils
43
import datetime
54
import os
65
import click
6+
import tabula
7+
from . import store_utils as utils
78

89

910
@click.command()
1011
@click.option("--base-dir", default="data/raw/montezuma_gate_log")
1112
def main(base_dir="data/raw/montezuma_gate_log"):
13+
download_and_parse_active_gate_log(base_dir)
14+
download_and_parse_archived_pdf(base_dir)
15+
16+
17+
def download_and_parse_active_gate_log(base_dir="data/raw/montezuma_gate"):
1218
"""
1319
Download the Montezuma Slough Salinity Control Gates log from the California Natural Resources Agency
1420
# https://data.cnra.ca.gov/dataset/suisun-marsh-salinity-control-gates-log/resource/265729e9-4ac0-469e-828b-2564ac077689
@@ -28,3 +34,37 @@ def main(base_dir="data/raw/montezuma_gate_log"):
2834
conv_dir = os.path.dirname(xlsfname).replace("/raw/", "/converted/")
2935
utils.ensure_dir(conv_dir)
3036
df.to_csv(os.path.join(conv_dir, fname.split(".")[0] + ".csv"))
37+
38+
39+
def download_and_parse_archived_pdf(base_dir="data/raw/montezuma_gate_log"):
40+
"""
41+
Download and parse the archived PDF log of the Montezuma Slough Salinity Control Gates.
42+
"""
43+
utils.ensure_dir(base_dir)
44+
url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/7b3ab962-202b-43c2-9ac7-08f2303b153b/download/histsmscgopnew.pdf"
45+
response = requests.get(url)
46+
assert response.status_code == 200
47+
pdf_fname = os.path.join(base_dir, "histsmscgopnew.pdf")
48+
with open(pdf_fname, "wb") as fh:
49+
fh.write(response.content)
50+
51+
# Parse the PDF using tabula-py
52+
dfs = tabula.read_pdf(
53+
pdf_fname, pages="all", multiple_tables=True, encoding="ISO-8859-1"
54+
) # for windows maybe?)
55+
56+
# Combine all tables into a single DataFrame
57+
for i in range(len(dfs)):
58+
dfs[i]["DATE"] = pd.to_datetime(dfs[i]["DATE"], errors="coerce")
59+
dfs[i] = dfs[i].set_index("DATE")
60+
df = pd.concat(dfs)
61+
df = df.sort_index()
62+
63+
# Save the DataFrame to CSV
64+
conv_dir = os.path.dirname(pdf_fname).replace("/raw/", "/converted/")
65+
utils.ensure_dir(conv_dir)
66+
df.to_csv(os.path.join(conv_dir, "histsmscgopnew.csv"), index=True)
67+
68+
69+
if __name__ == "__main__":
70+
main()

0 commit comments

Comments
 (0)