11import requests
22import pandas as pd
3- from . import store_utils as utils
43import datetime
54import os
65import click
6+ import tabula
7+ from . import store_utils as utils
78
89
910@click .command ()
1011@click .option ("--base-dir" , default = "data/raw/montezuma_gate_log" )
1112def main (base_dir = "data/raw/montezuma_gate_log" ):
13+ download_and_parse_active_gate_log (base_dir )
14+ download_and_parse_archived_pdf (base_dir )
15+
16+
17+ def download_and_parse_active_gate_log (base_dir = "data/raw/montezuma_gate" ):
1218 """
1319 Download the Montezuma Slough Salinity Control Gates log from the California Natural Resources Agency
1420 # https://data.cnra.ca.gov/dataset/suisun-marsh-salinity-control-gates-log/resource/265729e9-4ac0-469e-828b-2564ac077689
@@ -28,3 +34,37 @@ def main(base_dir="data/raw/montezuma_gate_log"):
2834 conv_dir = os .path .dirname (xlsfname ).replace ("/raw/" , "/converted/" )
2935 utils .ensure_dir (conv_dir )
3036 df .to_csv (os .path .join (conv_dir , fname .split ("." )[0 ] + ".csv" ))
37+
38+
39+ def download_and_parse_archived_pdf (base_dir = "data/raw/montezuma_gate_log" ):
40+ """
41+ Download and parse the archived PDF log of the Montezuma Slough Salinity Control Gates.
42+ """
43+ utils .ensure_dir (base_dir )
44+ url = "https://data.cnra.ca.gov/dataset/e76622ca-b6e9-4e78-a08e-deb9580d49b3/resource/7b3ab962-202b-43c2-9ac7-08f2303b153b/download/histsmscgopnew.pdf"
45+ response = requests .get (url )
46+ assert response .status_code == 200
47+ pdf_fname = os .path .join (base_dir , "histsmscgopnew.pdf" )
48+ with open (pdf_fname , "wb" ) as fh :
49+ fh .write (response .content )
50+
51+ # Parse the PDF using tabula-py
52+ dfs = tabula .read_pdf (
53+ pdf_fname , pages = "all" , multiple_tables = True , encoding = "ISO-8859-1"
54+ ) # for windows maybe?)
55+
56+ # Combine all tables into a single DataFrame
57+ for i in range (len (dfs )):
58+ dfs [i ]["DATE" ] = pd .to_datetime (dfs [i ]["DATE" ], errors = "coerce" )
59+ dfs [i ] = dfs [i ].set_index ("DATE" )
60+ df = pd .concat (dfs )
61+ df = df .sort_index ()
62+
63+ # Save the DataFrame to CSV
64+ conv_dir = os .path .dirname (pdf_fname ).replace ("/raw/" , "/converted/" )
65+ utils .ensure_dir (conv_dir )
66+ df .to_csv (os .path .join (conv_dir , "histsmscgopnew.csv" ), index = True )
67+
68+
69+ if __name__ == "__main__" :
70+ main ()
0 commit comments