3131def load_california_hcd_data (
3232 data_path : Path ,
3333) -> tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
34- df = pd .read_csv (data_path / "data/apr/table-a2-combined .csv.gz" )
34+ df = pd .read_csv (data_path / "data/apr/tablea2 .csv.gz" )
3535
3636 # BPS doesn't include mobile homes, so we shouldn't include them here either
37- df = df [df ["UNIT_CAT_DESC" ] != "Mobile Home Unit" ].copy ()
37+ df = df [df ["UNIT_CAT" ] != "MH" ].copy ()
38+
39+ # Has some values that are not numbers (e.g. "2020-08-02")
40+ df ["BP_ABOVE_MOD_INCOME" ] = pd .to_numeric (
41+ df ["BP_ABOVE_MOD_INCOME" ], errors = "coerce"
42+ )
3843
3944 df ["units" ] = df [BUILDING_PERMIT_COLUMNS ].sum (axis = "columns" , numeric_only = True )
4045
@@ -51,19 +56,16 @@ def load_california_hcd_data(
5156
5257 df ["building_type" ] = np .select (
5358 [
54- df ["UNIT_CAT_DESC" ] == "Accessory Dwelling Unit" ,
55- df ["UNIT_CAT_DESC" ].isin (
56- ["Single-Family Detached Unit" , "Single-Family Attached Unit" ]
57- ),
58- (df ["UNIT_CAT_DESC" ] == "2-, 3-, and 4-Plex Units per Structure" )
59- & df ["units" ].isin ([1 , 2 ]),
59+ df ["UNIT_CAT" ] == "ADU" ,
60+ df ["UNIT_CAT" ].isin (["SFD" , "SFA" ]),
61+ (df ["UNIT_CAT" ] == "2 to 4" ) & df ["units" ].isin ([1 , 2 ]),
6062 # If there are 3, 4, or more units in the project, assume it's 3 or 4.
6163 # TBH my prior is that 2-plexes are way more common than 3- or 4-plexes.
6264 # But for simplicity let's just put them in 3-to-4.
6365 # From 2018 to 2022, there are only ~2400 units worth of 2/3/4 unit projects
6466 # with >4 units in the project. So misclassifying these is not a big deal.
65- df ["UNIT_CAT_DESC " ] == "2-, 3-, and 4-Plex Units per Structure " ,
66- df ["UNIT_CAT_DESC " ] == "5 or More Units Per Structure " ,
67+ df ["UNIT_CAT " ] == "2 to 4 " ,
68+ df ["UNIT_CAT " ] == "5+ " ,
6769 ],
6870 [
6971 "adu" ,
@@ -96,7 +98,7 @@ def _aggregate_to_geography(
9698 df ["bldgs" ] = 1
9799
98100 if level == "place" :
99- index_cols = ["JURS_NAME " , "CNTY_NAME" , "year" ]
101+ index_cols = ["JURIS_NAME " , "CNTY_NAME" , "year" ]
100102 elif level == "county" :
101103 index_cols = ["CNTY_NAME" , "year" ]
102104 elif level == "state" :
@@ -119,25 +121,25 @@ def _aggregate_to_geography(
119121
120122 if level == "place" :
121123 # Confirm that we can drop county because in California, a city can't span multiple counties
122- assert (wide_df [["JURS_NAME " , "year" ]].value_counts () == 1 ).all ()
124+ assert (wide_df [["JURIS_NAME " , "year" ]].value_counts () == 1 ).all ()
123125 wide_df = wide_df .drop (columns = ["CNTY_NAME" ])
124126 if level == "place" :
125127 old_wide_df = wide_df
126128 # Add place_or_county_code
127129 wide_df = wide_df .merge (
128- _load_fips_crosswalk (data_path ), left_on = "JURS_NAME " , right_on = "name"
130+ _load_fips_crosswalk (data_path ), left_on = "JURIS_NAME " , right_on = "name"
129131 ).drop (columns = ["name" , "county_code" ])
130132 if len (old_wide_df ) != len (wide_df ):
131- dropped_cities = set (old_wide_df ["JURS_NAME " ]) - set (wide_df ["JURS_NAME " ])
132- added_cities = set (wide_df ["JURS_NAME " ]) - set (old_wide_df ["JURS_NAME " ])
133+ dropped_cities = set (old_wide_df ["JURIS_NAME " ]) - set (wide_df ["JURIS_NAME " ])
134+ added_cities = set (wide_df ["JURIS_NAME " ]) - set (old_wide_df ["JURIS_NAME " ])
133135 raise ValueError (
134136 f"wide_df had { len (old_wide_df )} rows before merge and { len (wide_df )} rows after merge. "
135137 f"{ dropped_cities = } { added_cities = } "
136138 )
137139 elif level == "county" :
138140 # Add county_code
139141 old_rows = len (wide_df )
140- wide_df ["name" ] = wide_df ["CNTY_NAME" ] + " COUNTY"
142+ wide_df ["name" ] = wide_df ["CNTY_NAME" ]. str . upper () + " COUNTY"
141143 wide_df = wide_df .merge (_load_fips_crosswalk (data_path ), on = "name" ).drop (
142144 columns = ["CNTY_NAME" , "name" , "place_or_county_code" ]
143145 )
0 commit comments