Skip to content

Commit f135ffc

Browse files
committed
Update to 2024 APR dataset
1 parent 7b04cd0 commit f135ffc

1 file changed

Lines changed: 18 additions & 16 deletions

File tree

python/housing_data/california_hcd_data.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,15 @@
3131
def load_california_hcd_data(
3232
data_path: Path,
3333
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
34-
df = pd.read_csv(data_path / "data/apr/table-a2-combined.csv.gz")
34+
df = pd.read_csv(data_path / "data/apr/tablea2.csv.gz")
3535

3636
# BPS doesn't include mobile homes, so we shouldn't include them here either
37-
df = df[df["UNIT_CAT_DESC"] != "Mobile Home Unit"].copy()
37+
df = df[df["UNIT_CAT"] != "MH"].copy()
38+
39+
# Has some values that are not numbers (e.g. "2020-08-02")
40+
df["BP_ABOVE_MOD_INCOME"] = pd.to_numeric(
41+
df["BP_ABOVE_MOD_INCOME"], errors="coerce"
42+
)
3843

3944
df["units"] = df[BUILDING_PERMIT_COLUMNS].sum(axis="columns", numeric_only=True)
4045

@@ -51,19 +56,16 @@ def load_california_hcd_data(
5156

5257
df["building_type"] = np.select(
5358
[
54-
df["UNIT_CAT_DESC"] == "Accessory Dwelling Unit",
55-
df["UNIT_CAT_DESC"].isin(
56-
["Single-Family Detached Unit", "Single-Family Attached Unit"]
57-
),
58-
(df["UNIT_CAT_DESC"] == "2-, 3-, and 4-Plex Units per Structure")
59-
& df["units"].isin([1, 2]),
59+
df["UNIT_CAT"] == "ADU",
60+
df["UNIT_CAT"].isin(["SFD", "SFA"]),
61+
(df["UNIT_CAT"] == "2 to 4") & df["units"].isin([1, 2]),
6062
# If there are 3, 4, or more units in the project, assume it's 3 or 4.
6163
# TBH my prior is that 2-plexes are way more common than 3- or 4-plexes.
6264
# But for simplicity let's just put them in 3-to-4.
6365
# From 2018 to 2022, there are only ~2400 units worth of 2/3/4 unit projects
6466
# with >4 units in the project. So misclassifying these is not a big deal.
65-
df["UNIT_CAT_DESC"] == "2-, 3-, and 4-Plex Units per Structure",
66-
df["UNIT_CAT_DESC"] == "5 or More Units Per Structure",
67+
df["UNIT_CAT"] == "2 to 4",
68+
df["UNIT_CAT"] == "5+",
6769
],
6870
[
6971
"adu",
@@ -96,7 +98,7 @@ def _aggregate_to_geography(
9698
df["bldgs"] = 1
9799

98100
if level == "place":
99-
index_cols = ["JURS_NAME", "CNTY_NAME", "year"]
101+
index_cols = ["JURIS_NAME", "CNTY_NAME", "year"]
100102
elif level == "county":
101103
index_cols = ["CNTY_NAME", "year"]
102104
elif level == "state":
@@ -119,25 +121,25 @@ def _aggregate_to_geography(
119121

120122
if level == "place":
121123
# Confirm that we can drop county because in California, a city can't span multiple counties
122-
assert (wide_df[["JURS_NAME", "year"]].value_counts() == 1).all()
124+
assert (wide_df[["JURIS_NAME", "year"]].value_counts() == 1).all()
123125
wide_df = wide_df.drop(columns=["CNTY_NAME"])
124126
if level == "place":
125127
old_wide_df = wide_df
126128
# Add place_or_county_code
127129
wide_df = wide_df.merge(
128-
_load_fips_crosswalk(data_path), left_on="JURS_NAME", right_on="name"
130+
_load_fips_crosswalk(data_path), left_on="JURIS_NAME", right_on="name"
129131
).drop(columns=["name", "county_code"])
130132
if len(old_wide_df) != len(wide_df):
131-
dropped_cities = set(old_wide_df["JURS_NAME"]) - set(wide_df["JURS_NAME"])
132-
added_cities = set(wide_df["JURS_NAME"]) - set(old_wide_df["JURS_NAME"])
133+
dropped_cities = set(old_wide_df["JURIS_NAME"]) - set(wide_df["JURIS_NAME"])
134+
added_cities = set(wide_df["JURIS_NAME"]) - set(old_wide_df["JURIS_NAME"])
133135
raise ValueError(
134136
f"wide_df had {len(old_wide_df)} rows before merge and {len(wide_df)} rows after merge. "
135137
f"{dropped_cities=} {added_cities=}"
136138
)
137139
elif level == "county":
138140
# Add county_code
139141
old_rows = len(wide_df)
140-
wide_df["name"] = wide_df["CNTY_NAME"] + " COUNTY"
142+
wide_df["name"] = wide_df["CNTY_NAME"].str.upper() + " COUNTY"
141143
wide_df = wide_df.merge(_load_fips_crosswalk(data_path), on="name").drop(
142144
columns=["CNTY_NAME", "name", "place_or_county_code"]
143145
)

0 commit comments

Comments
 (0)