Skip to content

Commit ca6dc1e

Browse files
committed
Create core functions to plot stability of various tags over time.
1 parent 0177db8 commit ca6dc1e

1 file changed

Lines changed: 194 additions & 0 deletions

File tree

src/openpois/osm/change_plots.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
# -------------------------------------------------------------
2+
# Copyright (c) Henry Spatial Analysis. All rights reserved.
3+
# Licensed under the MIT License. See LICENSE in project root for information.
4+
# -------------------------------------------------------------
5+
6+
"""
7+
This module creates plots showing the stability of various OSM tags over time.
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
import plotnine as gg
13+
from functools import reduce
14+
15+
16+
def change_plot_reshape_data(
17+
observations: pd.DataFrame,
18+
no_change_col: str,
19+
change_col: str,
20+
final_observation_col: str,
21+
day_range: int = 365*10,
22+
) -> pd.DataFrame:
23+
"""
24+
Reshape data for the change plot. The data comes in with one row per POI-tag, and
25+
is reshaped by elapsed days since the POI-tag was added. For each elapsed day, there
26+
are four possibilities:
27+
1. Confirmed unchanged: The tag was observed unchanged on or *after* this day
28+
2. Confirmed changed: The tag was last observed changed on or *before* this day
29+
2. Unsure: The tag was last observed unchanged *before* this day, but has not yet
30+
been observed changed
31+
4. Aged out: The maximum time elapsed between when the tag was added and our data
32+
download is *before* this day, so we should drop it from the plot
33+
34+
Args:
35+
observations: DataFrame with observations. Each row is an iteration of a
36+
tag, with the three columns described below.
37+
no_change_col: Column name for the days elapsed from when the tag was added to
38+
when it was last confirmed (observed unchanged).
39+
change_col: Column name for the days elapsed from when the tag was added to when
40+
it was changed. For tags that were unchanged, this will be infinity.
41+
final_observation_col: Column name for the days elapsed from when the tag was
42+
added to when this data was downloaded.
43+
day_range: Maximum elapsed time period to plot, in days
44+
45+
Returns:
46+
DataFrame where each row is an elapse d
47+
"""
48+
reshaped = (
49+
pd.DataFrame({
50+
'no_change': [
51+
np.sum(day_i < observations[no_change_col])
52+
for day_i in range(day_range)
53+
],
54+
'unknown': [
55+
np.sum(
56+
(observations[no_change_col] <= day_i) &
57+
(day_i < observations[final_observation_col])
58+
)
59+
for day_i in range(day_range)
60+
],
61+
'change': [
62+
np.sum(
63+
(observations[change_col] <= day_i) &
64+
(day_i < observations[final_observation_col])
65+
)
66+
for day_i in range(day_range)
67+
],
68+
'aged_out': [
69+
np.sum(observations[final_observation_col] <= day_i)
70+
for day_i in range(day_range)
71+
]
72+
})
73+
.assign(
74+
all = pd.col('no_change') + pd.col('change') + pd.col('unknown'),
75+
ymin = pd.col('no_change') / pd.col('all'),
76+
ymax = (pd.col('no_change') + pd.col('unknown')) / pd.col('all'),
77+
day = np.arange(day_range),
78+
year = pd.col('day') / 365,
79+
)
80+
)
81+
return reshaped
82+
83+
84+
def change_plot_create(
85+
observations: pd.DataFrame,
86+
no_change_col: str = 'no_change',
87+
change_col: str = 'change',
88+
final_observation_col: str = 'final_obs',
89+
title: str = None,
90+
subtitle: str = None,
91+
x_label: str = '',
92+
y_label: str = '',
93+
day_range: int = 365*10,
94+
) -> gg.ggplot:
95+
"""
96+
Create a single change plot.
97+
98+
Args:
99+
observations: DataFrame with observations. Each row is an iteration of a
100+
tag, with the three columns described below.
101+
no_change_col: Column name for the days elapsed from when the tag was added to
102+
when it was last confirmed (observed unchanged).
103+
change_col: Column name for the days elapsed from when the tag was added to when
104+
it was changed. For tags that were unchanged, this will be infinity.
105+
final_observation_col: Column name for the days elapsed from when the tag was
106+
added to when this data was downloaded.
107+
day_range: Maximum elapsed time period to plot, in days
108+
109+
Returns:
110+
ggplot object
111+
"""
112+
year_range = day_range / 365
113+
reshaped = change_plot_reshape_data(
114+
observations = observations,
115+
no_change_col = no_change_col,
116+
change_col = change_col,
117+
final_observation_col = final_observation_col,
118+
day_range = day_range
119+
)
120+
fig = (
121+
gg.ggplot(
122+
data = reshaped,
123+
mapping = gg.aes(x = 'year', ymin = 'ymin', ymax = 'ymax')
124+
) +
125+
gg.geom_ribbon(fill = 'blue', alpha = 0.4) +
126+
gg.geom_line(mapping = gg.aes(y = 'ymin'), color = 'black', alpha = 0.5) +
127+
gg.geom_line(mapping = gg.aes(y = 'ymax'), color = 'black', alpha = 0.5) +
128+
gg.labs(
129+
title = title,
130+
subtitle = subtitle,
131+
x = x_label,
132+
y = y_label,
133+
) +
134+
gg.scale_y_continuous(
135+
limits = (0, 1.01),
136+
breaks = np.arange(0, 1.01, 0.25),
137+
labels = [f"{x*100:.0f}%" for x in np.arange(0, 1.01, 0.25)],
138+
) +
139+
gg.scale_x_continuous(
140+
limits = (0, year_range + 0.01),
141+
breaks = np.arange(year_range + 1),
142+
labels = [f"{x:.0f}" for x in np.arange(year_range + 1)],
143+
) +
144+
gg.theme_bw()
145+
)
146+
return fig
147+
148+
149+
def change_multiplot_create(
150+
observations: pd.DataFrame,
151+
col: str,
152+
top_n: int = 9,
153+
no_change_col: str = 'no_change',
154+
change_col: str = 'change',
155+
final_observation_col: str = 'final_obs',
156+
day_range: int = 365*10,
157+
) -> gg.ggplot:
158+
"""
159+
Create a multi-panel change plot.
160+
161+
Args:
162+
col: Column name for the tag to plot.
163+
top_n: Number of tags to plot, ordered by number of observations.
164+
**kwargs: Keyword arguments for change_plot_create.
165+
166+
Returns:
167+
ggplot object
168+
"""
169+
# Drop rows where the tag is missing
170+
# Get the top occurrences of particular tags
171+
obs_sub = observations.dropna(subset = [col])
172+
top_tags = obs_sub[col].value_counts().head(top_n)
173+
# Create a list of ggplot objects
174+
fig_list = []
175+
for tag, _ in top_tags.items():
176+
obs_sub_tag = obs_sub.query(f"{col} == @tag")
177+
fig = change_plot_create(
178+
observations = obs_sub_tag,
179+
title = tag.title(),
180+
subtitle = f"N = {obs_sub_tag.shape[0]}",
181+
no_change_col = no_change_col,
182+
change_col = change_col,
183+
final_observation_col = final_observation_col,
184+
day_range = day_range,
185+
)
186+
fig_list.append(fig)
187+
# Compose the individual plots into a roughly square grid
188+
n_rows = np.ceil(np.sqrt(len(fig_list)))
189+
composed_rows = [
190+
reduce(lambda gg1, gg2: gg1 | gg2, row)
191+
for row in np.array_split(fig_list, n_rows)
192+
]
193+
composed_fig = reduce(lambda row1, row2: row1 / row2, composed_rows)
194+
return composed_fig

0 commit comments

Comments
 (0)