Skip to content

Commit 740c7c3

Browse files
committed
Functions to format observations from OSM versions + changesets.
1 parent 56a710b commit 740c7c3

1 file changed

Lines changed: 162 additions & 0 deletions

File tree

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# -------------------------------------------------------------
2+
# Copyright (c) Henry Spatial Analysis. All rights reserved.
3+
# Licensed under the MIT License. See LICENSE in project root for information.
4+
# -------------------------------------------------------------
5+
6+
"""
7+
This module formats OSM changes and versions into observations, which can be more easily
8+
queried and statistically analyzed.
9+
"""
10+
11+
import numpy as np
12+
import pandas as pd
13+
14+
def format_one_observation(
15+
changes_df: pd.DataFrame,
16+
versions_df: pd.DataFrame,
17+
tag_key: str,
18+
keep_keys: list[str]
19+
) -> pd.DataFrame:
20+
"""
21+
Format a single POI's changes and versions into observations.
22+
23+
Args:
24+
changes_df: DataFrame with changes data for a single POI.
25+
versions_df: DataFrame with versions data for a single POI.
26+
tag_key: Key of the tag to format.
27+
keep_keys: Keys to keep in the observations.
28+
29+
Returns:
30+
DataFrame with formatted observations. Each observation has the following columns:
31+
- id: unique identifier for the POI
32+
- version: unique identifier for this version of the POI. Observations are
33+
uniquely identified by `id` + `version`.
34+
- changeset: unique identifier for this changeset. Changesets can include changes
35+
to multiple POIs.
36+
- obs_timestamp: timestamp for this observation (this version of the POI).
37+
- last_obs_timestamp: timestamp for the previous observation of this POI.
38+
- last_tag_timestamp: timestamp for the last time the tag was changed prior to this
39+
observation.
40+
- last_tag_user: username of the user who last changed the relevant tag
41+
- user: username of the user who made this observation
42+
- tag_key: key of the relevant tag
43+
- tag_value: value of the relevant tag. If the POI was deleted, this will be NA.
44+
- changed: was the tag changed in this observation? 1 if changed, 0 if unchanged.
45+
- deleted: was the POI deleted in this observation? 1 if deleted, 0 if not
46+
deleted. All deleted POIs will have `tag_value` = NA and `changed` = 1.
47+
- Additionally, there will be columns for OSM grouping tags: these are `amenity`,
48+
`shop`, `healthcare`, and `leisure` by default. Each grouping tag will list that
49+
value, if present, at the time of this observation.
50+
"""
51+
# Setup
52+
obs_list = []
53+
names = [
54+
"version", "changeset", "obs_timestamp", "last_obs_timestamp",
55+
"last_tag_timestamp", "user", "last_tag_user", "tag_value", "changed", "deleted"
56+
] + keep_keys
57+
# Create a working dictionary for the latest observation, with some starting values
58+
latest_obs = {name: None for name in names}
59+
last_tag_timestamp = None
60+
last_obs_timestamp = None
61+
last_tag_user = None
62+
last_tag_value = None
63+
# Only start recording observaitons when the relevant tag is first added
64+
add_to_list = False
65+
# Iterate through all versions of the POI
66+
version_ids = sorted(versions_df["version"].unique().tolist())
67+
for v_idx in version_ids:
68+
version = versions_df.query("version == @v_idx").iloc[0].to_dict()
69+
changeset = changes_df.query("version == @v_idx").set_index("key")
70+
latest_obs['version'] = v_idx
71+
latest_obs['changeset'] = version['changeset']
72+
latest_obs['obs_timestamp'] = version['timestamp']
73+
latest_obs['last_obs_timestamp'] = last_obs_timestamp
74+
latest_obs['last_tag_timestamp'] = last_tag_timestamp
75+
latest_obs['last_tag_user'] = last_tag_user
76+
latest_obs['user'] = version['user']
77+
# Add all of the latest keep keys
78+
for key in keep_keys:
79+
if key in changeset.index:
80+
latest_obs[key] = changeset.loc[key, "value"]
81+
# Determine what is happening to the tag
82+
tag_added = (
83+
(tag_key in changeset.index) and
84+
(changeset.loc[tag_key, "change"] == "Added")
85+
)
86+
tag_changed = (
87+
(tag_key in changeset.index) and
88+
(changeset.loc[tag_key, "change"] == "Changed")
89+
)
90+
tag_deleted = (
91+
(tag_key in changeset.index) and
92+
(changeset.loc[tag_key, "change"] == "Deleted")
93+
)
94+
poi_deleted = (
95+
('visible' in changeset.index) and
96+
(changeset.loc['visible', "value"] == "false")
97+
)
98+
poi_re_added = (
99+
add_to_list and
100+
('visible' in changeset.index) and
101+
(changeset.loc['visible', "value"] == "true")
102+
)
103+
any_change = (
104+
tag_added or tag_changed or tag_deleted or poi_deleted or poi_re_added
105+
)
106+
latest_obs['changed'] = np.int64(any_change)
107+
# Only start adding observations to the list after the relevant tag is first added
108+
if tag_added:
109+
add_to_list = True
110+
if tag_added or tag_changed:
111+
last_tag_value = changeset.loc[tag_key, "value"]
112+
latest_obs['tag_value'] = last_tag_value
113+
# When a tag is changed, update the tag timestamp for the *next* observation
114+
if tag_deleted or poi_deleted:
115+
latest_obs['tag_value'] = None
116+
if poi_re_added:
117+
latest_obs['tag_value'] = last_tag_value
118+
if any_change:
119+
# Update timestamps
120+
last_tag_timestamp = version['timestamp']
121+
last_tag_user = version['user']
122+
if add_to_list:
123+
obs_list.append(pd.DataFrame({k: [v] for k, v in latest_obs.items()}))
124+
last_obs_timestamp = latest_obs['obs_timestamp']
125+
# Combine observations from all changesets
126+
if len(obs_list) > 0:
127+
formatted_obs_df = pd.concat(obs_list)
128+
formatted_obs_df['id'] = changes_df.iloc[0, :]["id"]
129+
formatted_obs_df['tag_key'] = tag_key
130+
return formatted_obs_df
131+
else:
132+
return pd.DataFrame()
133+
134+
135+
def format_observations(
136+
changes_df: pd.DataFrame,
137+
versions_df: pd.DataFrame,
138+
tag_key: str,
139+
keep_keys: list[str] = ["amenity", "shop", "healthcare", "leisure"]
140+
) -> pd.DataFrame:
141+
"""
142+
Format changes and versions into observations.
143+
144+
Args:
145+
changes_df: DataFrame with changes data.
146+
versions_df: DataFrame with versions data.
147+
tag_key: Key of the tag to format.
148+
keep_keys: Keys to keep in the observations.
149+
150+
Returns:
151+
DataFrame with observations.
152+
"""
153+
observations_df = pd.concat([
154+
format_one_observation(
155+
changes_df = changes_df.query("id == @this_id"),
156+
versions_df = versions_df.query("id == @this_id"),
157+
tag_key = tag_key,
158+
keep_keys = keep_keys,
159+
)
160+
for this_id in changes_df["id"].unique().tolist()
161+
])
162+
return observations_df

0 commit comments

Comments
 (0)