|
| 1 | +# ------------------------------------------------------------- |
| 2 | +# Copyright (c) Henry Spatial Analysis. All rights reserved. |
| 3 | +# Licensed under the MIT License. See LICENSE in project root for information. |
| 4 | +# ------------------------------------------------------------- |
| 5 | + |
| 6 | +""" |
| 7 | +This module formats OSM changes and versions into observations, which can be more easily |
| 8 | +queried and statistically analyzed. |
| 9 | +""" |
| 10 | + |
| 11 | +import numpy as np |
| 12 | +import pandas as pd |
| 13 | + |
| 14 | +def format_one_observation( |
| 15 | + changes_df: pd.DataFrame, |
| 16 | + versions_df: pd.DataFrame, |
| 17 | + tag_key: str, |
| 18 | + keep_keys: list[str] |
| 19 | +) -> pd.DataFrame: |
| 20 | + """ |
| 21 | + Format a single POI's changes and versions into observations. |
| 22 | +
|
| 23 | + Args: |
| 24 | + changes_df: DataFrame with changes data for a single POI. |
| 25 | + versions_df: DataFrame with versions data for a single POI. |
| 26 | + tag_key: Key of the tag to format. |
| 27 | + keep_keys: Keys to keep in the observations. |
| 28 | +
|
| 29 | + Returns: |
| 30 | + DataFrame with formatted observations. Each observation has the following columns: |
| 31 | + - id: unique identifier for the POI |
| 32 | + - version: unique identifier for this version of the POI. Observations are |
| 33 | + uniquely identified by `id` + `version`. |
| 34 | + - changeset: unique identifier for this changeset. Changesets can include changes |
| 35 | + to multiple POIs. |
| 36 | + - obs_timestamp: timestamp for this observation (this version of the POI). |
| 37 | + - last_obs_timestamp: timestamp for the previous observation of this POI. |
| 38 | + - last_tag_timestamp: timestamp for the last time the tag was changed prior to this |
| 39 | + observation. |
| 40 | + - last_tag_user: username of the user who last changed the relevant tag |
| 41 | + - user: username of the user who made this observation |
| 42 | + - tag_key: key of the relevant tag |
| 43 | + - tag_value: value of the relevant tag. If the POI was deleted, this will be NA. |
| 44 | + - changed: was the tag changed in this observation? 1 if changed, 0 if unchanged. |
| 45 | + - deleted: was the POI deleted in this observation? 1 if deleted, 0 if not |
| 46 | + deleted. All deleted POIs will have `tag_value` = NA and `changed` = 1. |
| 47 | + - Additionally, there will be columns for OSM grouping tags: these are `amenity`, |
| 48 | + `shop`, `healthcare`, and `leisure` by default. Each grouping tag will list that |
| 49 | + value, if present, at the time of this observation. |
| 50 | + """ |
| 51 | + # Setup |
| 52 | + obs_list = [] |
| 53 | + names = [ |
| 54 | + "version", "changeset", "obs_timestamp", "last_obs_timestamp", |
| 55 | + "last_tag_timestamp", "user", "last_tag_user", "tag_value", "changed", "deleted" |
| 56 | + ] + keep_keys |
| 57 | + # Create a working dictionary for the latest observation, with some starting values |
| 58 | + latest_obs = {name: None for name in names} |
| 59 | + last_tag_timestamp = None |
| 60 | + last_obs_timestamp = None |
| 61 | + last_tag_user = None |
| 62 | + last_tag_value = None |
| 63 | + # Only start recording observaitons when the relevant tag is first added |
| 64 | + add_to_list = False |
| 65 | + # Iterate through all versions of the POI |
| 66 | + version_ids = sorted(versions_df["version"].unique().tolist()) |
| 67 | + for v_idx in version_ids: |
| 68 | + version = versions_df.query("version == @v_idx").iloc[0].to_dict() |
| 69 | + changeset = changes_df.query("version == @v_idx").set_index("key") |
| 70 | + latest_obs['version'] = v_idx |
| 71 | + latest_obs['changeset'] = version['changeset'] |
| 72 | + latest_obs['obs_timestamp'] = version['timestamp'] |
| 73 | + latest_obs['last_obs_timestamp'] = last_obs_timestamp |
| 74 | + latest_obs['last_tag_timestamp'] = last_tag_timestamp |
| 75 | + latest_obs['last_tag_user'] = last_tag_user |
| 76 | + latest_obs['user'] = version['user'] |
| 77 | + # Add all of the latest keep keys |
| 78 | + for key in keep_keys: |
| 79 | + if key in changeset.index: |
| 80 | + latest_obs[key] = changeset.loc[key, "value"] |
| 81 | + # Determine what is happening to the tag |
| 82 | + tag_added = ( |
| 83 | + (tag_key in changeset.index) and |
| 84 | + (changeset.loc[tag_key, "change"] == "Added") |
| 85 | + ) |
| 86 | + tag_changed = ( |
| 87 | + (tag_key in changeset.index) and |
| 88 | + (changeset.loc[tag_key, "change"] == "Changed") |
| 89 | + ) |
| 90 | + tag_deleted = ( |
| 91 | + (tag_key in changeset.index) and |
| 92 | + (changeset.loc[tag_key, "change"] == "Deleted") |
| 93 | + ) |
| 94 | + poi_deleted = ( |
| 95 | + ('visible' in changeset.index) and |
| 96 | + (changeset.loc['visible', "value"] == "false") |
| 97 | + ) |
| 98 | + poi_re_added = ( |
| 99 | + add_to_list and |
| 100 | + ('visible' in changeset.index) and |
| 101 | + (changeset.loc['visible', "value"] == "true") |
| 102 | + ) |
| 103 | + any_change = ( |
| 104 | + tag_added or tag_changed or tag_deleted or poi_deleted or poi_re_added |
| 105 | + ) |
| 106 | + latest_obs['changed'] = np.int64(any_change) |
| 107 | + # Only start adding observations to the list after the relevant tag is first added |
| 108 | + if tag_added: |
| 109 | + add_to_list = True |
| 110 | + if tag_added or tag_changed: |
| 111 | + last_tag_value = changeset.loc[tag_key, "value"] |
| 112 | + latest_obs['tag_value'] = last_tag_value |
| 113 | + # When a tag is changed, update the tag timestamp for the *next* observation |
| 114 | + if tag_deleted or poi_deleted: |
| 115 | + latest_obs['tag_value'] = None |
| 116 | + if poi_re_added: |
| 117 | + latest_obs['tag_value'] = last_tag_value |
| 118 | + if any_change: |
| 119 | + # Update timestamps |
| 120 | + last_tag_timestamp = version['timestamp'] |
| 121 | + last_tag_user = version['user'] |
| 122 | + if add_to_list: |
| 123 | + obs_list.append(pd.DataFrame({k: [v] for k, v in latest_obs.items()})) |
| 124 | + last_obs_timestamp = latest_obs['obs_timestamp'] |
| 125 | + # Combine observations from all changesets |
| 126 | + if len(obs_list) > 0: |
| 127 | + formatted_obs_df = pd.concat(obs_list) |
| 128 | + formatted_obs_df['id'] = changes_df.iloc[0, :]["id"] |
| 129 | + formatted_obs_df['tag_key'] = tag_key |
| 130 | + return formatted_obs_df |
| 131 | + else: |
| 132 | + return pd.DataFrame() |
| 133 | + |
| 134 | + |
| 135 | +def format_observations( |
| 136 | + changes_df: pd.DataFrame, |
| 137 | + versions_df: pd.DataFrame, |
| 138 | + tag_key: str, |
| 139 | + keep_keys: list[str] = ["amenity", "shop", "healthcare", "leisure"] |
| 140 | +) -> pd.DataFrame: |
| 141 | + """ |
| 142 | + Format changes and versions into observations. |
| 143 | +
|
| 144 | + Args: |
| 145 | + changes_df: DataFrame with changes data. |
| 146 | + versions_df: DataFrame with versions data. |
| 147 | + tag_key: Key of the tag to format. |
| 148 | + keep_keys: Keys to keep in the observations. |
| 149 | +
|
| 150 | + Returns: |
| 151 | + DataFrame with observations. |
| 152 | + """ |
| 153 | + observations_df = pd.concat([ |
| 154 | + format_one_observation( |
| 155 | + changes_df = changes_df.query("id == @this_id"), |
| 156 | + versions_df = versions_df.query("id == @this_id"), |
| 157 | + tag_key = tag_key, |
| 158 | + keep_keys = keep_keys, |
| 159 | + ) |
| 160 | + for this_id in changes_df["id"].unique().tolist() |
| 161 | + ]) |
| 162 | + return observations_df |
0 commit comments