99import numpy as np
1010import pandas as pd
1111from pathlib import Path
12+
13+ import matplotlib
14+ matplotlib .use ("Agg" )
1215import plotnine as gg
1316
17+ from openpois .osm .change_plots import change_plot_create , change_multiplot_create
18+
1419# ----------------------------------------------------------------------------------------
1520# Configuration constants
1621# ----------------------------------------------------------------------------------------
1722
1823DATA_VERSION = "20260129"
1924SAVE_DIR = Path ("~/data/openpois" ).expanduser () / DATA_VERSION
25+ VIZ_DIR = SAVE_DIR / "viz"
2026OSM_KEYS = ["amenity" , "shop" , "healthcare" , "leisure" ]
2127TAG_KEY = "name"
2228END_DATE = pd .Timestamp ('2025-12-31' , tz = 'UTC' )
2329
24- max_days = 365 * 10
30+ max_days = 365 * 10
31+ VIZ_DIR .mkdir (parents = True , exist_ok = True )
32+
33+ # ----------------------------------------------------------------------------------------
34+ # Plotting functions
35+ # ----------------------------------------------------------------------------------------
36+
37+ def fig_save (
38+ fig : gg .ggplot , stub : str , width : float = 10 , height : float = 6 , ** kwargs
39+ ) -> None :
40+ """
41+ Helper function to save a ggplot figure
42+ """
43+ fig .save (
44+ filename = VIZ_DIR / f"{ stub } .png" ,
45+ width = width ,
46+ height = height ,
47+ units = 'in' ,
48+ dpi = 300 ,
49+ verbose = False ,
50+ ** kwargs
51+ )
52+ return None
53+
2554
2655# ----------------------------------------------------------------------------------------
2756# Main workflow
2857# ----------------------------------------------------------------------------------------
2958
3059if __name__ == "__main__" :
3160 # Read observations
61+ # Drop the first observation for each POI (when the POI was first added) - the last
62+ # observation timestamp will be missing for these rows
3263 timestamp_cols = ['obs_timestamp' , 'last_obs_timestamp' , 'last_tag_timestamp' ]
3364 observations_df = (pd .read_csv (SAVE_DIR / f"osm_observations_{ TAG_KEY } .csv" )
3465 .dropna (subset = timestamp_cols )
4273 ).astype (int )
4374 )
4475 # Prepare timediffs in days:
45- # t1: Time elapsed until the final confirmation of the previous tag
46- # t2: Time elapsed from previous tag to changed tag
76+ # no_change: Time elapsed until the final confirmation of the previous tag
77+ # change: Time elapsed from previous tag to changed tag
78+ # final_obs: Time elapsed from previous tag to data download
4779 changed_tags = (observations_df
4880 .query ('changed == 1' )
4981 .assign (
50- t1 = (pd .col ('last_obs_timestamp' ) - pd .col ('last_tag_timestamp' )).dt .days ,
51- t2 = (pd .col ('obs_timestamp' ) - pd .col ('last_tag_timestamp' )).dt .days ,
52- t3 = np .inf # (END_DATE - pd.col('last_tag_timestamp')).dt.days
82+ no_change = (
83+ pd .col ('last_obs_timestamp' ) - pd .col ('last_tag_timestamp' )
84+ ).dt .days ,
85+ change = (pd .col ('obs_timestamp' ) - pd .col ('last_tag_timestamp' )).dt .days ,
86+ final_obs = (END_DATE - pd .col ('last_tag_timestamp' )).dt .days
5387 )
5488 )
5589 unchanged_tags = (observations_df
5690 .query ('(changed == 0) & (latest_version == 1)' )
5791 .assign (
58- t1 = (pd .col ('obs_timestamp' ) - pd .col ('last_tag_timestamp' )).dt .days ,
59- t2 = np .inf , # (END_DATE - pd.col('last_tag_timestamp')).dt.days ,
60- t3 = np . inf
92+ no_change = (pd .col ('obs_timestamp' ) - pd .col ('last_tag_timestamp' )).dt .days ,
93+ change = np .inf ,
94+ final_obs = ( END_DATE - pd . col ( 'last_tag_timestamp' )). dt . days
6195 )
6296 )
6397 # Format changes
6498 to_plot_df = pd .concat ([changed_tags , unchanged_tags ])
65- # Create a plot
66- reshaped_df = (
67- pd .DataFrame ({
68- 'yes' : [np .sum (day_i < to_plot_df ['t1' ]) for day_i in range (max_days )],
69- 'unknown' : [
70- np .sum ((to_plot_df ['t1' ] <= day_i ) & (day_i < to_plot_df ['t2' ]))
71- for day_i in range (max_days )
72- ],
73- 'no' : [
74- np .sum ((to_plot_df ['t2' ] <= day_i ) & (day_i < to_plot_df ['t3' ]))
75- for day_i in range (max_days )
76- ],
77- })
78- .assign (
79- all = pd .col ('yes' ) + pd .col ('no' ) + pd .col ('unknown' ),
80- ymin = pd .col ('yes' ) / pd .col ('all' ),
81- ymax = (pd .col ('yes' ) + pd .col ('unknown' )) / pd .col ('all' ),
82- year = np .arange (max_days ) / 365 ,
83- )
99+ # Create a plot for all tags
100+ fig = change_plot_create (
101+ observations = to_plot_df ,
102+ no_change_col = 'no_change' ,
103+ change_col = 'change' ,
104+ final_observation_col = 'final_obs' ,
105+ day_range = max_days ,
106+ title = f"Stability of the `{ TAG_KEY } ` tag over time" ,
107+ x_label = "Years since tag" ,
108+ y_label = "Proportion remaining unchanged" ,
84109 )
85- fig = (
86- gg .ggplot (
87- reshaped_df ,
88- gg .aes (x = 'year' , ymin = 'ymin' , ymax = 'ymax' )) +
89- gg .geom_ribbon (fill = 'blue' , alpha = 0.4 ) +
90- gg .geom_line (gg .aes (y = 'ymin' ), color = 'black' , alpha = 0.5 ) +
91- gg .geom_line (gg .aes (y = 'ymax' ), color = 'black' , alpha = 0.5 ) +
92- gg .labs (
93- x = "Years from tag" ,
94- y = "Proportion remaining unchanged" ,
95- title = f"Proportion of `{ TAG_KEY } ` tags unchanged over time"
96- ) +
97- gg .scale_y_continuous (
98- limits = (0 , 1.01 ),
99- breaks = np .arange (0 , 1 , 0.25 ),
100- labels = [f"{ x * 100 :.0f} %" for x in np .arange (0 , 1 , 0.25 )]
101- ) +
102- gg .theme_bw ()
103- )
104- fig .save (
105- SAVE_DIR / f"osm_observations_{ TAG_KEY } .png" ,
106- width = 10 ,
107- height = 6 ,
108- units = 'in' ,
109- dpi = 300 ,
110- )
110+ fig_save (fig , stub = f"osm_changes_{ TAG_KEY } _all" )
111+
112+ # Create multi-panel plots for the top tags in each OSM category
113+ for subtype in OSM_KEYS :
114+ fig = change_multiplot_create (
115+ observations = to_plot_df ,
116+ col = subtype ,
117+ top_n = 9 ,
118+ no_change_col = 'no_change' ,
119+ change_col = 'change' ,
120+ final_observation_col = 'final_obs' ,
121+ day_range = max_days ,
122+ )
123+ fig_save (
124+ fig = fig ,
125+ stub = f"osm_changes_{ TAG_KEY } _{ subtype } " ,
126+ height = 12 ,
127+ width = 12
128+ )
0 commit comments