1- import overpass
2- import xml .etree .ElementTree as ET
3- import requests
1+ """
2+ Exploratory script for downloading OSM data.
3+
4+ This script uses the openpois.osm_download module to:
5+ 1. Collect element IDs across a date range using the Overpass API.
6+ 2. Download element histories from the OSM API.
7+ 3. Save the results to CSV files.
8+ """
49import datetime
5- import time
6- import pandas as pd
7-
8-
9- TIMEOUT = 1000
10- BBOX = {'ymin' : 47.41 , 'xmin' : - 122.48 , 'ymax' : 47.79 , 'xmax' : - 122.16 }
11- START_DATE = datetime .datetime (2016 , 1 , 1 ) # Earliest option is September 13, 2012
12- END_DATE = datetime .datetime (2026 , 1 , 1 ) # Latest
13- DATE_INTERVAL = datetime .timedelta (days = 7 )
14- OSM_KEYS = ['amenity' , 'shop' , 'healthcare' , 'leisure' ]
15-
16- # Create date range
17- date_range = [
18- START_DATE + i * DATE_INTERVAL
19- for i in range (((END_DATE - START_DATE ) // DATE_INTERVAL ) + 1 )
20- ]
21- if date_range [- 1 ] != END_DATE :
22- date_range .append (END_DATE )
23-
24-
25- def build_query_string (
26- date : datetime .datetime ,
27- bbox : dict ,
28- keys : list ,
29- timeout : int
30- ) -> str :
31- """
32- Builds a query string for the given date, bbox, kvs, and timeout.
33-
34- Args:
35- date: The date to query for.
36- bbox: The bounding box to query for.
37- amenities: The amenities to query for.
38- timeout: The timeout for the query.
39- Returns:
40- A query string.
41- """
42- query_string = f"""
43- [out:xml][timeout:{ timeout } ]
44- [date:"{ date .strftime ("%Y-%m-%dT00:00:00Z" )} "];
45- (
46- """
47- def add_group (key : str ) -> str :
48- prefix = f"nwr({ bbox ['ymin' ]} , { bbox ['xmin' ]} , { bbox ['ymax' ]} , { bbox ['xmax' ]} )"
49- return f"{ prefix } [{ key } ];\n "
50- for key in keys :
51- query_string += add_group (key )
52- query_string += """
53- );
54- out ids;
55- """
56- return query_string
57-
58- consider_ids = {
59- 'node' : set (),
60- 'way' : set (),
61- 'relation' : set (),
62- }
63-
64- # Build query string
65-
66- api = overpass .API (
67- timeout = TIMEOUT ,
68- endpoint = "https://maps.mail.ru/osm/tools/overpass/api/interpreter"
69- )
70- failed_dates = []
71- succeed_dates = []
72-
73- for this_date in date_range :
74- try :
75- start_time = time .time ()
76- for key in OSM_KEYS :
77- # Query all matching elements from this date
78- query_string = build_query_string (
79- date = this_date ,
80- bbox = BBOX ,
81- keys = [key ],
82- timeout = TIMEOUT
83- )
84- result_xml = api .get (query = query_string , build = False )
85- # Get all IDs for matching elements; add them to the consider_ids sets
86- result_etree = ET .fromstring (result_xml )
87- for e_type in consider_ids :
88- elements = result_etree .findall (f'.//{ e_type } ' )
89- for element in elements :
90- consider_ids [e_type ].add (element .get ('id' ))
91- print (
92- f"Successfully queried date { this_date } in { time .time () - start_time } seconds"
93- )
94- succeed_dates .append (this_date )
95- except Exception as e :
96- failed_dates .append (this_date )
97- print (f"Failed to query date { this_date } ; adding to failed_dates" )
98- time .sleep (1 )
99-
100-
101- # Save elements
102- elements_table = pd .concat ([
103- pd .DataFrame ({'type' : e_type , 'id' : list (consider_ids [e_type ])})
104- for e_type in consider_ids
105- ])
106- elements_table .to_csv (
107- '~/data/openpois/osm_elements.csv' ,
108- index = False
10+ import os
11+ from openpois .osm_download import (
12+ build_date_range ,
13+ collect_element_ids ,
14+ download_element_histories ,
10915)
11016
111- # Print the full structure of the Etree
112- def print_etree_structure (elem , indent = 0 ):
113- print (' ' * indent + f"<{ elem .tag } { dict (elem .attrib )} >" )
114- for child in elem :
115- print_etree_structure (child , indent + 1 )
17+ # -----------------------------------------------------------------------------
18+ # Configuration constants
19+ # -----------------------------------------------------------------------------
11620
117- def process_version (version_etree : ET .ElementTree ) -> tuple [pd .DataFrame , set [tuple [str , str ]]]:
118- # Extract version metadata
119- tag_keys = ['lat' , 'lon' , 'visible' ]
120- non_tag_df = pd .DataFrame (
121- [{key : version_etree .get (key ) for key in version_etree .attrib if key not in tag_keys }]
21+ TIMEOUT = 1000
22+ BBOX = {"ymin" : 47.41 , "xmin" : - 122.48 , "ymax" : 47.79 , "xmax" : - 122.16 }
23+ START_DATE = datetime .datetime (2016 , 1 , 1 ) # Earliest option is September 13, 2012
24+ END_DATE = datetime .datetime (2025 , 12 , 31 ) # Latest
25+ DATE_INTERVAL = datetime .timedelta (days = 7 )
26+ OSM_KEYS = ["amenity" , "shop" , "healthcare" , "leisure" ]
27+ SAVE_DIR = "~/data/osm_example_data"
28+
29+ os .makedirs (SAVE_DIR , exist_ok = True )
30+
31+
32+ # -----------------------------------------------------------------------------
33+ # Main workflow
34+ # -----------------------------------------------------------------------------
35+
36+ if __name__ == "__main__" :
37+ # Create date range
38+ date_range = build_date_range (
39+ start_date = START_DATE ,
40+ end_date = END_DATE ,
41+ interval = DATE_INTERVAL ,
12242 )
123- non_tag_df ['type' ] = version_etree .tag
124- # Get all k,v pairs for this version
125- tag_tuples = [
126- (key , version_etree .get (key ))
127- for key in version_etree .attrib
128- if key in tag_keys
129- ]
130- for tag_item in version_etree .findall ('.//tag' ):
131- tag_tuples .append ((tag_item .get ('k' ), tag_item .get ('v' )))
132- return non_tag_df , set (tag_tuples )
133-
134- def compare_tags (v1 : set [tuple [str , str ]], v2 : set [tuple [str , str ]]) -> pd .DataFrame :
135- """Get all changes between two sets of key-value pairs."""
136- new_tuples = list (v2 - v1 )
137- removed_tuples = list (v1 - v2 )
138- new_df = pd .DataFrame (new_tuples , columns = ['key' , 'value' ])
139- new_df ['change' ] = 'Added'
140- removed_df = pd .DataFrame (removed_tuples , columns = ['key' , 'value' ])
141- removed_df ['change' ] = 'Deleted'
142- # Check for changed keys
143- new_df .loc [new_df ['key' ].isin (removed_df ['key' ]), 'change' ] = 'Changed'
144- removed_df = removed_df .loc [~ removed_df ['key' ].isin (new_df ['key' ]), :]
145- all_changes_df = pd .concat ([new_df , removed_df ])
146- return all_changes_df
147-
148- def process_element (element_etree : ET .ElementTree ) -> tuple [pd .DataFrame , pd .DataFrame ]:
149- """Process an element and return all changes over multiple versions."""
150- previous_tags = set ()
151- versions_list = []
152- changes_list = []
153- for version_etree in element_etree :
154- non_tag_df , current_tags = process_version (version_etree )
155- versions_list .append (non_tag_df )
156- changes_df = compare_tags (previous_tags , current_tags )
157- changes_df ['id' ] = non_tag_df ['id' ].iloc [0 ]
158- changes_df ['version' ] = non_tag_df ['version' ].iloc [0 ]
159- changes_list .append (changes_df )
160- previous_tags = current_tags
161- return pd .concat (versions_list ), pd .concat (changes_list )
162-
16343
164- versions_list = []
165- changes_list = []
166- failed_rows = []
167-
168- for idx , row in elements_table .iterrows ():
169- print (f" Row { idx } : type={ row ['type' ]} , id={ row ['id' ]} " )
170- history_url = f"https://api.openstreetmap.org/api/0.6/{ row ['type' ]} /{ row ['id' ]} /history"
171- try :
172- history_response = requests .get (history_url , timeout = TIMEOUT )
173- except Exception as e :
174- print (f"Failed to get history for row { idx } : { e } " )
175- failed_rows .append (row )
176- time .sleep (1 )
177- continue
178- history_etree = ET .fromstring (history_response .text )
179- versions_df , changes_df = process_element (history_etree )
180- versions_list .append (versions_df )
181- changes_list .append (changes_df )
182- if idx % 100 == 0 :
183- print (f"Processed { idx } rows" )
44+ # Collect element IDs from Overpass
45+ elements_table , succeed_dates , failed_dates = collect_element_ids (
46+ date_range = date_range ,
47+ bbox = BBOX ,
48+ osm_keys = OSM_KEYS ,
49+ timeout = TIMEOUT ,
50+ )
18451
185- versions_df = pd .concat (versions_list )
186- changes_df = pd .concat (changes_list )
52+ # Save elements table
53+ elements_table .to_csv (
54+ os .path .join (SAVE_DIR , "osm_elements.csv" ),
55+ index = False ,
56+ )
57+ print (f"Saved { len (elements_table )} elements to osm_elements.csv" )
58+ print (f"Succeeded on { len (succeed_dates )} dates, failed on { len (failed_dates )} " )
59+
60+ # Download element histories
61+ versions_df , changes_df , failed_rows = download_element_histories (
62+ elements_table = elements_table ,
63+ timeout = TIMEOUT ,
64+ progress = True ,
65+ )
18766
188- versions_df .to_csv ('osm_versions.csv' , index = False )
189- changes_df .to_csv ('osm_changes.csv' , index = False )
67+ # Save results
68+ versions_df .to_csv (
69+ os .path .join (SAVE_DIR , "osm_versions.csv" ),
70+ index = False ,
71+ )
72+ changes_df .to_csv (
73+ os .path .join (SAVE_DIR , "osm_changes.csv" ),
74+ index = False ,
75+ )
76+ print (f"Saved { len (versions_df )} versions and { len (changes_df )} changes" )
77+
78+ print (f"Failed on { len (failed_rows )} elements" )
79+ failed_elements = elements_table .iloc [failed_rows , :]
80+ failed_elements .to_csv (
81+ os .path .join (SAVE_DIR , "osm_failed_elements.csv" ),
82+ index = False ,
83+ )
0 commit comments