Skip to content

Commit cf94976

Browse files
committed
Refactor OSM download functions into the package.
1 parent a0c313d commit cf94976

2 files changed

Lines changed: 406 additions & 181 deletions

File tree

exploratory/osm_download.py

Lines changed: 75 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -1,189 +1,83 @@
1-
import overpass
2-
import xml.etree.ElementTree as ET
3-
import requests
1+
"""
2+
Exploratory script for downloading OSM data.
3+
4+
This script uses the openpois.osm_download module to:
5+
1. Collect element IDs across a date range using the Overpass API.
6+
2. Download element histories from the OSM API.
7+
3. Save the results to CSV files.
8+
"""
49
import datetime
5-
import time
6-
import pandas as pd
7-
8-
9-
TIMEOUT = 1000
10-
BBOX = {'ymin': 47.41, 'xmin': -122.48, 'ymax': 47.79, 'xmax': -122.16}
11-
START_DATE = datetime.datetime(2016, 1, 1) # Earliest option is September 13, 2012
12-
END_DATE = datetime.datetime(2026, 1, 1) # Latest
13-
DATE_INTERVAL = datetime.timedelta(days=7)
14-
OSM_KEYS = ['amenity', 'shop', 'healthcare', 'leisure']
15-
16-
# Create date range
17-
date_range = [
18-
START_DATE + i * DATE_INTERVAL
19-
for i in range(((END_DATE - START_DATE) // DATE_INTERVAL) + 1)
20-
]
21-
if date_range[-1] != END_DATE:
22-
date_range.append(END_DATE)
23-
24-
25-
def build_query_string(
26-
date: datetime.datetime,
27-
bbox: dict,
28-
keys: list,
29-
timeout: int
30-
) -> str:
31-
"""
32-
Builds a query string for the given date, bbox, kvs, and timeout.
33-
34-
Args:
35-
date: The date to query for.
36-
bbox: The bounding box to query for.
37-
amenities: The amenities to query for.
38-
timeout: The timeout for the query.
39-
Returns:
40-
A query string.
41-
"""
42-
query_string = f"""
43-
[out:xml][timeout:{timeout}]
44-
[date:"{date.strftime("%Y-%m-%dT00:00:00Z")}"];
45-
(
46-
"""
47-
def add_group(key: str) -> str:
48-
prefix = f"nwr({bbox['ymin']}, {bbox['xmin']}, {bbox['ymax']}, {bbox['xmax']})"
49-
return f"{prefix}[{key}];\n"
50-
for key in keys:
51-
query_string += add_group(key)
52-
query_string += """
53-
);
54-
out ids;
55-
"""
56-
return query_string
57-
58-
consider_ids = {
59-
'node': set(),
60-
'way': set(),
61-
'relation': set(),
62-
}
63-
64-
# Build query string
65-
66-
api = overpass.API(
67-
timeout = TIMEOUT,
68-
endpoint = "https://maps.mail.ru/osm/tools/overpass/api/interpreter"
69-
)
70-
failed_dates = []
71-
succeed_dates = []
72-
73-
for this_date in date_range:
74-
try:
75-
start_time = time.time()
76-
for key in OSM_KEYS:
77-
# Query all matching elements from this date
78-
query_string = build_query_string(
79-
date = this_date,
80-
bbox = BBOX,
81-
keys = [key],
82-
timeout = TIMEOUT
83-
)
84-
result_xml = api.get(query = query_string, build = False)
85-
# Get all IDs for matching elements; add them to the consider_ids sets
86-
result_etree = ET.fromstring(result_xml)
87-
for e_type in consider_ids:
88-
elements = result_etree.findall(f'.//{e_type}')
89-
for element in elements:
90-
consider_ids[e_type].add(element.get('id'))
91-
print(
92-
f"Successfully queried date {this_date} in {time.time() - start_time} seconds"
93-
)
94-
succeed_dates.append(this_date)
95-
except Exception as e:
96-
failed_dates.append(this_date)
97-
print(f"Failed to query date {this_date}; adding to failed_dates")
98-
time.sleep(1)
99-
100-
101-
# Save elements
102-
elements_table = pd.concat([
103-
pd.DataFrame({'type': e_type, 'id': list(consider_ids[e_type])})
104-
for e_type in consider_ids
105-
])
106-
elements_table.to_csv(
107-
'~/data/openpois/osm_elements.csv',
108-
index = False
10+
import os
11+
from openpois.osm_download import (
12+
build_date_range,
13+
collect_element_ids,
14+
download_element_histories,
10915
)
11016

111-
# Print the full structure of the Etree
112-
def print_etree_structure(elem, indent=0):
113-
print(' ' * indent + f"<{elem.tag} {dict(elem.attrib)}>")
114-
for child in elem:
115-
print_etree_structure(child, indent + 1)
17+
# -----------------------------------------------------------------------------
18+
# Configuration constants
19+
# -----------------------------------------------------------------------------
11620

117-
def process_version(version_etree: ET.ElementTree) -> tuple[pd.DataFrame, set[tuple[str, str]]]:
118-
# Extract version metadata
119-
tag_keys = ['lat', 'lon', 'visible']
120-
non_tag_df = pd.DataFrame(
121-
[{key: version_etree.get(key) for key in version_etree.attrib if key not in tag_keys}]
21+
TIMEOUT = 1000
22+
BBOX = {"ymin": 47.41, "xmin": -122.48, "ymax": 47.79, "xmax": -122.16}
23+
START_DATE = datetime.datetime(2016, 1, 1) # Earliest option is September 13, 2012
24+
END_DATE = datetime.datetime(2025, 12, 31) # Latest
25+
DATE_INTERVAL = datetime.timedelta(days = 7)
26+
OSM_KEYS = ["amenity", "shop", "healthcare", "leisure"]
27+
SAVE_DIR = "~/data/osm_example_data"
28+
29+
os.makedirs(SAVE_DIR, exist_ok = True)
30+
31+
32+
# -----------------------------------------------------------------------------
33+
# Main workflow
34+
# -----------------------------------------------------------------------------
35+
36+
if __name__ == "__main__":
37+
# Create date range
38+
date_range = build_date_range(
39+
start_date = START_DATE,
40+
end_date = END_DATE,
41+
interval = DATE_INTERVAL,
12242
)
123-
non_tag_df['type'] = version_etree.tag
124-
# Get all k,v pairs for this version
125-
tag_tuples = [
126-
(key, version_etree.get(key))
127-
for key in version_etree.attrib
128-
if key in tag_keys
129-
]
130-
for tag_item in version_etree.findall('.//tag'):
131-
tag_tuples.append((tag_item.get('k'), tag_item.get('v')))
132-
return non_tag_df, set(tag_tuples)
133-
134-
def compare_tags(v1: set[tuple[str, str]], v2: set[tuple[str, str]]) -> pd.DataFrame:
135-
"""Get all changes between two sets of key-value pairs."""
136-
new_tuples = list(v2 - v1)
137-
removed_tuples = list(v1 - v2)
138-
new_df = pd.DataFrame(new_tuples, columns = ['key', 'value'])
139-
new_df['change'] = 'Added'
140-
removed_df = pd.DataFrame(removed_tuples, columns = ['key', 'value'])
141-
removed_df['change'] = 'Deleted'
142-
# Check for changed keys
143-
new_df.loc[new_df['key'].isin(removed_df['key']), 'change'] = 'Changed'
144-
removed_df = removed_df.loc[~removed_df['key'].isin(new_df['key']), :]
145-
all_changes_df = pd.concat([new_df, removed_df])
146-
return all_changes_df
147-
148-
def process_element(element_etree: ET.ElementTree) -> tuple[pd.DataFrame, pd.DataFrame]:
149-
"""Process an element and return all changes over multiple versions."""
150-
previous_tags = set()
151-
versions_list = []
152-
changes_list = []
153-
for version_etree in element_etree:
154-
non_tag_df, current_tags = process_version(version_etree)
155-
versions_list.append(non_tag_df)
156-
changes_df = compare_tags(previous_tags, current_tags)
157-
changes_df['id'] = non_tag_df['id'].iloc[0]
158-
changes_df['version'] = non_tag_df['version'].iloc[0]
159-
changes_list.append(changes_df)
160-
previous_tags = current_tags
161-
return pd.concat(versions_list), pd.concat(changes_list)
162-
16343

164-
versions_list = []
165-
changes_list = []
166-
failed_rows = []
167-
168-
for idx, row in elements_table.iterrows():
169-
print(f" Row {idx}: type={row['type']}, id={row['id']}")
170-
history_url = f"https://api.openstreetmap.org/api/0.6/{row['type']}/{row['id']}/history"
171-
try:
172-
history_response = requests.get(history_url, timeout = TIMEOUT)
173-
except Exception as e:
174-
print(f"Failed to get history for row {idx}: {e}")
175-
failed_rows.append(row)
176-
time.sleep(1)
177-
continue
178-
history_etree = ET.fromstring(history_response.text)
179-
versions_df, changes_df = process_element(history_etree)
180-
versions_list.append(versions_df)
181-
changes_list.append(changes_df)
182-
if idx % 100 == 0:
183-
print(f"Processed {idx} rows")
44+
# Collect element IDs from Overpass
45+
elements_table, succeed_dates, failed_dates = collect_element_ids(
46+
date_range = date_range,
47+
bbox = BBOX,
48+
osm_keys = OSM_KEYS,
49+
timeout = TIMEOUT,
50+
)
18451

185-
versions_df = pd.concat(versions_list)
186-
changes_df = pd.concat(changes_list)
52+
# Save elements table
53+
elements_table.to_csv(
54+
os.path.join(SAVE_DIR, "osm_elements.csv"),
55+
index = False,
56+
)
57+
print(f"Saved {len(elements_table)} elements to osm_elements.csv")
58+
print(f"Succeeded on {len(succeed_dates)} dates, failed on {len(failed_dates)}")
59+
60+
# Download element histories
61+
versions_df, changes_df, failed_rows = download_element_histories(
62+
elements_table = elements_table,
63+
timeout = TIMEOUT,
64+
progress = True,
65+
)
18766

188-
versions_df.to_csv('osm_versions.csv', index = False)
189-
changes_df.to_csv('osm_changes.csv', index = False)
67+
# Save results
68+
versions_df.to_csv(
69+
os.path.join(SAVE_DIR, "osm_versions.csv"),
70+
index=False,
71+
)
72+
changes_df.to_csv(
73+
os.path.join(SAVE_DIR, "osm_changes.csv"),
74+
index = False,
75+
)
76+
print(f"Saved {len(versions_df)} versions and {len(changes_df)} changes")
77+
78+
print(f"Failed on {len(failed_rows)} elements")
79+
failed_elements = elements_table.iloc[failed_rows, :]
80+
failed_elements.to_csv(
81+
os.path.join(SAVE_DIR, "osm_failed_elements.csv"),
82+
index = False,
83+
)

0 commit comments

Comments
 (0)