-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd_geocodes.py
More file actions
194 lines (170 loc) · 9.12 KB
/
add_geocodes.py
File metadata and controls
194 lines (170 loc) · 9.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import json
import pandas as pd
import requests
import io
import os
import time
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
# Configuration
SCHOOL_DATA_FILE = os.path.join('data', 'school_data_comprehensive.json') # Corrected input file
OUTPUT_DATA_FILE = os.path.join('data', 'school_data_final.json') # Overwrite the original file
POSTCODE_CSV_URL = 'https://raw.githubusercontent.com/Elkfox/Australian-Postcode-Data/master/au_postcodes.csv'
STATE_FILTER = 'VIC' # Filter for Victoria
def fetch_postcode_data(url):
"""Fetches postcode data CSV from the given URL."""
print(f"Fetching postcode data from {url}...")
try:
response = requests.get(url, timeout=30) # Add timeout
response.raise_for_status() # Raise an exception for bad status codes
print("Successfully fetched postcode data.")
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching postcode data: {e}")
return None
def create_geocode_lookup(csv_content, state_filter):
"""Creates a lookup dictionary from postcode CSV content."""
print(f"Processing postcode data for state: {state_filter}...")
if not csv_content:
return {}
try:
# Use StringIO to treat the string content as a file
csv_file = io.StringIO(csv_content)
df = pd.read_csv(csv_file)
# Filter by state
df_state = df[df['state_code'] == state_filter].copy()
print(f"Found {len(df_state)} entries for {state_filter}.")
# Ensure necessary columns exist
required_cols = ['place_name', 'postcode', 'latitude', 'longitude']
if not all(col in df_state.columns for col in required_cols):
print(f"Error: CSV missing one or more required columns: {required_cols}")
return {}
# Clean data: convert postcode to string, handle potential NaN in place_name
df_state['postcode'] = df_state['postcode'].astype(str)
df_state['place_name'] = df_state['place_name'].fillna('').astype(str)
# Create lookup key: (lowercase suburb, postcode)
df_state['lookup_key'] = df_state.apply(lambda row: (row['place_name'].strip().lower(), row['postcode'].strip()), axis=1)
# Handle duplicates: Group by lookup key and take the mean lat/lon
# Using mean might be slightly inaccurate if multiple distinct places share suburb/postcode,
# but it's a reasonable approach for averaging.
lookup_df = df_state.groupby('lookup_key')[['latitude', 'longitude']].mean().reset_index()
# Convert to dictionary
geocode_lookup = {row['lookup_key']: (row['latitude'], row['longitude']) for index, row in lookup_df.iterrows()}
print(f"Created geocode lookup with {len(geocode_lookup)} unique suburb/postcode entries.")
return geocode_lookup
except Exception as e:
print(f"Error processing postcode CSV data: {e}")
return {}
# Initialize Nominatim geocoder with rate limiting
geolocator = Nominatim(user_agent="school_geocoder_app_v1") # Replace with your app name
# Rate limiter to respect Nominatim's usage policy (1 request per second)
geocode_nominatim = RateLimiter(geolocator.geocode, min_delay_seconds=2)
def add_geocodes_to_schools(school_data_file, output_file, geocode_lookup):
"""Adds latitude and longitude to school data using the lookup."""
print(f"Loading school data from {school_data_file}...")
try:
with open(school_data_file, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"Error: School data file not found at {school_data_file}")
return
except json.JSONDecodeError as e:
print(f"Error decoding JSON from {school_data_file}: {e}")
return
except Exception as e:
print(f"An unexpected error occurred loading {school_data_file}: {e}")
return
schools = data.get('schools', [])
if not schools:
print("Warning: No schools found in the data file.")
return
print(f"Adding geocodes to {len(schools)} schools...")
schools_updated_count = 0
schools_missing_geo_count = 0
failed_schools = [] # Initialize list to store schools that failed geocoding
for school in schools:
suburb_name = str(school.get('suburb_name', '')).strip().lower()
postcode = str(school.get('suburb', '')).strip() # CORRECTED: Use 'suburb' key for postcode
lookup_key = (suburb_name, postcode)
coords = geocode_lookup.get(lookup_key)
location = None # Initialize location for Nominatim result
if not coords:
# --- Fallback to Nominatim ---
school_name = school.get('name', '')
address_query = f"{school_name}, {suburb_name}, {postcode}, VIC, Australia"
print(f" Postcode lookup failed for ('{suburb_name}', '{postcode}'). Trying Nominatim with: '{address_query}'")
try:
location = geocode_nominatim(address_query, addressdetails=True, timeout=10) # Increased timeout
if location:
coords = (location.latitude, location.longitude)
print(f" Nominatim SUCCESS: Found coords ({coords[0]}, {coords[1]}) for '{school_name}'")
if location:
coords = (location.latitude, location.longitude)
print(f" Nominatim SUCCESS: Found coords ({coords[0]}, {coords[1]}) for '{school_name}'")
else:
# Explicitly log when Nominatim returns None
print(f" Nominatim INFO: No location found for query: '{address_query}' (Result was None)")
except GeocoderTimedOut:
print(f" Nominatim ERROR (Timeout): Geocoder timed out for query: '{address_query}'")
time.sleep(2) # Wait a bit before next try
except GeocoderServiceError as e:
print(f" Nominatim ERROR (Service): Geocoder service error for query: '{address_query}': {e}")
time.sleep(5) # Wait longer for service errors
except Exception as e:
# Log the specific exception type
print(f" Nominatim ERROR (Unexpected: {type(e).__name__}): An unexpected error occurred during Nominatim lookup for '{address_query}': {e}")
# --- End Fallback ---
if coords:
school['latitude'] = coords[0]
school['longitude'] = coords[1]
schools_updated_count += 1
# print(f" Added coords for {school.get('name')}: {coords}")
else:
school['latitude'] = None
# school['latitude'] = None # Duplicate line removed
school['longitude'] = None
schools_missing_geo_count += 1
failed_schools.append(school) # Add school to the failed list ONLY if both methods fail
# print(f" Lookup failed for key: ('{suburb_name}', '{postcode}') - School: {school.get('name')}") # Print failed lookup keys
print(f"Finished processing schools. Added coordinates for {schools_updated_count} schools.")
if schools_missing_geo_count > 0:
print(f"Warning: Could not find coordinates for {schools_missing_geo_count} schools.")
# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
print(f"Creating output directory: {output_dir}")
os.makedirs(output_dir)
print(f"Saving updated school data to {output_file}...")
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4)
print("Successfully saved updated data.")
# Save schools that failed geocoding
if failed_schools:
failed_file_path = os.path.join(os.path.dirname(output_file), 'failed_geocoding.json')
print(f"\nSaving {len(failed_schools)} schools that failed geocoding to {failed_file_path}...")
try:
with open(failed_file_path, 'w', encoding='utf-8') as f:
json.dump(failed_schools, f, indent=4)
print(f"Successfully saved failed geocoding schools to {failed_file_path}")
except IOError as e:
print(f"Error saving failed geocoding schools to {failed_file_path}: {e}")
except Exception as e:
print(f"An unexpected error occurred saving {failed_file_path}: {e}")
except IOError as e:
print(f"Error saving updated data to {output_file}: {e}")
except Exception as e:
print(f"An unexpected error occurred saving {output_file}: {e}")
if __name__ == "__main__":
print("--- Starting Geocoding Script ---")
postcode_csv = fetch_postcode_data(POSTCODE_CSV_URL)
if postcode_csv:
lookup = create_geocode_lookup(postcode_csv, STATE_FILTER)
if lookup:
add_geocodes_to_schools(SCHOOL_DATA_FILE, OUTPUT_DATA_FILE, lookup)
else:
print("Failed to create geocode lookup. Cannot proceed.")
else:
print("Failed to fetch postcode data. Cannot proceed.")
print("--- Geocoding Script Finished ---")