-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupdate_database.py
More file actions
274 lines (232 loc) · 11.5 KB
/
update_database.py
File metadata and controls
274 lines (232 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import os
import json
import shutil
import csv
import re
from datetime import datetime
# File paths
DATA_DIR = "data"
JS_DIR = "js"
COMPREHENSIVE_SCHOOL_DATA = os.path.join(DATA_DIR, "school_data_comprehensive.json")
EXISTING_SCHOOL_DATA = os.path.join(DATA_DIR, "school_data_final.json")
VIC_SUBURBS_DATA = os.path.join(DATA_DIR, "victoria_suburbs.json")
# Source for the geocode database, assumed to be generated by another script (e.g., comprehensive_scraper.py)
SOURCE_GEOCODE_FROM_DATA_DIR = os.path.join(DATA_DIR, "generated_geocode_for_frontend.js")
# Target geocode database file used by the frontend
TARGET_GEOCODE_IN_JS_DIR = os.path.join(JS_DIR, "geocode-db.js")
# Backup directory
BACKUP_DIR = os.path.join(DATA_DIR, "backups")
os.makedirs(BACKUP_DIR, exist_ok=True)
def backup_existing_files():
"""Create backups of existing data files before updating them."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
files_to_backup = [
EXISTING_SCHOOL_DATA,
TARGET_GEOCODE_IN_JS_DIR # Backup the target file in the js directory
]
for file_path in files_to_backup:
if os.path.exists(file_path):
filename = os.path.basename(file_path)
backup_path = os.path.join(BACKUP_DIR, f"{filename}.{timestamp}.bak")
try:
shutil.copy2(file_path, backup_path)
print(f"Created backup: {backup_path}")
except Exception as e:
print(f"Error creating backup of {file_path}: {e}")
def normalize_school_name(name):
"""Normalize school name for matching."""
if not name:
return ''
# Convert to lowercase
normalized = name.lower()
# Remove common suffixes
normalized = re.sub(r'\s+campus$', '', normalized)
# Replace variations in campus indicators
normalized = normalized.replace(' - ', ' ')
# Remove extra spaces
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def merge_rankings_data():
"""Merge rankings data from CSV into the main JSON database."""
rankings_file = os.path.join(DATA_DIR, "processed_rankings.csv")
if not os.path.exists(rankings_file):
print(f"Rankings file not found at {rankings_file}")
return False
if not os.path.exists(EXISTING_SCHOOL_DATA):
print(f"Existing school data not found at {EXISTING_SCHOOL_DATA}")
return False
print(f"Loading rankings from {rankings_file}...")
# Load rankings data
rankings_data = {}
try:
with open(rankings_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Only process if we have a valid rank and it's the latest year (2024)
# The CSV contains multiple years, we want to update the main record with the latest
if row.get('Year') == '2024':
name = row.get('School')
if name:
normalized_name = normalize_school_name(name)
rankings_data[normalized_name] = row
except Exception as e:
print(f"Error reading rankings CSV: {e}")
return False
print(f"Loaded rankings for {len(rankings_data)} schools (Year 2024).")
# Load existing school data
try:
with open(EXISTING_SCHOOL_DATA, 'r', encoding='utf-8') as f:
data = json.load(f)
schools = data.get('schools', [])
except Exception as e:
print(f"Error loading existing school data: {e}")
return False
updated_count = 0
for school in schools:
school_name = school.get('name')
if not school_name:
continue
normalized_name = normalize_school_name(school_name)
if normalized_name in rankings_data:
ranking_info = rankings_data[normalized_name]
# Update fields
school['Median_VCE_Score'] = float(ranking_info['Median_VCE_Score']) if ranking_info.get('Median_VCE_Score') else None
school['Pct_Scores_40_Plus'] = float(ranking_info['Pct_Scores_40_Plus']) if ranking_info.get('Pct_Scores_40_Plus') else None
school['Completion_Rate'] = float(ranking_info['Completion_Rate']) if ranking_info.get('Completion_Rate') else None
school['Tertiary_Application_Rate'] = float(ranking_info['Tertiary_Application_Rate']) if ranking_info.get('Tertiary_Application_Rate') else None
school['rank'] = int(float(ranking_info['Rank'])) if ranking_info.get('Rank') else None
school['score_2023'] = float(ranking_info['Weighted_Score']) if ranking_info.get('Weighted_Score') else None # Mapping Weighted_Score to score_2023 for now as generic score
updated_count += 1
# Save updated data
try:
with open(EXISTING_SCHOOL_DATA, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Successfully merged rankings into school database.")
print(f" - {updated_count} schools updated with ranking data.")
return True
except Exception as e:
print(f"Error saving updated school data: {e}")
return False
def update_school_database():
"""Update the existing school database with comprehensive data."""
if not os.path.exists(COMPREHENSIVE_SCHOOL_DATA):
print(f"Comprehensive school data not found at {COMPREHENSIVE_SCHOOL_DATA}")
print("Please run comprehensive_scraper.py first.")
return False
# Load comprehensive school data
try:
with open(COMPREHENSIVE_SCHOOL_DATA, 'r', encoding='utf-8') as f:
comprehensive_data = json.load(f)
comprehensive_schools = comprehensive_data.get('schools', [])
except Exception as e:
print(f"Error loading comprehensive school data: {e}")
return False
# Load existing school data if available
existing_schools = []
existing_school_ids = set()
if os.path.exists(EXISTING_SCHOOL_DATA):
try:
with open(EXISTING_SCHOOL_DATA, 'r', encoding='utf-8') as f:
existing_data = json.load(f)
existing_schools = existing_data.get('schools', [])
# Convert existing schools list to a dictionary keyed by ID for efficient lookup
existing_schools_dict = {school.get('id'): school for school in existing_schools if school.get('id')}
except Exception as e:
print(f"Error loading existing school data: {e}")
existing_schools_dict = {}
# Merge school data, updating existing records and adding new ones
merged_schools_dict = existing_schools_dict.copy()
new_schools_added = 0
schools_updated = 0
for school in comprehensive_schools:
school_id = school.get('id')
if not school_id:
continue # Skip schools without an ID
if school_id in merged_schools_dict:
# Update existing school record
existing_school = merged_schools_dict[school_id]
updated = False
# Prioritize non-Unknown type, especially from VIC Government source
current_type = existing_school.get('type', 'Unknown')
new_type = school.get('type', 'Unknown')
if (current_type == 'Unknown' and new_type != 'Unknown') or \
('VIC Government' in school.get('data_source', '') and new_type != 'Unknown'):
existing_school['type'] = new_type
updated = True
# Update rank if missing in existing or if new source is Better Education
current_rank = existing_school.get('rank')
new_rank = school.get('rank')
if new_rank is not None and (current_rank is None or 'Better Education' in school.get('data_source', '')):
# Only update rank if new rank is not None
# Prioritize Better Education rank if available, otherwise take any rank if current is None
if current_rank is None or 'Better Education' in school.get('data_source', ''):
existing_school['rank'] = new_rank
# Also update score if rank is updated from BE
if 'Better Education' in school.get('data_source', ''):
existing_school['score_2023'] = school.get('score_2023')
updated = True
# Update other potentially missing fields like address, suburb_name, postcode
for key in ['address', 'suburb_name', 'suburb']:
if not existing_school.get(key) and school.get(key):
existing_school[key] = school.get(key)
updated = True
if updated:
schools_updated += 1
else:
# Add new school record
merged_schools_dict[school_id] = school
new_schools_added += 1
# Convert merged dictionary back to a list
merged_schools_list = list(merged_schools_dict.values())
# Save merged data
merged_data = {"schools": merged_schools_list}
try:
with open(EXISTING_SCHOOL_DATA, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, indent=4, ensure_ascii=False)
print(f"Successfully updated school database.")
print(f" - {new_schools_added} new schools added.")
print(f" - {schools_updated} existing schools updated.")
print(f"Total schools in database: {len(merged_schools_list)}")
return True
except Exception as e:
print(f"Error saving merged school data: {e}")
return False
def update_geocode_database():
"""Update the geocode database by copying a generated file from the data directory to the js directory."""
if not os.path.exists(SOURCE_GEOCODE_FROM_DATA_DIR):
print(f"Source geocode database not found at {SOURCE_GEOCODE_FROM_DATA_DIR}")
print("Please ensure it's generated (e.g., by comprehensive_scraper.py or similar) and named correctly before running this update script.")
return False
# Check if source and destination are the same file
if os.path.abspath(SOURCE_GEOCODE_FROM_DATA_DIR) == os.path.abspath(TARGET_GEOCODE_IN_JS_DIR):
print(f"Skipping geocode update: Source and destination are the same file ({SOURCE_GEOCODE_FROM_DATA_DIR}).")
return True
try:
shutil.copy2(SOURCE_GEOCODE_FROM_DATA_DIR, TARGET_GEOCODE_IN_JS_DIR)
print(f"Successfully updated geocode database by copying '{SOURCE_GEOCODE_FROM_DATA_DIR}' to '{TARGET_GEOCODE_IN_JS_DIR}'")
return True
except shutil.SameFileError:
print(f"Error updating geocode database: Source '{SOURCE_GEOCODE_FROM_DATA_DIR}' and destination '{TARGET_GEOCODE_IN_JS_DIR}' are the same file. This indicates a configuration issue.")
return False
except Exception as e:
print(f"Error updating geocode database: {e}")
return False
def main():
print("=== Updating Schoolify Database ===")
# Create backups of existing files
print("\nCreating backups of existing data files...")
backup_existing_files()
# Update school database
print("\nUpdating school database...")
update_school_database()
# Merge rankings data
print("\nMerging rankings data...")
merge_rankings_data()
# Update geocode database
print("\nUpdating geocode database...")
update_geocode_database()
print("\n=== Database Update Complete ===")
print("\nTo view updated database statistics, run:")
print("python count_database_entries.py")
if __name__ == "__main__":
main()