-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge_manual_data.py
More file actions
139 lines (123 loc) · 6.83 KB
/
merge_manual_data.py
File metadata and controls
139 lines (123 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
from thefuzz import process
import os
# Define file paths relative to the script location
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(script_dir, 'data')
manual_data_path = os.path.join(data_dir, 'manually_collected_school_data.json')
final_data_path = os.path.join(data_dir, 'school_data_final.json')
failed_geocoding_path = os.path.join(data_dir, 'failed_geocoding.json') # Path for failed geocoding data
# Load the manually collected data
try:
with open(manual_data_path, 'r', encoding='utf-8') as f:
manual_data = json.load(f)
print(f"Successfully loaded {len(manual_data)} entries from {manual_data_path}")
except FileNotFoundError:
print(f"Error: Manual data file not found at {manual_data_path}")
exit()
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {manual_data_path}")
exit()
# Load the final school data
try:
with open(final_data_path, 'r', encoding='utf-8') as f:
final_data_content = json.load(f)
# Assuming the school list is under a key like 'schools'
# Adjust this key if the structure is different
if isinstance(final_data_content, list):
final_data = final_data_content
schools_key = None # Data is a list directly
elif isinstance(final_data_content, dict) and 'schools' in final_data_content:
final_data = final_data_content['schools']
schools_key = 'schools'
else:
print(f"Error: Unexpected structure in {final_data_path}. Expected a list or a dict with a 'schools' key.")
exit()
print(f"Successfully loaded {len(final_data)} entries from {final_data_path}")
except FileNotFoundError:
print(f"Error: Final data file not found at {final_data_path}")
exit()
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {final_data_path}")
exit()
# Load the failed geocoding data
failed_schools = []
try:
with open(failed_geocoding_path, 'r', encoding='utf-8') as f:
failed_schools = json.load(f)
print(f"Successfully loaded {len(failed_schools)} entries from {failed_geocoding_path}")
except FileNotFoundError:
print(f"Warning: Failed geocoding file not found at {failed_geocoding_path}. Will create if needed.")
failed_schools = [] # Start with empty list if file doesn't exist
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {failed_geocoding_path}. Skipping update for this file.")
failed_schools = None # Indicate error loading
except Exception as e:
print(f"Error loading {failed_geocoding_path}: {e}. Skipping update for this file.")
failed_schools = None
# Create a dictionary mapping school names to their index in the final data for fuzzy matching
final_school_data_map = {school.get('name', ''): i for i, school in enumerate(final_data) if school.get('name')}
final_school_names = list(final_school_data_map.keys()) # Keep the list for extractOne choices
update_count = 0
match_threshold = 80 # Score threshold for fuzzy matching (0-100)
# matched_manual_names = set() # Keep track of names successfully matched from manual data - REPLACED
updated_final_names = set() # Keep track of the normalized 'name' from final_data that was actually updated
# Iterate through manually collected data and update final data
for manual_entry in manual_data:
manual_name = manual_entry.get('school_name')
if not manual_name:
continue
# Find the best match in the final data using fuzzy matching
# process.extractOne returns (choice, score, key) when given a dict as processor
# Here, we pass the list of names but will use the name to get the index from the map
best_match_result = process.extractOne(manual_name, final_school_names, score_cutoff=match_threshold)
if best_match_result:
matched_name, score = best_match_result # extractOne with list returns (choice, score)
# Find the original index using the matched name from our map
if matched_name in final_school_data_map:
index = final_school_data_map[matched_name]
print(f"Matching '{manual_name}' with '{matched_name}' (Index: {index}, Score: {score})")
# Update the corresponding entry in final_data
target_school = final_data[index]
else:
# This case should ideally not happen if the name came from the map keys
print(f"Error: Matched name '{matched_name}' not found in map for '{manual_name}'. Skipping.")
continue
target_school['suburb'] = manual_entry.get('suburb', target_school.get('suburb'))
target_school['postcode'] = manual_entry.get('postcode', target_school.get('postcode'))
target_school['latitude'] = manual_entry.get('latitude', target_school.get('latitude'))
target_school['longitude'] = manual_entry.get('longitude', target_school.get('longitude'))
# Add address if the field exists or needs creating
target_school['address'] = manual_entry.get('address', target_school.get('address'))
update_count += 1
# matched_manual_names.add(manual_name) # Add successfully matched name - REPLACED
updated_final_names.add(matched_name.strip().lower()) # Add the normalized name from final_data that was updated
else:
print(f"No suitable match found for '{manual_name}' in {final_data_path} (Threshold: {match_threshold})")
# Save the updated final data
output_data = final_data_content # Start with the original structure
if schools_key:
output_data[schools_key] = final_data
else:
output_data = final_data # If it was just a list
try:
with open(final_data_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nSuccessfully updated {update_count} entries.")
print(f"Updated school data saved to {final_data_path}")
except IOError as e:
print(f"Error saving updated data to {final_data_path}: {e}")
# Update and save the failed geocoding data
if failed_schools is not None: # Only proceed if loading was successful or file didn't exist initially
# Filter failed_schools to keep only those whose 'name' was NOT updated
# Compare against the set of names from final_data that were actually updated
remaining_failed_schools = [school for school in failed_schools if school.get('name', '').strip().lower() not in updated_final_names]
try:
with open(failed_geocoding_path, 'w', encoding='utf-8') as f:
json.dump(remaining_failed_schools, f, indent=2, ensure_ascii=False)
print(f"\nSuccessfully updated {failed_geocoding_path}.")
print(f"{len(remaining_failed_schools)} schools remain in the failed geocoding list.")
except IOError as e:
print(f"Error saving updated failed geocoding data to {failed_geocoding_path}: {e}")
else:
print(f"Skipped updating {failed_geocoding_path} due to loading errors.")