schoolify/merge_manual_data.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
from thefuzz import process
import os

# Define file paths relative to the script location
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(script_dir, 'data')
manual_data_path = os.path.join(data_dir, 'manually_collected_school_data.json')
final_data_path = os.path.join(data_dir, 'school_data_final.json')
failed_geocoding_path = os.path.join(data_dir, 'failed_geocoding.json') # Path for failed geocoding data

# Load the manually collected data
try:
    with open(manual_data_path, 'r', encoding='utf-8') as f:
        manual_data = json.load(f)
    print(f"Successfully loaded {len(manual_data)} entries from {manual_data_path}")
except FileNotFoundError:
    print(f"Error: Manual data file not found at {manual_data_path}")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {manual_data_path}")
    exit()

# Load the final school data
try:
    with open(final_data_path, 'r', encoding='utf-8') as f:
        final_data_content = json.load(f)
        # Assuming the school list is under a key like 'schools'
        # Adjust this key if the structure is different
        if isinstance(final_data_content, list):
            final_data = final_data_content
            schools_key = None # Data is a list directly
        elif isinstance(final_data_content, dict) and 'schools' in final_data_content:
             final_data = final_data_content['schools']
             schools_key = 'schools'
        else:
             print(f"Error: Unexpected structure in {final_data_path}. Expected a list or a dict with a 'schools' key.")
             exit()

    print(f"Successfully loaded {len(final_data)} entries from {final_data_path}")
except FileNotFoundError:
    print(f"Error: Final data file not found at {final_data_path}")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {final_data_path}")
    exit()

# Load the failed geocoding data
failed_schools = []
try:
    with open(failed_geocoding_path, 'r', encoding='utf-8') as f:
        failed_schools = json.load(f)
    print(f"Successfully loaded {len(failed_schools)} entries from {failed_geocoding_path}")
except FileNotFoundError:
    print(f"Warning: Failed geocoding file not found at {failed_geocoding_path}. Will create if needed.")
    failed_schools = [] # Start with empty list if file doesn't exist
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {failed_geocoding_path}. Skipping update for this file.")
    failed_schools = None # Indicate error loading
except Exception as e:
    print(f"Error loading {failed_geocoding_path}: {e}. Skipping update for this file.")
    failed_schools = None

# Create a dictionary mapping school names to their index in the final data for fuzzy matching
final_school_data_map = {school.get('name', ''): i for i, school in enumerate(final_data) if school.get('name')}
final_school_names = list(final_school_data_map.keys()) # Keep the list for extractOne choices

update_count = 0
match_threshold = 80 # Score threshold for fuzzy matching (0-100)
# matched_manual_names = set() # Keep track of names successfully matched from manual data - REPLACED
updated_final_names = set() # Keep track of the normalized 'name' from final_data that was actually updated

# Iterate through manually collected data and update final data
for manual_entry in manual_data:
    manual_name = manual_entry.get('school_name')
    if not manual_name:
        continue

    # Find the best match in the final data using fuzzy matching
    # process.extractOne returns (choice, score, key) when given a dict as processor
    # Here, we pass the list of names but will use the name to get the index from the map
    best_match_result = process.extractOne(manual_name, final_school_names, score_cutoff=match_threshold)

    if best_match_result:
        matched_name, score = best_match_result # extractOne with list returns (choice, score)
        # Find the original index using the matched name from our map
        if matched_name in final_school_data_map:
            index = final_school_data_map[matched_name]
            print(f"Matching '{manual_name}' with '{matched_name}' (Index: {index}, Score: {score})")

            # Update the corresponding entry in final_data
            target_school = final_data[index]
        else:
            # This case should ideally not happen if the name came from the map keys
            print(f"Error: Matched name '{matched_name}' not found in map for '{manual_name}'. Skipping.")
            continue
        target_school['suburb'] = manual_entry.get('suburb', target_school.get('suburb'))
        target_school['postcode'] = manual_entry.get('postcode', target_school.get('postcode'))
        target_school['latitude'] = manual_entry.get('latitude', target_school.get('latitude'))
        target_school['longitude'] = manual_entry.get('longitude', target_school.get('longitude'))
        # Add address if the field exists or needs creating
        target_school['address'] = manual_entry.get('address', target_school.get('address'))

        update_count += 1
        # matched_manual_names.add(manual_name) # Add successfully matched name - REPLACED
        updated_final_names.add(matched_name.strip().lower()) # Add the normalized name from final_data that was updated
    else:
        print(f"No suitable match found for '{manual_name}' in {final_data_path} (Threshold: {match_threshold})")

# Save the updated final data
output_data = final_data_content # Start with the original structure
if schools_key:
    output_data[schools_key] = final_data
else:
    output_data = final_data # If it was just a list

try:
    with open(final_data_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"\nSuccessfully updated {update_count} entries.")
    print(f"Updated school data saved to {final_data_path}")
except IOError as e:
    print(f"Error saving updated data to {final_data_path}: {e}")

# Update and save the failed geocoding data
if failed_schools is not None: # Only proceed if loading was successful or file didn't exist initially
    # Filter failed_schools to keep only those whose 'name' was NOT updated
    # Compare against the set of names from final_data that were actually updated
    remaining_failed_schools = [school for school in failed_schools if school.get('name', '').strip().lower() not in updated_final_names]

    try:
        with open(failed_geocoding_path, 'w', encoding='utf-8') as f:
            json.dump(remaining_failed_schools, f, indent=2, ensure_ascii=False)
        print(f"\nSuccessfully updated {failed_geocoding_path}.")
        print(f"{len(remaining_failed_schools)} schools remain in the failed geocoding list.")
    except IOError as e:
        print(f"Error saving updated failed geocoding data to {failed_geocoding_path}: {e}")
else:
    print(f"Skipped updating {failed_geocoding_path} due to loading errors.")