schoolify/clean_data.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import os
import csv
import requests
from io import StringIO

# Define paths relative to the script location
script_dir = os.path.dirname(os.path.abspath(__file__))
input_file_path = os.path.join(script_dir, 'data', 'school_data_typed.json') # Use typed data as input
output_file_path = os.path.join(script_dir, 'data', 'school_data_final.json') # Final output file
error_log_path = os.path.join(script_dir, 'data', 'cleaning_errors.log')

def clean_school_name(name):
    """Cleans the school name while preserving campus information."""
    if isinstance(name, str):
        # Only remove state/postcode patterns, preserve campus info
        # Remove patterns like "VIC 3000" but keep campus details
        name = re.sub(r',?\s+VIC\s+\d{4}', '', name)
        return name.strip()
    return name # Return original if not a string

def determine_school_type(name):
    """Attempts to determine the school type based on keywords in the name."""
    name_lower = name.lower()
    if 'catholic' in name_lower or 'st ' in name_lower or 'our lady' in name_lower or 'sacred heart' in name_lower or 'marist' in name_lower or 'xav' in name_lower:
        return "Catholic"
    if 'grammar' in name_lower or 'college' in name_lower or 'girls' in name_lower or 'boys' in name_lower or 'christian' in name_lower or 'lutheran' in name_lower or 'islamic' in name_lower or 'jewish' in name_lower or 'presbyterian' in name_lower or 'anglican' in name_lower:
        # This is a broad category for Independent/Private, needs refinement
        return "Independent/Private"
    if 'high school' in name_lower or 'secondary college' in name_lower or 'p-12' in name_lower or 'p-9' in name_lower:
        return "Public"
    # Default if no keywords match
    return "Unknown"

POSTCODE_CSV_URL = 'https://raw.githubusercontent.com/Elkfox/Australian-Postcode-Data/master/au_postcodes.csv'

def get_postcode_suburb_map():
    """Fetches postcode data and creates a postcode -> suburb mapping."""
    postcode_map = {}
    print(f"Fetching postcode data from: {POSTCODE_CSV_URL}")
    try:
        response = requests.get(POSTCODE_CSV_URL)
        response.raise_for_status() # Raise an exception for bad status codes
        csv_data = StringIO(response.text)
        reader = csv.DictReader(csv_data)
        for row in reader:
            postcode = row.get('postcode')
            suburb = row.get('place_name')
            state = row.get('state_code')
            # Only add VIC postcodes for now, and only if not already present
            # (to get one suburb per postcode for simplicity)
            if postcode and suburb and state == 'VIC' and postcode not in postcode_map:
                 postcode_map[postcode] = suburb
        print(f"Successfully created postcode map with {len(postcode_map)} VIC entries.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching postcode data: {e}")
    except Exception as e:
        print(f"Error processing postcode CSV: {e}")
    return postcode_map

def main():
    postcode_suburb_map = get_postcode_suburb_map()
    errors = []
    try:
        # Ensure the output directory exists
        output_dir = os.path.dirname(output_file_path)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created directory: {output_dir}")

        # Read the input JSON file
        print(f"Reading data from: {input_file_path}")
        with open(input_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        print("Processing school data...")
        cleaned_schools = []
        if 'schools' in data and isinstance(data['schools'], list):
            for i, school in enumerate(data['schools']):
                original_name = school.get('name')
                cleaned_name = clean_school_name(original_name)
                if cleaned_name != original_name:
                    print(f"  Cleaned name {i+1}: '{original_name}' -> '{cleaned_name}'")
                school['name'] = cleaned_name
                # Determine and update school type
                school_type = determine_school_type(cleaned_name)
                school['type'] = school_type
                # Add suburb name from map
                postcode = school.get('suburb') # Assumes 'suburb' field holds the postcode
                suburb_name = postcode_suburb_map.get(postcode, 'Not Found')
                school['suburb_name'] = suburb_name # Add new field for actual name
                if school_type != "Unknown":
                    print(f"    Identified type for '{cleaned_name}': {school_type}")
                if suburb_name != 'Not Found':
                     print(f"    Mapped postcode {postcode} to suburb: {suburb_name}")
                else:
                     print(f"    Could not map postcode: {postcode}")
                cleaned_schools.append(school)
            data['schools'] = cleaned_schools
        else:
            errors.append("Error: 'schools' key not found or is not a list in the JSON data.")
            print("Error: 'schools' key not found or is not a list.")


        # Write the cleaned data to the output JSON file
        print(f"Writing cleaned data to: {output_file_path}")
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

        print("Data cleaning complete.")

    except FileNotFoundError:
        errors.append(f"Error: Input file not found at {input_file_path}")
        print(f"Error: Input file not found at {input_file_path}")
    except json.JSONDecodeError:
        errors.append(f"Error: Could not decode JSON from {input_file_path}")
        print(f"Error: Could not decode JSON from {input_file_path}")
    except Exception as e:
        errors.append(f"An unexpected error occurred: {e}")
        print(f"An unexpected error occurred: {e}")

    # Log errors if any
    if errors:
        print(f"Errors occurred during processing. See {error_log_path} for details.")
        try:
            with open(error_log_path, 'w', encoding='utf-8') as log_f:
                for error in errors:
                    log_f.write(error + '\n')
        except Exception as log_e:
            print(f"Additionally, failed to write error log: {log_e}")


if __name__ == "__main__":
    main()