-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data.py
More file actions
134 lines (119 loc) · 6.2 KB
/
clean_data.py
File metadata and controls
134 lines (119 loc) · 6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import os
import csv
import requests
from io import StringIO
# Define paths relative to the script location
script_dir = os.path.dirname(os.path.abspath(__file__))
input_file_path = os.path.join(script_dir, 'data', 'school_data_typed.json') # Use typed data as input
output_file_path = os.path.join(script_dir, 'data', 'school_data_final.json') # Final output file
error_log_path = os.path.join(script_dir, 'data', 'cleaning_errors.log')
def clean_school_name(name):
"""Cleans the school name while preserving campus information."""
if isinstance(name, str):
# Only remove state/postcode patterns, preserve campus info
# Remove patterns like "VIC 3000" but keep campus details
name = re.sub(r',?\s+VIC\s+\d{4}', '', name)
return name.strip()
return name # Return original if not a string
def determine_school_type(name):
"""Attempts to determine the school type based on keywords in the name."""
name_lower = name.lower()
if 'catholic' in name_lower or 'st ' in name_lower or 'our lady' in name_lower or 'sacred heart' in name_lower or 'marist' in name_lower or 'xav' in name_lower:
return "Catholic"
if 'grammar' in name_lower or 'college' in name_lower or 'girls' in name_lower or 'boys' in name_lower or 'christian' in name_lower or 'lutheran' in name_lower or 'islamic' in name_lower or 'jewish' in name_lower or 'presbyterian' in name_lower or 'anglican' in name_lower:
# This is a broad category for Independent/Private, needs refinement
return "Independent/Private"
if 'high school' in name_lower or 'secondary college' in name_lower or 'p-12' in name_lower or 'p-9' in name_lower:
return "Public"
# Default if no keywords match
return "Unknown"
POSTCODE_CSV_URL = 'https://raw.githubusercontent.com/Elkfox/Australian-Postcode-Data/master/au_postcodes.csv'
def get_postcode_suburb_map():
"""Fetches postcode data and creates a postcode -> suburb mapping."""
postcode_map = {}
print(f"Fetching postcode data from: {POSTCODE_CSV_URL}")
try:
response = requests.get(POSTCODE_CSV_URL)
response.raise_for_status() # Raise an exception for bad status codes
csv_data = StringIO(response.text)
reader = csv.DictReader(csv_data)
for row in reader:
postcode = row.get('postcode')
suburb = row.get('place_name')
state = row.get('state_code')
# Only add VIC postcodes for now, and only if not already present
# (to get one suburb per postcode for simplicity)
if postcode and suburb and state == 'VIC' and postcode not in postcode_map:
postcode_map[postcode] = suburb
print(f"Successfully created postcode map with {len(postcode_map)} VIC entries.")
except requests.exceptions.RequestException as e:
print(f"Error fetching postcode data: {e}")
except Exception as e:
print(f"Error processing postcode CSV: {e}")
return postcode_map
def main():
postcode_suburb_map = get_postcode_suburb_map()
errors = []
try:
# Ensure the output directory exists
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created directory: {output_dir}")
# Read the input JSON file
print(f"Reading data from: {input_file_path}")
with open(input_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print("Processing school data...")
cleaned_schools = []
if 'schools' in data and isinstance(data['schools'], list):
for i, school in enumerate(data['schools']):
original_name = school.get('name')
cleaned_name = clean_school_name(original_name)
if cleaned_name != original_name:
print(f" Cleaned name {i+1}: '{original_name}' -> '{cleaned_name}'")
school['name'] = cleaned_name
# Determine and update school type
school_type = determine_school_type(cleaned_name)
school['type'] = school_type
# Add suburb name from map
postcode = school.get('suburb') # Assumes 'suburb' field holds the postcode
suburb_name = postcode_suburb_map.get(postcode, 'Not Found')
school['suburb_name'] = suburb_name # Add new field for actual name
if school_type != "Unknown":
print(f" Identified type for '{cleaned_name}': {school_type}")
if suburb_name != 'Not Found':
print(f" Mapped postcode {postcode} to suburb: {suburb_name}")
else:
print(f" Could not map postcode: {postcode}")
cleaned_schools.append(school)
data['schools'] = cleaned_schools
else:
errors.append("Error: 'schools' key not found or is not a list in the JSON data.")
print("Error: 'schools' key not found or is not a list.")
# Write the cleaned data to the output JSON file
print(f"Writing cleaned data to: {output_file_path}")
with open(output_file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print("Data cleaning complete.")
except FileNotFoundError:
errors.append(f"Error: Input file not found at {input_file_path}")
print(f"Error: Input file not found at {input_file_path}")
except json.JSONDecodeError:
errors.append(f"Error: Could not decode JSON from {input_file_path}")
print(f"Error: Could not decode JSON from {input_file_path}")
except Exception as e:
errors.append(f"An unexpected error occurred: {e}")
print(f"An unexpected error occurred: {e}")
# Log errors if any
if errors:
print(f"Errors occurred during processing. See {error_log_path} for details.")
try:
with open(error_log_path, 'w', encoding='utf-8') as log_f:
for error in errors:
log_f.write(error + '\n')
except Exception as log_e:
print(f"Additionally, failed to write error log: {log_e}")
if __name__ == "__main__":
main()