schoolify/fetch_vic_govt_schools.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import requests
import json
import os
import csv
import re
import csv
import io # Required for StringIO
from bs4 import BeautifulSoup
from time import sleep

# Output directories and files
DATA_DIR = "data"
DOWNLOADS_DIR = os.path.join(DATA_DIR, "downloads")
VIC_GOVT_SCHOOLS_FILE = os.path.join(DOWNLOADS_DIR, "vic_govt_schools.json")

# Ensure directories exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DOWNLOADS_DIR, exist_ok=True)

# URLs for Victorian government school data
# FIND_SCHOOL_URL = "https://www.findmyschool.vic.gov.au/" # No longer primary source
# VIC_GOVT_SCHOOLS_URL = "https://www.education.vic.gov.au/about/research/Pages/datavic.aspx" # Page exists, but direct CSV is better
# Direct link to the CSV dataset (Update this URL if it changes)
# Found via https://discover.data.vic.gov.au/dataset/school-locations-2023/resource/92fdd072-4666-4cc6-a28a-749c826297a7
VIC_SCHOOLS_CSV_URL = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv346-schoollocations2023.csv"

# User agent for requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def fetch_school_list_page(url):
    """Fetch a page from the Victorian government school finder."""
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {url}: {e}")
        return None

def extract_school_data_from_page(html_content):
    """Extract school data from the HTML content of a page."""
    schools = []
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find school listings on the page
    # This is a simplified example - actual implementation would need to match the website's structure
    school_elements = soup.select('.school-listing') or soup.select('.school-item') or soup.select('.school-card')

    for school_element in school_elements:
        try:
            # Extract school information based on the HTML structure
            # These selectors would need to be adjusted based on the actual website
            name_element = school_element.select_one('.school-name') or school_element.select_one('h3') or school_element.select_one('h2')
            address_element = school_element.select_one('.school-address') or school_element.select_one('.address')
            type_element = school_element.select_one('.school-type') or school_element.select_one('.type')

            if name_element:
                name = name_element.get_text(strip=True)

                # Extract address components if available
                address = address_element.get_text(strip=True) if address_element else ""
                suburb_match = re.search(r'([A-Za-z\s]+)\s+(VIC)\s+(\d{4})', address)

                suburb_name = ""
                postcode = ""
                if suburb_match:
                    suburb_name = suburb_match.group(1).strip()
                    postcode = suburb_match.group(3)

                # Extract school type
                school_type = type_element.get_text(strip=True) if type_element else "Unknown"
                if "government" in school_type.lower() or "public" in school_type.lower():
                    school_type = "Public"
                elif "catholic" in school_type.lower():
                    school_type = "Catholic"
                elif "independent" in school_type.lower() or "private" in school_type.lower():
                    school_type = "Independent/Private"

                # Create a unique ID
                school_id = f"VIC-{name.replace(' ', '')[:15]}"

                schools.append({
                    'id': school_id,
                    'name': name,
                    'suburb_name': suburb_name,
                    'suburb': postcode,
                    'type': school_type,
                    'address': address,
                    'data_source': 'VIC Government'
                })
        except Exception as e:
            print(f"Error extracting school data: {e}")

    return schools

def fetch_vic_govt_school_data():
    """Fetch comprehensive Victorian government school data from the official CSV dataset."""
    print(f"Attempting to fetch Victorian government school data from: {VIC_SCHOOLS_CSV_URL}")

    all_schools = []
    try:
        response = requests.get(VIC_SCHOOLS_CSV_URL, headers=HEADERS, timeout=30) # Added timeout
        response.raise_for_status()  # Raise an exception for bad status codes
        print("Successfully downloaded CSV data.")

        # Use io.StringIO to treat the downloaded text as a file
        csv_data = io.StringIO(response.text)
        reader = csv.DictReader(csv_data)

        print("Parsing CSV data...")
        processed_count = 0
        for row in reader:
            try:
                # Extract data, handle potential missing keys gracefully
                # Trying alternative common column names based on typical data structures
                name = row.get('Official_School_Name', row.get('School_Name', '')).strip()
                suburb_name = row.get('Town', row.get('Address_Town', '')).strip()
                postcode = row.get('Postal_Postcode', row.get('Postcode', '')).strip()
                school_sector = row.get('Education_Sector', row.get('School_Sector', '')).strip() # e.g., Government, Catholic, Independent
                # Address might be split; combine if necessary or use a primary address field
                address_line1 = row.get('Postal_Address_Line_1', '').strip()
                address_line2 = row.get('Postal_Address_Line_2', '').strip()
                full_address_from_parts = f"{address_line1} {address_line2}".strip()
                address = row.get('Full_Address', full_address_from_parts if full_address_from_parts else '').strip()
                school_level_raw = row.get('School_Type', '').strip() # e.g., Primary, Secondary, P-12, Special

                # Basic validation
                if not name or not postcode:
                    # print(f"Skipping row due to missing name or postcode: {row}")
                    continue

                # Map sector to standardized type
                if school_sector == 'Government':
                    school_type = 'Public'
                elif school_sector == 'Catholic':
                    school_type = 'Catholic'
                elif school_sector == 'Independent':
                    school_type = 'Independent/Private'
                else:
                    school_type = 'Other' # Or handle as needed

                # Determine school level
                if 'Primary' in school_level_raw and 'Secondary' in school_level_raw or 'P-12' in school_level_raw:
                    school_level = 'Combined'
                elif 'Secondary' in school_level_raw:
                    school_level = 'Secondary'
                elif 'Primary' in school_level_raw:
                    school_level = 'Primary'
                elif 'Special' in school_level_raw:
                    school_level = 'Special'
                else:
                    school_level = 'Other'

                # Create a unique ID (using postcode and name for better uniqueness)
                # Clean name for ID generation
                clean_name = re.sub(r'[^a-zA-Z0-9]', '', name)
                school_id = f"VIC-{postcode}-{clean_name[:15]}"

                school = {
                    'id': school_id,
                    'name': name,
                    'suburb_name': suburb_name,
                    'suburb': postcode, # 'suburb' key used for postcode in add_geocodes.py
                    'type': school_type, # Public, Catholic, Independent/Private
                    'level': school_level, # Primary, Secondary, Combined, Special, Other
                    'address': address,
                    'data_source': 'VIC Government (CSV)'
                }
                all_schools.append(school)
                processed_count += 1
            except Exception as e:
                print(f"Error processing row: {row}. Error: {e}")

        print(f"Successfully parsed {processed_count} schools from CSV.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching CSV data: {e}")
        print("Could not fetch fresh data. Checking cache...")
        # Fallback to cache if fetch fails
        if os.path.exists(VIC_GOVT_SCHOOLS_FILE):
            try:
                with open(VIC_GOVT_SCHOOLS_FILE, 'r', encoding='utf-8') as f:
                    cached_data = json.load(f)
                    print(f"Loaded {len(cached_data.get('schools', []))} schools from cache as fallback.")
                    return cached_data.get('schools', [])
            except Exception as cache_e:
                print(f"Error loading cached data during fallback: {cache_e}")
        print("No fresh data fetched and cache unavailable or failed to load.")
        return [] # Return empty list if fetch and cache fail
    except csv.Error as e:
        print(f"Error reading or parsing CSV data: {e}")
        return [] # Return empty list on CSV error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return [] # Return empty list on other errors

    # Save the fetched data to cache for future use
    if all_schools:
        try:
            # Ensure the directory exists before saving
            os.makedirs(os.path.dirname(VIC_GOVT_SCHOOLS_FILE), exist_ok=True)
            with open(VIC_GOVT_SCHOOLS_FILE, 'w', encoding='utf-8') as f:
                json.dump({"schools": all_schools}, f, indent=4, ensure_ascii=False)
            print(f"Saved {len(all_schools)} fetched schools to cache file: {VIC_GOVT_SCHOOLS_FILE}")
        except Exception as e:
            print(f"Error saving fetched data to cache: {e}")

    print(f"Fetched {len(all_schools)} Victorian government schools from CSV.")
    return all_schools

def main():
    print("=== Victorian Government Schools Data Fetcher ===")

    # Fetch Victorian government school data
    schools = fetch_vic_govt_school_data()

    print(f"\nTotal schools collected: {len(schools)}")
    print(f"School types distribution:")

    # Count schools by type
    school_types = {}
    for school in schools:
        school_type = school.get('type', 'Unknown')
        school_types[school_type] = school_types.get(school_type, 0) + 1

    for school_type, count in school_types.items():
        print(f"  {school_type}: {count}")

    print("\nData file created:")
    print(f"- {VIC_GOVT_SCHOOLS_FILE}")

    print("\nTo incorporate this data into the comprehensive database, run:")
    print("python comprehensive_scraper.py")

if __name__ == "__main__":
    main()