schoolify/scraper.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
from bs4 import BeautifulSoup
import json
import os
import re

# URL of the page to scrape
URL = "https://bettereducation.com.au/school/secondary/vic/vic_top_secondary_schools.aspx"
# Output file path
OUTPUT_DIR = "data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "school_data.json")

def scrape_school_data():
    """Fetches and parses school ranking data from Better Education."""
    print(f"Fetching data from {URL}...")
    try:
        response = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status() # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

    print("Parsing HTML content...")
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main table containing the school rankings
    # Inspecting the page, the table seems to be the first one after an <h4> tag
    # containing 'Top Secondary Schools'
    table = None
    h4_tag = soup.find('h4', string=re.compile(r'Top Secondary Schools'))
    if h4_tag:
        table = h4_tag.find_next_sibling('table')

    if not table:
        print("Error: Could not find the school ranking table on the page.")
        # Fallback: try finding any large table, might need adjustment
        tables = soup.find_all('table')
        if tables:
            print("Attempting fallback table detection...")
            # A simple heuristic: find a table with a reasonable number of rows
            for t in tables:
                if len(t.find_all('tr')) > 20: # Adjust threshold as needed
                    table = t
                    print("Found a potential fallback table.")
                    break
        if not table:
             print("Error: Fallback table detection failed.")
             return None


    schools = []
    rows = table.find_all('tr')

    print(f"Found {len(rows) - 1} potential school entries...") # Subtract header row

    # Skip the header row (usually the first row)
    for row in rows[1:]:
        cols = row.find_all('td')
        if len(cols) >= 4: # Expecting at least Rank, Name, Suburb, Score
            try:
                rank_text = cols[0].get_text(strip=True)
                # Handle ranks like '1=' or '100='
                rank = int(re.match(r'\d+', rank_text).group()) if re.match(r'\d+', rank_text) else None
                name = cols[1].get_text(strip=True)
                suburb = cols[2].get_text(strip=True)
                # Score might be in different columns depending on the year/layout
                # Try finding a column with a clear score format (e.g., XX.X)
                score_2023 = None
                for i in range(3, len(cols)):
                    score_text = cols[i].get_text(strip=True)
                    if re.match(r'^\d{1,2}(\.\d)?$', score_text): # Matches 99, 99.9
                        score_2023 = float(score_text)
                        break

                # Placeholder for type and rank_history as they are not directly on this table
                school_type = 'Unknown' # This info might be on linked pages
                rank_history = [rank] if rank is not None else [] # Only current rank available

                if rank is not None and name and suburb and score_2023 is not None:
                    schools.append({
                        'id': f"BE-{rank}-{name.replace(' ', '')[:10]}", # Create a simple ID
                        'rank': rank,
                        'name': name,
                        'suburb': suburb,
                        'type': school_type,
                        'score_2023': score_2023,
                        'rank_history': rank_history # Only current year's rank
                    })
                else:
                    print(f"Skipping row due to missing data: Rank={rank}, Name={name}, Suburb={suburb}, Score={score_2023}")

            except (ValueError, AttributeError, IndexError, TypeError) as e:
                print(f"Skipping row due to parsing error: {e} - Row content: {row.get_text(strip=True)}")
        else:
            print(f"Skipping row with insufficient columns: {len(cols)}")

    print(f"Successfully parsed {len(schools)} schools.")
    return {"schools": schools}

def save_data(data):
    """Saves the scraped data to a JSON file."""
    if not data or not data.get("schools"):
        print("No data to save.")
        return

    try:
        # Create the output directory if it doesn't exist
        os.makedirs(OUTPUT_DIR, exist_ok=True)

        print(f"Saving data to {OUTPUT_FILE}...")
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print("Data saved successfully.")
    except IOError as e:
        print(f"Error saving data to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during saving: {e}")

if __name__ == "__main__":
    scraped_data = scrape_school_data()
    save_data(scraped_data)
    print("\n--- Scraping complete ---")
    print(f"To update the data, simply run this script again:")
    print(f"python scraper.py")
    print("Make sure you have the required libraries installed:")
    print(f"pip install -r requirements.txt")