-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
126 lines (109 loc) · 5.17 KB
/
scraper.py
File metadata and controls
126 lines (109 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
from bs4 import BeautifulSoup
import json
import os
import re
# URL of the page to scrape
URL = "https://bettereducation.com.au/school/secondary/vic/vic_top_secondary_schools.aspx"
# Output file path
OUTPUT_DIR = "data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "school_data.json")
def scrape_school_data():
"""Fetches and parses school ranking data from Better Education."""
print(f"Fetching data from {URL}...")
try:
response = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
response.raise_for_status() # Raise an exception for bad status codes
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
print("Parsing HTML content...")
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main table containing the school rankings
# Inspecting the page, the table seems to be the first one after an <h4> tag
# containing 'Top Secondary Schools'
table = None
h4_tag = soup.find('h4', string=re.compile(r'Top Secondary Schools'))
if h4_tag:
table = h4_tag.find_next_sibling('table')
if not table:
print("Error: Could not find the school ranking table on the page.")
# Fallback: try finding any large table, might need adjustment
tables = soup.find_all('table')
if tables:
print("Attempting fallback table detection...")
# A simple heuristic: find a table with a reasonable number of rows
for t in tables:
if len(t.find_all('tr')) > 20: # Adjust threshold as needed
table = t
print("Found a potential fallback table.")
break
if not table:
print("Error: Fallback table detection failed.")
return None
schools = []
rows = table.find_all('tr')
print(f"Found {len(rows) - 1} potential school entries...") # Subtract header row
# Skip the header row (usually the first row)
for row in rows[1:]:
cols = row.find_all('td')
if len(cols) >= 4: # Expecting at least Rank, Name, Suburb, Score
try:
rank_text = cols[0].get_text(strip=True)
# Handle ranks like '1=' or '100='
rank = int(re.match(r'\d+', rank_text).group()) if re.match(r'\d+', rank_text) else None
name = cols[1].get_text(strip=True)
suburb = cols[2].get_text(strip=True)
# Score might be in different columns depending on the year/layout
# Try finding a column with a clear score format (e.g., XX.X)
score_2023 = None
for i in range(3, len(cols)):
score_text = cols[i].get_text(strip=True)
if re.match(r'^\d{1,2}(\.\d)?$', score_text): # Matches 99, 99.9
score_2023 = float(score_text)
break
# Placeholder for type and rank_history as they are not directly on this table
school_type = 'Unknown' # This info might be on linked pages
rank_history = [rank] if rank is not None else [] # Only current rank available
if rank is not None and name and suburb and score_2023 is not None:
schools.append({
'id': f"BE-{rank}-{name.replace(' ', '')[:10]}", # Create a simple ID
'rank': rank,
'name': name,
'suburb': suburb,
'type': school_type,
'score_2023': score_2023,
'rank_history': rank_history # Only current year's rank
})
else:
print(f"Skipping row due to missing data: Rank={rank}, Name={name}, Suburb={suburb}, Score={score_2023}")
except (ValueError, AttributeError, IndexError, TypeError) as e:
print(f"Skipping row due to parsing error: {e} - Row content: {row.get_text(strip=True)}")
else:
print(f"Skipping row with insufficient columns: {len(cols)}")
print(f"Successfully parsed {len(schools)} schools.")
return {"schools": schools}
def save_data(data):
"""Saves the scraped data to a JSON file."""
if not data or not data.get("schools"):
print("No data to save.")
return
try:
# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Saving data to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print("Data saved successfully.")
except IOError as e:
print(f"Error saving data to file: {e}")
except Exception as e:
print(f"An unexpected error occurred during saving: {e}")
if __name__ == "__main__":
scraped_data = scrape_school_data()
save_data(scraped_data)
print("\n--- Scraping complete ---")
print(f"To update the data, simply run this script again:")
print(f"python scraper.py")
print("Make sure you have the required libraries installed:")
print(f"pip install -r requirements.txt")