schoolify/rank_schools.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import json
import csv
import argparse
import os
import pandas as pd
import numpy as np

# Define default file paths relative to the script's execution directory
DEFAULT_METRICS_CSV = 'data/downloads/combined_school_metrics.csv'
DEFAULT_OUTPUT_CSV = 'data/processed_rankings.csv'
DRY_RUN_OUTPUT_CSV = 'data/processed_rankings_dry_run.csv'

def load_metrics_data(csv_file_path):
    """Loads school metrics data from the specified CSV file."""
    try:
        df = pd.read_csv(csv_file_path)
        return df
    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while loading CSV: {e}")
        return None

def calculate_rankings(df):
    """
    Calculates rankings based on VCE metrics.
    """
    if df is None or df.empty:
        return []

    # Filter for the latest year available for each school
    # Assuming 'Year' column exists. If not, we might need to infer or use all.
    if 'Year' in df.columns:
        # Sort by Year descending
        df = df.sort_values('Year', ascending=False)
        # Drop duplicates to keep only the latest year for each school
        # df = df.drop_duplicates(subset=['School'], keep='first')
        # Actually, we might want to keep history?
        # js/data.js loads all rows and filters by year 2024 for the main rank,
        # but keeps others for history.
        # So we should probably calculate rank for EACH year, or just for the latest?
        # The requirement is to generate processed_rankings.csv which js/data.js uses.
        # js/data.js expects 'Rank' column.
        # If we rank across ALL years mixed, that's wrong.
        # We should probably rank within each year.
        pass

    # Ensure numeric columns are numeric
    numeric_cols = ['Median_VCE_Score', 'Pct_Scores_40_Plus', 'Completion_Rate', 'Tertiary_Application_Rate']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Calculate Weighted Score
    # Weights: Median VCE (40%), 40+ Scores (40%), Completion (10%), Tertiary (10%)
    # Normalization:
    # Median VCE: / 50 * 100
    # Others are already 0-100 (percentages)

    def calculate_score(row):
        score = 0
        valid_metrics = 0

        if pd.notna(row.get('Median_VCE_Score')):
            score += (row['Median_VCE_Score'] / 50 * 100) * 0.40
            valid_metrics += 1

        if pd.notna(row.get('Pct_Scores_40_Plus')):
            score += row['Pct_Scores_40_Plus'] * 0.40
            valid_metrics += 1

        if pd.notna(row.get('Completion_Rate')):
            score += row['Completion_Rate'] * 0.10
            valid_metrics += 1

        if pd.notna(row.get('Tertiary_Application_Rate')):
            score += row['Tertiary_Application_Rate'] * 0.10
            valid_metrics += 1

        # If no metrics are valid, return NaN
        if valid_metrics == 0:
            return np.nan

        # Normalize if some metrics are missing?
        # For now, strictly follow the weights. If missing, it contributes 0.
        # But maybe we should re-weight?
        # The frontend logic seems to just use what's available or default to 0?
        # Let's stick to the simple weighted sum for now.
        return score

    df['Weighted_Score'] = df.apply(calculate_score, axis=1)

    # Rank within each Year
    if 'Year' in df.columns:
        df['Rank'] = df.groupby('Year')['Weighted_Score'].rank(ascending=False, method='min')
    else:
        df['Rank'] = df['Weighted_Score'].rank(ascending=False, method='min')

    # Sort by Year (desc) and Rank (asc)
    if 'Year' in df.columns:
        df = df.sort_values(['Year', 'Rank'], ascending=[False, True])
    else:
        df = df.sort_values('Rank', ascending=True)

    return df

def save_rankings_to_csv(df, csv_file_path):
    """Saves the ranked data to a CSV file."""
    if df is None:
        return

    try:
        # Ensure the directory exists
        os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

        # Columns to save
        cols_to_save = ['School', 'Median_VCE_Score', 'Pct_Scores_40_Plus', 'Completion_Rate', 'Tertiary_Application_Rate', 'Year', 'Weighted_Score', 'Rank']

        # Add Standardized_School_Name if needed (js/data.js doesn't seem to strictly require it in the CSV,
        # it generates it on load, but it might be good to have)
        # Let's add it for completeness if we can easily.
        # We'll skip it for now to keep it simple, js/data.js generates it.

        # Filter columns that exist
        cols_to_save = [c for c in cols_to_save if c in df.columns]

        df.to_csv(csv_file_path, index=False, columns=cols_to_save)
        print(f"Successfully saved rankings to {csv_file_path}")
    except Exception as e:
        print(f"Error saving CSV: {e}")

def main():
    parser = argparse.ArgumentParser(description="Ranks schools based on VCE metrics.")
    parser.add_argument(
        '--metrics_csv',
        type=str,
        default=DEFAULT_METRICS_CSV,
        help=f"Path to the input metrics CSV file (default: {DEFAULT_METRICS_CSV})"
    )
    parser.add_argument(
        '--output_csv',
        type=str,
        default=DEFAULT_OUTPUT_CSV,
        help=f"Path to the output CSV file for rankings (default: {DEFAULT_OUTPUT_CSV})"
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help="Perform a dry run. Output will be written to a separate file."
    )

    args = parser.parse_args()

    print("Starting school ranking process...")
    print(f"Loading metrics from: {args.metrics_csv}")

    df = load_metrics_data(args.metrics_csv)

    if df is None:
        print("Failed to load data. Exiting.")
        return

    print(f"Loaded {len(df)} rows.")

    print("Calculating rankings...")
    ranked_df = calculate_rankings(df)

    output_file_path = args.output_csv
    if args.dry_run:
        output_file_path = DRY_RUN_OUTPUT_CSV
        print(f"DRY RUN active. Output will be saved to: {output_file_path}")
    else:
        print(f"Saving rankings to: {output_file_path}")

    save_rankings_to_csv(ranked_df, output_file_path)
    print("Ranking process complete.")

if __name__ == '__main__':
    main()