schoolify/query_school_metrics.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import pandas as pd
import os

# Define the path to the school data file
DATA_DIR = "data"
SCHOOL_DATA_FILE = os.path.join(DATA_DIR, "school_data_final.json")

# Define the metrics to display in the table
# These should align with RANKING_METRICS_KEYS from js/data.js and other relevant fields
METRIC_COLUMNS = [
    'name',
    'rank',  # The calculated rank from the application
    'Median_VCE_Score',
    'Pct_Scores_40_Plus',
    'Completion_Rate',
    'Tertiary_Application_Rate',
    # You can add other fields from your JSON like 'type', 'suburb_name', 'level', 'score_2023'
    # 'type',
    # 'suburb_name',
]

def display_school_metrics():
    """
    Loads school data from school_data_final.json, filters out primary schools,
    displays key metrics for remaining schools sorted by VCE rank,
    and provides statistics on VCE metric availability.
    """
    if not os.path.exists(SCHOOL_DATA_FILE):
        print(f"Error: School data file not found at {SCHOOL_DATA_FILE}")
        return

    try:
        with open(SCHOOL_DATA_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {SCHOOL_DATA_FILE}: {e}")
        return
    except Exception as e:
        print(f"An unexpected error occurred loading {SCHOOL_DATA_FILE}: {e}")
        return

    all_schools = data.get('schools', [])
    if not all_schools:
        print("No schools found in the data file.")
        return

    # Filter out primary schools
    schools_to_display = [s for s in all_schools if s.get('level') != 'Primary']

    if not schools_to_display:
        print("No non-primary schools found to display.")
        return

    print(f"Found {len(schools_to_display)} non-primary schools. Preparing metrics table...\n")

    # Prepare data for pandas DataFrame
    table_data = []
    for school in schools_to_display:
        row = {}
        for col in METRIC_COLUMNS:
            row[col] = school.get(col, None)  # Use .get() for safety, defaulting to None
        table_data.append(row)

    if not table_data:
        print("No data to display in table.")
        return

    df = pd.DataFrame(table_data)

    # Ensure all desired columns are present in the DataFrame, even if all values are None
    # And reorder them as defined in METRIC_COLUMNS
    existing_cols = [col for col in METRIC_COLUMNS if col in df.columns]
    missing_cols = [col for col in METRIC_COLUMNS if col not in df.columns]
    for col in missing_cols:
        df[col] = None # Add missing columns filled with None
    df = df[METRIC_COLUMNS] # Ensure correct order

    # Convert 'rank' to numeric for sorting, coercing errors to NaT (which will be sorted last)
    df['rank'] = pd.to_numeric(df['rank'], errors='coerce')
    # Sort by VCE rank (ascending, N/A ranks at the bottom)
    df = df.sort_values(by='rank', ascending=True, na_position='last')

    # Display the DataFrame
    # Adjust pandas display options for better console readability
    pd.set_option('display.max_rows', None)        # Show all rows
    pd.set_option('display.max_columns', None)     # Show all columns
    pd.set_option('display.width', 1000)           # Adjust console width
    pd.set_option('display.colheader_justify', 'left')
    pd.set_option('display.precision', 2)          # For floating point numbers

    print(df.to_string(index=False))

    # Calculate and display statistics for VCE metric availability
    print("\n--- VCE Metric Availability Statistics (for non-primary schools) ---")
    vce_metric_keys = ['Median_VCE_Score', 'Pct_Scores_40_Plus', 'Completion_Rate', 'Tertiary_Application_Rate']
    metric_availability_counts = {1: 0, 2: 0, 3: 0, 4: 0, 0: 0} # Include 0 for completeness

    for school in schools_to_display:
        num_available_metrics = 0
        for key in vce_metric_keys:
            value = school.get(key)
            # Check if value is not None and not an empty string (though metrics are likely numeric/None)
            if value is not None and str(value).strip() != "":
                num_available_metrics += 1

        if num_available_metrics in metric_availability_counts:
            metric_availability_counts[num_available_metrics] += 1
        elif num_available_metrics > 0 : # Should not happen if dict is 0-4
             metric_availability_counts[num_available_metrics] = 1


    print(f"Schools with all 4 VCE metrics: {metric_availability_counts.get(4, 0)}")
    print(f"Schools with 3 VCE metrics:      {metric_availability_counts.get(3, 0)}")
    print(f"Schools with 2 VCE metrics:      {metric_availability_counts.get(2, 0)}")
    print(f"Schools with 1 VCE metric:       {metric_availability_counts.get(1, 0)}")
    print(f"Schools with 0 VCE metrics:      {metric_availability_counts.get(0, 0)}")
    total_processed_for_stats = sum(metric_availability_counts.values())
    print(f"Total non-primary schools processed for stats: {total_processed_for_stats}")


    # Optionally, you can save this table to a CSV file
    # output_csv_file = "school_metrics_summary.csv"
    # try:
    #     df.to_csv(output_csv_file, index=False, encoding='utf-8')
    #     print(f"\nTable also saved to {output_csv_file}")
    # except Exception as e:
    #     print(f"\nError saving table to CSV: {e}")

if __name__ == "__main__":
    display_school_metrics()