schoolify/analyze_school_data.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import pandas as pd
import numpy as np
import os
import logging
from io import StringIO

# Define a function to clean percentage values and other non-numeric entries
def clean_metric_value(value):
    if isinstance(value, str):
        value = value.strip()
        if value in ['-', 'I/D', '< 4', 'N/A', '']:
            return np.nan
        try:
            return float(value)
        except ValueError:
            return np.nan
    return value

def analyze_data():
    log_file_path = os.path.join(os.path.dirname(__file__), 'analysis_log.txt')
    logging.basicConfig(filename=log_file_path, level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s', filemode='w')
    logger = logging.getLogger()

    base_path = os.path.join('data', 'downloads')
    output_file = os.path.join(base_path, 'combined_school_metrics.csv')

    files_to_process = [
        {
            'year': 2021,
            'path': os.path.join(base_path, '2021SeniorSecondaryCompletionAndAchievementInformation.csv'),
            'header_row': 10,
            'columns_map': {
                'School': 'School',
                'Median VCE study score': 'Median_VCE_Score',
                'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
                'Percentage of satisfactory VCE completions in 2021': 'Completion_Rate',
                'Percentage of VCE students applying for tertiary places': 'Tertiary_Application_Rate'
            }
        },
        {
            'year': 2022,
            'path': os.path.join(base_path, '2022SeniorSecondaryCompletionAndAchievementInformation.csv'),
            'header_row': 8,
            'columns_map': {
                'School': 'School',
                'Median VCE study score': 'Median_VCE_Score',
                'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
                'Percentage of satisfactory VCE completions in 2022': 'Completion_Rate',
                'Percentage of VCE students applying for tertiary places': 'Tertiary_Application_Rate'
            }
        },
        {
            'year': 2023,
            'path': os.path.join(base_path, '2023SeniorSecondaryCompletionAndAchievementInformation.csv'),
            'header_row': 10,
            'columns_map': {
                'School': 'School',
                'Median VCE study score': 'Median_VCE_Score',
                'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
                'Percentage of satisfactory VCE completions': 'Completion_Rate',
                'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
            }
        },
        {
            'year': 2024,
            'path': os.path.join(base_path, '2024SeniorSecondaryCompletionAndAchievementInformation.csv'),
            'header_row': 10,
            'columns_map': {
                'School': 'School',
                'Median VCE study score': 'Median_VCE_Score',
                'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
                'Percentage of satisfactory VCE completions': 'Completion_Rate',
                'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
            }
        }
    ]

    all_data = []

    for file_info in files_to_process:
        logger.info(f"Processing {file_info['path']}...")
        try:
            df = pd.read_csv(file_info['path'], header=file_info['header_row'])

            # Select and rename columns
            current_file_columns = list(file_info['columns_map'].keys())
            # Check if all expected columns are present
            missing_cols = [col for col in current_file_columns if col not in df.columns]
            if missing_cols:
                logger.warning(f"Missing columns in {file_info['path']}: {missing_cols}")
                # Use only available columns
                current_file_columns = [col for col in current_file_columns if col in df.columns]

            if not current_file_columns:
                logger.error(f"No relevant columns found in {file_info['path']}. Skipping.")
                continue

            df_selected = df[current_file_columns].copy()
            df_selected.rename(columns=file_info['columns_map'], inplace=True)

            df_selected['Year'] = file_info['year']

            # Clean metric columns
            metric_cols_standard = ['Median_VCE_Score', 'Pct_Scores_40_Plus', 'Completion_Rate', 'Tertiary_Application_Rate']
            for col in metric_cols_standard:
                if col in df_selected.columns:
                    df_selected[col] = df_selected[col].apply(clean_metric_value).astype(float)

            # Drop rows where School name is NaN (often happens with footer/summary rows)
            if file_info['year'] == 2024:
                logger.info(f"DEBUG: Checking for 'Elevation Secondary College' in raw 2024 data (shape: {df.shape})")
                # Attempt to find by the 'School' column first, which is standard
                school_column_name_in_raw_df = file_info['columns_map'].get('School', 'School') # Default to 'School' if not in map
                if school_column_name_in_raw_df in df.columns:
                    esc_raw = df[df[school_column_name_in_raw_df].astype(str).str.contains("Elevation Secondary College", na=False)]
                    if not esc_raw.empty:
                        logger.info("DEBUG: Found 'Elevation Secondary College' in raw 2024 df via mapped 'School' column:")
                        logger.info(esc_raw.to_string())
                    else:
                        logger.info(f"DEBUG: 'Elevation Secondary College' NOT FOUND in raw 2024 df by mapped 'School' column ('{school_column_name_in_raw_df}').")
                else:
                    logger.info(f"DEBUG: Mapped 'School' column ('{school_column_name_in_raw_df}') not found in raw 2024 df columns: {list(df.columns)}")

            # Filter out rows that might be aggregate/summary rows if school name is not typical
            df_selected = df_selected[~df_selected['School'].astype(str).str.contains('©|Notes:|Total|Average', case=False, na=False)]

            if file_info['year'] == 2024:
                # Use a consistent list of metric columns for debug printing
                debug_metric_cols = [col for col in metric_cols_standard if col in df_selected.columns]
                esc_before_dropna = df_selected[df_selected['School'].astype(str).str.contains("Elevation Secondary College", na=False, case=False)]
                if not esc_before_dropna.empty:
                    logger.info("\nDEBUG (2024): ESC status BEFORE dropna(['School']):")
                    # Ensure School column is first for readability
                    print_cols = ['School'] + [col for col in debug_metric_cols if col != 'School']
                    logger.info(esc_before_dropna[print_cols].to_string())
                    school_val = esc_before_dropna['School'].iloc[0]
                    logger.info(f"DEBUG (2024): ESC 'School' column value: '{school_val}', type: {type(school_val)}, pd.isna: {pd.isna(school_val)}\n")
                else:
                    logger.info("\nDEBUG (2024): ESC NOT FOUND BEFORE dropna(['School']).\n")

            if file_info['year'] == 2024:
                logger.info("\nDEBUG (2024): df_selected dtypes JUST BEFORE dropna:")
                log_buffer_dtypes = StringIO()
                df_selected.dtypes.to_string(buf=log_buffer_dtypes)
                logger.info(log_buffer_dtypes.getvalue())
                esc_check_final_before_drop = df_selected[df_selected['School'].astype(str).str.contains("Elevation Secondary College", na=False, case=False)]
                if not esc_check_final_before_drop.empty:
                    school_val_final = esc_check_final_before_drop['School'].iloc[0]
                    logger.info(f"DEBUG (2024): ESC School value FINAL CHECK JUST BEFORE dropna: '{school_val_final}', type: {type(school_val_final)}, pd.isna: {pd.isna(school_val_final)}\n")
                else:
                    logger.info("DEBUG (2024): ESC NOT FOUND IN FINAL CHECK JUST BEFORE dropna.\n")

            # Drop rows where School name is NaN (often happens with footer/summary rows)
            df_selected = df_selected.dropna(subset=['School'])

            if file_info['year'] == 2024:
                logger.info(f"DEBUG: Checking for 'Elevation Secondary College' in processed 2024 df_selected (shape: {df_selected.shape})")
                esc_processed = df_selected[df_selected['School'].astype(str).str.contains("Elevation Secondary College", na=False, case=False)]
                if not esc_processed.empty:
                    logger.info("DEBUG: Found 'Elevation Secondary College' in processed 2024 df_selected:")
                    logger.info(esc_processed.to_string())
                else:
                    logger.info("DEBUG: 'Elevation Secondary College' NOT FOUND in processed 2024 df_selected.")

            all_data.append(df_selected)
            logger.info(f"Successfully processed {file_info['path']}. Shape: {df_selected.shape}")

        except FileNotFoundError:
            logger.error(f"File not found {file_info['path']}")
        except Exception as e:
            logger.exception(f"Error processing {file_info['path']}: {e}")

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        logger.info(f"\nCombined data shape: {combined_df.shape}")
        logger.info("\nFirst 5 rows of combined data:")
        logger.info(combined_df.head().to_string())
        logger.info("\nLast 5 rows of combined data:")
        logger.info(combined_df.tail().to_string())
        logger.info("\nInfo on combined data:")
        log_buffer = StringIO()
        combined_df.info(buf=log_buffer)
        logger.info(log_buffer.getvalue())

        # Save to CSV
        combined_df.to_csv(output_file, index=False, encoding='utf-8')
        logger.info(f"\nCombined data saved to '{output_file}'")
    else:
        logger.info("\nNo data processed.")

# In the analyze_data function, when processing school names:
df_selected['School'] = df_selected['School'].astype(str).str.strip()
# Don't lowercase or modify the original school name to preserve campus info
df_selected['Standardized_School_Name'] = df_selected['School'].str.lower()

if __name__ == "__main__":
    analyze_data()