schoolify/schoolify_data_processor.py at master · nullnuller/schoolify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
"""
Schoolify Data Processor

This script combines all the necessary data collection and processing steps for the Schoolify application
into a single unified workflow. It handles downloading postcodes, fetching school data from multiple sources,
processing academic metrics, calculating rankings, and updating the database.

Usage:
    python schoolify_data_processor.py [--skip-download] [--skip-geocoding] [--verbose]

Options:
    --skip-download    Skip downloading data files (use existing files)
    --skip-geocoding   Skip the geocoding step (which can be time-consuming)
    --verbose          Show detailed progress information
"""

import os
import sys
import time
import json
import shutil
import argparse
import traceback
import subprocess
import pandas as pd
import requests
from datetime import datetime
import re
import csv
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('schoolify_processor.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('schoolify')

# Constants
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, 'data')
DOWNLOADS_DIR = os.path.join(DATA_DIR, 'downloads')
BACKUPS_DIR = os.path.join(DATA_DIR, 'backups')

# Ensure directories exist
for directory in [DATA_DIR, DOWNLOADS_DIR, BACKUPS_DIR]:
    os.makedirs(directory, exist_ok=True)

# File paths
SCHOOL_DATA_FILE = os.path.join(DATA_DIR, 'school_data_final.json')
SCHOOL_DATA_COMPREHENSIVE_FILE = os.path.join(DATA_DIR, 'school_data_comprehensive.json')
SUBURB_DATA_FILE = os.path.join(DATA_DIR, 'victoria_suburbs.json')
GEOCODE_DB_FILE = os.path.join(BASE_DIR, 'js', 'geocode-db.js')
GEOCODE_CACHE_FILE = os.path.join(DOWNLOADS_DIR, 'geocode_cache.json')
VIC_GOVT_SCHOOLS_FILE = os.path.join(DOWNLOADS_DIR, 'vic_govt_schools.json')
FAILED_GEOCODING_FILE = os.path.join(DATA_DIR, 'failed_geocoding.json')

# URLs
AUS_POSTCODES_URL = 'https://raw.githubusercontent.com/Elkfox/Australian-Postcode-Data/master/data/au_postcodes.csv'
VIC_SCHOOLS_URL = 'https://www.education.vic.gov.au/Documents/about/research/datavic/dv310-allschoolslist-2023.csv'
BETTER_EDUCATION_URL = 'https://bettereducation.com.au/results/vce.aspx?all=true'

# VCAA Achievement Data files
VCAA_ACHIEVEMENT_FILES = {
    2021: os.path.join(DOWNLOADS_DIR, '2021SeniorSecondaryCompletionAndAchievementInformation.csv'),
    2022: os.path.join(DOWNLOADS_DIR, '2022SeniorSecondaryCompletionAndAchievementInformation.csv'),
    2023: os.path.join(DOWNLOADS_DIR, '2023SeniorSecondaryCompletionAndAchievementInformation.csv'),
    2024: os.path.join(DOWNLOADS_DIR, '2024SeniorSecondaryCompletionAndAchievementInformation.csv')
}

# VCAA Achievement Data configuration
VCAA_CONFIG = {
    2021: {
        'header_row': 10,
        'columns_map': {
            'School': 'School',
            'Median VCE study score': 'Median_VCE_Score',
            'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
            'Percentage of satisfactory VCE completions': 'Completion_Rate',
            'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
        }
    },
    2022: {
        'header_row': 10,
        'columns_map': {
            'School': 'School',
            'Median VCE study score': 'Median_VCE_Score',
            'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
            'Percentage of satisfactory VCE completions': 'Completion_Rate',
            'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
        }
    },
    2023: {
        'header_row': 10,
        'columns_map': {
            'School': 'School',
            'Median VCE study score': 'Median_VCE_Score',
            'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
            'Percentage of satisfactory VCE completions': 'Completion_Rate',
            'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
        }
    },
    2024: {
        'header_row': 10,
        'columns_map': {
            'School': 'School',
            'Median VCE study score': 'Median_VCE_Score',
            'Percentage of study scores of 40 and over': 'Pct_Scores_40_Plus',
            'Percentage of satisfactory VCE completions': 'Completion_Rate',
            'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'Tertiary_Application_Rate'
        }
    }
}

def create_backup(file_path):
    """Create a backup of a file with timestamp."""
    if not os.path.exists(file_path):
        return False

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_filename = f"{os.path.basename(file_path)}.{timestamp}.bak"
    backup_path = os.path.join(BACKUPS_DIR, backup_filename)

    try:
        shutil.copy2(file_path, backup_path)
        logger.info(f"Created backup: {backup_path}")
        return True
    except Exception as e:
        logger.error(f"Failed to create backup of {file_path}: {e}")
        return False

def download_file(url, filename, force=False):
    """Download a file from a URL to the downloads directory."""
    file_path = os.path.join(DOWNLOADS_DIR, filename)

    if os.path.exists(file_path) and not force:
        logger.info(f"File already exists: {file_path}")
        return file_path

    try:
        logger.info(f"Downloading {url} to {file_path}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        logger.info(f"Download complete: {file_path}")
        return file_path
    except Exception as e:
        logger.error(f"Failed to download {url}: {e}")
        return None

def download_postcodes():
    """Download Australian postcodes data and extract Victorian suburbs."""
    logger.info("=== Downloading Australian Postcodes Data ===")

    # Download postcodes CSV
    postcodes_file = download_file(AUS_POSTCODES_URL, 'aus_postcodes.csv')
    if not postcodes_file:
        logger.error("Failed to download postcodes data")
        return False

    # Extract Victorian postcodes (3000-3999)
    try:
        logger.info("Extracting Victorian suburbs...")
        vic_suburbs = []

        with open(postcodes_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                postcode = row.get('postcode', '')
                if postcode.isdigit() and 3000 <= int(postcode) <= 3999:
                    suburb = {
                        'name': row.get('place_name', ''),
                        'postcode': postcode,
                        'state': 'VIC',
                        'lat': float(row.get('lat', 0)) if row.get('lat') else 0,
                        'lon': float(row.get('long', 0)) if row.get('long') else 0
                    }
                    vic_suburbs.append(suburb)

        # Save Victorian suburbs data
        with open(SUBURB_DATA_FILE, 'w', encoding='utf-8') as f:
            json.dump({'suburbs': vic_suburbs}, f, indent=2)

        logger.info(f"Extracted {len(vic_suburbs)} Victorian suburbs")

        # Create geocode cache
        create_geocode_cache(vic_suburbs)

        return True
    except Exception as e:
        logger.error(f"Error extracting Victorian postcodes: {e}")
        traceback.print_exc()
        return False

def create_geocode_cache(suburbs):
    """Create a geocode cache from suburbs data."""
    logger.info("Creating geocode cache...")

    geocode_cache = {}
    for suburb in suburbs:
        if suburb['name'] and suburb['lat'] and suburb['lon']:
            key = suburb['name'].lower().strip()
            geocode_cache[key] = {
                'lat': suburb['lat'],
                'lon': suburb['lon']
            }

            # Also add with postcode for better matching
            key_with_postcode = f"{suburb['name'].lower().strip()} {suburb['postcode']}"
            geocode_cache[key_with_postcode] = {
                'lat': suburb['lat'],
                'lon': suburb['lon']
            }

    # Save geocode cache
    with open(GEOCODE_CACHE_FILE, 'w', encoding='utf-8') as f:
        json.dump(geocode_cache, f, indent=2)

    logger.info(f"Created geocode cache with {len(geocode_cache)} entries")
    return True

def fetch_vic_govt_school_data():
    """Fetch Victorian government school data."""
    logger.info("=== Fetching Victorian Government School Data ===")

    # Check if data already exists
    if os.path.exists(VIC_GOVT_SCHOOLS_FILE):
        try:
            with open(VIC_GOVT_SCHOOLS_FILE, 'r', encoding='utf-8') as f:
                schools = json.load(f)
            logger.info(f"Loaded {len(schools)} schools from existing file")
            return schools
        except Exception as e:
            logger.warning(f"Failed to load existing data, will download again: {e}")

    # Download the CSV file
    csv_file = download_file(VIC_SCHOOLS_URL, 'vic_schools_2023.csv')
    if not csv_file:
        logger.error("Failed to download Victorian schools data")
        return []

    try:
        # Process the CSV file
        schools = []
        df = pd.read_csv(csv_file, encoding='utf-8')

        for _, row in df.iterrows():
            school = {
                'name': row.get('School_Name', ''),
                'address': row.get('School_Address', ''),
                'suburb': row.get('School_Suburb', ''),
                'postcode': str(row.get('School_Postcode', '')),
                'phone': row.get('School_Phone', ''),
                'type': row.get('School_Type', ''),
                'level': determine_school_level(row.get('School_Name', ''), row.get('School_Type', '')),
                'source': 'vic_govt'
            }

            # Add coordinates if available
            if pd.notna(row.get('School_Latitude')) and pd.notna(row.get('School_Longitude')):
                school['lat'] = float(row.get('School_Latitude'))
                school['lon'] = float(row.get('School_Longitude'))

            schools.append(school)

        # Save the processed data
        with open(VIC_GOVT_SCHOOLS_FILE, 'w', encoding='utf-8') as f:
            json.dump(schools, f, indent=2)

        logger.info(f"Processed {len(schools)} Victorian government schools")
        return schools
    except Exception as e:
        logger.error(f"Error processing Victorian schools data: {e}")
        traceback.print_exc()
        return []

def determine_school_level(name, school_type):
    """Determine the school level based on name and type."""
    name = name.lower() if name else ""
    school_type = school_type.lower() if school_type else ""

    if any(term in name for term in ['secondary', 'high', 'college', ' sc ', ' sc,']):
        return 'Secondary'
    elif any(term in name for term in ['primary', 'elementary', ' ps ', ' ps,']):
        return 'Primary'
    elif 'p-12' in name or 'p-9' in name or 'prep to year' in name:
        return 'Combined'
    elif 'special' in name or 'specialist' in name:
        return 'Special'
    elif 'secondary' in school_type:
        return 'Secondary'
    elif 'primary' in school_type:
        return 'Primary'
    else:
        return 'Unknown'

def fetch_better_education_data():
    """Fetch school rankings from Better Education website."""
    logger.info("=== Fetching Better Education School Rankings ===")

    # This would normally use web scraping, but for simplicity we'll use a mock implementation
    # In a real implementation, you would use requests or selenium to scrape the website

    # Mock implementation - in a real scenario, replace with actual web scraping
    try:
        # Check if we have existing data to use as a starting point
        if os.path.exists(SCHOOL_DATA_FILE):
            with open(SCHOOL_DATA_FILE, 'r', encoding='utf-8') as f:
                data = json.load(f)
                schools = data.get('schools', [])
                logger.info(f"Using {len(schools)} schools from existing data as base")
                return schools

        # If no existing data, return empty list (would be populated by scraping in real implementation)
        logger.warning("No existing data found and web scraping not implemented in this script")
        return []
    except Exception as e:
        logger.error(f"Error fetching Better Education data: {e}")
        return []

def merge_school_data(better_education_schools, govt_schools):
    """Merge school data from different sources."""
    logger.info("=== Merging School Data from Different Sources ===")

    # Create a dictionary of existing schools for easy lookup
    existing_schools = {}
    for school in better_education_schools:
        if 'name' in school:
            # Use lowercase name as key for case-insensitive matching
            key = school['name'].lower().strip()
            existing_schools[key] = school

    # Count statistics
    added_count = 0
    updated_count = 0

    # Process government schools
    for govt_school in govt_schools:
        if 'name' not in govt_school or not govt_school['name']:
            continue

        # Use lowercase name for matching
        govt_name = govt_school['name'].lower().strip()

        if govt_name in existing_schools:
            # Update existing school with additional information
            school = existing_schools[govt_name]

            # Only update fields that don't already exist
            for key, value in govt_school.items():
                if key not in school or not school[key]:
                    school[key] = value

            updated_count += 1
        else:
            # Add new school
            better_education_schools.append(govt_school)
            existing_schools[govt_name] = govt_school
            added_count += 1

    logger.info(f"Merged school data: {added_count} added, {updated_count} updated")
    return better_education_schools

def generate_geocode_db(suburbs):
    """Generate geocode-db.js file from suburbs data."""
    logger.info("=== Generating Geocode Database ===")

    # Create backup of existing file
    create_backup(GEOCODE_DB_FILE)

    try:
        # Generate JavaScript code
        js_code = """/**
 * Geocode Database for Schoolify
 *
 * This file contains a comprehensive database of Victorian suburbs with their coordinates.
 * It is used by the geocoding service for fast local lookups without API calls.
 *
 * Auto-generated on: {date}
 */

const geocodeDb = {{
""".format(date=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        # Add suburbs to the database
        for i, suburb in enumerate(suburbs):
            if suburb['name'] and suburb['lat'] and suburb['lon']:
                js_code += f"  '{suburb['name'].lower()}': {{ lat: {suburb['lat']}, lon: {suburb['lon']} }}"
                if i < len(suburbs) - 1:
                    js_code += ",\n"
                else:
                    js_code += "\n"

        # Close the geocodeDb object
        js_code += "};\n\n"

        js_code += """/**
 * Get coordinates for a suburb from the local database
 * @param {string} suburb - Suburb name to look up
 * @returns {Object|null} - Object with lat and lon properties, or null if not found
 */
export function getSuburbCoordinates(suburb) {
  if (!suburb) return null;

  // Normalize the suburb name for lookup
  const normalizedSuburb = suburb.toLowerCase().trim();

  // Direct lookup
  if (geocodeDb[normalizedSuburb]) {
    return geocodeDb[normalizedSuburb];
  }

  // Try with postcode removed (if present)
  const withoutPostcode = normalizedSuburb.replace(/\\s+\\d{4}$/, '');
  if (withoutPostcode !== normalizedSuburb && geocodeDb[withoutPostcode]) {
    return geocodeDb[withoutPostcode];
  }

  // Try flexible matching
  for (const key in geocodeDb) {
    // Check if the key contains the suburb or vice versa
    if (key.includes(normalizedSuburb) || normalizedSuburb.includes(key)) {
      return geocodeDb[key];
    }
  }

  // Not found
  return null;
}
"""

        # Ensure the directory exists
        os.makedirs(os.path.dirname(GEOCODE_DB_FILE), exist_ok=True)

        # Write to file
        with open(GEOCODE_DB_FILE, 'w', encoding='utf-8') as f:
            f.write(js_code)

        logger.info(f"Generated geocode database with {len(suburbs)} suburbs")
        return True
    except Exception as e:
        logger.error(f"Error generating geocode database: {e}")
        traceback.print_exc()
        return False

def update_database(comprehensive_schools):
    """Update the existing database with new comprehensive data."""
    logger.info("=== Updating Database with Comprehensive Data ===")

    # Create backups of existing files
    create_backup(SCHOOL_DATA_FILE)

    try:
        # Load existing data if available
        existing_data = {'schools': []}
        if os.path.exists(SCHOOL_DATA_FILE):
            with open(SCHOOL_DATA_FILE, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)

        existing_schools = existing_data.get('schools', [])
        logger.info(f"Loaded {len(existing_schools)} schools from existing database")

        # Create a dictionary of existing schools for easy lookup
        existing_dict = {}
        for school in existing_schools:
            if 'name' in school:
                key = school['name'].lower().strip()
                existing_dict[key] = school

        # Create a dictionary of comprehensive schools
        comprehensive_dict = {}
        for school in comprehensive_schools:
            if 'name' in school:
                key = school['name'].lower().strip()
                comprehensive_dict[key] = school

        # Merge the data
        merged_schools = []
        added_count = 0
        updated_count = 0

        # First, process existing schools
        for key, school in existing_dict.items():
            if key in comprehensive_dict:
                # Update existing school with new data
                comp_school = comprehensive_dict[key]

                # Preserve important fields from existing data
                preserve_fields = ['rank', 'rank_change', 'rank_year', 'weighted_score', 'metrics']
                for field in preserve_fields:
                    if field in school and field not in comp_school:
                        comp_school[field] = school[field]

                merged_schools.append(comp_school)
                updated_count += 1
            else:
                # Keep existing school
                merged_schools.append(school)

        # Then, add new schools from comprehensive data
        for key, school in comprehensive_dict.items():
            if key not in existing_dict:
                merged_schools.append(school)
                added_count += 1

        # Save the merged data
        with open(SCHOOL_DATA_FILE, 'w', encoding='utf-8') as f:
            json.dump({'schools': merged_schools}, f, indent=2)

        logger.info(f"Updated database: {added_count} schools added, {updated_count} schools updated")
        return True
    except Exception as e:
        logger.error(f"Error updating database: {e}")
        traceback.print_exc()
        return False

def add_geocodes(schools, skip_geocoding=False):
    """Add geocodes to schools that don't have them."""
    if skip_geocoding:
        logger.info("Skipping geocoding step as requested")
        return schools

    logger.info("=== Adding Geocodes to Schools ===")

    # Load geocode cache
    geocode_cache = {}
    if os.path.exists(GEOCODE_CACHE_FILE):
        try:
            with open(GEOCODE_CACHE_FILE, 'r', encoding='utf-8') as f:
                geocode_cache = json.load(f)
            logger.info(f"Loaded geocode cache with {len(geocode_cache)} entries")
        except Exception as e:
            logger.warning(f"Failed to load geocode cache: {e}")

    # Track schools that failed geocoding
    failed_geocoding = []

    # Count statistics
    total_schools = len(schools)
    already_geocoded = 0
    newly_geocoded = 0
    failed_count = 0

    for i, school in enumerate(schools):
        if i % 100 == 0 and i > 0:
            logger.info(f"Processed {i}/{total_schools} schools")

        # Skip if already has coordinates
        if school.get('lat') and school.get('lon'):
            already_geocoded += 1
            continue

        # Try to geocode using suburb and postcode
        suburb = school.get('suburb', '').lower().strip() if school.get('suburb') else ''
        postcode = school.get('postcode', '').strip() if school.get('postcode') else ''

        # Try different combinations for geocoding
        geocoded = False

        # Try suburb + postcode
        if suburb and postcode:
            key = f"{suburb} {postcode}"
            if key in geocode_cache:
                school['lat'] = geocode_cache[key]['lat']
                school['lon'] = geocode_cache[key]['lon']
                newly_geocoded += 1
                geocoded = True

        # Try suburb only
        if not geocoded and suburb:
            if suburb in geocode_cache:
                school['lat'] = geocode_cache[suburb]['lat']
                school['lon'] = geocode_cache[suburb]['lon']
                newly_geocoded += 1
                geocoded = True

        # Try postcode only
        if not geocoded and postcode:
            if postcode in geocode_cache:
                school['lat'] = geocode_cache[postcode]['lat']
                school['lon'] = geocode_cache[postcode]['lon']
                newly_geocoded += 1
                geocoded = True

        # If still not geocoded, add to failed list
        if not geocoded:
            failed_count += 1
            failed_geocoding.append({
                'name': school.get('name', ''),
                'suburb': suburb,
                'postcode': postcode
            })

    # Save failed geocoding list
    with open(FAILED_GEOCODING_FILE, 'w', encoding='utf-8') as f:
        json.dump(failed_geocoding, f, indent=2)

    logger.info(f"Geocoding complete: {already_geocoded} already had coordinates, {newly_geocoded} newly geocoded, {failed_count} failed")
    return schools

def clean_school_name(name):
    """Cleans the school name while preserving campus information."""
    if isinstance(name, str):
        # Only remove state/postcode patterns, preserve campus info
        # Remove patterns like "VIC 3000" but keep campus details
        name = re.sub(r',?\s+VIC\s+\d{4}', '', name)
        return name.strip()
    return name # Return original if not a string

def process_vcaa_achievement_data():
    """Process VCAA achievement data for multiple years."""
    logger.info("=== Processing VCAA Achievement Data ===")

    vcaa_lookup = {}

    for year, file_path in VCAA_ACHIEVEMENT_FILES.items():
        if not os.path.exists(file_path):
            logger.warning(f"VCAA data file for {year} not found: {file_path}")
            continue

        try:
            config = VCAA_CONFIG.get(year, {})
            header_row = config.get('header_row', 0)
            columns_map = config.get('columns_map', {})

            # Special handling for 2022 data which has a different format
            if year == 2022:
                # For 2022, the first row contains school names without proper headers
                df = pd.read_csv(file_path, header=None)
                # Display the first few rows to understand the structure
                logger.debug(f"First rows of 2022 data:\n{df.head()}")

                # Create a dictionary for each school
                for i in range(len(df)):
                    school_name = df.iloc[i, 0]  # First column contains school names
                    if isinstance(school_name, str) and school_name.strip():
                        # Extract metrics from specific columns (adjust indices as needed)
                        median_score = pd.to_numeric(df.iloc[i, 6], errors='coerce')  # Example column index
                        pct_40_plus = pd.to_numeric(df.iloc[i, 7], errors='coerce')  # Example column index
                        completion_rate = pd.to_numeric(df.iloc[i, 12], errors='coerce')  # Example column index
                        tertiary_rate = pd.to_numeric(df.iloc[i, 13], errors='coerce')  # Example column index

                        if school_name not in vcaa_lookup:
                            vcaa_lookup[school_name] = {}

                        vcaa_lookup[school_name][year] = {
                            'Median_VCE_Score': median_score if not pd.isna(median_score) else None,
                            'Pct_Scores_40_Plus': pct_40_plus if not pd.isna(pct_40_plus) else None,
                            'Completion_Rate': completion_rate if not pd.isna(completion_rate) else None,
                            'Tertiary_Application_Rate': tertiary_rate if not pd.isna(tertiary_rate) else None
                        }

                logger.info(f"Processed VCAA data for {year}: {len(df)} schools")
            else:
                # Standard processing for other years
                df = pd.read_csv(file_path, header=header_row)

                if 'School' not in df.columns:
                    available_cols = ', '.join([str(col) for col in df.columns])
                    logger.error(f"Error processing VCAA data for {year}: 'School' column not found")
                    logger.error(f"Available columns: {available_cols}")
                    continue

                # Process each school
                for _, row in df.iterrows():
                    school_name = row.get('School')
                    if not school_name or pd.isna(school_name):
                        continue

                    # Extract metrics using column mapping
                    metrics = {}
                    for src_col, dest_col in columns_map.items():
                        if src_col in df.columns:
                            value = row.get(src_col)
                            if isinstance(value, str):
                                # Convert string percentages to float
                                if '%' in value:
                                    try:
                                        value = float(value.replace('%', '').strip())
                                    except ValueError:
                                        value = None
                            metrics[dest_col] = value if not pd.isna(value) else None

                    if school_name not in vcaa_lookup:
                        vcaa_lookup[school_name] = {}

                    vcaa_lookup[school_name][year] = metrics

                logger.info(f"Processed VCAA data for {year}: {len(df)} schools")
        except Exception as e:
            logger.error(f"Error processing VCAA data for {year}: {e}")
            traceback.print_exc()

    return vcaa_lookup

def calculate_school_rankings(schools, vcaa_data):
    """Calculate school rankings based on VCAA achievement data."""
    logger.info("=== Calculating School Rankings ===")

    # Create a dictionary for easy lookup of VCAA data by school name
    vcaa_lookup = {}
    for year, schools_data in vcaa_data.items():
        for school_data in schools_data:
            # Make sure school_data is a dictionary before using get()
            if isinstance(school_data, dict):
                school_name = school_data.get('School', '').lower().strip()
                if school_name:
                    if school_name not in vcaa_lookup:
                        vcaa_lookup[school_name] = {}
                    vcaa_lookup[school_name][year] = school_data

    # Update schools with VCAA data and calculate rankings
    for school in schools:
        # Skip if no name
        if 'name' not in school:
            continue

        school_name = school.get('name', '').lower().strip()

        # Try to find VCAA data for this school
        if school_name in vcaa_lookup:
            # Use the most recent year's data
            latest_year = max(vcaa_lookup[school_name].keys())
            vcaa_school_data = vcaa_lookup[school_name][latest_year]

            # Update school with VCAA metrics
            school['median_vce_score'] = vcaa_school_data.get('Median_VCE_Score')
            school['pct_scores_40_plus'] = vcaa_school_data.get('Pct_Scores_40_Plus')
            school['completion_rate'] = vcaa_school_data.get('Completion_Rate')
            school['tertiary_application_rate'] = vcaa_school_data.get('Tertiary_Application_Rate')
            school['vcaa_year'] = latest_year

    # Calculate rankings based on a weighted formula
    # This is a simplified example - you may want to use a more sophisticated algorithm
    ranked_schools = []
    for school in schools:
        if 'median_vce_score' in school and school['median_vce_score'] is not None:
            # Calculate a weighted score
            weighted_score = (
                (school.get('median_vce_score', 0) * 0.4) +
                (school.get('pct_scores_40_plus', 0) * 0.3) +
                (school.get('completion_rate', 0) * 0.2) +
                (school.get('tertiary_application_rate', 0) * 0.1)
            )
            school['weighted_score'] = round(weighted_score, 2)
            ranked_schools.append(school)

    # Sort schools by weighted score (descending)
    ranked_schools.sort(key=lambda x: x.get('weighted_score', 0), reverse=True)

    # Assign ranks
    for i, school in enumerate(ranked_schools):
        school['rank'] = i + 1

    # Merge ranked schools back with the original list
    ranked_school_ids = {school.get('id') for school in ranked_schools if 'id' in school}
    for school in schools:
        if 'id' in school and school['id'] not in ranked_school_ids:
            ranked_schools.append(school)

    logger.info(f"Calculated rankings for {len(ranked_schools)} schools")
    return ranked_schools

def run_command(command):
    """Run a shell command and return the output."""
    try:
        result = subprocess.run(command, shell=True, check=True, text=True, capture_output=True)
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        logger.error(f"Command failed: {command}")
        logger.error(f"Error: {e.stderr}")
        return None

def main():
    """Main function to run the entire data processing workflow."""
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Schoolify Data Processor')
    parser.add_argument('--skip-download', action='store_true', help='Skip downloading data files')
    parser.add_argument('--skip-geocoding', action='store_true', help='Skip geocoding step')
    parser.add_argument('--verbose', action='store_true', help='Show detailed progress information')
    args = parser.parse_args()

    # Set logging level based on verbose flag
    if args.verbose:
        logger.setLevel(logging.DEBUG)

    logger.info("=== Starting Schoolify Data Processing ===")

    # Step 1: Download postcodes and create suburb data
    if not args.skip_download:
        if not download_postcodes():
            logger.error("Failed to download and process postcodes data")
            return False
    else:
        logger.info("Skipping download of postcodes data as requested")

    # Step 2: Fetch Victorian government school data
    govt_schools = fetch_vic_govt_school_data()
    if not govt_schools:
        logger.error("Failed to fetch Victorian government school data")
        return False

    # Step 3: Fetch Better Education data
    better_education_schools = fetch_better_education_data()

    # Step 4: Merge school data from different sources
    merged_schools = merge_school_data(better_education_schools, govt_schools)

    # Step 5: Add geocodes to schools
    geocoded_schools = add_geocodes(merged_schools, skip_geocoding=args.skip_geocoding)

    # Step 6: Process VCAA achievement data
    vcaa_data = process_vcaa_achievement_data()

    # Step 7: Calculate school rankings
    ranked_schools = calculate_school_rankings(geocoded_schools, vcaa_data)

    # Step 8: Generate geocode database
    if os.path.exists(SUBURB_DATA_FILE):
        try:
            with open(SUBURB_DATA_FILE, 'r', encoding='utf-8') as f:
                suburb_data = json.load(f)
                suburbs = suburb_data.get('suburbs', [])
                if suburbs:
                    if not generate_geocode_db(suburbs):
                        logger.warning("Failed to generate geocode database")
        except Exception as e:
            logger.error(f"Error loading suburb data for geocode database: {e}")

    # Step 9: Update database with comprehensive data
    comprehensive_schools = ranked_schools
    with open(SCHOOL_DATA_COMPREHENSIVE_FILE, 'w', encoding='utf-8') as f:
        json.dump({'schools': comprehensive_schools}, f, indent=2)

    update_database(comprehensive_schools)

    logger.info("=== Schoolify Data Processing Complete ===")
    return True

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)