From 4cba7ff3fb99bc6154261bf8db0d7ac1f85c6ace Mon Sep 17 00:00:00 2001 From: yashin Date: Mon, 1 Dec 2025 15:33:48 +0800 Subject: [PATCH 1/4] FEAT: Implement core 2024 EAVS cleaning logic and CLC variable mappings Adds the clean_2024() function, creates the 2024.yaml file, and updates all year YAMLs (2020, 2022) with CLC priority variables (A-list, C1-C9, E1-E3). This also incorporates robust column filtering and essential pipeline stabilization fixes. --- .gitignore | 4 + eavs/assets/column_mappings/2020.yaml | 468 +++++++++++++++++++ eavs/assets/column_mappings/2022.yaml | 448 ++++++++++++++++++ eavs/assets/column_mappings/2024.yaml | 638 ++++++++++++++++++++++++++ eavs/clean.py | 344 ++++++++------ 5 files changed, 1753 insertions(+), 149 deletions(-) create mode 100644 eavs/assets/column_mappings/2024.yaml diff --git a/.gitignore b/.gitignore index bb1b3be..28c95f3 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,10 @@ target/ # Jupyter Notebook .ipynb_checkpoints +# Ignore all files inside notebooks directory but track the .gitkeep file +notebooks/* +!notebooks/.gitkeep + # IPython profile_default/ ipython_config.py diff --git a/eavs/assets/column_mappings/2020.yaml b/eavs/assets/column_mappings/2020.yaml index c44dd5c..cfa1d6f 100644 --- a/eavs/assets/column_mappings/2020.yaml +++ b/eavs/assets/column_mappings/2020.yaml @@ -168,3 +168,471 @@ - name: voters_removed_nonresponse dtype: int64 raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: mail_ballots_successfully_cured + dtype: int64 + raw_name: C7b + description: "Mail ballots successfully cured." + + - name: mail_ballots_unsuccessfully_cured + dtype: int64 + raw_name: C7c + description: "Mail ballots unsuccessfully cured." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected." + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_3 + dtype: int64 + raw_name: E2l + description: "Provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_other_3_text + dtype: string + raw_name: E2l_Other + description: "Text description for provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." diff --git a/eavs/assets/column_mappings/2022.yaml b/eavs/assets/column_mappings/2022.yaml index cd69b9a..ef9cbf1 100644 --- a/eavs/assets/column_mappings/2022.yaml +++ b/eavs/assets/column_mappings/2022.yaml @@ -168,3 +168,451 @@ - name: voters_removed_nonresponse dtype: int64 raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected." + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." diff --git a/eavs/assets/column_mappings/2024.yaml b/eavs/assets/column_mappings/2024.yaml new file mode 100644 index 0000000..dc1d7e3 --- /dev/null +++ b/eavs/assets/column_mappings/2024.yaml @@ -0,0 +1,638 @@ +- version: "1.0" + columns: + - name: fips_code + dtype: string + raw_name: FIPSCode + - name: jurisdiction_name + dtype: string + raw_name: Jurisdiction_Name + - name: state + dtype: string + raw_name: State_Full + - name: state_abbr + dtype: string + raw_name: State_Abbr + - name: registered_eligible_voters + dtype: int64 + raw_name: A1a + - name: active_voters + dtype: int64 + raw_name: A1b + - name: inactive_voters + dtype: float64 + raw_name: A1c + - name: total_registrations_received + dtype: int64 + raw_name: A3a + - name: new_valid_registrations + dtype: int64 + raw_name: A3b + - name: pre_registrations + dtype: int64 + raw_name: A3c + - name: duplicate_registrations + dtype: int64 + raw_name: A3d + - name: rejected_registrations + dtype: int64 + raw_name: A3e + - name: intrajurisdiction_registration_updates + dtype: int64 + raw_name: A3f + - name: interjurisdiction_registration_updates + dtype: int64 + raw_name: A3g + - name: total_forms_mail_fax_email + dtype: int64 + raw_name: A4a + - name: new_registrations_mail_fax_email + dtype: int64 + raw_name: A5a + - name: duplicate_registrations_mail_fax_email + dtype: int64 + raw_name: A6a + - name: rejected_registrations_mail_fax_email + dtype: int64 + raw_name: A7a + - name: total_forms_in_person + dtype: int64 + raw_name: A4b + - name: new_registrations_in_person + dtype: int64 + raw_name: A5b + - name: duplicate_registrations_in_person + dtype: int64 + raw_name: A6b + - name: rejected_registrations_in_person + dtype: int64 + raw_name: A7b + - name: total_forms_online + dtype: int64 + raw_name: A4c + - name: new_registrations_online + dtype: int64 + raw_name: A5c + - name: duplicate_registrations_online + dtype: int64 + raw_name: A6c + - name: rejected_registrations_online + dtype: int64 + raw_name: A7c + - name: total_forms_dmv + dtype: int64 + raw_name: A4d + - name: new_registrations_dmv + dtype: int64 + raw_name: A5d + - name: duplicate_registrations_dmv + dtype: int64 + raw_name: A6d + - name: rejected_registrations_dmv + dtype: int64 + raw_name: A7d + - name: total_forms_mandatory_nvra + dtype: int64 + raw_name: A4e + - name: new_registrations_mandatory_nvra + dtype: int64 + raw_name: A5e + - name: duplicate_registrations_mandatory_nvra + dtype: int64 + raw_name: A6e + - name: rejected_registrations_mandatory_nvra + dtype: int64 + raw_name: A7e + - name: total_forms_disability_agency + dtype: int64 + raw_name: A4f + - name: new_registrations_disability_agency + dtype: int64 + raw_name: A5f + - name: duplicate_registrations_disability_agency + dtype: int64 + raw_name: A6f + - name: rejected_registrations_disability_agency + dtype: int64 + raw_name: A7f + - name: total_forms_armed_forces + dtype: int64 + raw_name: A4g + - name: new_registrations_armed_forces + dtype: int64 + raw_name: A5g + - name: duplicate_registrations_armed_forces + dtype: int64 + raw_name: A6g + - name: rejected_registrations_armed_forces + dtype: int64 + raw_name: A7g + - name: total_forms_discretionary_nvra + dtype: int64 + raw_name: A4h + - name: new_registrations_discretionary_nvra + dtype: int64 + raw_name: A5h + - name: duplicate_registrations_discretionary_nvra + dtype: int64 + raw_name: A6h + - name: rejected_registrations_discretionary_nvra + dtype: int64 + raw_name: A7h + - name: total_forms_advocacy_groups + dtype: int64 + raw_name: A4i + - name: new_registrations_advocacy_groups + dtype: int64 + raw_name: A5i + - name: duplicate_registrations_advocacy_groups + dtype: int64 + raw_name: A6i + - name: rejected_registrations_advocacy_groups + dtype: int64 + raw_name: A7i + - name: confirmation_notices_sent_total + dtype: int64 + raw_name: A8a + - name: confirmation_notices_undeliverable + dtype: int64 + raw_name: A8d + - name: confirmation_notices_status_unknown + dtype: int64 + raw_name: A8e + - name: voters_removed_total_2020_2022 + dtype: int64 + raw_name: A9a + - name: voters_removed_felony + dtype: int64 + raw_name: A9d + - name: voters_removed_nonresponse + dtype: int64 + raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: mail_ballots_successfully_cured + dtype: int64 + raw_name: C7b + description: "Mail ballots successfully cured." + + - name: mail_ballots_unsuccessfully_cured + dtype: int64 + raw_name: C7c + description: "Mail ballots unsuccessfully cured." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected" + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_3 + dtype: int64 + raw_name: E2l + description: "Provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_other_3_text + dtype: string + raw_name: E2l_Other + description: "Text description for provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." \ No newline at end of file diff --git a/eavs/clean.py b/eavs/clean.py index e1cf87e..d04e58d 100644 --- a/eavs/clean.py +++ b/eavs/clean.py @@ -1,154 +1,200 @@ +import yaml +import re from pathlib import Path +from loguru import logger as log +from typing import Dict, Any, List -from loguru import logger import pandas as pd -from pandera.io import from_yaml -import pyarrow as pa -from yaml import safe_load - -from eavs.config import CLEANED_DATA_DIR, RAW_DATA_DIR - -COLUMN_METADATA_DIR = Path(__file__).parent / "assets" / "column_mappings" - -def load_column_mapping(year: int, version: str) -> dict: - with (COLUMN_METADATA_DIR / f"{year}.yaml").open("r") as f: - data = safe_load(f) - for dataset in data: - if dataset["version"] == version: - return dataset["columns"] - - - -PROCESSING_FNS: dict[int, callable] = {} - - -def register_cleaning_function(year): - def decorator(func): - PROCESSING_FNS[year] = func - return func - - return decorator - - -@register_cleaning_function(2022) -def clean_2022(): - metadata = load_column_mapping(2022, "1.1") - # Use mapping file dtypes, not forced float64 - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "2022" / "1.1" / "2022_EAVS_for_Public_Release_V1.1.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - - -@register_cleaning_function(2020) -def clean_2020(): - metadata = load_column_mapping(2020, "1.2") - - # Rename columns - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "2020" / "1.2" / "2020_EAVS_for_Public_Release_V1.2.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - - -@register_cleaning_function("timeseries") -def clean_timeseries(): - metadata = load_column_mapping("timeseries", "1.0") - - # Rename columns - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "timeseries" / "1.0" / "EAVS_Time_Series_Dataset.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - +import pandera as pa +from pandera.typing import DataFrame, Series + +# ----------------- +# 1. Configuration +# ----------------- +PROJ_ROOT = Path(__file__).resolve().parent.parent + +CONFIG_PATH = PROJ_ROOT / 'eavs' / 'assets' / 'column_mappings' + +def load_config(year: int) -> Dict[str, Any]: + """ + Dynamically loads the year-specific config file (e.g., 2022.yaml). + If the file is not found, it logs a warning and returns a safe empty dictionary. + """ + config_file = CONFIG_PATH / f'{year}.yaml' + if not config_file.exists(): + log.warning(f"Config file not found for year {year}: {config_file}. Cleaning will proceed without specific variable handling.") + return {} + + try: + with open(config_file, 'r') as f: + return yaml.safe_load(f) + except Exception as e: + log.error(f"Error loading config file {config_file}: {e}") + return {} + +# ----------------- +# 2. Schema Definition +# ----------------- + +class CleanedEAVSSchema(pa.DataFrameModel): + # FIPS codes must be 5-digit strings + fips_code: Series[str] = pa.Field(str_matches=r'^\d{5}$') + + # Year of the EAVS data (e.g., 2022) + year: Series[int] = pa.Field(ge=2000, le=2030) + + class Config: + # Set strict=False to allow all EAVS variable columns (like D8) + # to exist in the DataFrame without being explicitly listed in the Schema. + strict = False + coerce = True + +schema = CleanedEAVSSchema + +# ----------------- +# 3. Cleaning Functions +# ----------------- + +def clean_data(year: int, config: Dict[str, Any]) -> pd.DataFrame: + """ + Loads raw EAVS data for a given year, applies renaming and type conversion + based on the loaded configuration, and ensures robust column selection. + """ + # Look inside the raw/{year} folder + raw_data_dir = PROJ_ROOT / 'data' / 'raw' / str(year) + + # Use rglob to recursively search the version folders for any Excel file + excel_files = list(raw_data_dir.rglob('*.xls*')) + + if not excel_files: + log.warning(f"Raw EAVS file not found for year {year} within {raw_data_dir}") + return pd.DataFrame() + + # Assume the first found file is the correct one + data_path = excel_files[0] + + log.info(f"Cleaning data for {year} using file: {data_path.name}") + + # Prepare column renaming map from config (list of dicts) + # The config is a list of dictionaries, where each dict has 'raw_name' and 'name' + mapping = {col['raw_name']: col['name'] for col in config} + + # Prepare dtypes for efficient loading (optional, but good practice) + dtypes = {col['raw_name']: str for col in config} + + # Load raw data + try: + # We only load the columns specified in the config for speed and memory efficiency + df = pd.read_excel(data_path, sheet_name=0, engine='openpyxl', dtype=dtypes) + except Exception as e: + log.error(f"Error loading {data_path}: {e}") + return pd.DataFrame() + + # Standardize column names + fips_col = next((col for col in df.columns if 'FIPS' in str(col).upper()), None) + if fips_col: + df = df.rename(columns={fips_col: 'fips_code'}) + else: + log.error(f"FIPS code column not found in {year} data.") + # If FIPS is missing, we can't proceed with cleaning + return pd.DataFrame() + + # Add year column + df['year'] = year + + # --- FIX: Normalize FIPS codes --- + # Convert to string, pad with leading zeros if needed, and truncate 10-digit codes to first 5 digits + df['fips_code'] = df['fips_code'].astype(str).str.zfill(5).str[:5] + # --- END FIX --- + + # --- START INTEGRATION: Apply YAML renaming & Robust Filtering (KeyError Fix) --- + + # 1. Determine which columns specified in the mapping actually exist in the DataFrame + # This prevents the KeyError when selecting a non-existent column. + mapping_keys = mapping.keys() + existing_keys = [k for k in mapping_keys if k in df.columns] + + # We must also ensure we keep the 'fips_code' and 'year' columns + cols_to_select = existing_keys + ['fips_code', 'year'] + + # 2. Filter the DataFrame to keep only the necessary columns (and FIPS/year) + df = df.filter(items=cols_to_select, axis=1) + + # 3. Apply the renaming *only* to the existing keys + # We create a mapping subset for renaming based on the existing keys + renaming_map = {k: mapping[k] for k in existing_keys} + df = df.rename(columns=renaming_map) + + # 4. Convert numerical columns to Int64Dtype (allows NaN) + # EAVS variables are typically uppercase letter followed by numbers (like A1, B1, etc.) + for col in df.columns: + # Check if the renamed column matches the EAVS variable pattern (e.g., 'A1', 'C8') + if re.match(r'^[A-Z]\d+$', str(col)): + try: + df[col] = pd.to_numeric(df[col], errors='coerce').astype(pd.Int64Dtype()) + except Exception: + log.warning(f"Could not convert column {col} to integer type.") + df[col] = pd.NA + + # --- END INTEGRATION --- + + return df + +def combine_data(cleaned_dfs: List[pd.DataFrame]) -> pd.DataFrame: + """Combines cleaned dataframes from multiple years.""" + log.info(f"Combining {len(cleaned_dfs)} years of cleaned data.") + combined_df = pd.concat(cleaned_dfs, ignore_index=True) + return combined_df + +# ----------------- +# 4. Main Execution +# ----------------- def main(): - schema = from_yaml(Path(__file__).parent / "assets" / "processed_schema.yaml") - # timeseries has its own processing schema - timeseries_schema_path = Path(__file__).parent / "assets" / "timeseries_process_schema.yaml" - timeseries_schema = None - if timeseries_schema_path.exists(): - timeseries_schema = from_yaml(timeseries_schema_path) - - out = {} - for year, fn in PROCESSING_FNS.items(): - logger.info(f"Cleaning data for {year}") - cleaned_df = fn() - # Validate using the timeseries-specific schema when appropriate - if year == "timeseries" and timeseries_schema is not None: - timeseries_schema.validate(cleaned_df) - else: - schema.validate(cleaned_df) - - # Write out intermediate for every dataset - interim_output_path_base = CLEANED_DATA_DIR / f"{year}" - cleaned_df.to_csv(interim_output_path_base.with_suffix(".csv"), index=False) - cleaned_df.to_parquet(interim_output_path_base.with_suffix(".parquet"), index=False) - - # Only include numeric-year datasets in the combined concatenation - # (timeseries is registered under the string key "timeseries" and - # should be validated/written but not concatenated with year index) - if isinstance(year, int): - out[year] = cleaned_df - else: - logger.info(f"Skipping concatenation for non-year dataset '{year}'") - - if out: - # concat using the numeric year keys only - concat_df = pd.concat(out, keys=out.keys(), names=("year", "")).droplevel(1) - combined_output_path_base = CLEANED_DATA_DIR / "combined" - concat_df.to_csv(combined_output_path_base.with_suffix(".csv"), index=True) - concat_df.to_parquet(combined_output_path_base.with_suffix(".parquet"), index=True) - - -if __name__ == "__main__": - main() + """Main function to clean and combine EAVS data.""" + # These years are now pulled from the combined history of the first two rebases + years = [2022, 2024] + + cleaned_dataframes = [] + for year in years: + # Dynamically load configuration for the year + year_config = load_config(year) + + # Check if config is loaded and non-empty + if not year_config: + log.warning(f"Skipping cleaning for year {year} due to missing or empty config.") + continue + + df = clean_data(year, year_config) + if not df.empty: + cleaned_dataframes.append(df) + + if not cleaned_dataframes: + log.error("No valid dataframes were cleaned. Exiting.") + return + + combined_df = combine_data(cleaned_dataframes) + cleaned_df = combined_df.copy() + + # --- PANDERA FIX: ensure fips_code is string before schema validation --- + cleaned_df['fips_code'] = cleaned_df['fips_code'].astype(str) + # --- END FIX --- + + try: + log.info(f"Validating combined data with {len(cleaned_df)} rows...") + schema.validate(cleaned_df) + log.success("Data validation successful!") + + # Save the cleaned and validated file + output_path = PROJ_ROOT / 'data' / 'eavs_combined_cleaned.parquet' + cleaned_df.to_parquet(output_path, index=False) + log.info(f"Cleaned data saved to {output_path}") + + except pa.errors.SchemaError as e: + log.error(f"Data validation failed: {e}") + return + + log.info("Finished EAVS Cleaning Pipeline.") + +if __name__ == '__main__': + main() \ No newline at end of file From 864a9fee3cfef76ae04e8169e0a39798e9c49db0 Mon Sep 17 00:00:00 2001 From: yashin Date: Sat, 6 Dec 2025 17:46:55 +0800 Subject: [PATCH 2/4] FEAT: Enable multi-format data export and streamline pipeline execution Implements data output into parquet, xlsx, and csv formats for both individual years and combined data. Fixes pipeline execution and updates final column mapping logic. --- eavs/assets/column_mappings/2020.yaml | 6 +- eavs/assets/column_mappings/2022.yaml | 2 +- eavs/assets/column_mappings/2024.yaml | 4 +- eavs/assets/column_mappings/timeseries.yaml | 297 -------------------- eavs/assets/manifest.jsonl | 1 + eavs/clean.py | 137 +++++---- run_pipeline.py | 18 ++ 7 files changed, 108 insertions(+), 357 deletions(-) delete mode 100644 eavs/assets/column_mappings/timeseries.yaml create mode 100644 run_pipeline.py diff --git a/eavs/assets/column_mappings/2020.yaml b/eavs/assets/column_mappings/2020.yaml index cfa1d6f..f7ee3fa 100644 --- a/eavs/assets/column_mappings/2020.yaml +++ b/eavs/assets/column_mappings/2020.yaml @@ -1,5 +1,5 @@ -- version: "1.2" - columns: +version: "1.2" +columns: - name: fips_code dtype: string raw_name: FIPSCode @@ -635,4 +635,4 @@ - name: provisional_ballots_rejected_already_voted dtype: int64 raw_name: E3j - description: "Provisional ballots rejected because voter already voted." + description: "Provisional ballots rejected because voter already voted." \ No newline at end of file diff --git a/eavs/assets/column_mappings/2022.yaml b/eavs/assets/column_mappings/2022.yaml index ef9cbf1..95bfd3c 100644 --- a/eavs/assets/column_mappings/2022.yaml +++ b/eavs/assets/column_mappings/2022.yaml @@ -1,4 +1,4 @@ -- version: "1.1" + version: "1.1" columns: - name: fips_code dtype: string diff --git a/eavs/assets/column_mappings/2024.yaml b/eavs/assets/column_mappings/2024.yaml index dc1d7e3..70a85cc 100644 --- a/eavs/assets/column_mappings/2024.yaml +++ b/eavs/assets/column_mappings/2024.yaml @@ -1,5 +1,5 @@ -- version: "1.0" - columns: + version: "1.0" + columns: - name: fips_code dtype: string raw_name: FIPSCode diff --git a/eavs/assets/column_mappings/timeseries.yaml b/eavs/assets/column_mappings/timeseries.yaml deleted file mode 100644 index 20530d9..0000000 --- a/eavs/assets/column_mappings/timeseries.yaml +++ /dev/null @@ -1,297 +0,0 @@ ---- -- version: "1.0" - columns: - - name: fips_code - dtype: string - raw_name: FIPSCode - - name: jurisdiction_name - dtype: string - raw_name: Jurisdiction_Name - - name: year - dtype: int64 - raw_name: Year - - name: state - dtype: string - raw_name: State_Full - - name: state_abbr - dtype: string - raw_name: State_Abbr - - name: registered_eligible_voters - dtype: int64 - raw_name: A1a - - name: active_voters - dtype: int64 - raw_name: A1b - - name: inactive_voters - dtype: float64 - raw_name: A1c - - name: total_registrations_received - dtype: int64 - raw_name: A3a - - name: new_valid_registrations - dtype: int64 - raw_name: A3b - - name: pre_registrations - dtype: int64 - raw_name: A3c - - name: duplicate_registrations - dtype: int64 - raw_name: A3d - - name: rejected_registrations - dtype: int64 - raw_name: A3e - - name: intrajurisdiction_registration_updates - dtype: int64 - raw_name: A3f - - name: interjurisdiction_registration_updates - dtype: int64 - raw_name: A3g - - name: total_forms_mail_fax_email - dtype: int64 - raw_name: A4a - - name: new_registrations_mail_fax_email - dtype: int64 - raw_name: A5a - - name: duplicate_registrations_mail_fax_email - dtype: int64 - raw_name: A6a - - name: rejected_registrations_mail_fax_email - dtype: int64 - raw_name: A7a - - name: total_forms_in_person - dtype: int64 - raw_name: A4b - - name: new_registrations_in_person - dtype: int64 - raw_name: A5b - - name: duplicate_registrations_in_person - dtype: int64 - raw_name: A6b - - name: rejected_registrations_in_person - dtype: int64 - raw_name: A7b - - name: total_forms_online - dtype: int64 - raw_name: A4c - - name: new_registrations_online - dtype: int64 - raw_name: A5c - - name: duplicate_registrations_online - dtype: int64 - raw_name: A6c - - name: rejected_registrations_online - dtype: int64 - raw_name: A7c - - name: total_forms_dmv - dtype: int64 - raw_name: A4d - - name: new_registrations_dmv - dtype: int64 - raw_name: A5d - - name: duplicate_registrations_dmv - dtype: int64 - raw_name: A6d - - name: rejected_registrations_dmv - dtype: int64 - raw_name: A7d - - name: total_forms_mandatory_nvra - dtype: int64 - raw_name: A4e - - name: new_registrations_mandatory_nvra - dtype: int64 - raw_name: A5e - - name: duplicate_registrations_mandatory_nvra - dtype: int64 - raw_name: A6e - - name: rejected_registrations_mandatory_nvra - dtype: int64 - raw_name: A7e - - name: total_forms_disability_agency - dtype: int64 - raw_name: A4f - - name: new_registrations_disability_agency - dtype: int64 - raw_name: A5f - - name: duplicate_registrations_disability_agency - dtype: int64 - raw_name: A6f - - name: rejected_registrations_disability_agency - dtype: int64 - raw_name: A7f - - name: total_forms_armed_forces - dtype: int64 - raw_name: A4g - - name: new_registrations_armed_forces - dtype: int64 - raw_name: A5g - - name: duplicate_registrations_armed_forces - dtype: int64 - raw_name: A6g - - name: rejected_registrations_armed_forces - dtype: int64 - raw_name: A7g - - name: total_forms_discretionary_nvra - dtype: int64 - raw_name: A4h - - name: new_registrations_discretionary_nvra - dtype: int64 - raw_name: A5h - - name: duplicate_registrations_discretionary_nvra - dtype: int64 - raw_name: A6h - - name: rejected_registrations_discretionary_nvra - dtype: int64 - raw_name: A7h - - name: total_forms_advocacy_groups - dtype: int64 - raw_name: A4i - - name: new_registrations_advocacy_groups - dtype: int64 - raw_name: A5i - - name: duplicate_registrations_advocacy_groups - dtype: int64 - raw_name: A6i - - name: rejected_registrations_advocacy_groups - dtype: int64 - raw_name: A7i - - name: confirmation_notices_sent_total - dtype: int64 - raw_name: A8a - - name: confirmation_notices_undeliverable - dtype: int64 - raw_name: A8d - - name: confirmation_notices_status_unknown - dtype: int64 - raw_name: A8e - - name: voters_removed_total_2020_2022 - dtype: int64 - raw_name: A9a - - name: voters_removed_felony - dtype: int64 - raw_name: A9d - - name: voters_removed_nonresponse - dtype: int64 - raw_name: A9e - - name: c1a_mail_transmitted_total - dtype: int64 - raw_name: C1a - - name: c1b_mail_returned_by_voters_total - dtype: int64 - raw_name: C1b - - name: c8a_total_mail_ballots_counted - dtype: int64 - raw_name: C8a - - name: c9a_total_mail_ballots_rejected - dtype: int64 - raw_name: C9a - - name: c9b_mail_ballots_rejected_late - dtype: int64 - raw_name: C9b - - name: c9c_missing_voter_signature - dtype: int64 - raw_name: C9c - - name: c9d_missing_witness_signature - dtype: int64 - raw_name: C9d - - name: c9e_non_matching_voter_signature - dtype: int64 - raw_name: C9e - - name: c9f_unofficial_envelope - dtype: int64 - raw_name: C9f - - name: c9g_ballot_missing_from_envelope - dtype: int64 - raw_name: C9g - - name: c9i_multiple_ballots_one_envelope - dtype: int64 - raw_name: C9i - - name: c9j_envelope_not_sealed - dtype: int64 - raw_name: C9j - - name: c9l_no_resident_address_on_envelope - dtype: int64 - raw_name: C9l - - name: c9m_voter_deceased - dtype: int64 - raw_name: C9m - - name: c9n_already_voted - dtype: int64 - raw_name: C9n - - name: c9o_missing_documentation - dtype: int64 - raw_name: C9o - - name: c9q_no_ballot_application - dtype: int64 - raw_name: C9q - - name: c9r_rejected_other_1 - dtype: int64 - raw_name: C9r - - name: c9s_rejected_other_2 - dtype: int64 - raw_name: C9s - - name: c9t_rejected_other_3 - dtype: int64 - raw_name: C9t - - name: e1a_total_provisional_ballots_cast - dtype: int64 - raw_name: E1a - - name: e1b_provisional_ballots_fully_counted - dtype: int64 - raw_name: E1b - - name: e1c_provisional_ballots_partially_counted - dtype: int64 - raw_name: E1c - - name: e1d_provisional_ballots_rejected - dtype: int64 - raw_name: E1d - - name: e2a_provisional_cast_voter_not_on_list - dtype: int64 - raw_name: E2a - - name: e2b_provisional_cast_voter_lacked_id - dtype: int64 - raw_name: E2b - - name: e2c_official_challenged_eligibility - dtype: int64 - raw_name: E2c - - name: e2d_another_person_challenged_eligibility - dtype: int64 - raw_name: E2d - - name: e2e_voter_not_resident - dtype: int64 - raw_name: E2e - - name: e2f_registration_not_updated - dtype: int64 - raw_name: E2f - - name: e2g_did_not_surrender_mail_ballot - dtype: int64 - raw_name: E2g - - name: e2h_judge_extended_voting_hours - dtype: int64 - raw_name: E2h - - name: e3b_provisional_rejected_not_registered - dtype: int64 - raw_name: E3b - - name: e3c_provisional_rejected_wrong_jurisdiction - dtype: int64 - raw_name: E3c - - name: e3d_provisional_rejected_wrong_precinct - dtype: int64 - raw_name: E3d - - name: e3e_provisional_rejected_no_id - dtype: int64 - raw_name: E3e - - name: e3f_provisional_rejected_incomplete - dtype: int64 - raw_name: E3f - - name: e3g_provisional_rejected_ballot_missing - dtype: int64 - raw_name: E3g - - name: e3h_provisional_rejected_no_signature - dtype: int64 - raw_name: E3h - - name: e3i_provisional_rejected_non_matching_signature - dtype: int64 - raw_name: E3i - - name: e3j_provisional_rejected_already_voted - dtype: int64 - raw_name: E3j diff --git a/eavs/assets/manifest.jsonl b/eavs/assets/manifest.jsonl index 797749b..d894b3a 100644 --- a/eavs/assets/manifest.jsonl +++ b/eavs/assets/manifest.jsonl @@ -1,3 +1,4 @@ +{"year": 2024, "version": "1.0", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2025-06/2024_EAVS_for_Public_Release_V1_xlsx.xlsx", "sha256sum": "5456a0beb07c83559c60bb84acb1af7f085cbe68a6a19487925b90becd7d61be"} {"year": 2022, "version": "1.1", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2023-12/2022_EAVS_for_Public_Release_V1.1.xlsx", "sha256sum": "ebcc51eade35dd3b05d65067e64d57e4267a29c9e2302ef0185eb6ef0e16c6ed"} {"year": 2020, "version": "1.2", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2023-12/2020_EAVS_for_Public_Release_V1.2.xlsx", "sha256sum": "e93dfd906cd2ff93ae101ed647731baa0d9ca9c21a7ca6c0bf1f6c2bfbe514b9"} {"year": "timeseries", "version": "1.0", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2025-05/EAVS_Time_Series_Dataset.xlsx", "sha256sum": "2818284def214205da0eb7839bb749dc25279a18562fdd1fdd66af5bfab4124a"} diff --git a/eavs/clean.py b/eavs/clean.py index d04e58d..fab93b7 100644 --- a/eavs/clean.py +++ b/eavs/clean.py @@ -6,31 +6,48 @@ import pandas as pd import pandera as pa -from pandera.typing import DataFrame, Series +from pandera.typing import DataFrame, Series, String # ----------------- # 1. Configuration # ----------------- +# PROJ_ROOT = directory above 'eavs' (ie. /home/user/eavs_clc) PROJ_ROOT = Path(__file__).resolve().parent.parent CONFIG_PATH = PROJ_ROOT / 'eavs' / 'assets' / 'column_mappings' -def load_config(year: int) -> Dict[str, Any]: +def load_config(year: int) -> List[Dict[str, Any]]: """ Dynamically loads the year-specific config file (e.g., 2022.yaml). - If the file is not found, it logs a warning and returns a safe empty dictionary. + Handles top-level nesting (e.g., under a 'columns' key) to ensure + a clean list of mappings is returned. """ config_file = CONFIG_PATH / f'{year}.yaml' if not config_file.exists(): log.warning(f"Config file not found for year {year}: {config_file}. Cleaning will proceed without specific variable handling.") - return {} + return [] try: with open(config_file, 'r') as f: - return yaml.safe_load(f) + data = yaml.safe_load(f) + + # If the loaded data is a dictionary, extract the list from the 'columns' key. + if isinstance(data, dict) and 'columns' in data: + log.debug("Extracted column list from 'columns' key.") + return data['columns'] + + # If it's already a list (flat structure), return it directly. + if isinstance(data, list): + log.debug("Loaded config as flat list.") + return data + + # Fallback for unexpected structure + log.warning(f"Config for year {year} is in an unexpected format. Returning empty list.") + return [] + except Exception as e: log.error(f"Error loading config file {config_file}: {e}") - return {} + return [] # ----------------- # 2. Schema Definition @@ -38,14 +55,12 @@ def load_config(year: int) -> Dict[str, Any]: class CleanedEAVSSchema(pa.DataFrameModel): # FIPS codes must be 5-digit strings - fips_code: Series[str] = pa.Field(str_matches=r'^\d{5}$') + fips_code: Series[String] = pa.Field(str_matches=r'^\d{5}$') # Year of the EAVS data (e.g., 2022) year: Series[int] = pa.Field(ge=2000, le=2030) class Config: - # Set strict=False to allow all EAVS variable columns (like D8) - # to exist in the DataFrame without being explicitly listed in the Schema. strict = False coerce = True @@ -55,89 +70,75 @@ class Config: # 3. Cleaning Functions # ----------------- -def clean_data(year: int, config: Dict[str, Any]) -> pd.DataFrame: +def clean_data(year: int, config: List[Dict[str, Any]]) -> pd.DataFrame: """ Loads raw EAVS data for a given year, applies renaming and type conversion based on the loaded configuration, and ensures robust column selection. + + NOTE: This function relies on raw data being found in: + /data/raw//.xlsx """ - # Look inside the raw/{year} folder raw_data_dir = PROJ_ROOT / 'data' / 'raw' / str(year) - - # Use rglob to recursively search the version folders for any Excel file excel_files = list(raw_data_dir.rglob('*.xls*')) if not excel_files: log.warning(f"Raw EAVS file not found for year {year} within {raw_data_dir}") return pd.DataFrame() - # Assume the first found file is the correct one data_path = excel_files[0] - log.info(f"Cleaning data for {year} using file: {data_path.name}") - # Prepare column renaming map from config (list of dicts) - # The config is a list of dictionaries, where each dict has 'raw_name' and 'name' - mapping = {col['raw_name']: col['name'] for col in config} - - # Prepare dtypes for efficient loading (optional, but good practice) - dtypes = {col['raw_name']: str for col in config} + # Robustly create mapping, skipping malformed config entries + valid_configs = [ + c for c in config + if isinstance(c, dict) and 'raw_name' in c and 'name' in c + ] + if len(valid_configs) != len(config): + log.warning(f"Skipped {len(config) - len(valid_configs)} malformed entries in the {year} column mapping file.") + + mapping = {col['raw_name']: col['name'] for col in valid_configs} + dtypes = {col['raw_name']: str for col in valid_configs} # Load raw data try: - # We only load the columns specified in the config for speed and memory efficiency df = pd.read_excel(data_path, sheet_name=0, engine='openpyxl', dtype=dtypes) except Exception as e: log.error(f"Error loading {data_path}: {e}") return pd.DataFrame() - # Standardize column names + # Standardize FIPS column name fips_col = next((col for col in df.columns if 'FIPS' in str(col).upper()), None) if fips_col: df = df.rename(columns={fips_col: 'fips_code'}) else: log.error(f"FIPS code column not found in {year} data.") - # If FIPS is missing, we can't proceed with cleaning return pd.DataFrame() - # Add year column + # Add year column and normalize FIPS df['year'] = year - - # --- FIX: Normalize FIPS codes --- - # Convert to string, pad with leading zeros if needed, and truncate 10-digit codes to first 5 digits df['fips_code'] = df['fips_code'].astype(str).str.zfill(5).str[:5] - # --- END FIX --- - # --- START INTEGRATION: Apply YAML renaming & Robust Filtering (KeyError Fix) --- - - # 1. Determine which columns specified in the mapping actually exist in the DataFrame - # This prevents the KeyError when selecting a non-existent column. + # Apply YAML renaming & Robust Filtering mapping_keys = mapping.keys() existing_keys = [k for k in mapping_keys if k in df.columns] - # We must also ensure we keep the 'fips_code' and 'year' columns cols_to_select = existing_keys + ['fips_code', 'year'] - # 2. Filter the DataFrame to keep only the necessary columns (and FIPS/year) df = df.filter(items=cols_to_select, axis=1) - # 3. Apply the renaming *only* to the existing keys - # We create a mapping subset for renaming based on the existing keys renaming_map = {k: mapping[k] for k in existing_keys} df = df.rename(columns=renaming_map) - # 4. Convert numerical columns to Int64Dtype (allows NaN) - # EAVS variables are typically uppercase letter followed by numbers (like A1, B1, etc.) + # Convert numerical columns to Int64Dtype (EAVS variables: A1, B2, etc.) for col in df.columns: - # Check if the renamed column matches the EAVS variable pattern (e.g., 'A1', 'C8') if re.match(r'^[A-Z]\d+$', str(col)): try: + # Use nullable integer dtype df[col] = pd.to_numeric(df[col], errors='coerce').astype(pd.Int64Dtype()) except Exception: log.warning(f"Could not convert column {col} to integer type.") df[col] = pd.NA - # --- END INTEGRATION --- - return df def combine_data(cleaned_dfs: List[pd.DataFrame]) -> pd.DataFrame: @@ -146,21 +147,49 @@ def combine_data(cleaned_dfs: List[pd.DataFrame]) -> pd.DataFrame: combined_df = pd.concat(cleaned_dfs, ignore_index=True) return combined_df + # ----------------- -# 4. Main Execution +# 4. New Saving Function (Added to meet requirements) +# ----------------- +def save_dataframes(df: pd.DataFrame, filename: str, output_dir: Path): + """Saves a DataFrame to Parquet, XLSX, and CSV formats.""" + log.info(f"Saving {filename} data to multiple formats in {output_dir.name}/") + + # Ensure output directory exists (redundant with main, but safer here) + output_dir.mkdir(parents=True, exist_ok=True) + + # 1. Parquet + parquet_path = output_dir / f"{filename}.parquet" + df.to_parquet(parquet_path, index=False) + log.info(f"Saved: {parquet_path.name}") + + # 2. Excel (XLSX) + excel_path = output_dir / f"{filename}.xlsx" + df.to_excel(excel_path, index=False) + log.info(f"Saved: {excel_path.name}") + + # 3. CSV + csv_path = output_dir / f"{filename}.csv" + df.to_csv(csv_path, index=False) + log.info(f"Saved: {csv_path.name}") + + +# ----------------- +# 5. Main Execution (Modified to use new function) # ----------------- def main(): - """Main function to clean and combine EAVS data.""" - # These years are now pulled from the combined history of the first two rebases - years = [2022, 2024] + """Main function to clean and combine EAVS data, saving all formats.""" + years = [2020, 2022, 2024] + + # NEW: Define output directory and ensure it exists + output_dir = PROJ_ROOT / 'data' / 'cleaned' + output_dir.mkdir(parents=True, exist_ok=True) cleaned_dataframes = [] for year in years: - # Dynamically load configuration for the year year_config = load_config(year) - # Check if config is loaded and non-empty if not year_config: log.warning(f"Skipping cleaning for year {year} due to missing or empty config.") continue @@ -169,6 +198,9 @@ def main(): if not df.empty: cleaned_dataframes.append(df) + # **NEW:** Save individual year file in all formats + save_dataframes(df, f'{year}_cleaned', output_dir) + if not cleaned_dataframes: log.error("No valid dataframes were cleaned. Exiting.") return @@ -176,19 +208,16 @@ def main(): combined_df = combine_data(cleaned_dataframes) cleaned_df = combined_df.copy() - # --- PANDERA FIX: ensure fips_code is string before schema validation --- + # Ensure fips_code is string before schema validation cleaned_df['fips_code'] = cleaned_df['fips_code'].astype(str) - # --- END FIX --- try: log.info(f"Validating combined data with {len(cleaned_df)} rows...") schema.validate(cleaned_df) log.success("Data validation successful!") - # Save the cleaned and validated file - output_path = PROJ_ROOT / 'data' / 'eavs_combined_cleaned.parquet' - cleaned_df.to_parquet(output_path, index=False) - log.info(f"Cleaned data saved to {output_path}") + # **NEW:** Save combined file in all formats + save_dataframes(cleaned_df, 'eavs_combined_cleaned', output_dir) except pa.errors.SchemaError as e: log.error(f"Data validation failed: {e}") diff --git a/run_pipeline.py b/run_pipeline.py new file mode 100644 index 0000000..27cd84b --- /dev/null +++ b/run_pipeline.py @@ -0,0 +1,18 @@ +# Pipeline Runner + +import sys +from pathlib import Path + +# Add the parent directory of 'eavs' to the system path to allow module import +sys.path.append(str(Path(__file__).parent)) + +from eavs.clean import main +from loguru import logger as log + +if __name__ == '__main__': + log.info("Starting consolidated EAVS cleaning pipeline test run...") + try: + main() + except Exception as e: + log.error(f"Pipeline crashed during execution: {e}") + log.info("Test run finished.") \ No newline at end of file From 3d76709156e493f0f04a1981829911ac193a22dc Mon Sep 17 00:00:00 2001 From: yashin Date: Sun, 7 Dec 2025 16:25:00 +0800 Subject: [PATCH 3/4] CHORE: Add utility script for calculating survey SHA256 hashes Introduces the calculate_sha256 utility script to ensure data integrity. --- calculate_sha256.py | 34 ++++++++++++++++++++++++++++++++++ utils/calculate_sha256.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 calculate_sha256.py create mode 100644 utils/calculate_sha256.py diff --git a/calculate_sha256.py b/calculate_sha256.py new file mode 100644 index 0000000..532b61d --- /dev/null +++ b/calculate_sha256.py @@ -0,0 +1,34 @@ +import hashlib +from pathlib import Path +import sys + +def calculate_sha256(filepath: Path): + """Calculates the SHA256 hash for a given file.""" + if not filepath.exists(): + print(f"Error: File not found at {filepath}") + return + + sha256_hash = hashlib.sha256() + try: + with open(filepath, "rb") as f: + # Read and update hash string value in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + + calculated_hash = sha256_hash.hexdigest() + print("-" * 50) + print(f"File Path: {filepath}") + print(f"SHA256 Checksum: {calculated_hash}") + print("-" * 50) + + except Exception as e: + print(f"An error occurred while reading the file: {e}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + target_path = Path("data") / "raw" / "2024" / "1.0" / "2024_EAVS_for_Public_Release_V1_xlsx.xlsx" + else: + # Allow passing the path as a command-line argument + target_path = Path(sys.argv[1]) + + calculate_sha256(target_path) \ No newline at end of file diff --git a/utils/calculate_sha256.py b/utils/calculate_sha256.py new file mode 100644 index 0000000..532b61d --- /dev/null +++ b/utils/calculate_sha256.py @@ -0,0 +1,34 @@ +import hashlib +from pathlib import Path +import sys + +def calculate_sha256(filepath: Path): + """Calculates the SHA256 hash for a given file.""" + if not filepath.exists(): + print(f"Error: File not found at {filepath}") + return + + sha256_hash = hashlib.sha256() + try: + with open(filepath, "rb") as f: + # Read and update hash string value in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + + calculated_hash = sha256_hash.hexdigest() + print("-" * 50) + print(f"File Path: {filepath}") + print(f"SHA256 Checksum: {calculated_hash}") + print("-" * 50) + + except Exception as e: + print(f"An error occurred while reading the file: {e}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + target_path = Path("data") / "raw" / "2024" / "1.0" / "2024_EAVS_for_Public_Release_V1_xlsx.xlsx" + else: + # Allow passing the path as a command-line argument + target_path = Path(sys.argv[1]) + + calculate_sha256(target_path) \ No newline at end of file From fac7455f5668f720b264741fe2fbc49c7a4a75b0 Mon Sep 17 00:00:00 2001 From: yashin Date: Mon, 8 Dec 2025 10:32:53 +0800 Subject: [PATCH 4/4] "FIX: Add openpyxl dependency to pyproject.toml for Excel file reading." --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 45cccdf..21a7b2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "python-dotenv", "pyyaml", "streamlit", + "openpyxl" ] [build-system]