diff --git a/.gitignore b/.gitignore index bb1b3be..28c95f3 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,10 @@ target/ # Jupyter Notebook .ipynb_checkpoints +# Ignore all files inside notebooks directory but track the .gitkeep file +notebooks/* +!notebooks/.gitkeep + # IPython profile_default/ ipython_config.py diff --git a/calculate_sha256.py b/calculate_sha256.py new file mode 100644 index 0000000..532b61d --- /dev/null +++ b/calculate_sha256.py @@ -0,0 +1,34 @@ +import hashlib +from pathlib import Path +import sys + +def calculate_sha256(filepath: Path): + """Calculates the SHA256 hash for a given file.""" + if not filepath.exists(): + print(f"Error: File not found at {filepath}") + return + + sha256_hash = hashlib.sha256() + try: + with open(filepath, "rb") as f: + # Read and update hash string value in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + + calculated_hash = sha256_hash.hexdigest() + print("-" * 50) + print(f"File Path: {filepath}") + print(f"SHA256 Checksum: {calculated_hash}") + print("-" * 50) + + except Exception as e: + print(f"An error occurred while reading the file: {e}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + target_path = Path("data") / "raw" / "2024" / "1.0" / "2024_EAVS_for_Public_Release_V1_xlsx.xlsx" + else: + # Allow passing the path as a command-line argument + target_path = Path(sys.argv[1]) + + calculate_sha256(target_path) \ No newline at end of file diff --git a/eavs/assets/column_mappings/2020.yaml b/eavs/assets/column_mappings/2020.yaml index c44dd5c..f7ee3fa 100644 --- a/eavs/assets/column_mappings/2020.yaml +++ b/eavs/assets/column_mappings/2020.yaml @@ -1,5 +1,5 @@ -- version: "1.2" - columns: +version: "1.2" +columns: - name: fips_code dtype: string raw_name: FIPSCode @@ -168,3 +168,471 @@ - name: voters_removed_nonresponse dtype: int64 raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: mail_ballots_successfully_cured + dtype: int64 + raw_name: C7b + description: "Mail ballots successfully cured." + + - name: mail_ballots_unsuccessfully_cured + dtype: int64 + raw_name: C7c + description: "Mail ballots unsuccessfully cured." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected." + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_3 + dtype: int64 + raw_name: E2l + description: "Provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_other_3_text + dtype: string + raw_name: E2l_Other + description: "Text description for provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." \ No newline at end of file diff --git a/eavs/assets/column_mappings/2022.yaml b/eavs/assets/column_mappings/2022.yaml index cd69b9a..95bfd3c 100644 --- a/eavs/assets/column_mappings/2022.yaml +++ b/eavs/assets/column_mappings/2022.yaml @@ -1,4 +1,4 @@ -- version: "1.1" + version: "1.1" columns: - name: fips_code dtype: string @@ -168,3 +168,451 @@ - name: voters_removed_nonresponse dtype: int64 raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected." + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." diff --git a/eavs/assets/column_mappings/2024.yaml b/eavs/assets/column_mappings/2024.yaml new file mode 100644 index 0000000..70a85cc --- /dev/null +++ b/eavs/assets/column_mappings/2024.yaml @@ -0,0 +1,638 @@ + version: "1.0" + columns: + - name: fips_code + dtype: string + raw_name: FIPSCode + - name: jurisdiction_name + dtype: string + raw_name: Jurisdiction_Name + - name: state + dtype: string + raw_name: State_Full + - name: state_abbr + dtype: string + raw_name: State_Abbr + - name: registered_eligible_voters + dtype: int64 + raw_name: A1a + - name: active_voters + dtype: int64 + raw_name: A1b + - name: inactive_voters + dtype: float64 + raw_name: A1c + - name: total_registrations_received + dtype: int64 + raw_name: A3a + - name: new_valid_registrations + dtype: int64 + raw_name: A3b + - name: pre_registrations + dtype: int64 + raw_name: A3c + - name: duplicate_registrations + dtype: int64 + raw_name: A3d + - name: rejected_registrations + dtype: int64 + raw_name: A3e + - name: intrajurisdiction_registration_updates + dtype: int64 + raw_name: A3f + - name: interjurisdiction_registration_updates + dtype: int64 + raw_name: A3g + - name: total_forms_mail_fax_email + dtype: int64 + raw_name: A4a + - name: new_registrations_mail_fax_email + dtype: int64 + raw_name: A5a + - name: duplicate_registrations_mail_fax_email + dtype: int64 + raw_name: A6a + - name: rejected_registrations_mail_fax_email + dtype: int64 + raw_name: A7a + - name: total_forms_in_person + dtype: int64 + raw_name: A4b + - name: new_registrations_in_person + dtype: int64 + raw_name: A5b + - name: duplicate_registrations_in_person + dtype: int64 + raw_name: A6b + - name: rejected_registrations_in_person + dtype: int64 + raw_name: A7b + - name: total_forms_online + dtype: int64 + raw_name: A4c + - name: new_registrations_online + dtype: int64 + raw_name: A5c + - name: duplicate_registrations_online + dtype: int64 + raw_name: A6c + - name: rejected_registrations_online + dtype: int64 + raw_name: A7c + - name: total_forms_dmv + dtype: int64 + raw_name: A4d + - name: new_registrations_dmv + dtype: int64 + raw_name: A5d + - name: duplicate_registrations_dmv + dtype: int64 + raw_name: A6d + - name: rejected_registrations_dmv + dtype: int64 + raw_name: A7d + - name: total_forms_mandatory_nvra + dtype: int64 + raw_name: A4e + - name: new_registrations_mandatory_nvra + dtype: int64 + raw_name: A5e + - name: duplicate_registrations_mandatory_nvra + dtype: int64 + raw_name: A6e + - name: rejected_registrations_mandatory_nvra + dtype: int64 + raw_name: A7e + - name: total_forms_disability_agency + dtype: int64 + raw_name: A4f + - name: new_registrations_disability_agency + dtype: int64 + raw_name: A5f + - name: duplicate_registrations_disability_agency + dtype: int64 + raw_name: A6f + - name: rejected_registrations_disability_agency + dtype: int64 + raw_name: A7f + - name: total_forms_armed_forces + dtype: int64 + raw_name: A4g + - name: new_registrations_armed_forces + dtype: int64 + raw_name: A5g + - name: duplicate_registrations_armed_forces + dtype: int64 + raw_name: A6g + - name: rejected_registrations_armed_forces + dtype: int64 + raw_name: A7g + - name: total_forms_discretionary_nvra + dtype: int64 + raw_name: A4h + - name: new_registrations_discretionary_nvra + dtype: int64 + raw_name: A5h + - name: duplicate_registrations_discretionary_nvra + dtype: int64 + raw_name: A6h + - name: rejected_registrations_discretionary_nvra + dtype: int64 + raw_name: A7h + - name: total_forms_advocacy_groups + dtype: int64 + raw_name: A4i + - name: new_registrations_advocacy_groups + dtype: int64 + raw_name: A5i + - name: duplicate_registrations_advocacy_groups + dtype: int64 + raw_name: A6i + - name: rejected_registrations_advocacy_groups + dtype: int64 + raw_name: A7i + - name: confirmation_notices_sent_total + dtype: int64 + raw_name: A8a + - name: confirmation_notices_undeliverable + dtype: int64 + raw_name: A8d + - name: confirmation_notices_status_unknown + dtype: int64 + raw_name: A8e + - name: voters_removed_total_2020_2022 + dtype: int64 + raw_name: A9a + - name: voters_removed_felony + dtype: int64 + raw_name: A9d + - name: voters_removed_nonresponse + dtype: int64 + raw_name: A9e + + # ------------------------------- + # Mail Ballot Variables (C1) + # ------------------------------- + - name: mail_transmitted_total + dtype: int64 + raw_name: C1a + description: "Total number of mail ballots transmitted to voters." + + - name: mail_returned_by_voters + dtype: int64 + raw_name: C1b + description: "Number of mail ballots returned by voters." + + - name: mail_returned_undeliverable + dtype: int64 + raw_name: C1c + description: "Number of mail ballots returned as undeliverable." + + - name: mail_voided + dtype: int64 + raw_name: C1d + description: "Number of mail ballots voided by election officials." + + - name: mail_voted_in_person + dtype: int64 + raw_name: C1e + description: "Number of mail ballots that were instead voted in-person." + + - name: mail_unreturned + dtype: int64 + raw_name: C1f + description: "Number of mail ballots that were not returned by voters." + + - name: mail_other_1 + dtype: int64 + raw_name: C1g + description: "Mail ballots categorized as 'Other 1'." + + - name: mail_other_1_text + dtype: string + raw_name: C1g_Other + description: "Text description for 'Other 1' mail ballots." + + - name: mail_other_2 + dtype: int64 + raw_name: C1h + description: "Mail ballots categorized as 'Other 2'." + + - name: mail_other_2_text + dtype: string + raw_name: C1h_Other + description: "Text description for 'Other 2' mail ballots." + + - name: mail_other_3 + dtype: int64 + raw_name: C1i + description: "Mail ballots categorized as 'Other 3'." + + - name: mail_other_3_text + dtype: string + raw_name: C1i_Other + description: "Text description for 'Other 3' mail ballots." + + - name: mail_comments + dtype: string + raw_name: C1Comments + description: "General comments or notes regarding mail ballots." + + # ------------------------------- + # Permanent Mail Registrants (C2) + # ------------------------------- + - name: total_transmitted_permanent_mail + dtype: int64 + raw_name: C2a + description: "Total mail ballots transmitted to permanent mail registrants." + + - name: permanent_mail_comments + dtype: string + raw_name: C2Comments + description: "Comments regarding permanent mail registrants." + + # ------------------------------- + # Drop Box Variables (C3–C5) + # ------------------------------- + - name: drop_boxes_total + dtype: int64 + raw_name: C3a + description: "Total number of drop boxes available." + + - name: drop_box_availability_comments + dtype: string + raw_name: C3Comments + description: "Comments regarding drop box availability." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_box_availability_comments`** + + - name: drop_boxes_election_day_total + dtype: int64 + raw_name: C4a + description: "Total drop boxes on Election Day." + + - name: drop_boxes_election_day_at_office + dtype: int64 + raw_name: C4b + description: "Election Day drop boxes located at election office." + + - name: drop_boxes_election_day_not_at_office + dtype: int64 + raw_name: C4c + description: "Election Day drop boxes not located at election office." + + - name: drop_boxes_early_voting_total + dtype: int64 + raw_name: C5a + description: "Total drop boxes during early voting period." + + - name: drop_boxes_early_voting_at_office + dtype: int64 + raw_name: C5b + description: "Early voting drop boxes located at election office." + + - name: drop_boxes_early_voting_not_at_office + dtype: int64 + raw_name: C5c + description: "Early voting drop boxes not located at election office." + + - name: drop_boxes_logistics_comments + dtype: string + raw_name: C4_C5Comments + description: "Comments for drop box logistics (Election Day + Early Voting)." + # **FIXED: Changed name from `drop_boxes_comments` to `drop_boxes_logistics_comments`** + + # ------------------------------- + # Mail Ballot Returns via Drop Box (C6) + # ------------------------------- + - name: mail_ballots_returned_via_drop_box + dtype: int64 + raw_name: C6a + description: "Total mail ballots returned via drop boxes." + + - name: drop_box_return_comments + dtype: string + raw_name: C6Comments + description: "Comments about mail ballots returned via drop box." + + # ------------------------------- + # Curing Mail Ballots (C7) + # ------------------------------- + - name: mail_ballots_entered_curing + dtype: int64 + raw_name: C7a + description: "Mail ballots that entered the curing process." + + - name: mail_ballots_successfully_cured + dtype: int64 + raw_name: C7b + description: "Mail ballots successfully cured." + + - name: mail_ballots_unsuccessfully_cured + dtype: int64 + raw_name: C7c + description: "Mail ballots unsuccessfully cured." + + - name: curing_comments + dtype: string + raw_name: C7Comments + description: "Comments about curing process." + + # ------------------------------- + # Mail Ballots Counted and Rejected (C8–C9) + # ------------------------------- + - name: mail_ballots_counted + dtype: int64 + raw_name: C8a + description: "Total mail ballots counted." + + - name: mail_ballots_counted_comments + dtype: string + raw_name: C8Comments + description: "Comments about counted mail ballots." + + - name: mail_ballots_rejected_total + dtype: int64 + raw_name: C9a + description: "Total mail ballots rejected." + + - name: mail_ballots_rejected_late + dtype: int64 + raw_name: C9b + description: "Mail ballots rejected because submitted late." + + - name: mail_ballots_rejected_missing_voter_signature + dtype: int64 + raw_name: C9c + description: "Mail ballots rejected due to missing voter signature." + + - name: mail_ballots_rejected_missing_witness_signature + dtype: int64 + raw_name: C9d + description: "Mail ballots rejected due to missing witness signature." + + - name: mail_ballots_rejected_non_matching_voter_signature + dtype: int64 + raw_name: C9e + description: "Mail ballots rejected due to non-matching voter signature." + + - name: mail_ballots_rejected_unofficial_envelope + dtype: int64 + raw_name: C9f + description: "Mail ballots rejected because envelope was unofficial." + + - name: mail_ballots_rejected_ballot_missing_from_envelope + dtype: int64 + raw_name: C9g + description: "Mail ballots rejected because ballot was missing from envelope." + + - name: mail_ballots_rejected_no_secrecy_envelope + dtype: int64 + raw_name: C9h + description: "Mail ballots rejected because of missing secrecy envelope." + + - name: mail_ballots_rejected_multiple_ballots_one_envelope + dtype: int64 + raw_name: C9i + description: "Mail ballots rejected because multiple ballots were in one envelope." + + - name: mail_ballots_rejected_envelope_not_sealed + dtype: int64 + raw_name: C9j + description: "Mail ballots rejected because envelope was not sealed." + + - name: mail_ballots_rejected_no_postmark + dtype: int64 + raw_name: C9k + description: "Mail ballots rejected because no postmark was present." + + - name: mail_ballots_rejected_no_resident_address + dtype: int64 + raw_name: C9l + description: "Mail ballots rejected due to missing resident address on envelope." + + - name: mail_ballots_rejected_voter_deceased + dtype: int64 + raw_name: C9m + description: "Mail ballots rejected because voter was deceased." + + - name: mail_ballots_rejected_voter_already_voted + dtype: int64 + raw_name: C9n + description: "Mail ballots rejected because voter already voted." + + - name: mail_ballots_rejected_missing_documentation + dtype: int64 + raw_name: C9o + description: "Mail ballots rejected due to missing documentation." + + - name: mail_ballots_rejected_voter_not_eligible + dtype: int64 + raw_name: C9p + description: "Mail ballots rejected because voter was not eligible." + + - name: mail_ballots_rejected_no_ballot_application + dtype: int64 + raw_name: C9q + description: "Mail ballots rejected because no ballot application was found." + + - name: mail_ballots_rejected_other_1 + dtype: int64 + raw_name: C9r + description: "Mail ballots rejected categorized as Other 1." + + - name: mail_ballots_rejected_other_1_text + dtype: string + raw_name: C9r_Other + description: "Text description for Other 1 mail ballots rejected." + + - name: mail_ballots_rejected_other_2 + dtype: int64 + raw_name: C9s + description: "Mail ballots rejected categorized as Other 2." + + - name: mail_ballots_rejected_other_2_text + dtype: string + raw_name: C9s_Other + description: "Text description for Other 2 mail ballots rejected." + + - name: mail_ballots_rejected_other_3 + dtype: int64 + raw_name: C9t + description: "Mail ballots rejected categorized as Other 3." + + - name: mail_ballots_rejected_other_3_text + dtype: string + raw_name: C9t_Other + description: "Text description for Other 3 mail ballots rejected." + + - name: mail_ballots_rejected_comments + dtype: string + raw_name: C9Comments + description: "General comments regarding rejected mail ballots." + + # ------------------------------- + # Provisional Ballots (E1–E3) + # ------------------------------- + - name: provisional_ballots_cast_total + dtype: int64 + raw_name: E1a + description: "Total provisional ballots cast." + + - name: provisional_ballots_fully_counted + dtype: int64 + raw_name: E1b + description: "Number of provisional ballots fully counted." + + - name: provisional_ballots_partially_counted + dtype: int64 + raw_name: E1c + description: "Number of provisional ballots partially counted." + + - name: provisional_ballots_rejected_total + dtype: int64 + raw_name: E1d + description: "Number of provisional ballots rejected" + + - name: provisional_ballots_other_status + dtype: int64 + raw_name: E1e + description: "Provisional ballots with other status." + + - name: provisional_ballots_other_status_text + dtype: string + raw_name: E1e_Other + description: "Text description for provisional ballots with other status." + + - name: provisional_ballots_comments + dtype: string + raw_name: E1Comments + description: "General comments regarding provisional ballots." + + - name: provisional_ballots_cast_voter_not_on_list + dtype: int64 + raw_name: E2a + description: "Provisional ballots cast because voter was not on list." + + - name: provisional_ballots_cast_voter_lacked_id + dtype: int64 + raw_name: E2b + description: "Provisional ballots cast because voter lacked ID." + + - name: provisional_ballots_cast_challenged_by_official + dtype: int64 + raw_name: E2c + description: "Provisional ballots cast where election official challenged eligibility." + + - name: provisional_ballots_cast_challenged_by_other + dtype: int64 + raw_name: E2d + description: "Provisional ballots cast where another person challenged eligibility." + + - name: provisional_ballots_cast_voter_not_resident + dtype: int64 + raw_name: E2e + description: "Provisional ballots cast because voter was not resident." + + - name: provisional_ballots_cast_registration_not_updated + dtype: int64 + raw_name: E2f + description: "Provisional ballots cast because voter registration was not updated." + + - name: provisional_ballots_cast_voter_did_not_surrender_mail + dtype: int64 + raw_name: E2g + description: "Provisional ballots cast because voter did not surrender mail ballot." + + - name: provisional_ballots_cast_judge_extended_hours + dtype: int64 + raw_name: E2h + description: "Provisional ballots cast due to judge extending voting hours." + + - name: provisional_ballots_cast_voter_used_sdr + dtype: int64 + raw_name: E2i + description: "Provisional ballots cast where voter used Same-Day Registration (SDR)." + + - name: provisional_ballots_cast_other_1 + dtype: int64 + raw_name: E2j + description: "Provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_1_text + dtype: string + raw_name: E2j_Other + description: "Text description for provisional ballots cast for other reason 1." + + - name: provisional_ballots_cast_other_2 + dtype: int64 + raw_name: E2k + description: "Provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_2_text + dtype: string + raw_name: E2k_Other + description: "Text description for provisional ballots cast for other reason 2." + + - name: provisional_ballots_cast_other_3 + dtype: int64 + raw_name: E2l + description: "Provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_other_3_text + dtype: string + raw_name: E2l_Other + description: "Text description for provisional ballots cast for other reason 3." + + - name: provisional_ballots_cast_comments + dtype: string + raw_name: E2Comments + description: "Comments for provisional ballots cast (E2 group)." + + - name: provisional_ballots_rejected_total_2 + dtype: int64 + raw_name: E3a + description: "Total provisional ballots rejected." + + - name: provisional_ballots_rejected_not_registered + dtype: int64 + raw_name: E3b + description: "Provisional ballots rejected because voter not registered." + + - name: provisional_ballots_rejected_wrong_jurisdiction + dtype: int64 + raw_name: E3c + description: "Provisional ballots rejected due to wrong jurisdiction." + + - name: provisional_ballots_rejected_wrong_precinct + dtype: int64 + raw_name: E3d + description: "Provisional ballots rejected due to wrong precinct." + + - name: provisional_ballots_rejected_no_id + dtype: int64 + raw_name: E3e + description: "Provisional ballots rejected due to no ID." + + - name: provisional_ballots_rejected_incomplete + dtype: int64 + raw_name: E3f + description: "Provisional ballots rejected because incomplete." + + - name: provisional_ballots_rejected_ballot_missing + dtype: int64 + raw_name: E3g + description: "Provisional ballots rejected because ballot missing." + + - name: provisional_ballots_rejected_no_signature + dtype: int64 + raw_name: E3h + description: "Provisional ballots rejected because of no signature." + + - name: provisional_ballots_rejected_non_matching_signature + dtype: int64 + raw_name: E3i + description: "Provisional ballots rejected due to non-matching signature." + + - name: provisional_ballots_rejected_already_voted + dtype: int64 + raw_name: E3j + description: "Provisional ballots rejected because voter already voted." \ No newline at end of file diff --git a/eavs/assets/column_mappings/timeseries.yaml b/eavs/assets/column_mappings/timeseries.yaml deleted file mode 100644 index 20530d9..0000000 --- a/eavs/assets/column_mappings/timeseries.yaml +++ /dev/null @@ -1,297 +0,0 @@ ---- -- version: "1.0" - columns: - - name: fips_code - dtype: string - raw_name: FIPSCode - - name: jurisdiction_name - dtype: string - raw_name: Jurisdiction_Name - - name: year - dtype: int64 - raw_name: Year - - name: state - dtype: string - raw_name: State_Full - - name: state_abbr - dtype: string - raw_name: State_Abbr - - name: registered_eligible_voters - dtype: int64 - raw_name: A1a - - name: active_voters - dtype: int64 - raw_name: A1b - - name: inactive_voters - dtype: float64 - raw_name: A1c - - name: total_registrations_received - dtype: int64 - raw_name: A3a - - name: new_valid_registrations - dtype: int64 - raw_name: A3b - - name: pre_registrations - dtype: int64 - raw_name: A3c - - name: duplicate_registrations - dtype: int64 - raw_name: A3d - - name: rejected_registrations - dtype: int64 - raw_name: A3e - - name: intrajurisdiction_registration_updates - dtype: int64 - raw_name: A3f - - name: interjurisdiction_registration_updates - dtype: int64 - raw_name: A3g - - name: total_forms_mail_fax_email - dtype: int64 - raw_name: A4a - - name: new_registrations_mail_fax_email - dtype: int64 - raw_name: A5a - - name: duplicate_registrations_mail_fax_email - dtype: int64 - raw_name: A6a - - name: rejected_registrations_mail_fax_email - dtype: int64 - raw_name: A7a - - name: total_forms_in_person - dtype: int64 - raw_name: A4b - - name: new_registrations_in_person - dtype: int64 - raw_name: A5b - - name: duplicate_registrations_in_person - dtype: int64 - raw_name: A6b - - name: rejected_registrations_in_person - dtype: int64 - raw_name: A7b - - name: total_forms_online - dtype: int64 - raw_name: A4c - - name: new_registrations_online - dtype: int64 - raw_name: A5c - - name: duplicate_registrations_online - dtype: int64 - raw_name: A6c - - name: rejected_registrations_online - dtype: int64 - raw_name: A7c - - name: total_forms_dmv - dtype: int64 - raw_name: A4d - - name: new_registrations_dmv - dtype: int64 - raw_name: A5d - - name: duplicate_registrations_dmv - dtype: int64 - raw_name: A6d - - name: rejected_registrations_dmv - dtype: int64 - raw_name: A7d - - name: total_forms_mandatory_nvra - dtype: int64 - raw_name: A4e - - name: new_registrations_mandatory_nvra - dtype: int64 - raw_name: A5e - - name: duplicate_registrations_mandatory_nvra - dtype: int64 - raw_name: A6e - - name: rejected_registrations_mandatory_nvra - dtype: int64 - raw_name: A7e - - name: total_forms_disability_agency - dtype: int64 - raw_name: A4f - - name: new_registrations_disability_agency - dtype: int64 - raw_name: A5f - - name: duplicate_registrations_disability_agency - dtype: int64 - raw_name: A6f - - name: rejected_registrations_disability_agency - dtype: int64 - raw_name: A7f - - name: total_forms_armed_forces - dtype: int64 - raw_name: A4g - - name: new_registrations_armed_forces - dtype: int64 - raw_name: A5g - - name: duplicate_registrations_armed_forces - dtype: int64 - raw_name: A6g - - name: rejected_registrations_armed_forces - dtype: int64 - raw_name: A7g - - name: total_forms_discretionary_nvra - dtype: int64 - raw_name: A4h - - name: new_registrations_discretionary_nvra - dtype: int64 - raw_name: A5h - - name: duplicate_registrations_discretionary_nvra - dtype: int64 - raw_name: A6h - - name: rejected_registrations_discretionary_nvra - dtype: int64 - raw_name: A7h - - name: total_forms_advocacy_groups - dtype: int64 - raw_name: A4i - - name: new_registrations_advocacy_groups - dtype: int64 - raw_name: A5i - - name: duplicate_registrations_advocacy_groups - dtype: int64 - raw_name: A6i - - name: rejected_registrations_advocacy_groups - dtype: int64 - raw_name: A7i - - name: confirmation_notices_sent_total - dtype: int64 - raw_name: A8a - - name: confirmation_notices_undeliverable - dtype: int64 - raw_name: A8d - - name: confirmation_notices_status_unknown - dtype: int64 - raw_name: A8e - - name: voters_removed_total_2020_2022 - dtype: int64 - raw_name: A9a - - name: voters_removed_felony - dtype: int64 - raw_name: A9d - - name: voters_removed_nonresponse - dtype: int64 - raw_name: A9e - - name: c1a_mail_transmitted_total - dtype: int64 - raw_name: C1a - - name: c1b_mail_returned_by_voters_total - dtype: int64 - raw_name: C1b - - name: c8a_total_mail_ballots_counted - dtype: int64 - raw_name: C8a - - name: c9a_total_mail_ballots_rejected - dtype: int64 - raw_name: C9a - - name: c9b_mail_ballots_rejected_late - dtype: int64 - raw_name: C9b - - name: c9c_missing_voter_signature - dtype: int64 - raw_name: C9c - - name: c9d_missing_witness_signature - dtype: int64 - raw_name: C9d - - name: c9e_non_matching_voter_signature - dtype: int64 - raw_name: C9e - - name: c9f_unofficial_envelope - dtype: int64 - raw_name: C9f - - name: c9g_ballot_missing_from_envelope - dtype: int64 - raw_name: C9g - - name: c9i_multiple_ballots_one_envelope - dtype: int64 - raw_name: C9i - - name: c9j_envelope_not_sealed - dtype: int64 - raw_name: C9j - - name: c9l_no_resident_address_on_envelope - dtype: int64 - raw_name: C9l - - name: c9m_voter_deceased - dtype: int64 - raw_name: C9m - - name: c9n_already_voted - dtype: int64 - raw_name: C9n - - name: c9o_missing_documentation - dtype: int64 - raw_name: C9o - - name: c9q_no_ballot_application - dtype: int64 - raw_name: C9q - - name: c9r_rejected_other_1 - dtype: int64 - raw_name: C9r - - name: c9s_rejected_other_2 - dtype: int64 - raw_name: C9s - - name: c9t_rejected_other_3 - dtype: int64 - raw_name: C9t - - name: e1a_total_provisional_ballots_cast - dtype: int64 - raw_name: E1a - - name: e1b_provisional_ballots_fully_counted - dtype: int64 - raw_name: E1b - - name: e1c_provisional_ballots_partially_counted - dtype: int64 - raw_name: E1c - - name: e1d_provisional_ballots_rejected - dtype: int64 - raw_name: E1d - - name: e2a_provisional_cast_voter_not_on_list - dtype: int64 - raw_name: E2a - - name: e2b_provisional_cast_voter_lacked_id - dtype: int64 - raw_name: E2b - - name: e2c_official_challenged_eligibility - dtype: int64 - raw_name: E2c - - name: e2d_another_person_challenged_eligibility - dtype: int64 - raw_name: E2d - - name: e2e_voter_not_resident - dtype: int64 - raw_name: E2e - - name: e2f_registration_not_updated - dtype: int64 - raw_name: E2f - - name: e2g_did_not_surrender_mail_ballot - dtype: int64 - raw_name: E2g - - name: e2h_judge_extended_voting_hours - dtype: int64 - raw_name: E2h - - name: e3b_provisional_rejected_not_registered - dtype: int64 - raw_name: E3b - - name: e3c_provisional_rejected_wrong_jurisdiction - dtype: int64 - raw_name: E3c - - name: e3d_provisional_rejected_wrong_precinct - dtype: int64 - raw_name: E3d - - name: e3e_provisional_rejected_no_id - dtype: int64 - raw_name: E3e - - name: e3f_provisional_rejected_incomplete - dtype: int64 - raw_name: E3f - - name: e3g_provisional_rejected_ballot_missing - dtype: int64 - raw_name: E3g - - name: e3h_provisional_rejected_no_signature - dtype: int64 - raw_name: E3h - - name: e3i_provisional_rejected_non_matching_signature - dtype: int64 - raw_name: E3i - - name: e3j_provisional_rejected_already_voted - dtype: int64 - raw_name: E3j diff --git a/eavs/assets/manifest.jsonl b/eavs/assets/manifest.jsonl index 797749b..d894b3a 100644 --- a/eavs/assets/manifest.jsonl +++ b/eavs/assets/manifest.jsonl @@ -1,3 +1,4 @@ +{"year": 2024, "version": "1.0", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2025-06/2024_EAVS_for_Public_Release_V1_xlsx.xlsx", "sha256sum": "5456a0beb07c83559c60bb84acb1af7f085cbe68a6a19487925b90becd7d61be"} {"year": 2022, "version": "1.1", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2023-12/2022_EAVS_for_Public_Release_V1.1.xlsx", "sha256sum": "ebcc51eade35dd3b05d65067e64d57e4267a29c9e2302ef0185eb6ef0e16c6ed"} {"year": 2020, "version": "1.2", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2023-12/2020_EAVS_for_Public_Release_V1.2.xlsx", "sha256sum": "e93dfd906cd2ff93ae101ed647731baa0d9ca9c21a7ca6c0bf1f6c2bfbe514b9"} {"year": "timeseries", "version": "1.0", "format": "excel", "url": "https://www.eac.gov/sites/default/files/2025-05/EAVS_Time_Series_Dataset.xlsx", "sha256sum": "2818284def214205da0eb7839bb749dc25279a18562fdd1fdd66af5bfab4124a"} diff --git a/eavs/clean.py b/eavs/clean.py index e1cf87e..fab93b7 100644 --- a/eavs/clean.py +++ b/eavs/clean.py @@ -1,154 +1,229 @@ +import yaml +import re from pathlib import Path +from loguru import logger as log +from typing import Dict, Any, List -from loguru import logger import pandas as pd -from pandera.io import from_yaml -import pyarrow as pa -from yaml import safe_load - -from eavs.config import CLEANED_DATA_DIR, RAW_DATA_DIR - -COLUMN_METADATA_DIR = Path(__file__).parent / "assets" / "column_mappings" - -def load_column_mapping(year: int, version: str) -> dict: - with (COLUMN_METADATA_DIR / f"{year}.yaml").open("r") as f: - data = safe_load(f) - for dataset in data: - if dataset["version"] == version: - return dataset["columns"] - - - -PROCESSING_FNS: dict[int, callable] = {} - - -def register_cleaning_function(year): - def decorator(func): - PROCESSING_FNS[year] = func - return func - - return decorator - - -@register_cleaning_function(2022) -def clean_2022(): - metadata = load_column_mapping(2022, "1.1") - # Use mapping file dtypes, not forced float64 - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "2022" / "1.1" / "2022_EAVS_for_Public_Release_V1.1.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - - -@register_cleaning_function(2020) -def clean_2020(): - metadata = load_column_mapping(2020, "1.2") - - # Rename columns - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "2020" / "1.2" / "2020_EAVS_for_Public_Release_V1.2.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - - -@register_cleaning_function("timeseries") -def clean_timeseries(): - metadata = load_column_mapping("timeseries", "1.0") - - # Rename columns - dtypes = {col["raw_name"]: f"{col['dtype']}[pyarrow]" for col in metadata} - - mapping = {col["raw_name"]: col["name"] for col in metadata} - - df = pd.read_excel( - RAW_DATA_DIR / "timeseries" / "1.0" / "EAVS_Time_Series_Dataset.xlsx", - engine="calamine", - dtype_backend="pyarrow", - dtype=dtypes, - na_values=["Does not apply", "Data not available", "Valid skip"], - ) - - ## Temporary hack for weird bug in pandas - # https://github.com/pandas-dev/pandas/issues/61496 - for col in dtypes: - if dtypes[col] == "string[pyarrow]": - df[col] = df[col].astype(pd.ArrowDtype(pa.string())) - ## - - df_out = df.loc[:, mapping.keys()].rename(columns=mapping) - return df_out - +import pandera as pa +from pandera.typing import DataFrame, Series, String + +# ----------------- +# 1. Configuration +# ----------------- +# PROJ_ROOT = directory above 'eavs' (ie. /home/user/eavs_clc) +PROJ_ROOT = Path(__file__).resolve().parent.parent + +CONFIG_PATH = PROJ_ROOT / 'eavs' / 'assets' / 'column_mappings' + +def load_config(year: int) -> List[Dict[str, Any]]: + """ + Dynamically loads the year-specific config file (e.g., 2022.yaml). + Handles top-level nesting (e.g., under a 'columns' key) to ensure + a clean list of mappings is returned. + """ + config_file = CONFIG_PATH / f'{year}.yaml' + if not config_file.exists(): + log.warning(f"Config file not found for year {year}: {config_file}. Cleaning will proceed without specific variable handling.") + return [] + + try: + with open(config_file, 'r') as f: + data = yaml.safe_load(f) + + # If the loaded data is a dictionary, extract the list from the 'columns' key. + if isinstance(data, dict) and 'columns' in data: + log.debug("Extracted column list from 'columns' key.") + return data['columns'] + + # If it's already a list (flat structure), return it directly. + if isinstance(data, list): + log.debug("Loaded config as flat list.") + return data + + # Fallback for unexpected structure + log.warning(f"Config for year {year} is in an unexpected format. Returning empty list.") + return [] + + except Exception as e: + log.error(f"Error loading config file {config_file}: {e}") + return [] + +# ----------------- +# 2. Schema Definition +# ----------------- + +class CleanedEAVSSchema(pa.DataFrameModel): + # FIPS codes must be 5-digit strings + fips_code: Series[String] = pa.Field(str_matches=r'^\d{5}$') + + # Year of the EAVS data (e.g., 2022) + year: Series[int] = pa.Field(ge=2000, le=2030) + + class Config: + strict = False + coerce = True + +schema = CleanedEAVSSchema + +# ----------------- +# 3. Cleaning Functions +# ----------------- + +def clean_data(year: int, config: List[Dict[str, Any]]) -> pd.DataFrame: + """ + Loads raw EAVS data for a given year, applies renaming and type conversion + based on the loaded configuration, and ensures robust column selection. + + NOTE: This function relies on raw data being found in: + /data/raw//.xlsx + """ + raw_data_dir = PROJ_ROOT / 'data' / 'raw' / str(year) + excel_files = list(raw_data_dir.rglob('*.xls*')) + + if not excel_files: + log.warning(f"Raw EAVS file not found for year {year} within {raw_data_dir}") + return pd.DataFrame() + + data_path = excel_files[0] + log.info(f"Cleaning data for {year} using file: {data_path.name}") + + # Robustly create mapping, skipping malformed config entries + valid_configs = [ + c for c in config + if isinstance(c, dict) and 'raw_name' in c and 'name' in c + ] + if len(valid_configs) != len(config): + log.warning(f"Skipped {len(config) - len(valid_configs)} malformed entries in the {year} column mapping file.") + + mapping = {col['raw_name']: col['name'] for col in valid_configs} + dtypes = {col['raw_name']: str for col in valid_configs} + + # Load raw data + try: + df = pd.read_excel(data_path, sheet_name=0, engine='openpyxl', dtype=dtypes) + except Exception as e: + log.error(f"Error loading {data_path}: {e}") + return pd.DataFrame() + + # Standardize FIPS column name + fips_col = next((col for col in df.columns if 'FIPS' in str(col).upper()), None) + if fips_col: + df = df.rename(columns={fips_col: 'fips_code'}) + else: + log.error(f"FIPS code column not found in {year} data.") + return pd.DataFrame() + + # Add year column and normalize FIPS + df['year'] = year + df['fips_code'] = df['fips_code'].astype(str).str.zfill(5).str[:5] + + # Apply YAML renaming & Robust Filtering + mapping_keys = mapping.keys() + existing_keys = [k for k in mapping_keys if k in df.columns] + + cols_to_select = existing_keys + ['fips_code', 'year'] + + df = df.filter(items=cols_to_select, axis=1) + + renaming_map = {k: mapping[k] for k in existing_keys} + df = df.rename(columns=renaming_map) + + # Convert numerical columns to Int64Dtype (EAVS variables: A1, B2, etc.) + for col in df.columns: + if re.match(r'^[A-Z]\d+$', str(col)): + try: + # Use nullable integer dtype + df[col] = pd.to_numeric(df[col], errors='coerce').astype(pd.Int64Dtype()) + except Exception: + log.warning(f"Could not convert column {col} to integer type.") + df[col] = pd.NA + + return df + +def combine_data(cleaned_dfs: List[pd.DataFrame]) -> pd.DataFrame: + """Combines cleaned dataframes from multiple years.""" + log.info(f"Combining {len(cleaned_dfs)} years of cleaned data.") + combined_df = pd.concat(cleaned_dfs, ignore_index=True) + return combined_df + + +# ----------------- +# 4. New Saving Function (Added to meet requirements) +# ----------------- +def save_dataframes(df: pd.DataFrame, filename: str, output_dir: Path): + """Saves a DataFrame to Parquet, XLSX, and CSV formats.""" + log.info(f"Saving {filename} data to multiple formats in {output_dir.name}/") + + # Ensure output directory exists (redundant with main, but safer here) + output_dir.mkdir(parents=True, exist_ok=True) + + # 1. Parquet + parquet_path = output_dir / f"{filename}.parquet" + df.to_parquet(parquet_path, index=False) + log.info(f"Saved: {parquet_path.name}") + + # 2. Excel (XLSX) + excel_path = output_dir / f"{filename}.xlsx" + df.to_excel(excel_path, index=False) + log.info(f"Saved: {excel_path.name}") + + # 3. CSV + csv_path = output_dir / f"{filename}.csv" + df.to_csv(csv_path, index=False) + log.info(f"Saved: {csv_path.name}") + + +# ----------------- +# 5. Main Execution (Modified to use new function) +# ----------------- def main(): - schema = from_yaml(Path(__file__).parent / "assets" / "processed_schema.yaml") - # timeseries has its own processing schema - timeseries_schema_path = Path(__file__).parent / "assets" / "timeseries_process_schema.yaml" - timeseries_schema = None - if timeseries_schema_path.exists(): - timeseries_schema = from_yaml(timeseries_schema_path) - - out = {} - for year, fn in PROCESSING_FNS.items(): - logger.info(f"Cleaning data for {year}") - cleaned_df = fn() - # Validate using the timeseries-specific schema when appropriate - if year == "timeseries" and timeseries_schema is not None: - timeseries_schema.validate(cleaned_df) - else: - schema.validate(cleaned_df) - - # Write out intermediate for every dataset - interim_output_path_base = CLEANED_DATA_DIR / f"{year}" - cleaned_df.to_csv(interim_output_path_base.with_suffix(".csv"), index=False) - cleaned_df.to_parquet(interim_output_path_base.with_suffix(".parquet"), index=False) - - # Only include numeric-year datasets in the combined concatenation - # (timeseries is registered under the string key "timeseries" and - # should be validated/written but not concatenated with year index) - if isinstance(year, int): - out[year] = cleaned_df - else: - logger.info(f"Skipping concatenation for non-year dataset '{year}'") - - if out: - # concat using the numeric year keys only - concat_df = pd.concat(out, keys=out.keys(), names=("year", "")).droplevel(1) - combined_output_path_base = CLEANED_DATA_DIR / "combined" - concat_df.to_csv(combined_output_path_base.with_suffix(".csv"), index=True) - concat_df.to_parquet(combined_output_path_base.with_suffix(".parquet"), index=True) - - -if __name__ == "__main__": - main() + """Main function to clean and combine EAVS data, saving all formats.""" + years = [2020, 2022, 2024] + + # NEW: Define output directory and ensure it exists + output_dir = PROJ_ROOT / 'data' / 'cleaned' + output_dir.mkdir(parents=True, exist_ok=True) + + cleaned_dataframes = [] + for year in years: + year_config = load_config(year) + + if not year_config: + log.warning(f"Skipping cleaning for year {year} due to missing or empty config.") + continue + + df = clean_data(year, year_config) + if not df.empty: + cleaned_dataframes.append(df) + + # **NEW:** Save individual year file in all formats + save_dataframes(df, f'{year}_cleaned', output_dir) + + if not cleaned_dataframes: + log.error("No valid dataframes were cleaned. Exiting.") + return + + combined_df = combine_data(cleaned_dataframes) + cleaned_df = combined_df.copy() + + # Ensure fips_code is string before schema validation + cleaned_df['fips_code'] = cleaned_df['fips_code'].astype(str) + + try: + log.info(f"Validating combined data with {len(cleaned_df)} rows...") + schema.validate(cleaned_df) + log.success("Data validation successful!") + + # **NEW:** Save combined file in all formats + save_dataframes(cleaned_df, 'eavs_combined_cleaned', output_dir) + + except pa.errors.SchemaError as e: + log.error(f"Data validation failed: {e}") + return + + log.info("Finished EAVS Cleaning Pipeline.") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 45cccdf..21a7b2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "python-dotenv", "pyyaml", "streamlit", + "openpyxl" ] [build-system] diff --git a/run_pipeline.py b/run_pipeline.py new file mode 100644 index 0000000..27cd84b --- /dev/null +++ b/run_pipeline.py @@ -0,0 +1,18 @@ +# Pipeline Runner + +import sys +from pathlib import Path + +# Add the parent directory of 'eavs' to the system path to allow module import +sys.path.append(str(Path(__file__).parent)) + +from eavs.clean import main +from loguru import logger as log + +if __name__ == '__main__': + log.info("Starting consolidated EAVS cleaning pipeline test run...") + try: + main() + except Exception as e: + log.error(f"Pipeline crashed during execution: {e}") + log.info("Test run finished.") \ No newline at end of file diff --git a/utils/calculate_sha256.py b/utils/calculate_sha256.py new file mode 100644 index 0000000..532b61d --- /dev/null +++ b/utils/calculate_sha256.py @@ -0,0 +1,34 @@ +import hashlib +from pathlib import Path +import sys + +def calculate_sha256(filepath: Path): + """Calculates the SHA256 hash for a given file.""" + if not filepath.exists(): + print(f"Error: File not found at {filepath}") + return + + sha256_hash = hashlib.sha256() + try: + with open(filepath, "rb") as f: + # Read and update hash string value in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + + calculated_hash = sha256_hash.hexdigest() + print("-" * 50) + print(f"File Path: {filepath}") + print(f"SHA256 Checksum: {calculated_hash}") + print("-" * 50) + + except Exception as e: + print(f"An error occurred while reading the file: {e}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + target_path = Path("data") / "raw" / "2024" / "1.0" / "2024_EAVS_for_Public_Release_V1_xlsx.xlsx" + else: + # Allow passing the path as a command-line argument + target_path = Path(sys.argv[1]) + + calculate_sha256(target_path) \ No newline at end of file