diff --git a/arch/source_package.py b/arch/source_package.py index 4a285ef..5625e69 100644 --- a/arch/source_package.py +++ b/arch/source_package.py @@ -67,7 +67,16 @@ SOURCE_PACKAGE_RESOURCE_PACKAGE = "packages" SOURCE_PACKAGE_ALIASES = { "soi-table-1-1": Path("irs_soi/table_1_1"), + "soi-table-1-2": Path("irs_soi/table_1_2"), "soi-table-1-4": Path("irs_soi/table_1_4"), + "soi-table-2-1": Path("irs_soi/table_2_1"), + "soi-table-2-5": Path("irs_soi/table_2_5"), + "soi-table-4-3": Path("irs_soi/table_4_3"), + "soi-w2-statistics-2020": Path("irs_soi/w2_statistics_2020"), + "soi-ira-traditional-contributions-2022": Path( + "irs_soi/ira_traditional_contributions_2022" + ), + "soi-ira-roth-contributions-2022": Path("irs_soi/ira_roth_contributions_2022"), } SOURCE_ARTIFACT_CACHE_ENV = "ARCH_SOURCE_ARTIFACT_CACHE_DIR" SOURCE_ARTIFACT_FETCH_ENV = "ARCH_SOURCE_ARTIFACT_FETCH" diff --git a/db/data/irs_soi/ira_contributions/22in05ira.xlsx b/db/data/irs_soi/ira_contributions/22in05ira.xlsx new file mode 100644 index 0000000..025676c Binary files /dev/null and b/db/data/irs_soi/ira_contributions/22in05ira.xlsx differ diff --git a/db/data/irs_soi/ira_contributions/22in06ira.xlsx b/db/data/irs_soi/ira_contributions/22in06ira.xlsx new file mode 100644 index 0000000..33e7eaf Binary files /dev/null and b/db/data/irs_soi/ira_contributions/22in06ira.xlsx differ diff --git a/db/data/irs_soi/ira_contributions/manifest_roth_source_package.yaml b/db/data/irs_soi/ira_contributions/manifest_roth_source_package.yaml new file mode 100644 index 0000000..64746b9 --- /dev/null +++ b/db/data/irs_soi/ira_contributions/manifest_roth_source_package.yaml @@ -0,0 +1,20 @@ +source_id: irs_soi +package_id: soi-ira-roth-contributions-2022 +dataset: irs_soi_roth_ira_contributions_2022 +source_page: https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +table: Table 6. Taxpayers with Roth Individual Retirement Arrangement (IRA) Plan Contributions, + by Size of Contribution and Age of Taxpayer +files: + 2022: + filename: 22in06ira.xlsx + source_url: https://www.irs.gov/pub/irs-soi/22in06ira.xlsx + sha256: c1fb0894cb09d2486be4510b7e6ce0b8597787725b928ab8813f577d9fb13183 + size_bytes: 13617 + source_table: Table 6. Taxpayers with Roth Individual Retirement Arrangement (IRA) + Plan Contributions, by Size of Contribution and Age of Taxpayer + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-ira-roth-contributions-2022/2022/c1fb0894cb09d2486be4510b7e6ce0b8597787725b928ab8813f577d9fb13183/22in06ira.xlsx + uri: r2://arch-raw/raw/irs_soi/soi-ira-roth-contributions-2022/2022/c1fb0894cb09d2486be4510b7e6ce0b8597787725b928ab8813f577d9fb13183/22in06ira.xlsx diff --git a/db/data/irs_soi/ira_contributions/manifest_traditional_source_package.yaml b/db/data/irs_soi/ira_contributions/manifest_traditional_source_package.yaml new file mode 100644 index 0000000..2ad829d --- /dev/null +++ b/db/data/irs_soi/ira_contributions/manifest_traditional_source_package.yaml @@ -0,0 +1,20 @@ +source_id: irs_soi +package_id: soi-ira-traditional-contributions-2022 +dataset: irs_soi_traditional_ira_contributions_2022 +source_page: https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +table: Table 5. Taxpayers with Traditional Individual Retirement Arrangement (IRA) + Plan Contributions, by Size of Contribution and Age of Taxpayer +files: + 2022: + filename: 22in05ira.xlsx + source_url: https://www.irs.gov/pub/irs-soi/22in05ira.xlsx + sha256: 31ce9dd2fe631b336170af339bd7e9de86901cd20dbfd4cb8fb1c9a1d55cd90f + size_bytes: 13932 + source_table: Table 5. Taxpayers with Traditional Individual Retirement Arrangement + (IRA) Plan Contributions, by Size of Contribution and Age of Taxpayer + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-ira-traditional-contributions-2022/2022/31ce9dd2fe631b336170af339bd7e9de86901cd20dbfd4cb8fb1c9a1d55cd90f/22in05ira.xlsx + uri: r2://arch-raw/raw/irs_soi/soi-ira-traditional-contributions-2022/2022/31ce9dd2fe631b336170af339bd7e9de86901cd20dbfd4cb8fb1c9a1d55cd90f/22in05ira.xlsx diff --git a/db/data/irs_soi/table_1_2/21in12ms.xls b/db/data/irs_soi/table_1_2/21in12ms.xls new file mode 100644 index 0000000..ec390d3 Binary files /dev/null and b/db/data/irs_soi/table_1_2/21in12ms.xls differ diff --git a/db/data/irs_soi/table_1_2/22in12ms.xls b/db/data/irs_soi/table_1_2/22in12ms.xls new file mode 100644 index 0000000..973aaa0 Binary files /dev/null and b/db/data/irs_soi/table_1_2/22in12ms.xls differ diff --git a/db/data/irs_soi/table_1_2/23in12ms.xls b/db/data/irs_soi/table_1_2/23in12ms.xls new file mode 100644 index 0000000..8309319 Binary files /dev/null and b/db/data/irs_soi/table_1_2/23in12ms.xls differ diff --git a/db/data/irs_soi/table_1_2/manifest.yaml b/db/data/irs_soi/table_1_2/manifest.yaml new file mode 100644 index 0000000..1d2d33f --- /dev/null +++ b/db/data/irs_soi/table_1_2/manifest.yaml @@ -0,0 +1,42 @@ +source_id: irs_soi +package_id: soi-table-1-2 +dataset: irs_soi_soi-table-1-2 +source_page: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-complete-report-publication-1304-basic-tables-part-1 +table: Publication 1304 Table 1.2 +files: + 2023: + filename: 23in12ms.xls + source_url: https://www.irs.gov/pub/irs-soi/23in12ms.xls + sha256: 549861a1c7b38256abf9506a6111dcf82110266d0168199dfa465a5d61134490 + size_bytes: 62464 + fetched_at: '2026-05-11T11:40:08+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-1-2/2023/549861a1c7b38256abf9506a6111dcf82110266d0168199dfa465a5d61134490/23in12ms.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-1-2/2023/549861a1c7b38256abf9506a6111dcf82110266d0168199dfa465a5d61134490/23in12ms.xls + 2021: + filename: 21in12ms.xls + source_url: https://www.irs.gov/pub/irs-soi/21in12ms.xls + sha256: 5f68dae68357e3e1b8669389722c4e4d00030a9d1f4d77a41b4e2bce84b74643 + size_bytes: 61952 + fetched_at: '2026-05-11T12:24:07+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-1-2/2021/5f68dae68357e3e1b8669389722c4e4d00030a9d1f4d77a41b4e2bce84b74643/21in12ms.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-1-2/2021/5f68dae68357e3e1b8669389722c4e4d00030a9d1f4d77a41b4e2bce84b74643/21in12ms.xls + 2022: + filename: 22in12ms.xls + source_url: https://www.irs.gov/pub/irs-soi/22in12ms.xls + sha256: ffc483e16aa05762c5ac6dfdbfa86250cd4db5969b1cf6db2296539601a54503 + size_bytes: 62464 + fetched_at: '2026-05-11T12:24:07+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-1-2/2022/ffc483e16aa05762c5ac6dfdbfa86250cd4db5969b1cf6db2296539601a54503/22in12ms.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-1-2/2022/ffc483e16aa05762c5ac6dfdbfa86250cd4db5969b1cf6db2296539601a54503/22in12ms.xls diff --git a/db/data/irs_soi/table_2_1/21in21id.xls b/db/data/irs_soi/table_2_1/21in21id.xls new file mode 100644 index 0000000..b7d20f9 Binary files /dev/null and b/db/data/irs_soi/table_2_1/21in21id.xls differ diff --git a/db/data/irs_soi/table_2_1/22in21id.xls b/db/data/irs_soi/table_2_1/22in21id.xls new file mode 100644 index 0000000..8aec300 Binary files /dev/null and b/db/data/irs_soi/table_2_1/22in21id.xls differ diff --git a/db/data/irs_soi/table_2_1/23in21id.xls b/db/data/irs_soi/table_2_1/23in21id.xls new file mode 100644 index 0000000..7bef09e Binary files /dev/null and b/db/data/irs_soi/table_2_1/23in21id.xls differ diff --git a/db/data/irs_soi/table_2_1/manifest.yaml b/db/data/irs_soi/table_2_1/manifest.yaml new file mode 100644 index 0000000..2d993d7 --- /dev/null +++ b/db/data/irs_soi/table_2_1/manifest.yaml @@ -0,0 +1,42 @@ +source_id: irs_soi +package_id: soi-table-2-1 +dataset: irs_soi_soi-table-2-1 +source_page: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-complete-report-publication-1304-basic-tables-part-2 +table: Publication 1304 Table 2.1 +files: + 2023: + filename: 23in21id.xls + source_url: https://www.irs.gov/pub/irs-soi/23in21id.xls + sha256: df6cf04ed3b716dd670db055755528916ef02d59967e1af3ead59cc48bd4de80 + size_bytes: 75776 + fetched_at: '2026-05-11T11:44:21+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-1/2023/df6cf04ed3b716dd670db055755528916ef02d59967e1af3ead59cc48bd4de80/23in21id.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-1/2023/df6cf04ed3b716dd670db055755528916ef02d59967e1af3ead59cc48bd4de80/23in21id.xls + 2021: + filename: 21in21id.xls + source_url: https://www.irs.gov/pub/irs-soi/21in21id.xls + sha256: 726318d6f72f11aa9b948a3c33fa99099fe0f856c106dfd2be76b9bcc06ed5f3 + size_bytes: 73216 + fetched_at: '2026-05-11T12:24:07+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-1/2021/726318d6f72f11aa9b948a3c33fa99099fe0f856c106dfd2be76b9bcc06ed5f3/21in21id.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-1/2021/726318d6f72f11aa9b948a3c33fa99099fe0f856c106dfd2be76b9bcc06ed5f3/21in21id.xls + 2022: + filename: 22in21id.xls + source_url: https://www.irs.gov/pub/irs-soi/22in21id.xls + sha256: 0ebe525448c24c3a8fb99ca57136be27a97dcea24c04230670886c00b1db7ed9 + size_bytes: 77312 + fetched_at: '2026-05-11T12:24:07+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-1/2022/0ebe525448c24c3a8fb99ca57136be27a97dcea24c04230670886c00b1db7ed9/22in21id.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-1/2022/0ebe525448c24c3a8fb99ca57136be27a97dcea24c04230670886c00b1db7ed9/22in21id.xls diff --git a/db/data/irs_soi/table_2_5/20in25ic.xls b/db/data/irs_soi/table_2_5/20in25ic.xls new file mode 100644 index 0000000..031796f Binary files /dev/null and b/db/data/irs_soi/table_2_5/20in25ic.xls differ diff --git a/db/data/irs_soi/table_2_5/21in25ic.xls b/db/data/irs_soi/table_2_5/21in25ic.xls new file mode 100644 index 0000000..d2d9b39 Binary files /dev/null and b/db/data/irs_soi/table_2_5/21in25ic.xls differ diff --git a/db/data/irs_soi/table_2_5/22in25ic.xls b/db/data/irs_soi/table_2_5/22in25ic.xls new file mode 100644 index 0000000..2668726 Binary files /dev/null and b/db/data/irs_soi/table_2_5/22in25ic.xls differ diff --git a/db/data/irs_soi/table_2_5/23in25ic.xls b/db/data/irs_soi/table_2_5/23in25ic.xls new file mode 100644 index 0000000..6762f76 Binary files /dev/null and b/db/data/irs_soi/table_2_5/23in25ic.xls differ diff --git a/db/data/irs_soi/table_2_5/manifest.yaml b/db/data/irs_soi/table_2_5/manifest.yaml new file mode 100644 index 0000000..3ffc6e6 --- /dev/null +++ b/db/data/irs_soi/table_2_5/manifest.yaml @@ -0,0 +1,54 @@ +source_id: irs_soi +package_id: soi-table-2-5 +dataset: irs_soi_soi-table-2-5 +source_page: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-complete-report-publication-1304-basic-tables-part-2 +table: Publication 1304 Table 2.5 +files: + 2023: + filename: 23in25ic.xls + source_url: https://www.irs.gov/pub/irs-soi/23in25ic.xls + sha256: c6a58f637e42e83248b1f6f336fddc5d94ac91e04ccf68b1d089291d2d5fe1a5 + size_bytes: 55296 + fetched_at: '2026-05-11T11:50:29+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-5/2023/c6a58f637e42e83248b1f6f336fddc5d94ac91e04ccf68b1d089291d2d5fe1a5/23in25ic.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-5/2023/c6a58f637e42e83248b1f6f336fddc5d94ac91e04ccf68b1d089291d2d5fe1a5/23in25ic.xls + 2021: + filename: 21in25ic.xls + source_url: https://www.irs.gov/pub/irs-soi/21in25ic.xls + sha256: d4429c04a101b70cf23c6412f3246a3bc6a7244bad99f3561e54c302007f3285 + size_bytes: 54784 + fetched_at: '2026-05-11T12:24:30+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-5/2021/d4429c04a101b70cf23c6412f3246a3bc6a7244bad99f3561e54c302007f3285/21in25ic.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-5/2021/d4429c04a101b70cf23c6412f3246a3bc6a7244bad99f3561e54c302007f3285/21in25ic.xls + 2022: + filename: 22in25ic.xls + source_url: https://www.irs.gov/pub/irs-soi/22in25ic.xls + sha256: bd890a09ef3a35c577dcc9d94272a7ee40d836453c686d70599372b48b863884 + size_bytes: 55296 + fetched_at: '2026-05-11T12:24:30+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-5/2022/bd890a09ef3a35c577dcc9d94272a7ee40d836453c686d70599372b48b863884/22in25ic.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-5/2022/bd890a09ef3a35c577dcc9d94272a7ee40d836453c686d70599372b48b863884/22in25ic.xls + 2020: + filename: 20in25ic.xls + source_url: https://www.irs.gov/pub/irs-soi/20in25ic.xls + sha256: 28f8823857b345b59575038736011dfb750ba362df38ddb0b4b9b591a1cc7135 + size_bytes: 61440 + fetched_at: '2026-05-11T21:26:57+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-2-5/2020/28f8823857b345b59575038736011dfb750ba362df38ddb0b4b9b591a1cc7135/20in25ic.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-2-5/2020/28f8823857b345b59575038736011dfb750ba362df38ddb0b4b9b591a1cc7135/20in25ic.xls diff --git a/db/data/irs_soi/table_4_3/21in43ts.xls b/db/data/irs_soi/table_4_3/21in43ts.xls new file mode 100644 index 0000000..5819470 Binary files /dev/null and b/db/data/irs_soi/table_4_3/21in43ts.xls differ diff --git a/db/data/irs_soi/table_4_3/22in43ts.xls b/db/data/irs_soi/table_4_3/22in43ts.xls new file mode 100644 index 0000000..2489756 Binary files /dev/null and b/db/data/irs_soi/table_4_3/22in43ts.xls differ diff --git a/db/data/irs_soi/table_4_3/23in43ts.xls b/db/data/irs_soi/table_4_3/23in43ts.xls new file mode 100644 index 0000000..e71a32e Binary files /dev/null and b/db/data/irs_soi/table_4_3/23in43ts.xls differ diff --git a/db/data/irs_soi/table_4_3/manifest.yaml b/db/data/irs_soi/table_4_3/manifest.yaml new file mode 100644 index 0000000..2578e78 --- /dev/null +++ b/db/data/irs_soi/table_4_3/manifest.yaml @@ -0,0 +1,42 @@ +source_id: irs_soi +package_id: soi-table-4-3 +dataset: irs_soi_soi-table-4-3 +source_page: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-complete-report-publication-1304-basic-tables-part-4 +table: Publication 1304 Table 4.3 +files: + 2023: + filename: 23in43ts.xls + source_url: https://www.irs.gov/pub/irs-soi/23in43ts.xls + sha256: 4a65447b858f14cb39c6fdc1935c9f4cb8ee0a0c4c245baf263c29d80d1a49c8 + size_bytes: 58880 + fetched_at: '2026-05-11T11:53:23+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-4-3/2023/4a65447b858f14cb39c6fdc1935c9f4cb8ee0a0c4c245baf263c29d80d1a49c8/23in43ts.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-4-3/2023/4a65447b858f14cb39c6fdc1935c9f4cb8ee0a0c4c245baf263c29d80d1a49c8/23in43ts.xls + 2022: + filename: 22in43ts.xls + source_url: https://www.irs.gov/pub/irs-soi/22in43ts.xls + sha256: cb36e5fbd7862f54067801c4ee7abac45cc94c10ef8aed4c30adca9cdac85e4c + size_bytes: 58880 + fetched_at: '2026-05-11T12:24:30+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-4-3/2022/cb36e5fbd7862f54067801c4ee7abac45cc94c10ef8aed4c30adca9cdac85e4c/22in43ts.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-4-3/2022/cb36e5fbd7862f54067801c4ee7abac45cc94c10ef8aed4c30adca9cdac85e4c/22in43ts.xls + 2021: + filename: 21in43ts.xls + source_url: https://www.irs.gov/pub/irs-soi/21in43ts.xls + sha256: c2c4717067e661d62d4615b1147f41d780dc769ad4ee30cc0f03c07270700709 + size_bytes: 58880 + fetched_at: '2026-05-11T12:24:30+00:00' + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-table-4-3/2021/c2c4717067e661d62d4615b1147f41d780dc769ad4ee30cc0f03c07270700709/21in43ts.xls + uri: r2://arch-raw/raw/irs_soi/soi-table-4-3/2021/c2c4717067e661d62d4615b1147f41d780dc769ad4ee30cc0f03c07270700709/21in43ts.xls diff --git a/db/data/irs_soi/w2_statistics/20in04w2all.xlsx b/db/data/irs_soi/w2_statistics/20in04w2all.xlsx new file mode 100644 index 0000000..a124382 Binary files /dev/null and b/db/data/irs_soi/w2_statistics/20in04w2all.xlsx differ diff --git a/db/data/irs_soi/w2_statistics/manifest_2020_source_package.yaml b/db/data/irs_soi/w2_statistics/manifest_2020_source_package.yaml new file mode 100644 index 0000000..55b0c50 --- /dev/null +++ b/db/data/irs_soi/w2_statistics/manifest_2020_source_package.yaml @@ -0,0 +1,20 @@ +source_id: irs_soi +package_id: soi-w2-statistics-2020 +dataset: irs_soi_w2_statistics_2020 +source_page: https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics +table: Table 4.B. Summary of Items for Taxpayers with Form W-2, by Return and Earner + Type, Tax Year 2020 +files: + 2020: + filename: 20in04w2all.xlsx + source_url: https://www.irs.gov/pub/irs-soi/20in04w2all.xlsx + sha256: 1178d77618cc1d2f873506909eeec660f36e3599854f31337f9dcaec6cfc442f + size_bytes: 25491 + source_table: Table 4.B. Summary of Items for Taxpayers with Form W-2, by Return + and Earner Type, Tax Year 2020 + storage: + r2: + provider: r2 + bucket: arch-raw + key: raw/irs_soi/soi-w2-statistics-2020/2020/1178d77618cc1d2f873506909eeec660f36e3599854f31337f9dcaec6cfc442f/20in04w2all.xlsx + uri: r2://arch-raw/raw/irs_soi/soi-w2-statistics-2020/2020/1178d77618cc1d2f873506909eeec660f36e3599854f31337f9dcaec6cfc442f/20in04w2all.xlsx diff --git a/packages/irs_soi/ira_roth_contributions_2022/source_package.yaml b/packages/irs_soi/ira_roth_contributions_2022/source_package.yaml new file mode 100644 index 0000000..58915c8 --- /dev/null +++ b/packages/irs_soi/ira_roth_contributions_2022/source_package.yaml @@ -0,0 +1,66 @@ +schema_version: arch.source_package.v1 +package_id: soi-ira-roth-contributions-2022 +label: IRS SOI 2022 Roth IRA contribution totals +artifact: + source_name: irs_soi + source_table: Table 6. Taxpayers with Roth Individual Retirement Arrangement (IRA) Plan Contributions, by Size of Contribution and Age of Taxpayer + resource_package: db + resource_directory: data/irs_soi/ira_contributions + manifest: manifest_roth_source_package.yaml + vintage: tax_year_{year} + extracted_at: '2026-05-08' + extraction_method: xlsx whole-workbook used-range cell parse + parser: xlsx_used_range + artifact_year: 2022 +record_sets: +- record_set_id: irs_soi.ty{year}.roth_ira_contributions.all_taxpayers + record_set_spec_id: irs_soi.roth_ira_contributions.all_taxpayers.v1 + source_record_id_prefix: irs_soi.ty{year}.roth_ira_contributions.all_taxpayers + sheet_name: Sheet1 + period_type: tax_year + period: '{year}' + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: current + entity: tax_unit + entity_role: filing_unit + domain: individual_retirement_arrangement_contributions + groupby_dimension: irs_soi.ira_contribution_size + rows: + - value_id: all_taxpayers + label: All taxpayers + ordinal: 0 + row_number: 8 + expected_row_header_column: A + expected_row_header: All taxpayers + table_record_kind: total + guard_cells: + - column: B + row: 4 + expected_value: Total + label: total contribution size + measures: + - measure_id: taxpayer_count + label: Number of taxpayers + ordinal: 0 + column: B + source_column_id: number_of_taxpayers + expected_column_header_row: 5 + expected_column_header: "Number of \ntaxpayers" + concept: irs_soi.roth_ira_contributors + unit: count + aggregation: count + expected_cell_type: number + - measure_id: amount + label: Contribution amount + ordinal: 1 + column: C + source_column_id: amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.roth_ira_contributions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/ira_traditional_contributions_2022/source_package.yaml b/packages/irs_soi/ira_traditional_contributions_2022/source_package.yaml new file mode 100644 index 0000000..fa4f348 --- /dev/null +++ b/packages/irs_soi/ira_traditional_contributions_2022/source_package.yaml @@ -0,0 +1,66 @@ +schema_version: arch.source_package.v1 +package_id: soi-ira-traditional-contributions-2022 +label: IRS SOI 2022 Traditional IRA contribution totals +artifact: + source_name: irs_soi + source_table: Table 5. Taxpayers with Traditional Individual Retirement Arrangement (IRA) Plan Contributions, by Size of Contribution and Age of Taxpayer + resource_package: db + resource_directory: data/irs_soi/ira_contributions + manifest: manifest_traditional_source_package.yaml + vintage: tax_year_{year} + extracted_at: '2026-05-08' + extraction_method: xlsx whole-workbook used-range cell parse + parser: xlsx_used_range + artifact_year: 2022 +record_sets: +- record_set_id: irs_soi.ty{year}.traditional_ira_contributions.all_taxpayers + record_set_spec_id: irs_soi.traditional_ira_contributions.all_taxpayers.v1 + source_record_id_prefix: irs_soi.ty{year}.traditional_ira_contributions.all_taxpayers + sheet_name: Sheet1 + period_type: tax_year + period: '{year}' + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: current + entity: tax_unit + entity_role: filing_unit + domain: individual_retirement_arrangement_contributions + groupby_dimension: irs_soi.ira_contribution_size + rows: + - value_id: all_taxpayers + label: All taxpayers + ordinal: 0 + row_number: 8 + expected_row_header_column: A + expected_row_header: All taxpayers + table_record_kind: total + guard_cells: + - column: B + row: 4 + expected_value: Total + label: total contribution size + measures: + - measure_id: taxpayer_count + label: Number of taxpayers + ordinal: 0 + column: B + source_column_id: number_of_taxpayers + expected_column_header_row: 5 + expected_column_header: "Number of \ntaxpayers" + concept: irs_soi.traditional_ira_contributors + unit: count + aggregation: count + expected_cell_type: number + - measure_id: amount + label: Contribution amount + ordinal: 1 + column: C + source_column_id: amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.traditional_ira_contributions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/table_1_2/source_package.yaml b/packages/irs_soi/table_1_2/source_package.yaml new file mode 100644 index 0000000..fee4491 --- /dev/null +++ b/packages/irs_soi/table_1_2/source_package.yaml @@ -0,0 +1,137 @@ +schema_version: arch.source_package.v1 +package_id: soi-table-1-2 +label: IRS SOI Publication 1304 Table 1.2 all returns tax items +artifact: + source_name: irs_soi + source_table: Publication 1304 Table 1.2 + resource_package: db + resource_directory: data/irs_soi/table_1_2 + manifest: manifest.yaml + vintage: tax_year_{year} + extracted_at: "2026-05-11" + extraction_method: xlrd whole-workbook used-range cell parse +record_sets: + - record_set_id: irs_soi.ty{year}.table_1_2.all_returns + record_set_spec_id: irs_soi.table_1_2.all_returns.v1 + source_record_id_prefix: irs_soi.ty{year}.table_1_2.all_returns + sheet_name: TBL12 + period_type: tax_year + period: "{year}" + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: 2020_census + entity: tax_unit + entity_role: filing_unit + domain: all_individual_income_tax_returns + groupby_dimension: us:statutes/26/62#adjusted_gross_income + rows: + - value_id: all + label: All returns, total + ordinal: 0 + row_number: 9 + expected_row_header_column: A + expected_row_header: All returns, total + filters: + filing_status: all + income_range: all + guard_cells: + - column: B + row: 3 + expected_value: All returns + label: filing-status block + table_record_kind: total + measures: + - measure_id: return_count + label: Number of returns + ordinal: 0 + column: B + source_column_id: return_count + expected_column_header_row: 4 + expected_column_header: "Number\nof\nreturns" + concept: irs_soi.individual_income_tax_returns + unit: count + aggregation: count + expected_cell_type: number + - measure_id: adjusted_gross_income + label: Adjusted gross income + ordinal: 1 + column: C + source_column_id: adjusted_gross_income + expected_column_header_row: 4 + expected_column_header: "Adjusted\ngross income\nless\ndeficit" + concept: us:statutes/26/62#adjusted_gross_income + source_concept: irs_soi.adjusted_gross_income + concept_relation: exact + concept_authority: arch-us + concept_evidence_url: https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim) + concept_evidence_notes: > + IRS SOI Table 1.2 reports adjusted gross income less deficit for + individual income tax returns; IRC section 62 defines adjusted gross + income. This Arch assertion treats the SOI all-returns AGI column as + adopting that legal concept for the tax-year source record. + legal_vintage: tax_year_{year} + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_itemized_deductions_amount + label: Total itemized deductions amount + ordinal: 2 + column: E + source_column_id: total_itemized_deductions_amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.total_itemized_deductions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: standard_deduction_amount + label: Standard deduction amount + ordinal: 3 + column: G + source_column_id: standard_deduction_amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.standard_deduction + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_income_amount + label: Taxable income amount + ordinal: 4 + column: I + source_column_id: taxable_income_amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.taxable_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: income_tax_after_credits_amount + label: Income tax after credits amount + ordinal: 5 + column: K + source_column_id: income_tax_after_credits_amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.income_tax_after_credits + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_income_tax_amount + label: Total income tax amount + ordinal: 6 + column: M + source_column_id: total_income_tax_amount + expected_column_header_row: 5 + expected_column_header: Amount + concept: irs_soi.total_income_tax + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/table_2_1/source_package.yaml b/packages/irs_soi/table_2_1/source_package.yaml new file mode 100644 index 0000000..847b513 --- /dev/null +++ b/packages/irs_soi/table_2_1/source_package.yaml @@ -0,0 +1,258 @@ +schema_version: arch.source_package.v1 +package_id: soi-table-2-1 +label: IRS SOI Publication 1304 Table 2.1 itemized-return all-return totals +artifact: + source_name: irs_soi + source_table: Publication 1304 Table 2.1 + resource_package: db + resource_directory: data/irs_soi/table_2_1 + manifest: manifest.yaml + vintage: tax_year_{year} + extracted_at: "2026-05-11" + extraction_method: xlrd whole-workbook used-range cell parse +record_sets: + - record_set_id: irs_soi.ty{year}.table_2_1.itemized_all_returns + record_set_spec_id: irs_soi.table_2_1.itemized_all_returns.v1 + source_record_id_prefix: irs_soi.ty{year}.table_2_1.itemized_all_returns + sheet_name: TBL21 + period_type: tax_year + period: "{year}" + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: 2020_census + entity: tax_unit + entity_role: filing_unit + domain: individual_income_tax_returns_with_itemized_deductions + groupby_dimension: us:statutes/26/62#adjusted_gross_income + rows: + - value_id: all + label: All returns, total + ordinal: 0 + row_number: 10 + expected_row_header_column: A + expected_row_header: All returns, total + filters: + filing_status: all + income_range: all + guard_cells: + - column: A + row: 1 + expected_value: "Table 2.1. Returns with Itemized Deductions: Sources of Income,\nAdjustments, Itemized Deductions, and Tax Items, \nby Size of Adjusted Gross Income, Tax Year {year} (Filing Year {filing_year})" + label: table title + table_record_kind: total + measures: + - measure_id: itemized_return_count + label: Number of itemized returns + ordinal: 0 + column: B + source_column_id: itemized_return_count + expected_column_header_row: 3 + expected_column_header: "Number\nof\nreturns" + concept: irs_soi.returns_with_itemized_deductions + unit: count + aggregation: count + expected_cell_type: number + - measure_id: adjusted_gross_income + label: Adjusted gross income + ordinal: 1 + column: C + source_column_id: adjusted_gross_income + expected_column_header_row: 3 + expected_column_header: "Adjusted\ngross income\nless deficit" + concept: us:statutes/26/62#adjusted_gross_income + source_concept: irs_soi.adjusted_gross_income + concept_relation: exact + concept_authority: arch-us + concept_evidence_url: https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim) + concept_evidence_notes: > + IRS SOI Table 2.1 reports adjusted gross income less deficit for + individual income tax returns with itemized deductions; IRC section + 62 defines adjusted gross income. This Arch assertion treats the SOI + all-returns AGI column as adopting that legal concept for the + tax-year source record. + legal_vintage: tax_year_{year} + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_income_returns + label: Returns with total income + ordinal: 2 + column: D + source_column_id: total_income_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_total_income + unit: count + aggregation: count + expected_cell_type: number + - measure_id: total_income_amount + label: Total income amount + ordinal: 3 + column: E + source_column_id: total_income_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.total_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_wages_returns + label: Returns with total wages + ordinal: 4 + column: F + source_column_id: total_wages_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_total_wages + unit: count + aggregation: count + expected_cell_type: number + - measure_id: total_wages_amount + label: Total wages + ordinal: 5 + column: G + source_column_id: total_wages_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.total_wages + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_interest_returns + label: Returns with taxable interest + ordinal: 6 + column: T + source_column_id: taxable_interest_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_interest + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_interest_amount + label: Taxable interest + ordinal: 7 + column: U + source_column_id: taxable_interest_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.taxable_interest + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: ordinary_dividends_returns + label: Returns with ordinary dividends + ordinal: 8 + column: X + source_column_id: ordinary_dividends_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_ordinary_dividends + unit: count + aggregation: count + expected_cell_type: number + - measure_id: ordinary_dividends_amount + label: Ordinary dividends + ordinal: 9 + column: Y + source_column_id: ordinary_dividends_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.ordinary_dividends + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_ira_distributions_returns + label: Returns with taxable IRA distributions + ordinal: 10 + column: AL + source_column_id: taxable_ira_distributions_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_ira_distributions + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_ira_distributions_amount + label: Taxable IRA distributions + ordinal: 11 + column: AM + source_column_id: taxable_ira_distributions_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.taxable_ira_distributions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_pension_income_returns + label: Returns with taxable pension income + ordinal: 12 + column: AN + source_column_id: taxable_pension_income_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_pension_income + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_pension_income_amount + label: Taxable pension income + ordinal: 13 + column: AO + source_column_id: taxable_pension_income_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.taxable_pension_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_social_security_returns + label: Returns with taxable Social Security benefits + ordinal: 14 + column: BD + source_column_id: taxable_social_security_returns + expected_column_header_row: 7 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_social_security_benefits + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_social_security_amount + label: Taxable Social Security benefits + ordinal: 15 + column: BE + source_column_id: taxable_social_security_amount + expected_column_header_row: 7 + expected_column_header: Amount + concept: irs_soi.taxable_social_security_benefits + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_itemized_deductions_amount + label: Total itemized deductions amount + ordinal: 16 + column: BT + column_by_year: + 2021: BH + 2022: BT + 2023: BT + source_column_id: total_itemized_deductions_amount + expected_column_header_row: 4 + expected_column_header_by_year: + 2021: Total [2] + 2022: Total [3] + 2023: Total [3] + concept: irs_soi.total_itemized_deductions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/table_2_5/source_package.yaml b/packages/irs_soi/table_2_5/source_package.yaml new file mode 100644 index 0000000..f9b2de2 --- /dev/null +++ b/packages/irs_soi/table_2_5/source_package.yaml @@ -0,0 +1,147 @@ +schema_version: arch.source_package.v1 +package_id: soi-table-2-5 +label: IRS SOI Publication 1304 Table 2.5 EITC all-return totals +artifact: + source_name: irs_soi + source_table: Publication 1304 Table 2.5 + resource_package: db + resource_directory: data/irs_soi/table_2_5 + manifest: manifest.yaml + vintage: tax_year_{year} + extracted_at: "2026-05-11" + extraction_method: xlrd whole-workbook used-range cell parse +record_sets: + - record_set_id: irs_soi.ty{year}.table_2_5.eitc_all_returns + record_set_spec_id: irs_soi.table_2_5.eitc_all_returns.v1 + source_record_id_prefix: irs_soi.ty{year}.table_2_5.eitc_all_returns + sheet_name: TBL25 + period_type: tax_year + period: "{year}" + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: 2020_census + entity: tax_unit + entity_role: filing_unit + domain: individual_income_tax_returns_with_earned_income_credit + groupby_dimension: us:statutes/26/62#adjusted_gross_income + rows: + - value_id: total + label: Total + ordinal: 0 + row_number: 9 + expected_row_header_column: A + expected_row_header: Total + filters: + income_range: all + qualifying_children: all + guard_cells: + - column: A + row: 1 + expected_value: "Table 2.5. Returns with Earned Income Credit, by Size of Adjusted \nGross Income and Number of Qualifying Children, \nTax Year {year} (Filing Year {filing_year})" + label: table title + table_record_kind: total + measures: + - measure_id: eitc_return_count + label: Returns with earned income credit + ordinal: 0 + column: B + source_column_id: eitc_return_count + expected_column_header_row: 4 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_earned_income_credit + unit: count + aggregation: count + expected_cell_type: number + - measure_id: adjusted_gross_income + label: Adjusted gross income + ordinal: 1 + column: C + source_column_id: adjusted_gross_income + expected_column_header_row: 4 + expected_column_header: "Adjusted\ngross income\nless deficit" + concept: us:statutes/26/62#adjusted_gross_income + source_concept: irs_soi.adjusted_gross_income + concept_relation: exact + concept_authority: arch-us + concept_evidence_url: https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim) + concept_evidence_notes: > + IRS SOI Table 2.5 reports adjusted gross income less deficit for + individual income tax returns with earned income credit; IRC section + 62 defines adjusted gross income. This Arch assertion treats the SOI + total-row AGI column as adopting that legal concept for the tax-year + source record. + legal_vintage: tax_year_{year} + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: eic_earned_income_returns + label: Returns with EIC earned income + ordinal: 2 + column: H + source_column_id: eic_earned_income_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_eic_earned_income + unit: count + aggregation: count + expected_cell_type: number + - measure_id: eic_earned_income_amount + label: EIC earned income + ordinal: 3 + column: I + source_column_id: eic_earned_income_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.eic_earned_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_earned_income_credit_returns + label: Returns with total earned income credit + ordinal: 4 + column: J + source_column_id: total_earned_income_credit_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_total_earned_income_credit + unit: count + aggregation: count + expected_cell_type: number + - measure_id: total_earned_income_credit_amount + label: Total earned income credit + ordinal: 5 + column: K + source_column_id: total_earned_income_credit_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.total_earned_income_credit + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: eic_refundable_portion_returns + label: Returns with refundable EIC portion + ordinal: 6 + column: P + source_column_id: eic_refundable_portion_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_eic_refundable_portion + unit: count + aggregation: count + expected_cell_type: number + - measure_id: eic_refundable_portion_amount + label: EIC refundable portion + ordinal: 7 + column: Q + source_column_id: eic_refundable_portion_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.eic_refundable_portion + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/table_4_3/source_package.yaml b/packages/irs_soi/table_4_3/source_package.yaml new file mode 100644 index 0000000..ad01fdc --- /dev/null +++ b/packages/irs_soi/table_4_3/source_package.yaml @@ -0,0 +1,265 @@ +schema_version: arch.source_package.v1 +package_id: soi-table-4-3 +label: IRS SOI Publication 1304 Table 4.3 all-return totals +artifact: + source_name: irs_soi + source_table: Publication 1304 Table 4.3 + resource_package: db + resource_directory: data/irs_soi/table_4_3 + manifest: manifest.yaml + vintage: tax_year_{year} + extracted_at: "2026-05-11" + extraction_method: xlrd whole-workbook used-range cell parse +record_sets: + - record_set_id: irs_soi.ty{year}.table_4_3.all_returns_excluding_dependents + record_set_spec_id: irs_soi.table_4_3.all_returns_excluding_dependents.v1 + source_record_id_prefix: irs_soi.ty{year}.table_4_3.all_returns_excluding_dependents + sheet_name: Tab43 + sheet_name_by_year: + 2021: TBL14 + 2022: TBL14 + 2023: Tab43 + period_type: tax_year + period: "{year}" + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: 2020_census + entity: tax_unit + entity_role: filing_unit + domain: individual_income_tax_returns_excluding_dependents + groupby_dimension: us:statutes/26/62#adjusted_gross_income + rows: + - value_id: all + label: All returns, total + ordinal: 0 + row_number: 9 + expected_row_header_column: A + expected_row_header: All returns, total + filters: + income_percentile_range: all + guard_cells: + - column: A + row: 1 + expected_value: "Table 4.3. All Individual Returns Excluding Dependents: Number of \nReturns, Shares of Adjusted Gross Income (AGI), Selected Income \nItems, Credits, Total Income Tax, AGI Floor on Percentiles, and \nAverage Tax Rates, by Selected Expanded Descending Cumulative\nPercentiles of Returns Based on AGI, Tax Year {year} (Filing Year {filing_year})" + label: table title + table_record_kind: total + measures: + - measure_id: return_count + label: Number of returns excluding dependents + ordinal: 0 + column: B + source_column_id: return_count + expected_column_header_row: 3 + expected_column_header: "Number\nof\nreturns" + concept: irs_soi.individual_income_tax_returns_excluding_dependents + unit: count + aggregation: count + expected_cell_type: number + - measure_id: adjusted_gross_income + label: Adjusted gross income + ordinal: 1 + column: D + source_column_id: adjusted_gross_income + expected_column_header_row: 3 + expected_column_header: "Adjusted\ngross income\nless deficit" + concept: us:statutes/26/62#adjusted_gross_income + source_concept: irs_soi.adjusted_gross_income + concept_relation: exact + concept_authority: arch-us + concept_evidence_url: https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim) + concept_evidence_notes: > + IRS SOI Table 4.3 reports adjusted gross income less deficit for all + individual returns excluding dependents; IRC section 62 defines + adjusted gross income. This Arch assertion treats the SOI all-returns + AGI column as adopting that legal concept for the tax-year source + record. + legal_vintage: tax_year_{year} + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_wages_returns + label: Returns with total wages + ordinal: 2 + column: E + source_column_id: total_wages_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_total_wages + unit: count + aggregation: count + expected_cell_type: number + - measure_id: total_wages_amount + label: Total wages + ordinal: 3 + column: F + source_column_id: total_wages_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.total_wages + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_interest_returns + label: Returns with taxable interest + ordinal: 4 + column: G + source_column_id: taxable_interest_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_interest + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_interest_amount + label: Taxable interest + ordinal: 5 + column: H + source_column_id: taxable_interest_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.taxable_interest + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: ordinary_dividends_returns + label: Returns with ordinary dividends + ordinal: 6 + column: I + source_column_id: ordinary_dividends_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_ordinary_dividends + unit: count + aggregation: count + expected_cell_type: number + - measure_id: ordinary_dividends_amount + label: Ordinary dividends + ordinal: 7 + column: J + source_column_id: ordinary_dividends_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.ordinary_dividends + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: capital_asset_net_gain_returns + label: Returns with capital asset net gain less loss + ordinal: 8 + column: M + source_column_id: capital_asset_net_gain_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_capital_asset_net_gain_less_loss + unit: count + aggregation: count + expected_cell_type: number + - measure_id: capital_asset_net_gain_amount + label: Capital asset net gain less loss + ordinal: 9 + column: N + source_column_id: capital_asset_net_gain_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.capital_asset_net_gain_less_loss + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: itemized_deductions_returns + label: Returns with itemized deductions + ordinal: 10 + column: AC + source_column_id: itemized_deductions_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_itemized_deductions + unit: count + aggregation: count + expected_cell_type: number + - measure_id: itemized_deductions_amount + label: Itemized deductions + ordinal: 11 + column: AD + source_column_id: itemized_deductions_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.total_itemized_deductions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: taxable_income_returns + label: Returns with taxable income + ordinal: 12 + column: AO + source_column_id: taxable_income_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_taxable_income + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxable_income_amount + label: Taxable income + ordinal: 13 + column: AP + source_column_id: taxable_income_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.taxable_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: tax_credits_returns + label: Returns with tax credits + ordinal: 14 + column: AS + source_column_id: tax_credits_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_tax_credits + unit: count + aggregation: count + expected_cell_type: number + - measure_id: tax_credits_amount + label: Tax credits + ordinal: 15 + column: AT + source_column_id: tax_credits_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.tax_credits + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number + - measure_id: total_income_tax_returns + label: Returns with total income tax + ordinal: 16 + column: BI + source_column_id: total_income_tax_returns + expected_column_header_row: 6 + expected_column_header: "Number of\nreturns" + concept: irs_soi.returns_with_total_income_tax + unit: count + aggregation: count + expected_cell_type: number + - measure_id: total_income_tax_amount + label: Total income tax + ordinal: 17 + column: BJ + source_column_id: total_income_tax_amount + expected_column_header_row: 6 + expected_column_header: Amount + concept: irs_soi.total_income_tax + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/packages/irs_soi/w2_statistics_2020/source_package.yaml b/packages/irs_soi/w2_statistics_2020/source_package.yaml new file mode 100644 index 0000000..d999849 --- /dev/null +++ b/packages/irs_soi/w2_statistics_2020/source_package.yaml @@ -0,0 +1,157 @@ +schema_version: arch.source_package.v1 +package_id: soi-w2-statistics-2020 +label: IRS SOI 2020 Form W-2 social-security tips totals +artifact: + source_name: irs_soi + source_table: Table 4.B. Summary of Items for Taxpayers with Form W-2, by Return and Earner Type, Tax Year 2020 + resource_package: db + resource_directory: data/irs_soi/w2_statistics + manifest: manifest_2020_source_package.yaml + vintage: tax_year_{year} + extracted_at: '2026-05-08' + extraction_method: xlsx whole-workbook used-range cell parse + parser: xlsx_used_range + artifact_year: 2020 +record_sets: +- record_set_id: irs_soi.ty{year}.form_w2_social_security_tips + record_set_spec_id: irs_soi.form_w2_social_security_tips.v1 + source_record_id_prefix: irs_soi.ty{year}.form_w2_social_security_tips + sheet_name: Table 4.B + period_type: tax_year + period: '{year}' + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: current + entity: tax_unit + entity_role: filing_unit_with_form_w2 + domain: form_w2_items + groupby_dimension: irs_soi.form_w2_item + rows: + - value_id: box_7_social_security_tips + label: 'Box 7: Social security tips' + ordinal: 0 + row_number: 13 + expected_row_header_column: A + expected_row_header: 'Box 7: Social security tips' + table_record_kind: total + guard_cells: + - column: B + row: 3 + expected_value: All taxpayers + label: all taxpayers block + measures: + - measure_id: return_count + label: Number of returns + ordinal: 0 + column: B + source_column_id: number_of_returns + expected_column_header_row: 4 + expected_column_header: Number of returns + concept: irs_soi.form_w2_social_security_tip_returns + unit: count + aggregation: count + expected_cell_type: number + - measure_id: taxpayer_count + label: Number of taxpayers + ordinal: 1 + column: C + source_column_id: number_of_taxpayers + expected_column_header_row: 4 + expected_column_header: Number of taxpayers + concept: irs_soi.form_w2_social_security_tip_taxpayers + unit: count + aggregation: count + expected_cell_type: number + - measure_id: amount + label: Amount + ordinal: 2 + column: D + source_column_id: amount + expected_column_header_row: 4 + expected_column_header: Amount + concept: irs_soi.form_w2_social_security_tip_income + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number +- record_set_id: irs_soi.ty{year}.form_w2_401k_elective_deferrals + record_set_spec_id: irs_soi.form_w2_401k_elective_deferrals.v1 + source_record_id_prefix: irs_soi.ty{year}.form_w2_401k_elective_deferrals + sheet_name: Table 4.B + period_type: tax_year + period: '{year}' + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: current + entity: tax_unit + entity_role: filing_unit_with_form_w2 + domain: form_w2_items + groupby_dimension: irs_soi.form_w2_item + rows: + - value_id: box_12_d_401k_elective_deferrals + label: 'Box 12 D: 401(k) elective deferrals' + ordinal: 0 + row_number: 22 + expected_row_header_column: A + expected_row_header: D. 401(k) elective deferrals + table_record_kind: total + guard_cells: + - column: B + row: 3 + expected_value: All taxpayers + label: all taxpayers block + measures: + - measure_id: amount + label: Amount + ordinal: 0 + column: D + source_column_id: amount + expected_column_header_row: 4 + expected_column_header: Amount + concept: irs_soi.form_w2_401k_elective_deferrals + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number +- record_set_id: irs_soi.ty{year}.form_w2_designated_roth_401k_contributions + record_set_spec_id: irs_soi.form_w2_designated_roth_401k_contributions.v1 + source_record_id_prefix: irs_soi.ty{year}.form_w2_designated_roth_401k_contributions + sheet_name: Table 4.B + period_type: tax_year + period: '{year}' + geography_id: 0100000US + geography_level: country + geography_name: United States + geography_vintage: current + entity: tax_unit + entity_role: filing_unit_with_form_w2 + domain: form_w2_items + groupby_dimension: irs_soi.form_w2_item + rows: + - value_id: box_12_aa_designated_roth_401k_contributions + label: 'Box 12 AA: Designated Roth 401(k) contributions' + ordinal: 0 + row_number: 41 + expected_row_header_column: A + expected_row_header: AA. Designated Roth 401(k) contributions + table_record_kind: total + guard_cells: + - column: B + row: 3 + expected_value: All taxpayers + label: all taxpayers block + measures: + - measure_id: amount + label: Amount + ordinal: 0 + column: D + source_column_id: amount + expected_column_header_row: 4 + expected_column_header: Amount + concept: irs_soi.form_w2_designated_roth_401k_contributions + unit: usd + aggregation: sum + value_scale: 1000 + expected_cell_type: number diff --git a/tests/test_arch_bundle.py b/tests/test_arch_bundle.py index 89a3ef4..fe234dc 100644 --- a/tests/test_arch_bundle.py +++ b/tests/test_arch_bundle.py @@ -29,40 +29,67 @@ def test_build_bundle_writes_merged_consumer_contract(tmp_path): "aggregate_duplicate_key_count": 0, "entity_count": 1, "error_count": 0, - "fact_count": 340, + "fact_count": 399, "geography_count": 1, "period_count": 1, - "semantic_duplicate_key_count": 0, + "semantic_duplicate_key_count": 3, "skipped_source_count": 0, "source_count": 1, - "source_package_count": 2, - "warning_count": 0, + "source_package_count": 9, + "warning_count": 1, } - assert len(rows) == 340 + assert len(rows) == 399 assert rows[0]["aggregate_fact_key"].startswith("arch.aggregate_fact.v2:") assert rows[0]["semantic_fact_key"].startswith("arch.semantic_fact.v2:") - assert source_packages["source_package_count"] == 2 + assert source_packages["source_package_count"] == 9 assert source_packages["skipped_source_count"] == 0 assert not source_packages["skipped_sources"] - assert coverage["fact_count"] == 340 + assert coverage["fact_count"] == 399 assert coverage["counts"]["by_source"] == { - "irs_soi": 340, + "irs_soi": 399, } assert coverage["counts"]["by_source_table"] == { "irs_soi:Publication 1304 Table 1.1": 80, + "irs_soi:Publication 1304 Table 1.2": 7, "irs_soi:Publication 1304 Table 1.4": 260, + "irs_soi:Publication 1304 Table 2.1": 17, + "irs_soi:Publication 1304 Table 2.5": 8, + "irs_soi:Publication 1304 Table 4.3": 18, + ( + "irs_soi:Table 4.B. Summary of Items for Taxpayers with Form W-2, " + "by Return and Earner Type, Tax Year 2020" + ): 5, + ( + "irs_soi:Table 5. Taxpayers with Traditional Individual Retirement " + "Arrangement (IRA) Plan Contributions, by Size of Contribution and " + "Age of Taxpayer" + ): 2, + ( + "irs_soi:Table 6. Taxpayers with Roth Individual Retirement " + "Arrangement (IRA) Plan Contributions, by Size of Contribution and " + "Age of Taxpayer" + ): 2, } assert coverage["counts"]["by_period"] == { - "tax_year:2023": 340, + "tax_year:2023": 399, } assert coverage["counts"]["by_geography"] == { - "country:0100000US": 340, + "country:0100000US": 399, } assert coverage["counts"]["by_entity"] == { - "tax_unit": 340, + "tax_unit": 399, } assert not coverage["duplicates"]["aggregate_fact_keys"] - assert not coverage["duplicates"]["semantic_fact_keys"] + assert len(coverage["duplicates"]["semantic_fact_keys"]) == 3 + assert summary["warnings"] == [ + { + "code": "duplicate_semantic_fact_key", + "message": ( + "One or more semantic facts appear in multiple rows; downstream " + "consumers should reconcile or select sources." + ), + } + ] assert (output_dir / "sources" / "soi-table-1-1" / "consumer_facts.jsonl").exists() assert ( output_dir @@ -71,6 +98,12 @@ def test_build_bundle_writes_merged_consumer_contract(tmp_path): / "reports" / "build_summary.json" ).exists() + assert ( + output_dir + / "sources" + / "soi-ira-roth-contributions-2022" + / "consumer_facts.jsonl" + ).exists() def test_build_bundle_cli_supports_explicit_sources(tmp_path, capsys): diff --git a/tests/test_arch_source_package.py b/tests/test_arch_source_package.py index 748d08e..8be29dc 100644 --- a/tests/test_arch_source_package.py +++ b/tests/test_arch_source_package.py @@ -258,3 +258,63 @@ def test_validate_source_package_reports_fixture_counts(): "source_record_count": 80, "source_region_count": 1, } + + +def test_national_soi_source_package_aliases_validate_fixture_counts(): + expected_counts = { + "soi-table-1-2": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 7, + "source_record_count": 7, + "source_region_count": 1, + }, + "soi-table-2-1": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 17, + "source_record_count": 17, + "source_region_count": 1, + }, + "soi-table-2-5": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 8, + "source_record_count": 8, + "source_region_count": 1, + }, + "soi-table-4-3": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 18, + "source_record_count": 18, + "source_region_count": 1, + }, + "soi-w2-statistics-2020": { + "record_set_count": 3, + "row_count": 3, + "measure_count": 5, + "source_record_count": 5, + "source_region_count": 3, + }, + "soi-ira-traditional-contributions-2022": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 2, + "source_record_count": 2, + "source_region_count": 1, + }, + "soi-ira-roth-contributions-2022": { + "record_set_count": 1, + "row_count": 1, + "measure_count": 2, + "source_record_count": 2, + "source_region_count": 1, + }, + } + + for package_id, counts in expected_counts.items(): + report = validate_source_package(package_id, year=2023) + + assert report.valid, package_id + assert report.counts == counts