|
27 | 27 | "execution_count": null, |
28 | 28 | "metadata": {}, |
29 | 29 | "outputs": [], |
30 | | - "source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" |
| 30 | + "source": [ |
| 31 | + "import numpy as np\n", |
| 32 | + "import pandas as pd\n", |
| 33 | + "from policyengine_us import Microsimulation\n", |
| 34 | + "from policyengine_us_data.storage import STORAGE_FOLDER\n", |
| 35 | + "from policyengine_us_data.calibration.unified_matrix_builder import (\n", |
| 36 | + " UnifiedMatrixBuilder,\n", |
| 37 | + ")\n", |
| 38 | + "from policyengine_us_data.calibration.clone_and_assign import (\n", |
| 39 | + " assign_random_geography,\n", |
| 40 | + ")\n", |
| 41 | + "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", |
| 42 | + " create_target_groups,\n", |
| 43 | + " drop_target_groups,\n", |
| 44 | + " get_geo_level,\n", |
| 45 | + " STATE_CODES,\n", |
| 46 | + ")\n", |
| 47 | + "\n", |
| 48 | + "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", |
| 49 | + "db_uri = f\"sqlite:///{db_path}\"\n", |
| 50 | + "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" |
| 51 | + ] |
31 | 52 | }, |
32 | 53 | { |
33 | 54 | "cell_type": "code", |
|
82 | 103 | "execution_count": null, |
83 | 104 | "metadata": {}, |
84 | 105 | "outputs": [], |
85 | | - "source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")" |
| 106 | + "source": [ |
| 107 | + "print(f\"Targets: {X_sparse.shape[0]}\")\n", |
| 108 | + "print(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\n", |
| 109 | + "print(f\"Non-zeros: {X_sparse.nnz:,}\")\n", |
| 110 | + "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n", |
| 111 | + "\n", |
| 112 | + "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n", |
| 113 | + "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n", |
| 114 | + "for level in [0, 1, 2]:\n", |
| 115 | + " n = (geo_levels == level).sum()\n", |
| 116 | + " if n > 0:\n", |
| 117 | + " print(f\" {level_names[level]}: {n} targets\")" |
| 118 | + ] |
86 | 119 | }, |
87 | 120 | { |
88 | 121 | "cell_type": "markdown", |
|
294 | 327 | "for gid, info in enumerate(group_info):\n", |
295 | 328 | " mask = target_groups == gid\n", |
296 | 329 | " vals = targets_df.loc[mask, \"value\"]\n", |
297 | | - " records.append({\n", |
298 | | - " \"group_id\": gid,\n", |
299 | | - " \"description\": info,\n", |
300 | | - " \"n_targets\": mask.sum(),\n", |
301 | | - " \"min_value\": vals.min(),\n", |
302 | | - " \"median_value\": vals.median(),\n", |
303 | | - " \"max_value\": vals.max(),\n", |
304 | | - " })\n", |
| 330 | + " records.append(\n", |
| 331 | + " {\n", |
| 332 | + " \"group_id\": gid,\n", |
| 333 | + " \"description\": info,\n", |
| 334 | + " \"n_targets\": mask.sum(),\n", |
| 335 | + " \"min_value\": vals.min(),\n", |
| 336 | + " \"median_value\": vals.median(),\n", |
| 337 | + " \"max_value\": vals.max(),\n", |
| 338 | + " }\n", |
| 339 | + " )\n", |
305 | 340 | "\n", |
306 | 341 | "group_df = pd.DataFrame(records)\n", |
307 | 342 | "print(group_df.to_string(index=False))" |
|
431 | 466 | " for r in nz_rows[:5]:\n", |
432 | 467 | " row = targets_df.iloc[r]\n", |
433 | 468 | " print(\n", |
434 | | - " f\" {row['variable']} (geo={row['geographic_id']}): \"\n", |
435 | | - " f\"{X_sparse[r, col]:.2f}\"\n", |
| 469 | + " f\" {row['variable']} (geo={row['geographic_id']}): {X_sparse[r, col]:.2f}\"\n", |
436 | 470 | " )\n", |
437 | 471 | " if len(nz_rows) > 5:\n", |
438 | 472 | " print(f\" ... and {len(nz_rows) - 5} more\")" |
|
475 | 509 | "execution_count": null, |
476 | 510 | "metadata": {}, |
477 | 511 | "outputs": [], |
478 | | - "source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )" |
| 512 | + "source": [ |
| 513 | + "nnz_per_row = np.diff(X_sparse.indptr)\n", |
| 514 | + "print(f\"Non-zeros per row:\")\n", |
| 515 | + "print(f\" min: {nnz_per_row.min():,}\")\n", |
| 516 | + "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n", |
| 517 | + "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n", |
| 518 | + "print(f\" max: {nnz_per_row.max():,}\")\n", |
| 519 | + "\n", |
| 520 | + "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n", |
| 521 | + "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n", |
| 522 | + "print(\"\\nBy geographic level:\")\n", |
| 523 | + "for level in [0, 1, 2]:\n", |
| 524 | + " mask = (geo_levels == level).values\n", |
| 525 | + " if mask.any():\n", |
| 526 | + " vals = nnz_per_row[mask]\n", |
| 527 | + " print(\n", |
| 528 | + " f\" {level_names[level]:10s}: \"\n", |
| 529 | + " f\"n={mask.sum():>4d}, \"\n", |
| 530 | + " f\"median nnz={int(np.median(vals)):>7,}, \"\n", |
| 531 | + " f\"range=[{vals.min():,}, {vals.max():,}]\"\n", |
| 532 | + " )" |
| 533 | + ] |
479 | 534 | }, |
480 | 535 | { |
481 | 536 | "cell_type": "code", |
|
498 | 553 | "clone_nnz = []\n", |
499 | 554 | "for ci in range(N_CLONES):\n", |
500 | 555 | " block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n", |
501 | | - " n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n", |
502 | | - " clone_nnz.append({\n", |
503 | | - " \"clone\": ci,\n", |
504 | | - " \"nnz\": block.nnz,\n", |
505 | | - " \"unique_states\": n_states,\n", |
506 | | - " })\n", |
| 556 | + " n_states = len(\n", |
| 557 | + " np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records])\n", |
| 558 | + " )\n", |
| 559 | + " clone_nnz.append(\n", |
| 560 | + " {\n", |
| 561 | + " \"clone\": ci,\n", |
| 562 | + " \"nnz\": block.nnz,\n", |
| 563 | + " \"unique_states\": n_states,\n", |
| 564 | + " }\n", |
| 565 | + " )\n", |
507 | 566 | "\n", |
508 | 567 | "clone_df = pd.DataFrame(clone_nnz)\n", |
509 | 568 | "print(\"Non-zeros per clone block:\")\n", |
|
666 | 725 | } |
667 | 726 | ], |
668 | 727 | "source": [ |
669 | | - "ratios = row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n", |
| 728 | + "ratios = (\n", |
| 729 | + " row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n", |
| 730 | + ")\n", |
670 | 731 | "ratio_df = targets_filtered[achievable_mask].copy()\n", |
671 | 732 | "ratio_df[\"row_sum\"] = row_sums[achievable_mask]\n", |
672 | 733 | "ratio_df[\"ratio\"] = ratios\n", |
|
0 commit comments