|
474 | 474 | "docstring": "Build all datasets with preemption-resilient checkpointing.\n\nArgs:\n upload: Whether to upload completed datasets.\n branch: Git branch to build from.\n sequential: Use sequential (non-parallel) execution.\n clear_checkpoints: Clear existing checkpoints before starting.\n skip_tests: Skip running the test suite (useful for calibration runs).\n skip_enhanced_cps: Skip enhanced_cps.py and small_enhanced_cps.py\n (useful for calibration runs that only need source_imputed H5).\n skip_stage_5: Skip source-imputed CPS and small enhanced CPS after\n enhanced_cps_2024.h5 is built.\n stage_only: Upload to HF staging only, without promoting a release.\n version: policyengine-us-data package version used for staging and\n dataset-build contracts.", |
475 | 475 | "id": "build_datasets", |
476 | 476 | "kind": "function", |
477 | | - "line": 569, |
| 477 | + "line": 536, |
478 | 478 | "metadata": { |
479 | 479 | "api_refs": [ |
480 | 480 | "modal_app.data_build.build_datasets" |
|
999 | 999 | "docstring": "Build CPS before PUF because PUF pension imputation loads CPS_2024.", |
1000 | 1000 | "id": "cps_puf_build_phase", |
1001 | 1001 | "kind": "function", |
1002 | | - "line": 437, |
| 1002 | + "line": 404, |
1003 | 1003 | "metadata": { |
1004 | 1004 | "api_refs": [ |
1005 | 1005 | "modal_app.data_build.run_cps_then_puf_phase" |
|
3463 | 3463 | "signature": "def reconcile_ss_subcomponents(data: Dict[str, Dict[int, np.ndarray]], n_cps: int, time_period: int) -> None", |
3464 | 3464 | "source_file": "policyengine_us_data/calibration/puf_impute.py" |
3465 | 3465 | }, |
| 3466 | + "stage_1_dataset_artifact_specs": { |
| 3467 | + "docstring": "Return all artifact specs known to the Stage 1 dataset build.", |
| 3468 | + "id": "stage_1_dataset_artifact_specs", |
| 3469 | + "kind": "function", |
| 3470 | + "line": 230, |
| 3471 | + "metadata": { |
| 3472 | + "api_refs": [ |
| 3473 | + "policyengine_us_data.build_datasets.artifacts.stage_1_artifact_specs" |
| 3474 | + ], |
| 3475 | + "artifacts_out": [ |
| 3476 | + "uprating_factors.csv", |
| 3477 | + "acs_2022.h5", |
| 3478 | + "irs_puf_2015.h5", |
| 3479 | + "cps_2024.h5", |
| 3480 | + "puf_2024.h5", |
| 3481 | + "extended_cps_2024.h5", |
| 3482 | + "enhanced_cps_2024.h5", |
| 3483 | + "enhanced_cps_2024.clone_diagnostics.json", |
| 3484 | + "calibration_log.csv", |
| 3485 | + "stratified_extended_cps_2024.h5", |
| 3486 | + "source_imputed_stratified_extended_cps_2024.h5", |
| 3487 | + "small_enhanced_cps_2024.h5", |
| 3488 | + "source_imputed_stratified_extended_cps.h5", |
| 3489 | + "policy_data.db", |
| 3490 | + "build_log.txt", |
| 3491 | + "data_build_checkpoint_stats.json" |
| 3492 | + ], |
| 3493 | + "description": "Canonical artifact inventory for Stage 1 dataset-build outputs.", |
| 3494 | + "id": "stage_1_dataset_artifact_specs", |
| 3495 | + "label": "Stage 1 Dataset Artifact Specs", |
| 3496 | + "node_type": "library", |
| 3497 | + "pathways": [ |
| 3498 | + "data_build", |
| 3499 | + "stage_contracts", |
| 3500 | + "pipeline_docs" |
| 3501 | + ], |
| 3502 | + "source_file": "policyengine_us_data/build_datasets/artifacts.py", |
| 3503 | + "stability": "stable", |
| 3504 | + "status": "current", |
| 3505 | + "validation_commands": [ |
| 3506 | + "uv run pytest tests/unit/test_build_dataset_specs.py" |
| 3507 | + ] |
| 3508 | + }, |
| 3509 | + "object_path": "policyengine_us_data.build_datasets.artifacts.stage_1_artifact_specs", |
| 3510 | + "signature": "def stage_1_artifact_specs() -> tuple[DatasetArtifactSpec, ...]", |
| 3511 | + "source_file": "policyengine_us_data/build_datasets/artifacts.py" |
| 3512 | + }, |
| 3513 | + "stage_1_dataset_build_specs": { |
| 3514 | + "docstring": "Return the canonical Stage 1 dataset-build substage specs.", |
| 3515 | + "id": "stage_1_dataset_build_specs", |
| 3516 | + "kind": "function", |
| 3517 | + "line": 87, |
| 3518 | + "metadata": { |
| 3519 | + "api_refs": [ |
| 3520 | + "policyengine_us_data.build_datasets.specs.stage_1_step_specs" |
| 3521 | + ], |
| 3522 | + "description": "Canonical substage taxonomy for Stage 1 dataset-build contracts, step manifests, and pipeline documentation.", |
| 3523 | + "id": "stage_1_dataset_build_specs", |
| 3524 | + "label": "Stage 1 Dataset Build Specs", |
| 3525 | + "node_type": "library", |
| 3526 | + "pathways": [ |
| 3527 | + "data_build", |
| 3528 | + "stage_contracts", |
| 3529 | + "pipeline_docs" |
| 3530 | + ], |
| 3531 | + "source_file": "policyengine_us_data/build_datasets/specs.py", |
| 3532 | + "stability": "stable", |
| 3533 | + "status": "current", |
| 3534 | + "validation_commands": [ |
| 3535 | + "uv run pytest tests/unit/test_build_dataset_specs.py" |
| 3536 | + ] |
| 3537 | + }, |
| 3538 | + "object_path": "policyengine_us_data.build_datasets.specs.stage_1_step_specs", |
| 3539 | + "signature": "def stage_1_step_specs() -> tuple[DatasetBuildStepSpec, ...]", |
| 3540 | + "source_file": "policyengine_us_data/build_datasets/specs.py" |
| 3541 | + }, |
3466 | 3542 | "staging_upload": { |
3467 | 3543 | "docstring": "Upload files to HuggingFace staging only.\n\nGCS is updated during promote_publish, not here.\nPromote must be run separately via promote_publish.", |
3468 | 3544 | "id": "staging_upload", |
|
0 commit comments