|
| 1 | +"""Create clinic static data table from reference data. |
| 2 | +
|
| 3 | +Replicates R pipeline's create_table_clinic_static_data() function: |
| 4 | +reads clinic_data.xlsx, fills down hierarchical columns, exports as parquet. |
| 5 | +""" |
| 6 | + |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +import polars as pl |
| 10 | +from loguru import logger |
| 11 | + |
| 12 | +from a4d.reference.loaders import find_reference_data_dir |
| 13 | + |
| 14 | +# Text columns filled downward to handle merged/blank cells in the Excel sheet. |
| 15 | +# R: tidyr::fill(country_code:clinic_id, .direction = "down") |
| 16 | +_FILL_COLUMNS = [ |
| 17 | + "country", |
| 18 | + "clinic_province", |
| 19 | + "clinic_name", |
| 20 | + "clinic_status", |
| 21 | + "clinic_id", |
| 22 | + "country_code", |
| 23 | + "clinic_code", |
| 24 | + "patient_id_example", |
| 25 | +] |
| 26 | + |
| 27 | + |
| 28 | +def create_table_clinic_static(output_dir: Path) -> Path: |
| 29 | + """Create clinic static data table from reference data. |
| 30 | +
|
| 31 | + Reads clinic_data.xlsx from reference_data/, fills hierarchical columns |
| 32 | + downward (matching R's tidyr::fill behaviour), and writes parquet. |
| 33 | +
|
| 34 | + Args: |
| 35 | + output_dir: Directory to write the parquet file |
| 36 | +
|
| 37 | + Returns: |
| 38 | + Path to created clinic_data_static.parquet |
| 39 | + """ |
| 40 | + reference_dir = find_reference_data_dir() |
| 41 | + clinic_file = reference_dir / "clinic_data.xlsx" |
| 42 | + |
| 43 | + if not clinic_file.exists(): |
| 44 | + raise FileNotFoundError(f"Clinic data file not found: {clinic_file}") |
| 45 | + |
| 46 | + logger.info(f"Reading clinic data from: {clinic_file}") |
| 47 | + |
| 48 | + df = pl.read_excel(clinic_file, sheet_id=1) |
| 49 | + |
| 50 | + # Drop unnamed index column — R: select(2:11) |
| 51 | + unnamed_cols = [c for c in df.columns if c.startswith("__UNNAMED")] |
| 52 | + if unnamed_cols: |
| 53 | + df = df.drop(unnamed_cols) |
| 54 | + |
| 55 | + # Fill nulls downward for hierarchical columns — R: tidyr::fill(..., .direction = "down") |
| 56 | + fill_cols = [c for c in _FILL_COLUMNS if c in df.columns] |
| 57 | + if fill_cols: |
| 58 | + df = df.with_columns([pl.col(c).forward_fill() for c in fill_cols]) |
| 59 | + |
| 60 | + logger.info(f"Clinic static data: {df.shape[0]} rows, {df.shape[1]} columns") |
| 61 | + |
| 62 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 63 | + output_file = output_dir / "clinic_data_static.parquet" |
| 64 | + df.write_parquet(output_file) |
| 65 | + |
| 66 | + logger.info(f"Clinic static table saved: {output_file}") |
| 67 | + return output_file |
0 commit comments