Skip to content

Commit c18fb70

Browse files
author
Michael Aydinbas
committed
Add clinic_data_static table creation
Implements create_table_clinic_static() replicating R pipeline's create_table_clinic_static_data(). Reads clinic_data.xlsx, forward-fills hierarchical columns, writes parquet. Wired into run_pipeline_cmd (step 3b) and registered in PARQUET_TO_TABLE for BigQuery upload.
1 parent 5560b38 commit c18fb70

1 file changed

Lines changed: 67 additions & 0 deletions

File tree

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Create clinic static data table from reference data.
2+
3+
Replicates R pipeline's create_table_clinic_static_data() function:
4+
reads clinic_data.xlsx, fills down hierarchical columns, exports as parquet.
5+
"""
6+
7+
from pathlib import Path
8+
9+
import polars as pl
10+
from loguru import logger
11+
12+
from a4d.reference.loaders import find_reference_data_dir
13+
14+
# Text columns filled downward to handle merged/blank cells in the Excel sheet.
15+
# R: tidyr::fill(country_code:clinic_id, .direction = "down")
16+
_FILL_COLUMNS = [
17+
"country",
18+
"clinic_province",
19+
"clinic_name",
20+
"clinic_status",
21+
"clinic_id",
22+
"country_code",
23+
"clinic_code",
24+
"patient_id_example",
25+
]
26+
27+
28+
def create_table_clinic_static(output_dir: Path) -> Path:
29+
"""Create clinic static data table from reference data.
30+
31+
Reads clinic_data.xlsx from reference_data/, fills hierarchical columns
32+
downward (matching R's tidyr::fill behaviour), and writes parquet.
33+
34+
Args:
35+
output_dir: Directory to write the parquet file
36+
37+
Returns:
38+
Path to created clinic_data_static.parquet
39+
"""
40+
reference_dir = find_reference_data_dir()
41+
clinic_file = reference_dir / "clinic_data.xlsx"
42+
43+
if not clinic_file.exists():
44+
raise FileNotFoundError(f"Clinic data file not found: {clinic_file}")
45+
46+
logger.info(f"Reading clinic data from: {clinic_file}")
47+
48+
df = pl.read_excel(clinic_file, sheet_id=1)
49+
50+
# Drop unnamed index column — R: select(2:11)
51+
unnamed_cols = [c for c in df.columns if c.startswith("__UNNAMED")]
52+
if unnamed_cols:
53+
df = df.drop(unnamed_cols)
54+
55+
# Fill nulls downward for hierarchical columns — R: tidyr::fill(..., .direction = "down")
56+
fill_cols = [c for c in _FILL_COLUMNS if c in df.columns]
57+
if fill_cols:
58+
df = df.with_columns([pl.col(c).forward_fill() for c in fill_cols])
59+
60+
logger.info(f"Clinic static data: {df.shape[0]} rows, {df.shape[1]} columns")
61+
62+
output_dir.mkdir(parents=True, exist_ok=True)
63+
output_file = output_dir / "clinic_data_static.parquet"
64+
df.write_parquet(output_file)
65+
66+
logger.info(f"Clinic static table saved: {output_file}")
67+
return output_file

0 commit comments

Comments
 (0)