Skip to content

Commit faa3c28

Browse files
author
Michael Aydinbas
committed
add new error log table combining all errors in a single parquet file and preparing for BigQuery upload
1 parent 2069a4d commit faa3c28

12 files changed

Lines changed: 111 additions & 82 deletions

File tree

pyproject.toml

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@ version = "2.0.0"
44
description = "A4D Medical Tracker Data Processing Pipeline (Python)"
55
readme = "README.md"
66
requires-python = ">=3.14"
7-
authors = [
8-
{name = "Michael Aydinbas", email = "michael.aydinbas@gmail.com"}
9-
]
10-
license = {text = "MIT"}
7+
authors = [{ name = "Michael Aydinbas", email = "michael.aydinbas@gmail.com" }]
8+
license = { text = "MIT" }
119

1210
dependencies = [
1311
"polars>=0.20.0",
@@ -48,20 +46,20 @@ build-backend = "hatchling.build"
4846
line-length = 100
4947
target-version = "py314"
5048
lint.select = [
51-
"E", # pycodestyle errors
52-
"W", # pycodestyle warnings
53-
"F", # pyflakes
54-
"I", # isort
55-
"N", # pep8-naming
56-
"UP", # pyupgrade
57-
"B", # flake8-bugbear
58-
"A", # flake8-builtins
59-
"C4", # flake8-comprehensions
60-
"PT", # flake8-pytest-style
49+
"E", # pycodestyle errors
50+
"W", # pycodestyle warnings
51+
"F", # pyflakes
52+
"I", # isort
53+
"N", # pep8-naming
54+
"UP", # pyupgrade
55+
"B", # flake8-bugbear
56+
"A", # flake8-builtins
57+
"C4", # flake8-comprehensions
58+
"PT", # flake8-pytest-style
6159
]
6260

6361
[tool.ruff.lint.per-file-ignores]
64-
"__init__.py" = ["F401"] # Allow unused imports in __init__.py
62+
"__init__.py" = ["F401"] # Allow unused imports in __init__.py
6563

6664
[tool.pytest.ini_options]
6765
testpaths = ["tests"]
@@ -72,11 +70,5 @@ markers = [
7270
"integration: marks tests as integration tests requiring real tracker files",
7371
"e2e: marks tests as end-to-end tests (extraction + cleaning)",
7472
]
75-
addopts = [
76-
"--cov=src/a4d",
77-
"--cov-report=term-missing",
78-
"--cov-report=html",
79-
]
80-
filterwarnings = [
81-
"ignore::RuntimeWarning:google_crc32c",
82-
]
73+
addopts = ["--cov=src/a4d", "--cov-report=term-missing", "--cov-report=html"]
74+
filterwarnings = ["ignore::RuntimeWarning:google_crc32c"]

r-archive/scripts-root/logs.ipynb

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
"import pandas as pd\n",
1212
"from collections import defaultdict\n",
1313
"\n",
14-
"pd.set_option('display.max_colwidth', None)\n",
14+
"pd.set_option(\"display.max_colwidth\", None)\n",
1515
"pd.set_option(\"display.max_rows\", 1000)\n",
16-
"pd.set_option('display.width', None)"
16+
"pd.set_option(\"display.width\", None)"
1717
]
1818
},
1919
{
@@ -60,11 +60,11 @@
6060
" warn = None\n",
6161
" if \"ERROR\" in line:\n",
6262
" error = line.split(\"\\t\")[-1]\n",
63-
" \n",
63+
"\n",
6464
" if \"WARN\" in line:\n",
6565
" warn = line.split(\"\\t\")[-1]\n",
66-
" \n",
67-
" if error or warn: \n",
66+
"\n",
67+
" if error or warn:\n",
6868
" data[\"file\"].append(file.name)\n",
6969
" data[\"error\"].append(error)\n",
7070
" data[\"warning\"].append(warn)"
@@ -505,8 +505,7 @@
505505
],
506506
"source": [
507507
"# how many files could not be processed in %?\n",
508-
"len(df[~df.error.isna()]) / len(files) * 100\n",
509-
" "
508+
"len(df[~df.error.isna()]) / len(files) * 100"
510509
]
511510
},
512511
{
@@ -597,7 +596,12 @@
597596
],
598597
"source": [
599598
"# month list is empty?\n",
600-
"print(\"\\n\".join(l.split(\".\")[0] for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()))"
599+
"print(\n",
600+
" \"\\n\".join(\n",
601+
" l.split(\".\")[0]\n",
602+
" for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()\n",
603+
" )\n",
604+
")"
601605
]
602606
},
603607
{
@@ -627,7 +631,7 @@
627631
],
628632
"source": [
629633
"# cannot find patient data\n",
630-
"for year in range(2017,2023):\n",
634+
"for year in range(2017, 2023):\n",
631635
" subdf = df[(~df.error.isna()) & (df.error.str.contains(\"readxl::cell_limits\"))]\n",
632636
" print(subdf[subdf.file.str.startswith(str(year))].file.tolist())"
633637
]
@@ -854,7 +858,7 @@
854858
}
855859
],
856860
"source": [
857-
"df[~df.warning.isna()].drop_duplicates()\n"
861+
"df[~df.warning.isna()].drop_duplicates()"
858862
]
859863
},
860864
{
@@ -925,12 +929,17 @@
925929
}
926930
],
927931
"source": [
928-
"missed_names = df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))].warning.str.strip(\"Extra columns in patient data:\").drop_duplicates().to_list()\n",
932+
"missed_names = (\n",
933+
" df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))]\n",
934+
" .warning.str.strip(\"Extra columns in patient data:\")\n",
935+
" .drop_duplicates()\n",
936+
" .to_list()\n",
937+
")\n",
929938
"\n",
930939
"names = set()\n",
931940
"for x in missed_names:\n",
932941
" names.update(x for x in x.split(\",\") if x)\n",
933-
" \n",
942+
"\n",
934943
"names"
935944
]
936945
},
@@ -1877,7 +1886,7 @@
18771886
"source": [
18781887
"subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"Found invalid value\", regex=False))]\n",
18791888
"\n",
1880-
"subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0,3]].drop_duplicates()"
1889+
"subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0, 3]].drop_duplicates()"
18811890
]
18821891
},
18831892
{
@@ -3192,7 +3201,10 @@
31923201
}
31933202
],
31943203
"source": [
3195-
"subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))]\n",
3204+
"subdf = df[\n",
3205+
" (~df.warning.isna())\n",
3206+
" & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))\n",
3207+
"]\n",
31963208
"\n",
31973209
"subdf.warning.str.split(expand=True)[[3, 6]].drop_duplicates()"
31983210
]

r-archive/scripts-root/python/main.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
# "pandas>=3.0.1",
77
# ]
88
# ///
9-
""" Small cli helper tool to replace patient names with patient ids in excel files.
9+
"""Small cli helper tool to replace patient names with patient ids in excel files.
1010
1111
This script is used to replace patient names with patient ids in excel files.
1212
The script will look for excel files in the source directory and replace the names
13-
with the ids. The script will create a new directory called 'output' next to the source directory
13+
with the ids. The script will create a new directory called 'output' next to the source directory
1414
and save the changed files there.
15-
The source directory is specified by the user via prompt.
15+
The source directory is specified by the user via prompt.
1616
The output directory is specified by the user via option --output, and defaults to "output".
17-
The script is logging to a file called 'main_replace_patient_names.log'
17+
The script is logging to a file called 'main_replace_patient_names.log'
1818
in a subdirectory called 'logs' inside the output directory.
1919
2020
Example call:
@@ -78,9 +78,7 @@ def replace_name_with_id(src: Path, output: str):
7878

7979
logger.info("Start processing %s excel files.", len(excel_files))
8080
for i, excel_file in enumerate(excel_files):
81-
logger.info(
82-
"Start processing %s (%s/%s).", excel_file.name, i + 1, len(excel_files)
83-
)
81+
logger.info("Start processing %s (%s/%s).", excel_file.name, i + 1, len(excel_files))
8482

8583
try:
8684
wb = openpyxl.load_workbook(str(excel_file), data_only=True)
@@ -98,9 +96,7 @@ def replace_name_with_id(src: Path, output: str):
9896
patient_data = pd.DataFrame(
9997
[
10098
(b.value, c.value)
101-
for b, c in wb["Patient List"][
102-
PATIENT_DATA_RANGE[0] : PATIENT_DATA_RANGE[1]
103-
]
99+
for b, c in wb["Patient List"][PATIENT_DATA_RANGE[0] : PATIENT_DATA_RANGE[1]]
104100
if b.value and c.value
105101
],
106102
columns=["id", "name"],
@@ -129,9 +125,7 @@ def replace_name_with_id(src: Path, output: str):
129125
if not all(patients_replaced.values()):
130126
logger.warning(
131127
"Not all patient names were replaced. Missing patients: %s",
132-
", ".join(
133-
name for name, replaced in patients_replaced.items() if not replaced
134-
),
128+
", ".join(name for name, replaced in patients_replaced.items() if not replaced),
135129
)
136130

137131
wb.save(output_dir / excel_file.name)

r-archive/scripts-root/python/sort_yaml.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,25 @@
55

66

77
def parse_args():
8-
parser = ArgumentParser(description='Sort YAML file')
9-
parser.add_argument('file', type=str, help='YAML file to sort')
8+
parser = ArgumentParser(description="Sort YAML file")
9+
parser.add_argument("file", type=str, help="YAML file to sort")
1010
return parser.parse_args()
1111

1212

1313
def sort_yaml():
1414
args = parse_args()
1515
yaml_file = Path(args.file).resolve()
1616
if not yaml_file.is_file():
17-
print(f'File not found: {yaml_file}')
17+
print(f"File not found: {yaml_file}")
1818
return
19-
20-
with open(yaml_file, 'r') as f:
19+
20+
with open(yaml_file, "r") as f:
2121
data = yaml.safe_load(f)
22-
23-
with open(yaml_file, 'w') as f:
22+
23+
with open(yaml_file, "w") as f:
2424
yaml.dump(data, f, sort_keys=True)
2525
print("YAML file sorted")
26-
27-
if __name__ == '__main__':
28-
sort_yaml()
2926

27+
28+
if __name__ == "__main__":
29+
sort_yaml()

src/a4d/clean/date_parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ def parse_date_flexible(date_str: str | None, error_val: str = "9999-09-09") ->
116116
return result
117117
except (ValueError, date_parser.ParserError) as e:
118118
# If parsing fails, log warning and return error date
119-
logger.bind(error_code="invalid_value").warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
119+
logger.bind(error_code="invalid_value").warning(
120+
f"Could not parse date '{date_str}': {e}. Returning error value {error_val}"
121+
)
120122
try:
121123
return datetime.strptime(error_val, "%Y-%m-%d").date()
122124
except ValueError:

src/a4d/clean/transformers.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,9 @@ def fix_value(value: str | None) -> str | None:
324324

325325
# Log warning if any ranges were found
326326
if has_ranges:
327-
logger.bind(error_code="invalid_value").warning("Found ranges in testing_frequency column. Replacing with mean values.")
327+
logger.bind(error_code="invalid_value").warning(
328+
"Found ranges in testing_frequency column. Replacing with mean values."
329+
)
328330

329331
return df
330332

src/a4d/cli.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,9 @@ def download_reference_data_cmd() -> None:
554554
console.print("Downloading clinic_data.xlsx from Google Drive...")
555555
path = download_clinic_data(reference_dir)
556556
size_kb = path.stat().st_size / 1024
557-
console.print(f" [bold green]✓[/bold green] clinic_data.xlsx ({size_kb:.1f} KB) -> {path}\n")
557+
console.print(
558+
f" [bold green]✓[/bold green] clinic_data.xlsx ({size_kb:.1f} KB) -> {path}\n"
559+
)
558560
except Exception as e:
559561
console.print(f" [bold red]✗ Download failed: {e}[/bold red]\n")
560562
raise typer.Exit(1) from e
@@ -629,7 +631,9 @@ def run_pipeline_cmd(
629631
console.print(f"Workers: {_workers}")
630632
console.print(f"Project: {settings.project_id}")
631633
console.print(f"Dataset: {settings.dataset}")
632-
console.print(f"Drive: {'yes' if not skip_drive_download else 'skipped (--skip-drive-download)'}")
634+
console.print(
635+
f"Drive: {'yes' if not skip_drive_download else 'skipped (--skip-drive-download)'}"
636+
)
633637
console.print(f"Download: {'yes' if not skip_download else 'skipped (--skip-download)'}")
634638
console.print(f"Upload: {'yes' if not skip_upload else 'skipped (--skip-upload)'}")
635639
console.print()
@@ -714,7 +718,9 @@ def run_pipeline_cmd(
714718
uploaded += upload_output(source_dir=tables_dir, prefix=f"{run_ts}/tables")
715719
if logs_dir.exists():
716720
uploaded += upload_output(source_dir=logs_dir, prefix=f"{run_ts}/logs")
717-
console.print(f" ✓ Uploaded {len(uploaded)} files to gs://{settings.upload_bucket}/{run_ts}/\n")
721+
console.print(
722+
f" ✓ Uploaded {len(uploaded)} files to gs://{settings.upload_bucket}/{run_ts}/\n"
723+
)
718724
except Exception as e:
719725
console.print(f"\n[bold red]Error during GCS upload: {e}[/bold red]\n")
720726
raise typer.Exit(1) from e

src/a4d/extract/patient.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,9 @@ def extract_patient_data(
510510
if not valid_cols:
511511
if close_wb:
512512
workbook.close()
513-
logger.bind(error_code="invalid_tracker").warning(f"No valid headers found in sheet '{sheet_name}'")
513+
logger.bind(error_code="invalid_tracker").warning(
514+
f"No valid headers found in sheet '{sheet_name}'"
515+
)
514516
return pl.DataFrame()
515517

516518
data = read_patient_rows(ws, data_start_row, len(headers))
@@ -689,7 +691,9 @@ def read_all_patient_sheets(
689691
df_sheet = extract_patient_data(tracker_file, sheet_name, year, mapper=mapper, workbook=wb)
690692

691693
if df_sheet.is_empty():
692-
logger.bind(error_code="invalid_tracker").warning(f"Sheet '{sheet_name}' has no data, skipping")
694+
logger.bind(error_code="invalid_tracker").warning(
695+
f"Sheet '{sheet_name}' has no data, skipping"
696+
)
693697
continue
694698

695699
df_sheet = harmonize_patient_data_columns(df_sheet, mapper=mapper, strict=False)
@@ -703,7 +707,9 @@ def read_all_patient_sheets(
703707
try:
704708
month_num = extract_tracker_month(sheet_name)
705709
except ValueError as e:
706-
logger.bind(error_code="invalid_tracker").warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
710+
logger.bind(error_code="invalid_tracker").warning(
711+
f"Could not extract month from '{sheet_name}': {e}, skipping"
712+
)
707713
continue
708714

709715
# Derived metadata (year, month) use Int64; text metadata (sheet_name, etc.) use String
@@ -843,7 +849,9 @@ def read_all_patient_sheets(
843849
else:
844850
logger.bind(error_code="invalid_tracker").warning("Patient List sheet is empty")
845851
except Exception as e:
846-
logger.bind(error_code="invalid_tracker").warning(f"Could not process Patient List sheet: {e}")
852+
logger.bind(error_code="invalid_tracker").warning(
853+
f"Could not process Patient List sheet: {e}"
854+
)
847855

848856
# Process Annual sheet if it exists (R: lines 132-160)
849857
if "Annual" in all_sheets:
@@ -884,11 +892,15 @@ def read_all_patient_sheets(
884892
)
885893
logger.info(f"Joined {len(annual_data)} Annual records")
886894
else:
887-
logger.bind(error_code="invalid_tracker").warning("Annual sheet has no 'patient_id' column after harmonization")
895+
logger.bind(error_code="invalid_tracker").warning(
896+
"Annual sheet has no 'patient_id' column after harmonization"
897+
)
888898
else:
889899
logger.bind(error_code="invalid_tracker").warning("Annual sheet is empty")
890900
except Exception as e:
891-
logger.bind(error_code="invalid_tracker").warning(f"Could not process Annual sheet: {e}")
901+
logger.bind(error_code="invalid_tracker").warning(
902+
f"Could not process Annual sheet: {e}"
903+
)
892904

893905
# Close workbook after all processing
894906
wb.close()

src/a4d/gcp/drive.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,7 @@ def download_clinic_data(destination: Path) -> Path:
3939

4040
logger.info(f"Downloading clinic_data.xlsx from Google Drive (file ID: {CLINIC_DATA_FILE_ID})")
4141

42-
credentials, _ = google.auth.default(
43-
scopes=["https://www.googleapis.com/auth/drive.readonly"]
44-
)
42+
credentials, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/drive.readonly"])
4543
session = google.auth.transport.requests.AuthorizedSession(credentials)
4644

4745
# clinic_data is a Google Sheets file — must use export endpoint, not alt=media.

src/a4d/pipeline/tracker.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,9 @@ def process_tracker_patient(
103103
)
104104

105105
except Exception as e:
106-
logger.bind(error_code="critical_abort").exception(f"Failed to process tracker: {tracker_file.name}")
106+
logger.bind(error_code="critical_abort").exception(
107+
f"Failed to process tracker: {tracker_file.name}"
108+
)
107109
return TrackerResult(
108110
tracker_file=tracker_file,
109111
tracker_name=tracker_name,

0 commit comments

Comments
 (0)