add new error log table combining all errors in a single parquet file and preparing for BigQuery upload

Michael Aydinbas · Michael Aydinbas · commit faa3c2801d4c · 2026-04-01T16:55:23.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,10 +4,8 @@ version = "2.0.0"
 description = "A4D Medical Tracker Data Processing Pipeline (Python)"
 readme = "README.md"
 requires-python = ">=3.14"
-authors = [
-    {name = "Michael Aydinbas", email = "michael.aydinbas@gmail.com"}
-]
-license = {text = "MIT"}
+authors = [{ name = "Michael Aydinbas", email = "michael.aydinbas@gmail.com" }]
+license = { text = "MIT" }
 
 dependencies = [
     "polars>=0.20.0",
@@ -48,20 +46,20 @@ build-backend = "hatchling.build"
 line-length = 100
 target-version = "py314"
 lint.select = [
-    "E",   # pycodestyle errors
-    "W",   # pycodestyle warnings
-    "F",   # pyflakes
-    "I",   # isort
-    "N",   # pep8-naming
-    "UP",  # pyupgrade
-    "B",   # flake8-bugbear
-    "A",   # flake8-builtins
-    "C4",  # flake8-comprehensions
-    "PT",  # flake8-pytest-style
+    "E",  # pycodestyle errors
+    "W",  # pycodestyle warnings
+    "F",  # pyflakes
+    "I",  # isort
+    "N",  # pep8-naming
+    "UP", # pyupgrade
+    "B",  # flake8-bugbear
+    "A",  # flake8-builtins
+    "C4", # flake8-comprehensions
+    "PT", # flake8-pytest-style
 ]
 
 [tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["F401"]  # Allow unused imports in __init__.py
+"__init__.py" = ["F401"] # Allow unused imports in __init__.py
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
@@ -72,11 +70,5 @@ markers = [
     "integration: marks tests as integration tests requiring real tracker files",
     "e2e: marks tests as end-to-end tests (extraction + cleaning)",
 ]
-addopts = [
-    "--cov=src/a4d",
-    "--cov-report=term-missing",
-    "--cov-report=html",
-]
-filterwarnings = [
-    "ignore::RuntimeWarning:google_crc32c",
-]
+addopts = ["--cov=src/a4d", "--cov-report=term-missing", "--cov-report=html"]
+filterwarnings = ["ignore::RuntimeWarning:google_crc32c"]
diff --git a/r-archive/scripts-root/logs.ipynb b/r-archive/scripts-root/logs.ipynb
@@ -11,9 +11,9 @@
     "import pandas as pd\n",
     "from collections import defaultdict\n",
     "\n",
-    "pd.set_option('display.max_colwidth', None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
     "pd.set_option(\"display.max_rows\", 1000)\n",
-    "pd.set_option('display.width', None)"
+    "pd.set_option(\"display.width\", None)"
    ]
   },
   {
@@ -60,11 +60,11 @@
     "            warn = None\n",
     "            if \"ERROR\" in line:\n",
     "                error = line.split(\"\\t\")[-1]\n",
-    "                \n",
+    "\n",
     "            if \"WARN\" in line:\n",
     "                warn = line.split(\"\\t\")[-1]\n",
-    "            \n",
-    "            if error or warn: \n",
+    "\n",
+    "            if error or warn:\n",
     "                data[\"file\"].append(file.name)\n",
     "                data[\"error\"].append(error)\n",
     "                data[\"warning\"].append(warn)"
@@ -505,8 +505,7 @@
    ],
    "source": [
     "# how many files could not be processed in %?\n",
-    "len(df[~df.error.isna()]) / len(files) * 100\n",
-    "                                                        "
+    "len(df[~df.error.isna()]) / len(files) * 100"
    ]
   },
   {
@@ -597,7 +596,12 @@
    ],
    "source": [
     "# month list is empty?\n",
-    "print(\"\\n\".join(l.split(\".\")[0] for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()))"
+    "print(\n",
+    "    \"\\n\".join(\n",
+    "        l.split(\".\")[0]\n",
+    "        for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -627,7 +631,7 @@
    ],
    "source": [
     "# cannot find patient data\n",
-    "for year in range(2017,2023):\n",
+    "for year in range(2017, 2023):\n",
     "    subdf = df[(~df.error.isna()) & (df.error.str.contains(\"readxl::cell_limits\"))]\n",
     "    print(subdf[subdf.file.str.startswith(str(year))].file.tolist())"
    ]
@@ -854,7 +858,7 @@
     }
    ],
    "source": [
-    "df[~df.warning.isna()].drop_duplicates()\n"
+    "df[~df.warning.isna()].drop_duplicates()"
    ]
   },
   {
@@ -925,12 +929,17 @@
     }
    ],
    "source": [
-    "missed_names = df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))].warning.str.strip(\"Extra columns in patient data:\").drop_duplicates().to_list()\n",
+    "missed_names = (\n",
+    "    df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))]\n",
+    "    .warning.str.strip(\"Extra columns in patient data:\")\n",
+    "    .drop_duplicates()\n",
+    "    .to_list()\n",
+    ")\n",
     "\n",
     "names = set()\n",
     "for x in missed_names:\n",
     "    names.update(x for x in x.split(\",\") if x)\n",
-    "    \n",
+    "\n",
     "names"
    ]
   },
@@ -1877,7 +1886,7 @@
    "source": [
     "subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"Found invalid value\", regex=False))]\n",
     "\n",
-    "subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0,3]].drop_duplicates()"
+    "subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0, 3]].drop_duplicates()"
    ]
   },
   {
@@ -3192,7 +3201,10 @@
     }
    ],
    "source": [
-    "subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))]\n",
+    "subdf = df[\n",
+    "    (~df.warning.isna())\n",
+    "    & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))\n",
+    "]\n",
     "\n",
     "subdf.warning.str.split(expand=True)[[3, 6]].drop_duplicates()"
    ]
diff --git a/r-archive/scripts-root/python/main.py b/r-archive/scripts-root/python/main.py
@@ -6,15 +6,15 @@
 #     "pandas>=3.0.1",
 # ]
 # ///
-""" Small cli helper tool to replace patient names with patient ids in excel files.
+"""Small cli helper tool to replace patient names with patient ids in excel files.
 
 This script is used to replace patient names with patient ids in excel files.
 The script will look for excel files in the source directory and replace the names
-with the ids. The script will create a new directory called 'output' next to the source directory 
+with the ids. The script will create a new directory called 'output' next to the source directory
 and save the changed files there.
-The source directory is specified by the user via prompt. 
+The source directory is specified by the user via prompt.
 The output directory is specified by the user via option --output, and defaults to "output".
-The script is logging to a file called 'main_replace_patient_names.log' 
+The script is logging to a file called 'main_replace_patient_names.log'
 in a subdirectory called 'logs' inside the output directory.
 
 Example call:
@@ -78,9 +78,7 @@ def replace_name_with_id(src: Path, output: str):
 
     logger.info("Start processing %s excel files.", len(excel_files))
     for i, excel_file in enumerate(excel_files):
-        logger.info(
-            "Start processing %s (%s/%s).", excel_file.name, i + 1, len(excel_files)
-        )
+        logger.info("Start processing %s (%s/%s).", excel_file.name, i + 1, len(excel_files))
 
         try:
             wb = openpyxl.load_workbook(str(excel_file), data_only=True)
@@ -98,9 +96,7 @@ def replace_name_with_id(src: Path, output: str):
         patient_data = pd.DataFrame(
             [
                 (b.value, c.value)
-                for b, c in wb["Patient List"][
-                    PATIENT_DATA_RANGE[0] : PATIENT_DATA_RANGE[1]
-                ]
+                for b, c in wb["Patient List"][PATIENT_DATA_RANGE[0] : PATIENT_DATA_RANGE[1]]
                 if b.value and c.value
             ],
             columns=["id", "name"],
@@ -129,9 +125,7 @@ def replace_name_with_id(src: Path, output: str):
         if not all(patients_replaced.values()):
             logger.warning(
                 "Not all patient names were replaced. Missing patients: %s",
-                ", ".join(
-                    name for name, replaced in patients_replaced.items() if not replaced
-                ),
+                ", ".join(name for name, replaced in patients_replaced.items() if not replaced),
             )
 
         wb.save(output_dir / excel_file.name)
diff --git a/r-archive/scripts-root/python/sort_yaml.py b/r-archive/scripts-root/python/sort_yaml.py
@@ -5,25 +5,25 @@
 
 
 def parse_args():
-    parser = ArgumentParser(description='Sort YAML file')
-    parser.add_argument('file', type=str, help='YAML file to sort')
+    parser = ArgumentParser(description="Sort YAML file")
+    parser.add_argument("file", type=str, help="YAML file to sort")
     return parser.parse_args()
 
 
 def sort_yaml():
     args = parse_args()
     yaml_file = Path(args.file).resolve()
     if not yaml_file.is_file():
-        print(f'File not found: {yaml_file}')
+        print(f"File not found: {yaml_file}")
         return
-    
-    with open(yaml_file, 'r') as f:
+
+    with open(yaml_file, "r") as f:
         data = yaml.safe_load(f)
-    
-    with open(yaml_file, 'w') as f:
+
+    with open(yaml_file, "w") as f:
         yaml.dump(data, f, sort_keys=True)
         print("YAML file sorted")
-    
-if __name__ == '__main__':
-    sort_yaml()
 
+
+if __name__ == "__main__":
+    sort_yaml()
diff --git a/src/a4d/clean/date_parser.py b/src/a4d/clean/date_parser.py
@@ -116,7 +116,9 @@ def parse_date_flexible(date_str: str | None, error_val: str = "9999-09-09") ->
         return result
     except (ValueError, date_parser.ParserError) as e:
         # If parsing fails, log warning and return error date
-        logger.bind(error_code="invalid_value").warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
+        logger.bind(error_code="invalid_value").warning(
+            f"Could not parse date '{date_str}': {e}. Returning error value {error_val}"
+        )
         try:
             return datetime.strptime(error_val, "%Y-%m-%d").date()
         except ValueError:
diff --git a/src/a4d/clean/transformers.py b/src/a4d/clean/transformers.py
@@ -324,7 +324,9 @@ def fix_value(value: str | None) -> str | None:
 
     # Log warning if any ranges were found
     if has_ranges:
-        logger.bind(error_code="invalid_value").warning("Found ranges in testing_frequency column. Replacing with mean values.")
+        logger.bind(error_code="invalid_value").warning(
+            "Found ranges in testing_frequency column. Replacing with mean values."
+        )
 
     return df
 
diff --git a/src/a4d/cli.py b/src/a4d/cli.py
@@ -554,7 +554,9 @@ def download_reference_data_cmd() -> None:
         console.print("Downloading clinic_data.xlsx from Google Drive...")
         path = download_clinic_data(reference_dir)
         size_kb = path.stat().st_size / 1024
-        console.print(f"  [bold green]✓[/bold green] clinic_data.xlsx ({size_kb:.1f} KB) -> {path}\n")
+        console.print(
+            f"  [bold green]✓[/bold green] clinic_data.xlsx ({size_kb:.1f} KB) -> {path}\n"
+        )
     except Exception as e:
         console.print(f"  [bold red]✗ Download failed: {e}[/bold red]\n")
         raise typer.Exit(1) from e
@@ -629,7 +631,9 @@ def run_pipeline_cmd(
     console.print(f"Workers:     {_workers}")
     console.print(f"Project:     {settings.project_id}")
     console.print(f"Dataset:     {settings.dataset}")
-    console.print(f"Drive:       {'yes' if not skip_drive_download else 'skipped (--skip-drive-download)'}")
+    console.print(
+        f"Drive:       {'yes' if not skip_drive_download else 'skipped (--skip-drive-download)'}"
+    )
     console.print(f"Download:    {'yes' if not skip_download else 'skipped (--skip-download)'}")
     console.print(f"Upload:      {'yes' if not skip_upload else 'skipped (--skip-upload)'}")
     console.print()
@@ -714,7 +718,9 @@ def run_pipeline_cmd(
                 uploaded += upload_output(source_dir=tables_dir, prefix=f"{run_ts}/tables")
             if logs_dir.exists():
                 uploaded += upload_output(source_dir=logs_dir, prefix=f"{run_ts}/logs")
-            console.print(f"  ✓ Uploaded {len(uploaded)} files to gs://{settings.upload_bucket}/{run_ts}/\n")
+            console.print(
+                f"  ✓ Uploaded {len(uploaded)} files to gs://{settings.upload_bucket}/{run_ts}/\n"
+            )
         except Exception as e:
             console.print(f"\n[bold red]Error during GCS upload: {e}[/bold red]\n")
             raise typer.Exit(1) from e
diff --git a/src/a4d/extract/patient.py b/src/a4d/extract/patient.py
@@ -510,7 +510,9 @@ def extract_patient_data(
     if not valid_cols:
         if close_wb:
             workbook.close()
-        logger.bind(error_code="invalid_tracker").warning(f"No valid headers found in sheet '{sheet_name}'")
+        logger.bind(error_code="invalid_tracker").warning(
+            f"No valid headers found in sheet '{sheet_name}'"
+        )
         return pl.DataFrame()
 
     data = read_patient_rows(ws, data_start_row, len(headers))
@@ -689,7 +691,9 @@ def read_all_patient_sheets(
         df_sheet = extract_patient_data(tracker_file, sheet_name, year, mapper=mapper, workbook=wb)
 
         if df_sheet.is_empty():
-            logger.bind(error_code="invalid_tracker").warning(f"Sheet '{sheet_name}' has no data, skipping")
+            logger.bind(error_code="invalid_tracker").warning(
+                f"Sheet '{sheet_name}' has no data, skipping"
+            )
             continue
 
         df_sheet = harmonize_patient_data_columns(df_sheet, mapper=mapper, strict=False)
@@ -703,7 +707,9 @@ def read_all_patient_sheets(
         try:
             month_num = extract_tracker_month(sheet_name)
         except ValueError as e:
-            logger.bind(error_code="invalid_tracker").warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
+            logger.bind(error_code="invalid_tracker").warning(
+                f"Could not extract month from '{sheet_name}': {e}, skipping"
+            )
             continue
 
         # Derived metadata (year, month) use Int64; text metadata (sheet_name, etc.) use String
@@ -843,7 +849,9 @@ def read_all_patient_sheets(
             else:
                 logger.bind(error_code="invalid_tracker").warning("Patient List sheet is empty")
         except Exception as e:
-            logger.bind(error_code="invalid_tracker").warning(f"Could not process Patient List sheet: {e}")
+            logger.bind(error_code="invalid_tracker").warning(
+                f"Could not process Patient List sheet: {e}"
+            )
 
     # Process Annual sheet if it exists (R: lines 132-160)
     if "Annual" in all_sheets:
@@ -884,11 +892,15 @@ def read_all_patient_sheets(
                     )
                     logger.info(f"Joined {len(annual_data)} Annual records")
                 else:
-                    logger.bind(error_code="invalid_tracker").warning("Annual sheet has no 'patient_id' column after harmonization")
+                    logger.bind(error_code="invalid_tracker").warning(
+                        "Annual sheet has no 'patient_id' column after harmonization"
+                    )
             else:
                 logger.bind(error_code="invalid_tracker").warning("Annual sheet is empty")
         except Exception as e:
-            logger.bind(error_code="invalid_tracker").warning(f"Could not process Annual sheet: {e}")
+            logger.bind(error_code="invalid_tracker").warning(
+                f"Could not process Annual sheet: {e}"
+            )
 
     # Close workbook after all processing
     wb.close()
diff --git a/src/a4d/gcp/drive.py b/src/a4d/gcp/drive.py
@@ -39,9 +39,7 @@ def download_clinic_data(destination: Path) -> Path:
 
     logger.info(f"Downloading clinic_data.xlsx from Google Drive (file ID: {CLINIC_DATA_FILE_ID})")
 
-    credentials, _ = google.auth.default(
-        scopes=["https://www.googleapis.com/auth/drive.readonly"]
-    )
+    credentials, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/drive.readonly"])
     session = google.auth.transport.requests.AuthorizedSession(credentials)
 
     # clinic_data is a Google Sheets file — must use export endpoint, not alt=media.
diff --git a/src/a4d/pipeline/tracker.py b/src/a4d/pipeline/tracker.py
@@ -103,7 +103,9 @@ def process_tracker_patient(
         )
 
     except Exception as e:
-        logger.bind(error_code="critical_abort").exception(f"Failed to process tracker: {tracker_file.name}")
+        logger.bind(error_code="critical_abort").exception(
+            f"Failed to process tracker: {tracker_file.name}"
+        )
         return TrackerResult(
             tracker_file=tracker_file,
             tracker_name=tracker_name,
diff --git a/src/a4d/tables/errors.py b/src/a4d/tables/errors.py
diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py