Add error_code classification to pipeline logs

Michael Aydinbas · Michael Aydinbas · commit 7e345dd0f488 · 2026-03-01T04:28:30.000+01:00
Adds structured error_code field to all loguru log records, matching R
pipeline's errorCode/warningCode concept for dashboard filtering.

- tables/logs.py: extract error_code from loguru extra dict, add to schema
- logging.py: fix silent bug (logger.exception kwarg silently discarded);
  use logger.bind(error_code=...) pattern throughout
- 18 call sites across tracker.py, patient.py, transformers.py,
  date_parser.py, synonyms.py, extract/patient.py with codes:
  critical_abort, invalid_value, missing_value, missing_column, invalid_tracker
diff --git a/a4d-python/src/a4d/clean/date_parser.py b/a4d-python/src/a4d/clean/date_parser.py
@@ -116,7 +116,7 @@ def parse_date_flexible(date_str: str | None, error_val: str = "9999-09-09") ->
         return result
     except (ValueError, date_parser.ParserError) as e:
         # If parsing fails, log warning and return error date
-        logger.warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
+        logger.bind(error_code="invalid_value").warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
         try:
             return datetime.strptime(error_val, "%Y-%m-%d").date()
         except ValueError:
diff --git a/a4d-python/src/a4d/clean/patient.py b/a4d-python/src/a4d/clean/patient.py
@@ -671,7 +671,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
         calc_age = row["_calc_age"]
 
         if excel_age is None or (excel_age == settings.error_val_numeric):
-            logger.warning(
+            logger.bind(error_code="missing_value").warning(
                 f"Patient {patient_id}: age is missing. "
                 f"Using calculated age {calc_age} instead of original age."
             )
@@ -686,7 +686,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
             )
             ages_missing += 1
         elif calc_age < 0:
-            logger.warning(
+            logger.bind(error_code="invalid_value").warning(
                 f"Patient {patient_id}: calculated age is negative ({calc_age}). "
                 f"Please check this manually. Using error value instead."
             )
@@ -701,7 +701,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
             )
             ages_negative += 1
         else:
-            logger.warning(
+            logger.bind(error_code="invalid_value").warning(
                 f"Patient {patient_id}: age {excel_age} is different "
                 f"from calculated age {calc_age}. "
                 f"Using calculated age instead of original age."
@@ -831,7 +831,7 @@ def _validate_dates(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.Dat
             original_date = row.get(col)
             tracker_year = row.get("tracker_year")
 
-            logger.warning(
+            logger.bind(error_code="invalid_value").warning(
                 f"Patient {patient_id}: {col} = {original_date} "
                 f"is beyond tracker year {tracker_year}. "
                 f"Replacing with error date."
diff --git a/a4d-python/src/a4d/clean/transformers.py b/a4d-python/src/a4d/clean/transformers.py
@@ -324,7 +324,7 @@ def fix_value(value: str | None) -> str | None:
 
     # Log warning if any ranges were found
     if has_ranges:
-        logger.warning("Found ranges in testing_frequency column. Replacing with mean values.")
+        logger.bind(error_code="invalid_value").warning("Found ranges in testing_frequency column. Replacing with mean values.")
 
     return df
 
@@ -367,7 +367,7 @@ def split_bp_in_sys_and_dias(df: pl.DataFrame) -> pl.DataFrame:
     has_errors = df.filter(pl.col("blood_pressure_mmhg") == error_pattern).height > 0
 
     if has_errors:
-        logger.warning(
+        logger.bind(error_code="invalid_value").warning(
             "Found invalid values for column blood_pressure_mmhg "
             f"that do not follow the format X/Y. "
             f"Values were replaced with {error_val_int}."
diff --git a/a4d-python/src/a4d/extract/patient.py b/a4d-python/src/a4d/extract/patient.py
@@ -510,7 +510,7 @@ def extract_patient_data(
     if not valid_cols:
         if close_wb:
             workbook.close()
-        logger.warning(f"No valid headers found in sheet '{sheet_name}'")
+        logger.bind(error_code="invalid_tracker").warning(f"No valid headers found in sheet '{sheet_name}'")
         return pl.DataFrame()
 
     data = read_patient_rows(ws, data_start_row, len(headers))
@@ -689,21 +689,21 @@ def read_all_patient_sheets(
         df_sheet = extract_patient_data(tracker_file, sheet_name, year, mapper=mapper, workbook=wb)
 
         if df_sheet.is_empty():
-            logger.warning(f"Sheet '{sheet_name}' has no data, skipping")
+            logger.bind(error_code="invalid_tracker").warning(f"Sheet '{sheet_name}' has no data, skipping")
             continue
 
         df_sheet = harmonize_patient_data_columns(df_sheet, mapper=mapper, strict=False)
 
         if "patient_id" not in df_sheet.columns:
-            logger.warning(
+            logger.bind(error_code="invalid_tracker").warning(
                 f"Sheet '{sheet_name}' has no 'patient_id' column after harmonization, skipping"
             )
             continue
 
         try:
             month_num = extract_tracker_month(sheet_name)
         except ValueError as e:
-            logger.warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
+            logger.bind(error_code="invalid_tracker").warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
             continue
 
         # Derived metadata (year, month) use Int64; text metadata (sheet_name, etc.) use String
@@ -735,7 +735,7 @@ def read_all_patient_sheets(
     missing_count = len(missing_patient_id_rows)
 
     if missing_count > 0:
-        logger.error(
+        logger.bind(error_code="invalid_value").error(
             f"Found {missing_count} rows with missing patient_id in {tracker_file.name} - "
             f"these rows will be excluded from processing"
         )
@@ -837,13 +837,13 @@ def read_all_patient_sheets(
                     )
                     logger.info(f"Joined {len(patient_list)} Patient List records")
                 else:
-                    logger.warning(
+                    logger.bind(error_code="invalid_tracker").warning(
                         "Patient List sheet has no 'patient_id' column after harmonization"
                     )
             else:
-                logger.warning("Patient List sheet is empty")
+                logger.bind(error_code="invalid_tracker").warning("Patient List sheet is empty")
         except Exception as e:
-            logger.warning(f"Could not process Patient List sheet: {e}")
+            logger.bind(error_code="invalid_tracker").warning(f"Could not process Patient List sheet: {e}")
 
     # Process Annual sheet if it exists (R: lines 132-160)
     if "Annual" in all_sheets:
@@ -884,11 +884,11 @@ def read_all_patient_sheets(
                     )
                     logger.info(f"Joined {len(annual_data)} Annual records")
                 else:
-                    logger.warning("Annual sheet has no 'patient_id' column after harmonization")
+                    logger.bind(error_code="invalid_tracker").warning("Annual sheet has no 'patient_id' column after harmonization")
             else:
-                logger.warning("Annual sheet is empty")
+                logger.bind(error_code="invalid_tracker").warning("Annual sheet is empty")
         except Exception as e:
-            logger.warning(f"Could not process Annual sheet: {e}")
+            logger.bind(error_code="invalid_tracker").warning(f"Could not process Annual sheet: {e}")
 
     # Close workbook after all processing
     wb.close()
diff --git a/a4d-python/src/a4d/logging.py b/a4d-python/src/a4d/logging.py
@@ -165,7 +165,7 @@ def file_logger(
             yield
         except Exception:
             # Log exception with full traceback
-            logger.exception("Processing failed", error_code="critical_abort")
+            logger.bind(error_code="critical_abort").exception("Processing failed")
             raise
         finally:
             # Remove the handler
diff --git a/a4d-python/src/a4d/pipeline/tracker.py b/a4d-python/src/a4d/pipeline/tracker.py
@@ -102,7 +102,7 @@ def process_tracker_patient(
         )
 
     except Exception as e:
-        logger.exception(f"Failed to process tracker: {tracker_file.name}")
+        logger.bind(error_code="critical_abort").exception(f"Failed to process tracker: {tracker_file.name}")
         return TrackerResult(
             tracker_file=tracker_file,
             tracker_name=tracker_name,
diff --git a/a4d-python/src/a4d/reference/synonyms.py b/a4d-python/src/a4d/reference/synonyms.py
@@ -117,7 +117,7 @@ def _build_lookup(self) -> dict[str, str]:
                 sanitized_key = sanitize_str(synonym)
 
                 if sanitized_key in lookup:
-                    logger.warning(
+                    logger.bind(error_code="invalid_tracker").warning(
                         f"Duplicate sanitized synonym '{sanitized_key}' "
                         f"(from '{synonym}') found for both "
                         f"'{lookup[sanitized_key]}' and '{standard_name}'. "
@@ -209,7 +209,7 @@ def rename_columns(
                     "These columns do not appear in the synonym file."
                 )
             else:
-                logger.warning(
+                logger.bind(error_code="missing_column").warning(
                     f"Keeping {len(unmapped_columns)} unmapped columns as-is: {unmapped_columns}"
                 )
 
@@ -221,7 +221,7 @@ def rename_columns(
 
         if any(count > 1 for count in target_counts.values()):
             duplicates = {t: c for t, c in target_counts.items() if c > 1}
-            logger.warning(
+            logger.bind(error_code="invalid_tracker").warning(
                 f"Multiple source columns map to same target name: {duplicates}. "
                 "Keeping first occurrence only. "
                 "This is an edge case from discontinued 2023 format."
diff --git a/a4d-python/src/a4d/tables/logs.py b/a4d-python/src/a4d/tables/logs.py
@@ -60,11 +60,12 @@ def parse_log_file(log_file: Path) -> pl.DataFrame:
                     line = record_data.get("line", 0)
                     module = record_data.get("module", "")
 
-                    # Extract context fields (file_name, tracker_year, tracker_month)
+                    # Extract context fields (file_name, tracker_year, tracker_month, error_code)
                     extra = record_data.get("extra", {})
                     file_name = extra.get("file_name")
                     tracker_year = extra.get("tracker_year")
                     tracker_month = extra.get("tracker_month")
+                    error_code = extra.get("error_code")
 
                     # Extract process info (useful for debugging parallel processing)
                     process_data = record_data.get("process", {})
@@ -86,6 +87,7 @@ def parse_log_file(log_file: Path) -> pl.DataFrame:
                             "timestamp": timestamp,
                             "level": level,
                             "message": message,
+                            "error_code": error_code,
                             "log_file": log_file.name,
                             "file_name": file_name,
                             "tracker_year": tracker_year,
@@ -169,6 +171,7 @@ def create_table_logs(logs_dir: Path, output_dir: Path) -> Path:
                 "timestamp": pl.Datetime,
                 "level": pl.Categorical,
                 "message": pl.Utf8,
+                "error_code": pl.Utf8,
                 "log_file": pl.Categorical,
                 "file_name": pl.Utf8,
                 "tracker_year": pl.Int32,

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ def process_tracker_patient(`
`102`	`102`	`)`
`103`	`103`
`104`	`104`	`except Exception as e:`
`105`		`- logger.exception(f"Failed to process tracker: {tracker_file.name}")`
	`105`	`+ logger.bind(error_code="critical_abort").exception(f"Failed to process tracker: {tracker_file.name}")`
`106`	`106`	`return TrackerResult(`
`107`	`107`	`tracker_file=tracker_file,`
`108`	`108`	`tracker_name=tracker_name,`