Skip to content

Commit 7e345dd

Browse files
author
Michael Aydinbas
committed
Add error_code classification to pipeline logs
Adds structured error_code field to all loguru log records, matching R pipeline's errorCode/warningCode concept for dashboard filtering. - tables/logs.py: extract error_code from loguru extra dict, add to schema - logging.py: fix silent bug (logger.exception kwarg silently discarded); use logger.bind(error_code=...) pattern throughout - 18 call sites across tracker.py, patient.py, transformers.py, date_parser.py, synonyms.py, extract/patient.py with codes: critical_abort, invalid_value, missing_value, missing_column, invalid_tracker
1 parent c18fb70 commit 7e345dd

8 files changed

Lines changed: 27 additions & 24 deletions

File tree

a4d-python/src/a4d/clean/date_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def parse_date_flexible(date_str: str | None, error_val: str = "9999-09-09") ->
116116
return result
117117
except (ValueError, date_parser.ParserError) as e:
118118
# If parsing fails, log warning and return error date
119-
logger.warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
119+
logger.bind(error_code="invalid_value").warning(f"Could not parse date '{date_str}': {e}. Returning error value {error_val}")
120120
try:
121121
return datetime.strptime(error_val, "%Y-%m-%d").date()
122122
except ValueError:

a4d-python/src/a4d/clean/patient.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
671671
calc_age = row["_calc_age"]
672672

673673
if excel_age is None or (excel_age == settings.error_val_numeric):
674-
logger.warning(
674+
logger.bind(error_code="missing_value").warning(
675675
f"Patient {patient_id}: age is missing. "
676676
f"Using calculated age {calc_age} instead of original age."
677677
)
@@ -686,7 +686,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
686686
)
687687
ages_missing += 1
688688
elif calc_age < 0:
689-
logger.warning(
689+
logger.bind(error_code="invalid_value").warning(
690690
f"Patient {patient_id}: calculated age is negative ({calc_age}). "
691691
f"Please check this manually. Using error value instead."
692692
)
@@ -701,7 +701,7 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
701701
)
702702
ages_negative += 1
703703
else:
704-
logger.warning(
704+
logger.bind(error_code="invalid_value").warning(
705705
f"Patient {patient_id}: age {excel_age} is different "
706706
f"from calculated age {calc_age}. "
707707
f"Using calculated age instead of original age."
@@ -831,7 +831,7 @@ def _validate_dates(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.Dat
831831
original_date = row.get(col)
832832
tracker_year = row.get("tracker_year")
833833

834-
logger.warning(
834+
logger.bind(error_code="invalid_value").warning(
835835
f"Patient {patient_id}: {col} = {original_date} "
836836
f"is beyond tracker year {tracker_year}. "
837837
f"Replacing with error date."

a4d-python/src/a4d/clean/transformers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def fix_value(value: str | None) -> str | None:
324324

325325
# Log warning if any ranges were found
326326
if has_ranges:
327-
logger.warning("Found ranges in testing_frequency column. Replacing with mean values.")
327+
logger.bind(error_code="invalid_value").warning("Found ranges in testing_frequency column. Replacing with mean values.")
328328

329329
return df
330330

@@ -367,7 +367,7 @@ def split_bp_in_sys_and_dias(df: pl.DataFrame) -> pl.DataFrame:
367367
has_errors = df.filter(pl.col("blood_pressure_mmhg") == error_pattern).height > 0
368368

369369
if has_errors:
370-
logger.warning(
370+
logger.bind(error_code="invalid_value").warning(
371371
"Found invalid values for column blood_pressure_mmhg "
372372
f"that do not follow the format X/Y. "
373373
f"Values were replaced with {error_val_int}."

a4d-python/src/a4d/extract/patient.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ def extract_patient_data(
510510
if not valid_cols:
511511
if close_wb:
512512
workbook.close()
513-
logger.warning(f"No valid headers found in sheet '{sheet_name}'")
513+
logger.bind(error_code="invalid_tracker").warning(f"No valid headers found in sheet '{sheet_name}'")
514514
return pl.DataFrame()
515515

516516
data = read_patient_rows(ws, data_start_row, len(headers))
@@ -689,21 +689,21 @@ def read_all_patient_sheets(
689689
df_sheet = extract_patient_data(tracker_file, sheet_name, year, mapper=mapper, workbook=wb)
690690

691691
if df_sheet.is_empty():
692-
logger.warning(f"Sheet '{sheet_name}' has no data, skipping")
692+
logger.bind(error_code="invalid_tracker").warning(f"Sheet '{sheet_name}' has no data, skipping")
693693
continue
694694

695695
df_sheet = harmonize_patient_data_columns(df_sheet, mapper=mapper, strict=False)
696696

697697
if "patient_id" not in df_sheet.columns:
698-
logger.warning(
698+
logger.bind(error_code="invalid_tracker").warning(
699699
f"Sheet '{sheet_name}' has no 'patient_id' column after harmonization, skipping"
700700
)
701701
continue
702702

703703
try:
704704
month_num = extract_tracker_month(sheet_name)
705705
except ValueError as e:
706-
logger.warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
706+
logger.bind(error_code="invalid_tracker").warning(f"Could not extract month from '{sheet_name}': {e}, skipping")
707707
continue
708708

709709
# Derived metadata (year, month) use Int64; text metadata (sheet_name, etc.) use String
@@ -735,7 +735,7 @@ def read_all_patient_sheets(
735735
missing_count = len(missing_patient_id_rows)
736736

737737
if missing_count > 0:
738-
logger.error(
738+
logger.bind(error_code="invalid_value").error(
739739
f"Found {missing_count} rows with missing patient_id in {tracker_file.name} - "
740740
f"these rows will be excluded from processing"
741741
)
@@ -837,13 +837,13 @@ def read_all_patient_sheets(
837837
)
838838
logger.info(f"Joined {len(patient_list)} Patient List records")
839839
else:
840-
logger.warning(
840+
logger.bind(error_code="invalid_tracker").warning(
841841
"Patient List sheet has no 'patient_id' column after harmonization"
842842
)
843843
else:
844-
logger.warning("Patient List sheet is empty")
844+
logger.bind(error_code="invalid_tracker").warning("Patient List sheet is empty")
845845
except Exception as e:
846-
logger.warning(f"Could not process Patient List sheet: {e}")
846+
logger.bind(error_code="invalid_tracker").warning(f"Could not process Patient List sheet: {e}")
847847

848848
# Process Annual sheet if it exists (R: lines 132-160)
849849
if "Annual" in all_sheets:
@@ -884,11 +884,11 @@ def read_all_patient_sheets(
884884
)
885885
logger.info(f"Joined {len(annual_data)} Annual records")
886886
else:
887-
logger.warning("Annual sheet has no 'patient_id' column after harmonization")
887+
logger.bind(error_code="invalid_tracker").warning("Annual sheet has no 'patient_id' column after harmonization")
888888
else:
889-
logger.warning("Annual sheet is empty")
889+
logger.bind(error_code="invalid_tracker").warning("Annual sheet is empty")
890890
except Exception as e:
891-
logger.warning(f"Could not process Annual sheet: {e}")
891+
logger.bind(error_code="invalid_tracker").warning(f"Could not process Annual sheet: {e}")
892892

893893
# Close workbook after all processing
894894
wb.close()

a4d-python/src/a4d/logging.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def file_logger(
165165
yield
166166
except Exception:
167167
# Log exception with full traceback
168-
logger.exception("Processing failed", error_code="critical_abort")
168+
logger.bind(error_code="critical_abort").exception("Processing failed")
169169
raise
170170
finally:
171171
# Remove the handler

a4d-python/src/a4d/pipeline/tracker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def process_tracker_patient(
102102
)
103103

104104
except Exception as e:
105-
logger.exception(f"Failed to process tracker: {tracker_file.name}")
105+
logger.bind(error_code="critical_abort").exception(f"Failed to process tracker: {tracker_file.name}")
106106
return TrackerResult(
107107
tracker_file=tracker_file,
108108
tracker_name=tracker_name,

a4d-python/src/a4d/reference/synonyms.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def _build_lookup(self) -> dict[str, str]:
117117
sanitized_key = sanitize_str(synonym)
118118

119119
if sanitized_key in lookup:
120-
logger.warning(
120+
logger.bind(error_code="invalid_tracker").warning(
121121
f"Duplicate sanitized synonym '{sanitized_key}' "
122122
f"(from '{synonym}') found for both "
123123
f"'{lookup[sanitized_key]}' and '{standard_name}'. "
@@ -209,7 +209,7 @@ def rename_columns(
209209
"These columns do not appear in the synonym file."
210210
)
211211
else:
212-
logger.warning(
212+
logger.bind(error_code="missing_column").warning(
213213
f"Keeping {len(unmapped_columns)} unmapped columns as-is: {unmapped_columns}"
214214
)
215215

@@ -221,7 +221,7 @@ def rename_columns(
221221

222222
if any(count > 1 for count in target_counts.values()):
223223
duplicates = {t: c for t, c in target_counts.items() if c > 1}
224-
logger.warning(
224+
logger.bind(error_code="invalid_tracker").warning(
225225
f"Multiple source columns map to same target name: {duplicates}. "
226226
"Keeping first occurrence only. "
227227
"This is an edge case from discontinued 2023 format."

a4d-python/src/a4d/tables/logs.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,12 @@ def parse_log_file(log_file: Path) -> pl.DataFrame:
6060
line = record_data.get("line", 0)
6161
module = record_data.get("module", "")
6262

63-
# Extract context fields (file_name, tracker_year, tracker_month)
63+
# Extract context fields (file_name, tracker_year, tracker_month, error_code)
6464
extra = record_data.get("extra", {})
6565
file_name = extra.get("file_name")
6666
tracker_year = extra.get("tracker_year")
6767
tracker_month = extra.get("tracker_month")
68+
error_code = extra.get("error_code")
6869

6970
# Extract process info (useful for debugging parallel processing)
7071
process_data = record_data.get("process", {})
@@ -86,6 +87,7 @@ def parse_log_file(log_file: Path) -> pl.DataFrame:
8687
"timestamp": timestamp,
8788
"level": level,
8889
"message": message,
90+
"error_code": error_code,
8991
"log_file": log_file.name,
9092
"file_name": file_name,
9193
"tracker_year": tracker_year,
@@ -169,6 +171,7 @@ def create_table_logs(logs_dir: Path, output_dir: Path) -> Path:
169171
"timestamp": pl.Datetime,
170172
"level": pl.Categorical,
171173
"message": pl.Utf8,
174+
"error_code": pl.Utf8,
172175
"log_file": pl.Categorical,
173176
"file_name": pl.Utf8,
174177
"tracker_year": pl.Int32,

0 commit comments

Comments
 (0)