fix: calculate age/t1d_diagnosis_age from DOB, handle Excel errors

Michael Aydinbas · claude · Michael Aydinbas · commit 1a8c7b85ebee · 2025-12-28T22:40:25.000+01:00
- Add _fix_t1d_diagnosis_age() to calculate from dob and diagnosis date - Update _fix_age_from_dob() to skip error dates (9999-09-09) - Clean Excel errors (#NUM! etc) from Patient List and Annual sheets - Normalize missing-value strings (N/A, -, etc) to null before conversion - Add tests for age calculation functions - Fix integration tests: blood pressure column names, critical columns check - Add R validation exceptions for North Okkalapa tracker 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/a4d-python/src/a4d/clean/converters.py b/a4d-python/src/a4d/clean/converters.py
@@ -71,6 +71,22 @@ def safe_convert_column(
     if column not in df.columns:
         return df
 
+    # Normalize empty/whitespace/missing-value strings to null BEFORE conversion
+    # This ensures missing data stays null rather than becoming error values
+    # Matches R behavior where these values → NA (not conversion error)
+    if df[column].dtype in (pl.Utf8, pl.String):
+        # Common missing value representations to treat as null
+        missing_values = ["", "N/A", "NA", "n/a", "na", "-", ".", "None", "none", "NULL", "null"]
+        df = df.with_columns(
+            pl.when(
+                pl.col(column).str.strip_chars().is_in(missing_values)
+                | (pl.col(column).str.strip_chars().str.len_chars() == 0)
+            )
+            .then(None)
+            .otherwise(pl.col(column))
+            .alias(column)
+        )
+
     # Store original values for error reporting
     df = df.with_columns(pl.col(column).alias(f"_orig_{column}"))
 
diff --git a/a4d-python/src/a4d/clean/patient.py b/a4d-python/src/a4d/clean/patient.py
@@ -84,6 +84,10 @@ def clean_patient_data(
     # Must happen before range validation so validated age is correct
     df = _fix_age_from_dob(df, error_collector)
 
+    # Step 5.5b: Calculate t1d_diagnosis_age from dob and t1d_diagnosis_date
+    # Replaces any existing value (including Excel errors like #NUM!)
+    df = _fix_t1d_diagnosis_age(df)
+
     # Step 5.6: Validate dates (replace future dates with error value)
     # Must happen after type conversions so dates are proper date types
     df = _validate_dates(df, error_collector)
@@ -634,11 +638,16 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
 
     logger.info("Fixing age values from DOB (matching R pipeline logic)")
 
+    error_date = pl.lit(settings.error_val_date).str.to_date()
+
+    # Only calculate if dob is valid (not null, not error date)
+    valid_dob = pl.col("dob").is_not_null() & (pl.col("dob") != error_date)
+
     # Calculate age from DOB
     # calc_age = tracker_year - year(dob)
     # if tracker_month < month(dob): calc_age -= 1
     df = df.with_columns(
-        pl.when(pl.col("dob").is_not_null())
+        pl.when(valid_dob)
         .then(
             pl.col("tracker_year")
             - pl.col("dob").dt.year()
@@ -734,6 +743,49 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
     return df
 
 
+def _fix_t1d_diagnosis_age(df: pl.DataFrame) -> pl.DataFrame:
+    """Calculate t1d_diagnosis_age from dob and t1d_diagnosis_date.
+
+    If both dates are valid (not null, not error date), calculates age at diagnosis.
+    If either date is missing or is error date, result is null.
+
+    Args:
+        df: DataFrame with dob, t1d_diagnosis_date, t1d_diagnosis_age columns
+
+    Returns:
+        DataFrame with calculated t1d_diagnosis_age
+    """
+    required_cols = ["dob", "t1d_diagnosis_date", "t1d_diagnosis_age"]
+    if not all(col in df.columns for col in required_cols):
+        return df
+
+    error_date = pl.lit(settings.error_val_date).str.to_date()
+
+    # Only calculate if both dates are valid (not null, not error date)
+    valid_dob = pl.col("dob").is_not_null() & (pl.col("dob") != error_date)
+    valid_diagnosis = pl.col("t1d_diagnosis_date").is_not_null() & (
+        pl.col("t1d_diagnosis_date") != error_date
+    )
+
+    # Calculate age at diagnosis: year(diagnosis_date) - year(dob)
+    # Adjust if birthday hasn't occurred yet in diagnosis year
+    df = df.with_columns(
+        pl.when(valid_dob & valid_diagnosis)
+        .then(
+            pl.col("t1d_diagnosis_date").dt.year()
+            - pl.col("dob").dt.year()
+            - pl.when(pl.col("t1d_diagnosis_date").dt.month() < pl.col("dob").dt.month())
+            .then(1)
+            .otherwise(0)
+        )
+        .otherwise(None)
+        .cast(pl.Int32)
+        .alias("t1d_diagnosis_age")
+    )
+
+    return df
+
+
 def _validate_dates(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.DataFrame:
     """Validate date columns and replace future dates with error value.
 
diff --git a/a4d-python/src/a4d/extract/patient.py b/a4d-python/src/a4d/extract/patient.py
@@ -799,6 +799,7 @@ def read_all_patient_sheets(
                 tracker_file, "Patient List", year, mapper=mapper, workbook=wb
             )
             if not patient_list.is_empty():
+                patient_list = clean_excel_errors(patient_list)
                 patient_list = harmonize_patient_data_columns(
                     patient_list, mapper=mapper, strict=False
                 )
@@ -851,6 +852,7 @@ def read_all_patient_sheets(
                 tracker_file, "Annual", year, mapper=mapper, workbook=wb
             )
             if not annual_data.is_empty():
+                annual_data = clean_excel_errors(annual_data)
                 annual_data = harmonize_patient_data_columns(
                     annual_data, mapper=mapper, strict=False
                 )
diff --git a/a4d-python/tests/test_clean/test_patient.py b/a4d-python/tests/test_clean/test_patient.py
@@ -1,8 +1,16 @@
 """Unit tests for patient cleaning functions."""
 
+from datetime import date
+
 import polars as pl
 
-from a4d.clean.patient import _apply_preprocessing
+from a4d.clean.patient import (
+    _apply_preprocessing,
+    _fix_age_from_dob,
+    _fix_t1d_diagnosis_age,
+)
+from a4d.config import settings
+from a4d.errors import ErrorCollector
 
 
 class TestPatientIdNormalization:
@@ -201,3 +209,210 @@ def test_preserve_hyphen_in_other_columns(self):
         # These columns are not in the insulin list, so '-' is preserved
         assert result["clinic_visit"][0] == "-"
         assert result["active"][0] == "-"
+
+
+class TestFixAgeFromDob:
+    """Tests for age calculation from DOB."""
+
+    def test_calculates_age_from_dob(self):
+        """Should calculate age from DOB and tracker date."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "age": [None],
+                "dob": [date(2010, 6, 15)],
+                "tracker_year": [2025],
+                "tracker_month": [1],
+            }
+        )
+        collector = ErrorCollector()
+
+        result = _fix_age_from_dob(df, collector)
+
+        # 2025 - 2010 = 15, but Jan < June so 15 - 1 = 14
+        assert result["age"][0] == 14
+
+    def test_birthday_already_passed(self):
+        """Should not subtract 1 if birthday already passed in tracker year."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "age": [None],
+                "dob": [date(2010, 3, 15)],
+                "tracker_year": [2025],
+                "tracker_month": [6],
+            }
+        )
+        collector = ErrorCollector()
+
+        result = _fix_age_from_dob(df, collector)
+
+        # 2025 - 2010 = 15, June > March so no adjustment
+        assert result["age"][0] == 15
+
+    def test_missing_dob_keeps_null(self):
+        """Should keep null age if DOB is missing."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "age": [None],
+                "dob": pl.Series([None], dtype=pl.Date),
+                "tracker_year": [2025],
+                "tracker_month": [1],
+            }
+        )
+        collector = ErrorCollector()
+
+        result = _fix_age_from_dob(df, collector)
+
+        assert result["age"][0] is None
+
+    def test_error_date_dob_keeps_null(self):
+        """Should keep null age if DOB is error date."""
+        error_date = date.fromisoformat(settings.error_val_date)
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "age": [None],
+                "dob": [error_date],
+                "tracker_year": [2025],
+                "tracker_month": [1],
+            }
+        )
+        collector = ErrorCollector()
+
+        result = _fix_age_from_dob(df, collector)
+
+        assert result["age"][0] is None
+
+    def test_corrects_wrong_excel_age(self):
+        """Should replace wrong Excel age with calculated age."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "age": [99.0],  # Wrong value from Excel
+                "dob": [date(2010, 6, 15)],
+                "tracker_year": [2025],
+                "tracker_month": [8],
+            }
+        )
+        collector = ErrorCollector()
+
+        result = _fix_age_from_dob(df, collector)
+
+        # Should be corrected to 15
+        assert result["age"][0] == 15
+
+
+class TestFixT1dDiagnosisAge:
+    """Tests for t1d_diagnosis_age calculation from DOB and diagnosis date."""
+
+    def test_calculates_diagnosis_age(self):
+        """Should calculate age at diagnosis from DOB and diagnosis date."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [date(2005, 8, 20)],
+                "t1d_diagnosis_date": [date(2020, 3, 15)],
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        # 2020 - 2005 = 15, but March < August so 15 - 1 = 14
+        assert result["t1d_diagnosis_age"][0] == 14
+
+    def test_birthday_passed_before_diagnosis(self):
+        """Should not subtract 1 if birthday passed before diagnosis."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [date(2005, 3, 20)],
+                "t1d_diagnosis_date": [date(2020, 8, 15)],
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        # 2020 - 2005 = 15, August > March so no adjustment
+        assert result["t1d_diagnosis_age"][0] == 15
+
+    def test_missing_dob_returns_null(self):
+        """Should return null if DOB is missing."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": pl.Series([None], dtype=pl.Date),
+                "t1d_diagnosis_date": [date(2020, 3, 15)],
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        assert result["t1d_diagnosis_age"][0] is None
+
+    def test_missing_diagnosis_date_returns_null(self):
+        """Should return null if diagnosis date is missing."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [date(2005, 8, 20)],
+                "t1d_diagnosis_date": pl.Series([None], dtype=pl.Date),
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        assert result["t1d_diagnosis_age"][0] is None
+
+    def test_error_date_dob_returns_null(self):
+        """Should return null if DOB is error date."""
+        error_date = date.fromisoformat(settings.error_val_date)
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [error_date],
+                "t1d_diagnosis_date": [date(2020, 3, 15)],
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        assert result["t1d_diagnosis_age"][0] is None
+
+    def test_error_date_diagnosis_returns_null(self):
+        """Should return null if diagnosis date is error date."""
+        error_date = date.fromisoformat(settings.error_val_date)
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [date(2005, 8, 20)],
+                "t1d_diagnosis_date": [error_date],
+                "t1d_diagnosis_age": [None],
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        assert result["t1d_diagnosis_age"][0] is None
+
+    def test_replaces_excel_error_value(self):
+        """Should replace Excel error (#NUM!) that became 999999 with calculated value."""
+        df = pl.DataFrame(
+            {
+                "patient_id": ["P001"],
+                "dob": [date(2005, 8, 20)],
+                "t1d_diagnosis_date": [date(2020, 3, 15)],
+                "t1d_diagnosis_age": [999999],  # Error value from Excel
+            }
+        )
+
+        result = _fix_t1d_diagnosis_age(df)
+
+        # Should be calculated as 14
+        assert result["t1d_diagnosis_age"][0] == 14
diff --git a/a4d-python/tests/test_integration/test_clean_integration.py b/a4d-python/tests/test_integration/test_clean_integration.py
@@ -52,8 +52,8 @@ def test_clean_creates_derived_columns(self, tracker_2024_penang):
         # Check derived columns exist
         assert "insulin_type" in df_clean.columns
         assert "insulin_subtype" in df_clean.columns
-        assert "systolic_bp" in df_clean.columns
-        assert "diastolic_bp" in df_clean.columns
+        assert "blood_pressure_sys_mmhg" in df_clean.columns
+        assert "blood_pressure_dias_mmhg" in df_clean.columns
 
     def test_clean_tracks_errors(self, tracker_2024_penang):
         """Should track data quality errors in ErrorCollector."""
diff --git a/a4d-python/tests/test_integration/test_e2e.py b/a4d-python/tests/test_integration/test_e2e.py
@@ -86,22 +86,31 @@ def test_e2e_full_pipeline(self, tracker_2024_penang):
         # Validate clinic_id
         assert df_clean["clinic_id"].unique().to_list() == ["PNG"]
 
-    def test_e2e_key_columns_populated(self, tracker_2024_penang):
-        """Validate that key columns have data after pipeline."""
+    def test_e2e_critical_columns_populated(self, tracker_2024_penang):
+        """Validate that critical columns are fully populated after pipeline."""
         skip_if_missing(tracker_2024_penang)
 
-        # Full pipeline
         df_raw = read_all_patient_sheets(tracker_2024_penang)
         collector = ErrorCollector()
         df_clean = clean_patient_data(df_raw, collector)
 
-        # Check that insulin_type has some non-null values
-        insulin_type_count = df_clean["insulin_type"].is_not_null().sum()
-        assert insulin_type_count > 0, "insulin_type should have some values"
-
-        # Check that insulin_total_units has some non-null values
-        insulin_total_count = df_clean["insulin_total_units"].is_not_null().sum()
-        assert insulin_total_count > 0, "insulin_total_units should have some values"
+        # These columns must be 100% populated for every row
+        required_full = [
+            "patient_id",
+            "status",
+            "clinic_id",
+            "tracker_year",
+            "tracker_month",
+        ]
+        for col in required_full:
+            null_count = df_clean[col].is_null().sum()
+            assert null_count == 0, f"{col} has {null_count} null values, expected 0"
+
+        # These columns should have high population (allow some nulls)
+        required_partial = ["age", "last_clinic_visit_date"]
+        for col in required_partial:
+            non_null = df_clean[col].is_not_null().sum()
+            assert non_null > len(df_clean) * 0.9, f"{col} has <90% population"
 
 
 class TestE2ECrosYearConsistency:
diff --git a/a4d-python/tests/test_integration/test_r_validation.py b/a4d-python/tests/test_integration/test_r_validation.py

Original file line number	Diff line number	Diff line change
`@@ -799,6 +799,7 @@ def read_all_patient_sheets(`
`799`	`799`	`tracker_file, "Patient List", year, mapper=mapper, workbook=wb`
`800`	`800`	`)`
`801`	`801`	`if not patient_list.is_empty():`
	`802`	`+ patient_list = clean_excel_errors(patient_list)`
`802`	`803`	`patient_list = harmonize_patient_data_columns(`
`803`	`804`	`patient_list, mapper=mapper, strict=False`
`804`	`805`	`)`
`@@ -851,6 +852,7 @@ def read_all_patient_sheets(`
`851`	`852`	`tracker_file, "Annual", year, mapper=mapper, workbook=wb`
`852`	`853`	`)`
`853`	`854`	`if not annual_data.is_empty():`
	`855`	`+ annual_data = clean_excel_errors(annual_data)`
`854`	`856`	`annual_data = harmonize_patient_data_columns(`
`855`	`857`	`annual_data, mapper=mapper, strict=False`
`856`	`858`	`)`