Skip to content

Commit 1a8c7b8

Browse files
Michael Aydinbasclaude
andcommitted
fix: calculate age/t1d_diagnosis_age from DOB, handle Excel errors
- Add _fix_t1d_diagnosis_age() to calculate from dob and diagnosis date - Update _fix_age_from_dob() to skip error dates (9999-09-09) - Clean Excel errors (#NUM! etc) from Patient List and Annual sheets - Normalize missing-value strings (N/A, -, etc) to null before conversion - Add tests for age calculation functions - Fix integration tests: blood pressure column names, critical columns check - Add R validation exceptions for North Okkalapa tracker 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 7018884 commit 1a8c7b8

7 files changed

Lines changed: 312 additions & 14 deletions

File tree

a4d-python/src/a4d/clean/converters.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,22 @@ def safe_convert_column(
7171
if column not in df.columns:
7272
return df
7373

74+
# Normalize empty/whitespace/missing-value strings to null BEFORE conversion
75+
# This ensures missing data stays null rather than becoming error values
76+
# Matches R behavior where these values → NA (not conversion error)
77+
if df[column].dtype in (pl.Utf8, pl.String):
78+
# Common missing value representations to treat as null
79+
missing_values = ["", "N/A", "NA", "n/a", "na", "-", ".", "None", "none", "NULL", "null"]
80+
df = df.with_columns(
81+
pl.when(
82+
pl.col(column).str.strip_chars().is_in(missing_values)
83+
| (pl.col(column).str.strip_chars().str.len_chars() == 0)
84+
)
85+
.then(None)
86+
.otherwise(pl.col(column))
87+
.alias(column)
88+
)
89+
7490
# Store original values for error reporting
7591
df = df.with_columns(pl.col(column).alias(f"_orig_{column}"))
7692

a4d-python/src/a4d/clean/patient.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ def clean_patient_data(
8484
# Must happen before range validation so validated age is correct
8585
df = _fix_age_from_dob(df, error_collector)
8686

87+
# Step 5.5b: Calculate t1d_diagnosis_age from dob and t1d_diagnosis_date
88+
# Replaces any existing value (including Excel errors like #NUM!)
89+
df = _fix_t1d_diagnosis_age(df)
90+
8791
# Step 5.6: Validate dates (replace future dates with error value)
8892
# Must happen after type conversions so dates are proper date types
8993
df = _validate_dates(df, error_collector)
@@ -634,11 +638,16 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
634638

635639
logger.info("Fixing age values from DOB (matching R pipeline logic)")
636640

641+
error_date = pl.lit(settings.error_val_date).str.to_date()
642+
643+
# Only calculate if dob is valid (not null, not error date)
644+
valid_dob = pl.col("dob").is_not_null() & (pl.col("dob") != error_date)
645+
637646
# Calculate age from DOB
638647
# calc_age = tracker_year - year(dob)
639648
# if tracker_month < month(dob): calc_age -= 1
640649
df = df.with_columns(
641-
pl.when(pl.col("dob").is_not_null())
650+
pl.when(valid_dob)
642651
.then(
643652
pl.col("tracker_year")
644653
- pl.col("dob").dt.year()
@@ -734,6 +743,49 @@ def _fix_age_from_dob(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.D
734743
return df
735744

736745

746+
def _fix_t1d_diagnosis_age(df: pl.DataFrame) -> pl.DataFrame:
747+
"""Calculate t1d_diagnosis_age from dob and t1d_diagnosis_date.
748+
749+
If both dates are valid (not null, not error date), calculates age at diagnosis.
750+
If either date is missing or is error date, result is null.
751+
752+
Args:
753+
df: DataFrame with dob, t1d_diagnosis_date, t1d_diagnosis_age columns
754+
755+
Returns:
756+
DataFrame with calculated t1d_diagnosis_age
757+
"""
758+
required_cols = ["dob", "t1d_diagnosis_date", "t1d_diagnosis_age"]
759+
if not all(col in df.columns for col in required_cols):
760+
return df
761+
762+
error_date = pl.lit(settings.error_val_date).str.to_date()
763+
764+
# Only calculate if both dates are valid (not null, not error date)
765+
valid_dob = pl.col("dob").is_not_null() & (pl.col("dob") != error_date)
766+
valid_diagnosis = pl.col("t1d_diagnosis_date").is_not_null() & (
767+
pl.col("t1d_diagnosis_date") != error_date
768+
)
769+
770+
# Calculate age at diagnosis: year(diagnosis_date) - year(dob)
771+
# Adjust if birthday hasn't occurred yet in diagnosis year
772+
df = df.with_columns(
773+
pl.when(valid_dob & valid_diagnosis)
774+
.then(
775+
pl.col("t1d_diagnosis_date").dt.year()
776+
- pl.col("dob").dt.year()
777+
- pl.when(pl.col("t1d_diagnosis_date").dt.month() < pl.col("dob").dt.month())
778+
.then(1)
779+
.otherwise(0)
780+
)
781+
.otherwise(None)
782+
.cast(pl.Int32)
783+
.alias("t1d_diagnosis_age")
784+
)
785+
786+
return df
787+
788+
737789
def _validate_dates(df: pl.DataFrame, error_collector: ErrorCollector) -> pl.DataFrame:
738790
"""Validate date columns and replace future dates with error value.
739791

a4d-python/src/a4d/extract/patient.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,7 @@ def read_all_patient_sheets(
799799
tracker_file, "Patient List", year, mapper=mapper, workbook=wb
800800
)
801801
if not patient_list.is_empty():
802+
patient_list = clean_excel_errors(patient_list)
802803
patient_list = harmonize_patient_data_columns(
803804
patient_list, mapper=mapper, strict=False
804805
)
@@ -851,6 +852,7 @@ def read_all_patient_sheets(
851852
tracker_file, "Annual", year, mapper=mapper, workbook=wb
852853
)
853854
if not annual_data.is_empty():
855+
annual_data = clean_excel_errors(annual_data)
854856
annual_data = harmonize_patient_data_columns(
855857
annual_data, mapper=mapper, strict=False
856858
)

a4d-python/tests/test_clean/test_patient.py

Lines changed: 216 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
"""Unit tests for patient cleaning functions."""
22

3+
from datetime import date
4+
35
import polars as pl
46

5-
from a4d.clean.patient import _apply_preprocessing
7+
from a4d.clean.patient import (
8+
_apply_preprocessing,
9+
_fix_age_from_dob,
10+
_fix_t1d_diagnosis_age,
11+
)
12+
from a4d.config import settings
13+
from a4d.errors import ErrorCollector
614

715

816
class TestPatientIdNormalization:
@@ -201,3 +209,210 @@ def test_preserve_hyphen_in_other_columns(self):
201209
# These columns are not in the insulin list, so '-' is preserved
202210
assert result["clinic_visit"][0] == "-"
203211
assert result["active"][0] == "-"
212+
213+
214+
class TestFixAgeFromDob:
215+
"""Tests for age calculation from DOB."""
216+
217+
def test_calculates_age_from_dob(self):
218+
"""Should calculate age from DOB and tracker date."""
219+
df = pl.DataFrame(
220+
{
221+
"patient_id": ["P001"],
222+
"age": [None],
223+
"dob": [date(2010, 6, 15)],
224+
"tracker_year": [2025],
225+
"tracker_month": [1],
226+
}
227+
)
228+
collector = ErrorCollector()
229+
230+
result = _fix_age_from_dob(df, collector)
231+
232+
# 2025 - 2010 = 15, but Jan < June so 15 - 1 = 14
233+
assert result["age"][0] == 14
234+
235+
def test_birthday_already_passed(self):
236+
"""Should not subtract 1 if birthday already passed in tracker year."""
237+
df = pl.DataFrame(
238+
{
239+
"patient_id": ["P001"],
240+
"age": [None],
241+
"dob": [date(2010, 3, 15)],
242+
"tracker_year": [2025],
243+
"tracker_month": [6],
244+
}
245+
)
246+
collector = ErrorCollector()
247+
248+
result = _fix_age_from_dob(df, collector)
249+
250+
# 2025 - 2010 = 15, June > March so no adjustment
251+
assert result["age"][0] == 15
252+
253+
def test_missing_dob_keeps_null(self):
254+
"""Should keep null age if DOB is missing."""
255+
df = pl.DataFrame(
256+
{
257+
"patient_id": ["P001"],
258+
"age": [None],
259+
"dob": pl.Series([None], dtype=pl.Date),
260+
"tracker_year": [2025],
261+
"tracker_month": [1],
262+
}
263+
)
264+
collector = ErrorCollector()
265+
266+
result = _fix_age_from_dob(df, collector)
267+
268+
assert result["age"][0] is None
269+
270+
def test_error_date_dob_keeps_null(self):
271+
"""Should keep null age if DOB is error date."""
272+
error_date = date.fromisoformat(settings.error_val_date)
273+
df = pl.DataFrame(
274+
{
275+
"patient_id": ["P001"],
276+
"age": [None],
277+
"dob": [error_date],
278+
"tracker_year": [2025],
279+
"tracker_month": [1],
280+
}
281+
)
282+
collector = ErrorCollector()
283+
284+
result = _fix_age_from_dob(df, collector)
285+
286+
assert result["age"][0] is None
287+
288+
def test_corrects_wrong_excel_age(self):
289+
"""Should replace wrong Excel age with calculated age."""
290+
df = pl.DataFrame(
291+
{
292+
"patient_id": ["P001"],
293+
"age": [99.0], # Wrong value from Excel
294+
"dob": [date(2010, 6, 15)],
295+
"tracker_year": [2025],
296+
"tracker_month": [8],
297+
}
298+
)
299+
collector = ErrorCollector()
300+
301+
result = _fix_age_from_dob(df, collector)
302+
303+
# Should be corrected to 15
304+
assert result["age"][0] == 15
305+
306+
307+
class TestFixT1dDiagnosisAge:
308+
"""Tests for t1d_diagnosis_age calculation from DOB and diagnosis date."""
309+
310+
def test_calculates_diagnosis_age(self):
311+
"""Should calculate age at diagnosis from DOB and diagnosis date."""
312+
df = pl.DataFrame(
313+
{
314+
"patient_id": ["P001"],
315+
"dob": [date(2005, 8, 20)],
316+
"t1d_diagnosis_date": [date(2020, 3, 15)],
317+
"t1d_diagnosis_age": [None],
318+
}
319+
)
320+
321+
result = _fix_t1d_diagnosis_age(df)
322+
323+
# 2020 - 2005 = 15, but March < August so 15 - 1 = 14
324+
assert result["t1d_diagnosis_age"][0] == 14
325+
326+
def test_birthday_passed_before_diagnosis(self):
327+
"""Should not subtract 1 if birthday passed before diagnosis."""
328+
df = pl.DataFrame(
329+
{
330+
"patient_id": ["P001"],
331+
"dob": [date(2005, 3, 20)],
332+
"t1d_diagnosis_date": [date(2020, 8, 15)],
333+
"t1d_diagnosis_age": [None],
334+
}
335+
)
336+
337+
result = _fix_t1d_diagnosis_age(df)
338+
339+
# 2020 - 2005 = 15, August > March so no adjustment
340+
assert result["t1d_diagnosis_age"][0] == 15
341+
342+
def test_missing_dob_returns_null(self):
343+
"""Should return null if DOB is missing."""
344+
df = pl.DataFrame(
345+
{
346+
"patient_id": ["P001"],
347+
"dob": pl.Series([None], dtype=pl.Date),
348+
"t1d_diagnosis_date": [date(2020, 3, 15)],
349+
"t1d_diagnosis_age": [None],
350+
}
351+
)
352+
353+
result = _fix_t1d_diagnosis_age(df)
354+
355+
assert result["t1d_diagnosis_age"][0] is None
356+
357+
def test_missing_diagnosis_date_returns_null(self):
358+
"""Should return null if diagnosis date is missing."""
359+
df = pl.DataFrame(
360+
{
361+
"patient_id": ["P001"],
362+
"dob": [date(2005, 8, 20)],
363+
"t1d_diagnosis_date": pl.Series([None], dtype=pl.Date),
364+
"t1d_diagnosis_age": [None],
365+
}
366+
)
367+
368+
result = _fix_t1d_diagnosis_age(df)
369+
370+
assert result["t1d_diagnosis_age"][0] is None
371+
372+
def test_error_date_dob_returns_null(self):
373+
"""Should return null if DOB is error date."""
374+
error_date = date.fromisoformat(settings.error_val_date)
375+
df = pl.DataFrame(
376+
{
377+
"patient_id": ["P001"],
378+
"dob": [error_date],
379+
"t1d_diagnosis_date": [date(2020, 3, 15)],
380+
"t1d_diagnosis_age": [None],
381+
}
382+
)
383+
384+
result = _fix_t1d_diagnosis_age(df)
385+
386+
assert result["t1d_diagnosis_age"][0] is None
387+
388+
def test_error_date_diagnosis_returns_null(self):
389+
"""Should return null if diagnosis date is error date."""
390+
error_date = date.fromisoformat(settings.error_val_date)
391+
df = pl.DataFrame(
392+
{
393+
"patient_id": ["P001"],
394+
"dob": [date(2005, 8, 20)],
395+
"t1d_diagnosis_date": [error_date],
396+
"t1d_diagnosis_age": [None],
397+
}
398+
)
399+
400+
result = _fix_t1d_diagnosis_age(df)
401+
402+
assert result["t1d_diagnosis_age"][0] is None
403+
404+
def test_replaces_excel_error_value(self):
405+
"""Should replace Excel error (#NUM!) that became 999999 with calculated value."""
406+
df = pl.DataFrame(
407+
{
408+
"patient_id": ["P001"],
409+
"dob": [date(2005, 8, 20)],
410+
"t1d_diagnosis_date": [date(2020, 3, 15)],
411+
"t1d_diagnosis_age": [999999], # Error value from Excel
412+
}
413+
)
414+
415+
result = _fix_t1d_diagnosis_age(df)
416+
417+
# Should be calculated as 14
418+
assert result["t1d_diagnosis_age"][0] == 14

a4d-python/tests/test_integration/test_clean_integration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def test_clean_creates_derived_columns(self, tracker_2024_penang):
5252
# Check derived columns exist
5353
assert "insulin_type" in df_clean.columns
5454
assert "insulin_subtype" in df_clean.columns
55-
assert "systolic_bp" in df_clean.columns
56-
assert "diastolic_bp" in df_clean.columns
55+
assert "blood_pressure_sys_mmhg" in df_clean.columns
56+
assert "blood_pressure_dias_mmhg" in df_clean.columns
5757

5858
def test_clean_tracks_errors(self, tracker_2024_penang):
5959
"""Should track data quality errors in ErrorCollector."""

a4d-python/tests/test_integration/test_e2e.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,31 @@ def test_e2e_full_pipeline(self, tracker_2024_penang):
8686
# Validate clinic_id
8787
assert df_clean["clinic_id"].unique().to_list() == ["PNG"]
8888

89-
def test_e2e_key_columns_populated(self, tracker_2024_penang):
90-
"""Validate that key columns have data after pipeline."""
89+
def test_e2e_critical_columns_populated(self, tracker_2024_penang):
90+
"""Validate that critical columns are fully populated after pipeline."""
9191
skip_if_missing(tracker_2024_penang)
9292

93-
# Full pipeline
9493
df_raw = read_all_patient_sheets(tracker_2024_penang)
9594
collector = ErrorCollector()
9695
df_clean = clean_patient_data(df_raw, collector)
9796

98-
# Check that insulin_type has some non-null values
99-
insulin_type_count = df_clean["insulin_type"].is_not_null().sum()
100-
assert insulin_type_count > 0, "insulin_type should have some values"
101-
102-
# Check that insulin_total_units has some non-null values
103-
insulin_total_count = df_clean["insulin_total_units"].is_not_null().sum()
104-
assert insulin_total_count > 0, "insulin_total_units should have some values"
97+
# These columns must be 100% populated for every row
98+
required_full = [
99+
"patient_id",
100+
"status",
101+
"clinic_id",
102+
"tracker_year",
103+
"tracker_month",
104+
]
105+
for col in required_full:
106+
null_count = df_clean[col].is_null().sum()
107+
assert null_count == 0, f"{col} has {null_count} null values, expected 0"
108+
109+
# These columns should have high population (allow some nulls)
110+
required_partial = ["age", "last_clinic_visit_date"]
111+
for col in required_partial:
112+
non_null = df_clean[col].is_not_null().sum()
113+
assert non_null > len(df_clean) * 0.9, f"{col} has <90% population"
105114

106115

107116
class TestE2ECrosYearConsistency:

0 commit comments

Comments
 (0)