Health-Informatics-UoN · AndyRae · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026
@@ -11,8 +11,8 @@ wheels/
 .venv
 
 # Output files
-*.xlsx
 *.csv
+tests/output/
 *.jsonl
 *.jsonl.gz
 *.jsonl.bz2

@@ -0,0 +1,32 @@
+import shutil
+from pathlib import Path
+
+import pytest
+
+OUTPUT_DIR = Path(__file__).parent / "output"
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--save-output",
+        action="store_true",
+        default=False,
+        help="Write e2e output files to tests/output/<test-name>/ for inspection.",
+    )
+
+
+@pytest.fixture
+def output_path(request: pytest.FixtureRequest, tmp_path: Path) -> Path:
+    """Directory for e2e test output files.
+
+    Normally delegates to pytest's tmp_path (ephemeral).
+    Pass --save-output to write to tests/output/<test-name>/ instead,
+    so you can open the files afterwards for manual inspection.
+    """
+    if request.config.getoption("--save-output"):
+        path = OUTPUT_DIR / request.node.name
+        if path.exists():
+            shutil.rmtree(path)
+        path.mkdir(parents=True)
+        return path
+    return tmp_path
@@ -0,0 +1,209 @@
+"""End-to-end tests for shift_excel_dates using tests/data/patients.xlsx.
+
+Run normally for CI:
+    uv run pytest tests/test_e2e_date_shift.py
+
+Run with --save-output to keep the Excel files for manual inspection:
+    uv run pytest tests/test_e2e_date_shift.py --save-output
+    # files written to tests/output/<test-name>/
+"""
+
+from pathlib import Path
+from typing import TypedDict, Unpack
+
+import pandas as pd
+
+from nuh_helper import shift_excel_dates
+
+
+class _ShiftKwargs(TypedDict, total=False):
+    """Optional kwargs forwarded to shift_excel_dates in e2e tests."""
+
+    seed: int | None
+    min_shift_days: int
+    max_shift_days: int
+
+
+DATA_DIR = Path(__file__).parent / "data"
+INPUT_FILE = DATA_DIR / "patients.xlsx"
+
+# Config matching the structure of patients.xlsx
+SHEET_CONFIGS = {
+    "patients": {
+        "patient_id_col": "patient_id",
+        "date_columns": ["dob", "last_alive"],
+    },
+    "results": {
+        "patient_id_col": "patient_id",
+        "date_columns": ["date_result"],
+    },
+}
+
+
+def run_shift(base: Path, **kwargs: Unpack[_ShiftKwargs]) -> tuple[Path, Path]:
+    """Call shift_excel_dates with defaults for e2e tests.
+
+    Returns (output_xlsx_path, linking_table_csv_path).
+    Always writes a linking table so tests don't pollute the working directory.
+    """
+    output = base / "output.xlsx"
+    linking = base / "linking.csv"
+    shift_excel_dates(
+        str(INPUT_FILE),
+        str(output),
+        patient_sheet="patients",
+        patient_id_col="patient_id",
+        sheet_configs=SHEET_CONFIGS,
+        linking_table_output=str(linking),
+        **kwargs,
+    )
+    return output, linking
+
+
+def read_sheet(path: Path, sheet: str) -> pd.DataFrame:
+    return pd.read_excel(str(path), sheet_name=sheet)
+
+
+class TestOutputStructure:
+    def test_output_file_is_created(self, output_path: Path) -> None:
+        output, _ = run_shift(output_path, seed=42)
+        assert output.exists()
+
+    def test_linking_table_is_created_with_one_row_per_patient(
+        self, output_path: Path
+    ) -> None:
+        _, linking = run_shift(output_path, seed=42)
+        df = pd.read_csv(linking)
+        assert set(df.columns) == {"patient_id", "shift_days"}
+        assert len(df) == 5
+
+    def test_output_sheets_match_input_sheets(self, output_path: Path) -> None:
+        output, _ = run_shift(output_path, seed=42)
+        input_sheets = pd.ExcelFile(str(INPUT_FILE)).sheet_names
+        output_sheets = pd.ExcelFile(str(output)).sheet_names
+        assert output_sheets == input_sheets
+
+    def test_non_date_columns_are_unchanged(self, output_path: Path) -> None:
+        output, _ = run_shift(output_path, seed=42)
+        pd.testing.assert_series_equal(
+            read_sheet(INPUT_FILE, "patients")["name"],
+            read_sheet(output, "patients")["name"],
+        )
+        pd.testing.assert_series_equal(
+            read_sheet(INPUT_FILE, "results")["measurement"],
+            read_sheet(output, "results")["measurement"],
+        )
+
+
+class TestDateShifting:
+    def test_dates_are_shifted_by_amounts_in_linking_table(
+        self, output_path: Path
+    ) -> None:
+        output, linking = run_shift(output_path, seed=42)
+
+        shift_dict = dict(
+            zip(
+                pd.read_csv(linking)["patient_id"],
+                pd.read_csv(linking)["shift_days"],
+                strict=True,
+            )
+        )
+        input_df = read_sheet(INPUT_FILE, "patients")
+        output_df = read_sheet(output, "patients")
+
+        for pid, expected_days in shift_dict.items():
+            in_dob = pd.Timestamp(
+                input_df.loc[input_df["patient_id"] == pid, "dob"].iloc[0]
+            )
+            out_dob = pd.Timestamp(
+                output_df.loc[output_df["patient_id"] == pid, "dob"].iloc[0]
+            )
+            assert (out_dob - in_dob).days == expected_days, (
+                f"Patient {pid}: expected shift of {expected_days} days, "
+                f"got {(out_dob - in_dob).days}"
+            )
+
+    def test_shift_is_consistent_across_sheets(self, output_path: Path) -> None:
+        """Each patient's dates shift by the same number of days in every sheet."""
+        output, linking = run_shift(output_path, seed=42)
+
+        shift_dict = dict(
+            zip(
+                pd.read_csv(linking)["patient_id"],
+                pd.read_csv(linking)["shift_days"],
+                strict=True,
+            )
+        )
+        input_results = read_sheet(INPUT_FILE, "results")
+        output_results = read_sheet(output, "results")
+
+        # Test5 has "unknown" as date_result so skip it here
+        for pid in ["Test1", "Test2", "Test3", "Test4"]:
+            in_date = pd.Timestamp(
+                input_results.loc[
+                    input_results["patient_id"] == pid, "date_result"
+                ].iloc[0]
+            )
+            out_date = pd.Timestamp(
+                output_results.loc[
+                    output_results["patient_id"] == pid, "date_result"
+                ].iloc[0]
+            )
+            assert (out_date - in_date).days == shift_dict[pid], (
+                f"Patient {pid}: results sheet shift differs from linking table"
+            )
+
+    def test_shifts_within_specified_range(self, output_path: Path) -> None:
+        _, linking = run_shift(
+            output_path, seed=42, min_shift_days=-7, max_shift_days=7
+        )
+        shifts = pd.read_csv(linking)["shift_days"]
+        assert shifts.between(-7, 7).all()
+
+    def test_placeholder_date_becomes_null_in_output(self, output_path: Path) -> None:
+        """Test5 has "unknown" as date_result — should be null after shifting."""
+        output, _ = run_shift(output_path, seed=42)
+        output_results = read_sheet(output, "results")
+        test5_date = output_results.loc[
+            output_results["patient_id"] == "Test5", "date_result"
+        ].iloc[0]
+        assert pd.isna(test5_date)
+
+
+class TestReproducibility:
+    def test_same_seed_produces_identical_output(self, output_path: Path) -> None:
+        run1 = output_path / "run1"
+        run2 = output_path / "run2"
+        run1.mkdir()
+        run2.mkdir()
+        output1, _ = run_shift(run1, seed=42)
+        output2, _ = run_shift(run2, seed=42)
+
+        for sheet in SHEET_CONFIGS:
+            pd.testing.assert_frame_equal(
+                read_sheet(output1, sheet),
+                read_sheet(output2, sheet),
+            )
+
+    def test_linking_table_reproduces_same_shifts_on_new_file(
+        self, output_path: Path
+    ) -> None:
+        """Saving a linking table and reloading it should produce identical dates."""
+        output1, linking = run_shift(output_path, seed=42)
+
+        output2 = output_path / "output2.xlsx"
+        shift_excel_dates(
+            str(INPUT_FILE),
+            str(output2),
+            patient_sheet="patients",
+            patient_id_col="patient_id",
+            sheet_configs=SHEET_CONFIGS,
+            linking_table_path=str(linking),
+            linking_table_output=str(output_path / "linking2.csv"),
+        )
+
+        for sheet in SHEET_CONFIGS:
+            pd.testing.assert_frame_equal(
+                read_sheet(output1, sheet),
+                read_sheet(output2, sheet),
+            )