Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
2 changes: 1 addition & 1 deletion .github/workflows/check.quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ jobs:
uses: actions/checkout@v4

- name: Run Ruff
uses: astral-sh/ruff-action@39f75e526a505e26a302f8796977b50c13720edf # v3.2.1
uses: astral-sh/ruff-action@4919ec5cf1f49eff0871dbcea0da843445b837e6 # v3.6.1

7 changes: 7 additions & 0 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"recommendations": [
"astral-sh.ty",
"ms-python.python",
"charliermarsh.ruff"
]
}
7 changes: 6 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ CI runs the full test suite; ensure `uv run pytest` passes locally before openin

## Commits and PRs

- Use **Angular-style semantic commit messages** (e.g. `feat: add X`, `fix: handle Y`, `docs: update Z`). CI checks this.
- Use **Angular-style semantic PR title messages** (e.g. `feat: add X`, `fix: handle Y`, `docs: update Z`). CI checks this.
- Open a PR against the default branch. A code owner will review and approve before merge.

## Releases

Releases are made automatically through `semantic-release` and depend on PR titles.

Thank you for contributing.

6 changes: 2 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
with hardcoded test inputs.
"""

from typing import Any, Dict

from nuh_helper import shift_excel_dates


def main():
def main() -> None:
"""
Main entry point with hardcoded inputs for dev/testing.
"""
Expand All @@ -22,7 +20,7 @@ def main():
# Currently based on test.xlsx structure:
# - 'patients' sheet: patient_id, gender, dob (date column)
# - 'labs' sheet: patient_id, test_date (date column), result
sheet_configs: Dict[str, Dict[str, Any]] = {
sheet_configs: dict[str, dict[str, str | list[str] | int]] = {
"patients": {
"patient_id_col": "patient_id",
"date_columns": ["dob", "date_of_diagnosis"],
Expand Down
71 changes: 39 additions & 32 deletions nuh_helper/date_shift/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
import random
from datetime import date, datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast
from typing import Any, cast

import pandas as pd


def generate_shift_mappings(
patient_ids: List[str],
patient_ids: list[str],
min_shift_days: int = -15,
max_shift_days: int = 15,
seed: Optional[int] = None,
seed: int | None = None,
) -> pd.DataFrame:
"""
Generate random shift mappings for patient IDs.
Expand Down Expand Up @@ -55,14 +55,17 @@ def load_shift_mappings(csv_path: str) -> pd.DataFrame:
return df


def _parse_date_value(value: Any) -> Optional[pd.Timestamp]:
def _parse_date_value(
value: str | float | int | datetime | date | pd.Timestamp | None,
) -> pd.Timestamp | None:
"""Parse a value into a pandas Timestamp if possible."""
if value is None or (isinstance(value, float) and pd.isna(value)):
return None

# Already datetime-like
if isinstance(value, (pd.Timestamp, datetime, date)):
return pd.to_datetime(value, errors="coerce")
result = pd.to_datetime(value, errors="coerce")
return result if pd.notna(result) else None

if isinstance(value, str):
v = value.strip()
Expand All @@ -79,16 +82,14 @@ def _parse_date_value(value: Any) -> Optional[pd.Timestamp]:
pass

# Fallback: let pandas try with dayfirst to handle ambiguous strings
parsed = pd.to_datetime(
v, errors="coerce", dayfirst=True, infer_datetime_format=True
)
parsed = pd.to_datetime(v, errors="coerce", dayfirst=True)
return parsed if pd.notna(parsed) else None

# Anything else: no parse
return None


def _normalize_patient_id(value: Any) -> Optional[str]:
def _normalize_patient_id(value: str | float | int | None) -> str | None:
"""Normalize patient IDs by stripping whitespace and converting to string."""
if value is None or (isinstance(value, float) and pd.isna(value)):
return None
Expand All @@ -104,15 +105,15 @@ def _normalize_patient_id(value: Any) -> Optional[str]:
def apply_date_shifts(
df: pd.DataFrame,
patient_id_col: str,
date_columns: List[str],
date_columns: list[str],
shift_mappings: pd.DataFrame,
date_format: Optional[str] = None,
date_format: str | None = None,
) -> pd.DataFrame:
"""
Apply date shifts to specified columns in a DataFrame.

Args:
df: DataFrame containing patient data.
df: pd.DataFrame containing patient data.
patient_id_col: Name of the column containing patient IDs.
date_columns: List of column names containing dates to shift.
shift_mappings: DataFrame with 'patient_id' and 'shift_days' columns.
Expand All @@ -128,22 +129,28 @@ def apply_date_shifts(
# Normalize patient IDs in the working DataFrame to align with mapping keys
df[patient_id_col] = df[patient_id_col].apply(_normalize_patient_id)

shift_dict = dict(zip(shift_mappings["patient_id"], shift_mappings["shift_days"]))
shift_dict = dict(
zip(
shift_mappings["patient_id"],
shift_mappings["shift_days"],
strict=True,
)
)

for date_col in date_columns:
if date_col not in df.columns:
continue

# Parse flexible date strings (handles YYYY-DD-MM and placeholders like "Unknown")
# Parse flexible date strings (handles YYYY-DD-MM and placeholders "Unknown")
df[date_col] = df[date_col].apply(_parse_date_value)

# Apply shifts
df[date_col] = df.apply(
lambda row: (
row[date_col]
row[date_col] # noqa: B023
+ pd.Timedelta(days=shift_dict.get(row[patient_id_col], 0))
if row[date_col] is not None and row[patient_id_col] in shift_dict
else row[date_col]
if row[date_col] is not None and row[patient_id_col] in shift_dict # noqa: B023
else row[date_col] # noqa: B023
),
axis=1,
)
Expand All @@ -163,15 +170,15 @@ def shift_excel_dates(
output_file: str,
patient_sheet: str,
patient_id_col: str,
sheet_configs: Dict[str, Dict[str, Any]],
sheet_configs: dict[str, dict[str, Any]],
min_shift_days: int = -15,
max_shift_days: int = 15,
linking_table_path: Optional[str] = None,
linking_table_output: Optional[str] = None,
seed: Optional[int] = None,
linking_table_path: str | None = None,
linking_table_output: str | None = None,
seed: int | None = None,
patient_header_row: int = 0,
patient_skip_rows: Optional[List[int]] = None,
date_format: Optional[str] = None,
patient_skip_rows: list[int] | None = None,
date_format: str | None = None,
) -> None:
"""
Shift dates in an Excel file for patient IDs consistently across sheets.
Expand All @@ -198,13 +205,13 @@ def shift_excel_dates(
date_format: Optional Excel date format string (e.g., 'YYYY-MM-DD', 'yyyy-mm-dd').
If None, Excel's default date format is used.
Common formats: 'YYYY-MM-DD', 'MM/DD/YYYY', 'DD-MM-YYYY', etc.
"""
""" # noqa: E501

def _read_sheet_with_structure(
excel_file: pd.ExcelFile,
sheet_name: str,
header_row: int = 0,
) -> Tuple[pd.DataFrame, pd.DataFrame, List[List[Any]]]:
) -> tuple[pd.DataFrame, pd.DataFrame, list[list[Any]]]:
"""
Read a sheet preserving description rows and structure.

Expand All @@ -219,7 +226,7 @@ def _read_sheet_with_structure(

if header_row == 0:
# No description rows, header is first row
description_rows: List[List[Any]] = []
description_rows: list[list[Any]] = []
description_df = pd.DataFrame()
# Use first row as header
data_df = pd.read_excel(excel_file, sheet_name=sheet_name, header=0)
Expand All @@ -239,10 +246,10 @@ def _write_sheet_with_structure(
writer: pd.ExcelWriter,
sheet_name: str,
data_df: pd.DataFrame,
description_rows: List[List[Any]],
description_rows: list[list[Any]],
header_row: int,
date_columns: Optional[List[str]] = None,
date_format: Optional[str] = None,
date_columns: list[str] | None = None,
date_format: str | None = None,
) -> None:
"""
Write a sheet preserving description rows and structure.
Expand Down Expand Up @@ -372,13 +379,13 @@ def _write_sheet_with_structure(
header_row = default_header_row

# Track date columns for formatting
sheet_date_columns: Optional[List[str]] = None
sheet_date_columns: list[str] | None = None

# Check if this sheet needs date shifting
if sheet_name in sheet_configs:
config = sheet_configs[cast(str, sheet_name)]
sheet_patient_id_col: str = cast(str, config["patient_id_col"])
date_columns: List[str] = cast(List[str], config["date_columns"])
date_columns: list[str] = cast(list[str], config["date_columns"])
header_row = cast(int, config.get("header_row", header_row))
sheet_date_columns = date_columns

Expand All @@ -392,7 +399,7 @@ def _write_sheet_with_structure(
if sheet_name in sheet_configs:
if sheet_patient_id_col not in df.columns:
raise ValueError(
f"Patient ID column '{sheet_patient_id_col}' not found in sheet '{sheet_name}'"
f"Patient ID column '{sheet_patient_id_col}' not found in sheet '{sheet_name}'" # noqa: E501
)

df = apply_date_shifts(
Expand Down
28 changes: 18 additions & 10 deletions nuh_helper/profile/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import csv
from pathlib import Path
from collections import Counter, defaultdict
from openpyxl import Workbook
from datetime import datetime
from pathlib import Path

from openpyxl import Workbook

SCAN_REPORT_FILE_NAME = "ScanReport.xlsx"

Expand All @@ -25,7 +26,7 @@
]


def index_table_names(table_names):
def index_table_names(table_names: list[str]) -> dict[str, str]:
indexed = {}
counts = defaultdict(int)

Expand All @@ -36,13 +37,15 @@ def index_table_names(table_names):
return indexed


def read_csv_header(csv_path):
def read_csv_header(csv_path: str) -> list[str]:
with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
return next(reader)


def scan_csv_values(csv_path, min_cell_count):
def scan_csv_values(
csv_path: str, min_cell_count: int
) -> tuple[dict[str, list[tuple[str, int]]], int]:
counters = defaultdict(Counter)
row_count = 0

Expand All @@ -64,20 +67,25 @@ def scan_csv_values(csv_path, min_cell_count):


def generate_scan_report(
csv_files, output_path=SCAN_REPORT_FILE_NAME, min_cell_count=1
):
csv_files: list[str],
output_path: str = SCAN_REPORT_FILE_NAME,
min_cell_count: int = 1,
) -> str:
tables = []

for csv_file in csv_files:
csv_file = Path(csv_file)
header = read_csv_header(csv_file)
tables.append({"name": csv_file.name, "path": csv_file, "fields": header})
header = read_csv_header(csv_file.as_posix())
tables.append(
{"name": csv_file.name, "path": csv_file.as_posix(), "fields": header}
)

tables.sort(key=lambda t: t["name"])
indexed_names = index_table_names([t["name"] for t in tables])

wb = Workbook()
wb.remove(wb.active)
if wb.active:
wb.remove(wb.active)

# FIELD_OVERVIEW
field_sheet = wb.create_sheet("Field Overview")
Expand Down
12 changes: 11 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "nuh-helper"
dynamic = ["version"]
description = "Helper functions for enabling studies"
description = "Helper library for enabling studies"
readme = "README.md"
requires-python = ">=3.13"

Expand Down Expand Up @@ -34,3 +34,13 @@ dev = [

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.lint]
select = ["E", "F", "ANN", "B", "UP", "I", "SIM", "C4"]

[tool.coverage.run]
branch = true

[tool.pytest.ini_options]
addopts = "-ra"
testpaths = ["tests"]
Loading
Loading