Skip to content

Commit 92b500a

Browse files
authored
feat: Add logging (#25)
1 parent 768d321 commit 92b500a

3 files changed

Lines changed: 51 additions & 6 deletions

File tree

nuh_helper/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,17 @@
66
- **Dataset profiling**: profile a dataset into a Scan Report
77
"""
88

9-
from nuh_helper.date_shift import (
9+
import logging
10+
11+
logging.getLogger(__name__).addHandler(logging.NullHandler())
12+
13+
from nuh_helper.date_shift import ( # noqa: E402
1014
apply_date_shifts,
1115
generate_shift_mappings,
1216
load_shift_mappings,
1317
shift_excel_dates,
1418
)
15-
from nuh_helper.profile import generate_scan_report
19+
from nuh_helper.profile import generate_scan_report # noqa: E402
1620

1721
__all__ = [
1822
"shift_excel_dates",

nuh_helper/date_shift/__init__.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
in an Excel file, with support for reproducible shifts using a linking table.
66
"""
77

8+
import logging
89
import random
910
from datetime import date, datetime
1011
from pathlib import Path
1112
from typing import Any, cast
1213

1314
import pandas as pd
1415

16+
logger = logging.getLogger(__name__)
17+
1518

1619
def generate_shift_mappings(
1720
patient_ids: list[str],
@@ -52,6 +55,7 @@ def load_shift_mappings(csv_path: str) -> pd.DataFrame:
5255
df = pd.read_csv(csv_path)
5356
if "patient_id" not in df.columns or "shift_days" not in df.columns:
5457
raise ValueError("CSV must contain 'patient_id' and 'shift_days' columns")
58+
logger.info("Loaded %d shift mapping(s) from '%s'", len(df), csv_path)
5559
return df
5660

5761

@@ -139,10 +143,21 @@ def apply_date_shifts(
139143

140144
for date_col in date_columns:
141145
if date_col not in df.columns:
146+
logger.warning(
147+
"Date column '%s' not found in DataFrame, skipping", date_col
148+
)
142149
continue
143150

144151
# Parse flexible date strings (handles YYYY-DD-MM and placeholders "Unknown")
152+
non_null_before = df[date_col].notna().sum()
145153
df[date_col] = df[date_col].apply(_parse_date_value)
154+
parse_failures = non_null_before - sum(x is not None for x in df[date_col])
155+
if parse_failures > 0:
156+
logger.debug(
157+
"Column '%s': %d value(s) could not be parsed as dates",
158+
date_col,
159+
parse_failures,
160+
)
146161

147162
# Apply shifts
148163
df[date_col] = df.apply(
@@ -206,6 +221,10 @@ def shift_excel_dates(
206221
If None, Excel's default date format is used.
207222
Common formats: 'YYYY-MM-DD', 'MM/DD/YYYY', 'DD-MM-YYYY', etc.
208223
""" # noqa: E501
224+
logger.info("Shifting dates: '%s' → '%s'", input_file, output_file)
225+
logger.debug(
226+
"Shift range: %d to %d days, seed=%s", min_shift_days, max_shift_days, seed
227+
)
209228

210229
def _read_sheet_with_structure(
211230
excel_file: pd.ExcelFile,
@@ -341,21 +360,30 @@ def _write_sheet_with_structure(
341360
.unique()
342361
.tolist()
343362
)
363+
logger.info(
364+
"Found %d patient(s) in sheet '%s'", len(patient_ids), patient_sheet
365+
)
344366

345367
# Generate or load shift mappings
346368
if linking_table_path and Path(linking_table_path).exists():
369+
logger.info("Loading shift mappings from '%s'", linking_table_path)
347370
shift_mappings = load_shift_mappings(linking_table_path)
348371
# Filter to only include patient IDs that exist in the data
349372
shift_mappings = shift_mappings[shift_mappings["patient_id"].isin(patient_ids)]
350373
# Add any missing patient IDs with random shifts
351374
existing_ids = set(shift_mappings["patient_id"])
352375
missing_ids = [pid for pid in patient_ids if pid not in existing_ids]
353376
if missing_ids:
377+
logger.warning(
378+
"%d patient(s) had no entry in the linking table; new shifts generated",
379+
len(missing_ids),
380+
)
354381
new_shifts = generate_shift_mappings(
355382
missing_ids, min_shift_days, max_shift_days, seed
356383
)
357384
shift_mappings = pd.concat([shift_mappings, new_shifts], ignore_index=True)
358385
else:
386+
logger.info("Generating shift mappings for %d patient(s)", len(patient_ids))
359387
shift_mappings = generate_shift_mappings(
360388
patient_ids, min_shift_days, max_shift_days, seed
361389
)
@@ -388,6 +416,11 @@ def _write_sheet_with_structure(
388416
date_columns: list[str] = cast(list[str], config["date_columns"])
389417
header_row = cast(int, config.get("header_row", header_row))
390418
sheet_date_columns = date_columns
419+
logger.info(
420+
"Shifting %d date column(s) in sheet '%s'",
421+
len(date_columns),
422+
sheet_name,
423+
)
391424

392425
# Read sheet preserving structure
393426
df, description_df, description_rows = _read_sheet_with_structure(
@@ -421,11 +454,12 @@ def _write_sheet_with_structure(
421454
date_format=date_format,
422455
)
423456

457+
logger.info("Output written to '%s'", output_file)
458+
424459
# Save linking table
425-
if linking_table_output:
426-
shift_mappings.to_csv(linking_table_output, index=False)
427-
else:
428-
shift_mappings.to_csv("shift_mappings.csv", index=False)
460+
linking_path = linking_table_output or "shift_mappings.csv"
461+
shift_mappings.to_csv(linking_path, index=False)
462+
logger.info("Linking table saved to '%s'", linking_path)
429463

430464

431465
__all__ = [

nuh_helper/profile/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import csv
2+
import logging
23
from collections import Counter, defaultdict
34
from datetime import datetime
45
from pathlib import Path
56

67
from openpyxl import Workbook
78

9+
logger = logging.getLogger(__name__)
10+
811
SCAN_REPORT_FILE_NAME = "ScanReport.xlsx"
912

1013
FIELD_OVERVIEW_HEADERS = [
@@ -71,11 +74,14 @@ def generate_scan_report(
7174
output_path: str = SCAN_REPORT_FILE_NAME,
7275
min_cell_count: int = 1,
7376
) -> str:
77+
logger.info("Generating scan report for %d table(s)", len(csv_files))
78+
7479
tables = []
7580

7681
for csv_file in csv_files:
7782
csv_file = Path(csv_file)
7883
header = read_csv_header(csv_file.as_posix())
84+
logger.info("Scanning '%s' (%d field(s))", csv_file.name, len(header))
7985
tables.append(
8086
{"name": csv_file.name, "path": csv_file.as_posix(), "fields": header}
8187
)
@@ -151,4 +157,5 @@ def generate_scan_report(
151157
meta_sheet.append(["minCellCount", min_cell_count])
152158

153159
wb.save(output_path)
160+
logger.info("Scan report written to '%s'", output_path)
154161
return output_path

0 commit comments

Comments
 (0)