|
5 | 5 | in an Excel file, with support for reproducible shifts using a linking table. |
6 | 6 | """ |
7 | 7 |
|
| 8 | +import logging |
8 | 9 | import random |
9 | 10 | from datetime import date, datetime |
10 | 11 | from pathlib import Path |
11 | 12 | from typing import Any, cast |
12 | 13 |
|
13 | 14 | import pandas as pd |
14 | 15 |
|
| 16 | +logger = logging.getLogger(__name__) |
| 17 | + |
15 | 18 |
|
16 | 19 | def generate_shift_mappings( |
17 | 20 | patient_ids: list[str], |
@@ -52,6 +55,7 @@ def load_shift_mappings(csv_path: str) -> pd.DataFrame: |
52 | 55 | df = pd.read_csv(csv_path) |
53 | 56 | if "patient_id" not in df.columns or "shift_days" not in df.columns: |
54 | 57 | raise ValueError("CSV must contain 'patient_id' and 'shift_days' columns") |
| 58 | + logger.info("Loaded %d shift mapping(s) from '%s'", len(df), csv_path) |
55 | 59 | return df |
56 | 60 |
|
57 | 61 |
|
@@ -139,10 +143,21 @@ def apply_date_shifts( |
139 | 143 |
|
140 | 144 | for date_col in date_columns: |
141 | 145 | if date_col not in df.columns: |
| 146 | + logger.warning( |
| 147 | + "Date column '%s' not found in DataFrame, skipping", date_col |
| 148 | + ) |
142 | 149 | continue |
143 | 150 |
|
144 | 151 | # Parse flexible date strings (handles YYYY-DD-MM and placeholders "Unknown") |
| 152 | + non_null_before = df[date_col].notna().sum() |
145 | 153 | df[date_col] = df[date_col].apply(_parse_date_value) |
| 154 | + parse_failures = non_null_before - sum(x is not None for x in df[date_col]) |
| 155 | + if parse_failures > 0: |
| 156 | + logger.debug( |
| 157 | + "Column '%s': %d value(s) could not be parsed as dates", |
| 158 | + date_col, |
| 159 | + parse_failures, |
| 160 | + ) |
146 | 161 |
|
147 | 162 | # Apply shifts |
148 | 163 | df[date_col] = df.apply( |
@@ -206,6 +221,10 @@ def shift_excel_dates( |
206 | 221 | If None, Excel's default date format is used. |
207 | 222 | Common formats: 'YYYY-MM-DD', 'MM/DD/YYYY', 'DD-MM-YYYY', etc. |
208 | 223 | """ # noqa: E501 |
| 224 | + logger.info("Shifting dates: '%s' → '%s'", input_file, output_file) |
| 225 | + logger.debug( |
| 226 | + "Shift range: %d to %d days, seed=%s", min_shift_days, max_shift_days, seed |
| 227 | + ) |
209 | 228 |
|
210 | 229 | def _read_sheet_with_structure( |
211 | 230 | excel_file: pd.ExcelFile, |
@@ -341,21 +360,30 @@ def _write_sheet_with_structure( |
341 | 360 | .unique() |
342 | 361 | .tolist() |
343 | 362 | ) |
| 363 | + logger.info( |
| 364 | + "Found %d patient(s) in sheet '%s'", len(patient_ids), patient_sheet |
| 365 | + ) |
344 | 366 |
|
345 | 367 | # Generate or load shift mappings |
346 | 368 | if linking_table_path and Path(linking_table_path).exists(): |
| 369 | + logger.info("Loading shift mappings from '%s'", linking_table_path) |
347 | 370 | shift_mappings = load_shift_mappings(linking_table_path) |
348 | 371 | # Filter to only include patient IDs that exist in the data |
349 | 372 | shift_mappings = shift_mappings[shift_mappings["patient_id"].isin(patient_ids)] |
350 | 373 | # Add any missing patient IDs with random shifts |
351 | 374 | existing_ids = set(shift_mappings["patient_id"]) |
352 | 375 | missing_ids = [pid for pid in patient_ids if pid not in existing_ids] |
353 | 376 | if missing_ids: |
| 377 | + logger.warning( |
| 378 | + "%d patient(s) had no entry in the linking table; new shifts generated", |
| 379 | + len(missing_ids), |
| 380 | + ) |
354 | 381 | new_shifts = generate_shift_mappings( |
355 | 382 | missing_ids, min_shift_days, max_shift_days, seed |
356 | 383 | ) |
357 | 384 | shift_mappings = pd.concat([shift_mappings, new_shifts], ignore_index=True) |
358 | 385 | else: |
| 386 | + logger.info("Generating shift mappings for %d patient(s)", len(patient_ids)) |
359 | 387 | shift_mappings = generate_shift_mappings( |
360 | 388 | patient_ids, min_shift_days, max_shift_days, seed |
361 | 389 | ) |
@@ -388,6 +416,11 @@ def _write_sheet_with_structure( |
388 | 416 | date_columns: list[str] = cast(list[str], config["date_columns"]) |
389 | 417 | header_row = cast(int, config.get("header_row", header_row)) |
390 | 418 | sheet_date_columns = date_columns |
| 419 | + logger.info( |
| 420 | + "Shifting %d date column(s) in sheet '%s'", |
| 421 | + len(date_columns), |
| 422 | + sheet_name, |
| 423 | + ) |
391 | 424 |
|
392 | 425 | # Read sheet preserving structure |
393 | 426 | df, description_df, description_rows = _read_sheet_with_structure( |
@@ -421,11 +454,12 @@ def _write_sheet_with_structure( |
421 | 454 | date_format=date_format, |
422 | 455 | ) |
423 | 456 |
|
| 457 | + logger.info("Output written to '%s'", output_file) |
| 458 | + |
424 | 459 | # Save linking table |
425 | | - if linking_table_output: |
426 | | - shift_mappings.to_csv(linking_table_output, index=False) |
427 | | - else: |
428 | | - shift_mappings.to_csv("shift_mappings.csv", index=False) |
| 460 | + linking_path = linking_table_output or "shift_mappings.csv" |
| 461 | + shift_mappings.to_csv(linking_path, index=False) |
| 462 | + logger.info("Linking table saved to '%s'", linking_path) |
429 | 463 |
|
430 | 464 |
|
431 | 465 | __all__ = [ |
|
0 commit comments