-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprototype_mapper.py
More file actions
102 lines (80 loc) · 3.66 KB
/
Copy pathprototype_mapper.py
File metadata and controls
102 lines (80 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
"""
PCGL Data Mapper - Generic Study Processor
Configuration-driven data mapping framework for processing study data using YAML
entity configurations and the generic mapper factory pattern.
"""
import sys
import argparse
import logging
from pathlib import Path
# Add parent directories to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
# Import StudyDataMapper from core
from core.mappers import StudyDataMapper
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('prototype_mapper.log', mode='w'), # 'w' mode overwrites on each run
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def main():
"""
Main entry point for generic study data mapper.
Auto-discovers entities from YAML configs, dynamically loads study-specific
functions, processes all entities, and generates output files and reports.
"""
# Parse command line arguments
parser = argparse.ArgumentParser(
description='PCGL Data Mapper - Generic Study Processor',
epilog='Example: %(prog)s --study_id StudyA --input_csv data/source/input.csv --output_dir data/mapped/'
)
parser.add_argument('--study_id', help='Study identifier')
# Support both single-file (backward compatible) and multi-file (new) modes
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument('--input_csv', type=Path, help='Path to source CSV file (single-file mode)')
input_group.add_argument('--input_dir', type=Path, help='Directory containing source CSV files (multi-file mode)')
parser.add_argument('--output_dir', type=Path, help='Output directory for mapped files')
parser.add_argument('--study_config_dir', type=Path, help='Root directory containing study configs (default: ./studies/)')
args = parser.parse_args()
study_id = args.study_id
input_csv_path = args.input_csv
input_dir_path = args.input_dir
output_dir = args.output_dir
study_config_dir = args.study_config_dir
# Validate input
if input_csv_path and not input_csv_path.exists():
logger.error(f"Input file not found: {input_csv_path}")
sys.exit(1)
if input_dir_path and not input_dir_path.exists():
logger.error(f"Input directory not found: {input_dir_path}")
sys.exit(1)
try:
# Initialize mapper (auto-discovers entities, dynamically loads study module)
mapper = StudyDataMapper(study_id=study_id, study_root=study_config_dir)
# Determine mode and process accordingly
if input_csv_path:
# Single-file mode (backward compatible)
logger.info(f"Running in single-file mode with: {input_csv_path}")
source_df = mapper.load_source_data(input_csv_path)
mapper.process_all_entities(source_df)
else:
# Multi-file mode (new)
logger.info(f"Running in multi-file mode with directory: {input_dir_path}")
mapper.set_input_directory(input_dir_path)
mapper.process_all_entities_multifile()
# Save results
mapper.save_results(output_dir)
# Generate and save summary report
mapper.save_summary_report(output_dir)
logger.info(f"Data mapper completed successfully for {study_id}!")
logger.info(f"Auto-discovered and processed {len(mapper.entities)} entities")
except Exception as e:
logger.error(f"Data mapper failed: {e}", exc_info=True)
sys.exit(1)
if __name__ == '__main__':
main()