|
| 1 | +import pandas as pd |
| 2 | +import sys |
| 3 | +import os |
| 4 | +import time |
| 5 | +import logging |
| 6 | +from datetime import datetime |
| 7 | + |
| 8 | +def setup_logging(log_dir="logs"): |
| 9 | + """Configures logging to both console (stdout) and a time-stamped file.""" |
| 10 | + if not os.path.exists(log_dir): |
| 11 | + os.makedirs(log_dir) |
| 12 | + |
| 13 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 14 | + log_file = os.path.join(log_dir, f"file_transform_{timestamp}.log") |
| 15 | + |
| 16 | + # Basic configuration: logs everything of INFO level and above |
| 17 | + logging.basicConfig( |
| 18 | + level=logging.INFO, |
| 19 | + format='%(asctime)s - %(levelname)s - %(message)s', |
| 20 | + datefmt='%Y-%m-%d %H:%M:%S', |
| 21 | + handlers=[ |
| 22 | + # 1. File Handler: Writes to the log file |
| 23 | + logging.FileHandler(log_file, mode='w', encoding='utf-8'), |
| 24 | + # 2. Console Handler: Prints to standard output |
| 25 | + logging.StreamHandler(sys.stdout) |
| 26 | + ] |
| 27 | + ) |
| 28 | + logging.info(f"Logging initiated. All output directed to console and file: {log_file}") |
| 29 | + logging.info("-" * 50) |
| 30 | + |
| 31 | +def apply_enrollment_transforms(df: pd.DataFrame): |
| 32 | + """Applies the specific set of transformations for Enrollment data.""" |
| 33 | + logging.info("Applying Enrollment-specific transformations...") |
| 34 | + |
| 35 | + # --- 1. Put text in double quotes for a list of columns --- |
| 36 | + double_quote_cols = [ |
| 37 | + 'CUSTOMER_NAME', 'CUSTOMER_SERVICE_ADDRESS', 'CUSTOMER_SERVICE_CITY_STATE_ZIP', |
| 38 | + 'TX_TAR_SHORT_DESC', 'TX_TAR_SCH_DESC' |
| 39 | + ] |
| 40 | + logging.info(f"Encapsulating text in double quotes for columns: {', '.join(double_quote_cols)}") |
| 41 | + for col in double_quote_cols: |
| 42 | + if col in df.columns: |
| 43 | + mask = df[col].notna() |
| 44 | + df.loc[mask, col] = df.loc[mask, col].astype(str).apply(lambda x: f'"{x}"') |
| 45 | + |
| 46 | + # --- 2. Put text in custom quotes for a specific column --- |
| 47 | + triple_quote_col = 'TX_SERV_SUPP' |
| 48 | + # FIX: Corrected f-string syntax by using double curly braces '{{}}' for literal braces and escaping internal quotes. |
| 49 | + logging.info(f"Applying custom encapsulation ('\"\\'{{value}}\\'\"') to column: {triple_quote_col}") |
| 50 | + if triple_quote_col in df.columns: |
| 51 | + mask = df[triple_quote_col].notna() |
| 52 | + df.loc[mask, triple_quote_col] = df.loc[mask, triple_quote_col].astype(str).apply(lambda x: f'"\'{x}\'"') |
| 53 | + |
| 54 | + # --- 3. Format dates to "YYYY-MM-DD" for a list of columns --- |
| 55 | + date_cols = ['DT_EFF', 'CUST_ENR_START_DATE', 'CUST_EDI_DROP_DATE', 'LAST_UPDATE'] |
| 56 | + logging.info(f"Formatting dates to YYYY-MM-DD for columns: {', '.join(date_cols)}") |
| 57 | + for col in date_cols: |
| 58 | + if col in df.columns: |
| 59 | + df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d') |
| 60 | + |
| 61 | + # --- 4. Zero-padding for specific columns --- |
| 62 | + padding_cols = {'CITY_GATE': 4, 'KY_MTR_BILL_GRP': 2, 'CD_SERV_SUPP': 4} |
| 63 | + logging.info(f"Applying zero-padding for columns: {padding_cols}") |
| 64 | + for col, length in padding_cols.items(): |
| 65 | + if col in df.columns: |
| 66 | + mask = df[col].notna() |
| 67 | + df.loc[mask, col] = df.loc[mask, col].astype(str).str.split('.').str[0].str.zfill(length) |
| 68 | + |
| 69 | + # --- 5. Remove comma separators from specific columns --- |
| 70 | + comma_removal_cols = [ |
| 71 | + 'TOT_ANNUAL_USAGE', 'CUST_PEAK_DAY', 'CUST_BASE_LOAD', 'CUST_THERMAL_RESPONSE' |
| 72 | + ] |
| 73 | + logging.info(f"Removing commas from columns: {', '.join(comma_removal_cols)}") |
| 74 | + for col in comma_removal_cols: |
| 75 | + if col in df.columns: |
| 76 | + mask = df[col].notna() |
| 77 | + df.loc[mask, col] = df.loc[mask, col].astype(str).str.replace(',', '') |
| 78 | + |
| 79 | + return df |
| 80 | + |
| 81 | +def apply_usage_transforms(df: pd.DataFrame): |
| 82 | + """Applies the specific set of transformations for Usage data.""" |
| 83 | + logging.info("Applying Usage-specific transformations...") |
| 84 | + |
| 85 | + # --- 1. Put text in double quotes for a list of columns --- |
| 86 | + double_quote_cols = [ |
| 87 | + 'CUST_NAME', 'CUST_SERV_ADDR', 'CUST_SERV_CITY_ST_ZIP', 'CUST_POOL_ID' |
| 88 | + ] |
| 89 | + logging.info(f"Encapsulating text in double quotes for columns: {', '.join(double_quote_cols)}") |
| 90 | + for col in double_quote_cols: |
| 91 | + if col in df.columns: |
| 92 | + mask = df[col].notna() |
| 93 | + df.loc[mask, col] = df.loc[mask, col].astype(str).apply(lambda x: f'"{x}"') |
| 94 | + |
| 95 | + # --- 2. Format dates to "YYYY-MM-DD" for a list of columns --- |
| 96 | + date_cols = ['DT_LST_BLLD', 'DT_RDG_FROM', 'DT_RDG_TO', 'DT_ENTERED'] |
| 97 | + logging.info(f"Formatting dates to YYYY-MM-DD for columns: {', '.join(date_cols)}") |
| 98 | + for col in date_cols: |
| 99 | + if col in df.columns: |
| 100 | + df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d') |
| 101 | + |
| 102 | + # --- 3. Zero-padding for specific columns --- |
| 103 | + logging.info("Applying zero-padding for various columns...") |
| 104 | + |
| 105 | + # KY_MTR_BILL_GRP needs 2 chars |
| 106 | + if 'KY_MTR_BILL_GRP' in df.columns: |
| 107 | + mask = df['KY_MTR_BILL_GRP'].notna() |
| 108 | + df.loc[mask, 'KY_MTR_BILL_GRP'] = df.loc[mask, 'KY_MTR_BILL_GRP'].astype(str).str.split('.').str[0].str.zfill(2) |
| 109 | + |
| 110 | + # CITY_GATE needs 4 chars |
| 111 | + if 'CITY_GATE' in df.columns: |
| 112 | + mask = df['CITY_GATE'].notna() |
| 113 | + df.loc[mask, 'CITY_GATE'] = df.loc[mask, 'CITY_GATE'].astype(str).str.split('.').str[0].str.zfill(4) |
| 114 | + |
| 115 | + # CD_BILL_PRCS_INSTR needs 4 chars |
| 116 | + if 'CD_BILL_PRCS_INSTR' in df.columns: |
| 117 | + mask = df['CD_BILL_PRCS_INSTR'].notna() |
| 118 | + df.loc[mask, 'CD_BILL_PRCS_INSTR'] = df.loc[mask, 'CD_BILL_PRCS_INSTR'].astype(str).str.split('.').str[0].str.zfill(4) |
| 119 | + |
| 120 | + # CD_SERV_SUPP needs 4 chars |
| 121 | + if 'CD_SERV_SUPP' in df.columns: |
| 122 | + mask = df['CD_SERV_SUPP'].notna() |
| 123 | + df.loc[mask, 'CD_SERV_SUPP'] = df.loc[mask, 'CD_SERV_SUPP'].astype(str).str.split('.').str[0].str.zfill(4) |
| 124 | + |
| 125 | + # --- 4. Remove comma separators from specific columns --- |
| 126 | + comma_removal_cols = ['USAGE', 'QY_BTU_FACTOR'] |
| 127 | + logging.info(f"Removing commas from columns: {', '.join(comma_removal_cols)}") |
| 128 | + for col in comma_removal_cols: |
| 129 | + if col in df.columns: |
| 130 | + mask = df[col].notna() |
| 131 | + df.loc[mask, col] = df.loc[mask, col].astype(str).str.replace(',', '') |
| 132 | + |
| 133 | + return df |
| 134 | + |
| 135 | +def transform_excel_data(input_file: str, output_file: str): |
| 136 | + """ |
| 137 | + Reads an Excel file and applies either Enrollment or Usage transforms |
| 138 | + based on keywords in the filename, then saves the result as a CSV file. |
| 139 | + """ |
| 140 | + file_name_lower = os.path.basename(input_file).lower() |
| 141 | + |
| 142 | + try: |
| 143 | + df = pd.read_excel(input_file, engine='openpyxl') |
| 144 | + logging.info(f"Successfully read file: {input_file}") |
| 145 | + except FileNotFoundError: |
| 146 | + logging.error(f"Error: The input file '{input_file}' was not found.") |
| 147 | + return False |
| 148 | + except Exception as e: |
| 149 | + logging.error(f"An error occurred while reading the file: {e}") |
| 150 | + return False |
| 151 | + |
| 152 | + # Dispatch to the correct transformation function based on keywords |
| 153 | + try: |
| 154 | + if 'enrollment' in file_name_lower: |
| 155 | + df = apply_enrollment_transforms(df) |
| 156 | + elif 'usage' in file_name_lower: |
| 157 | + df = apply_usage_transforms(df) |
| 158 | + else: |
| 159 | + logging.warning("Filename does not contain 'Enrollment' or 'Usage'. Skipping transformations.") |
| 160 | + return False |
| 161 | + except KeyError as e: |
| 162 | + # Catches common errors where a column expected by the transformation logic is missing. |
| 163 | + logging.error("Transformation Failed: A required column was not found in the Excel file.") |
| 164 | + logging.error(f"Missing column: {e}. Please ensure the input file schema is correct.") |
| 165 | + return False |
| 166 | + except Exception as e: |
| 167 | + # Catches any other unexpected error during the transformation logic execution. |
| 168 | + logging.error(f"Transformation Failed unexpectedly during data processing: {e}") |
| 169 | + return False |
| 170 | + |
| 171 | + df.to_csv(output_file, index=False) |
| 172 | + logging.info(f"Excel transformation complete! Data saved to intermediate CSV: {output_file}") |
| 173 | + return True |
| 174 | + |
| 175 | +def find_and_replace_quotes(input_file, output_file): |
| 176 | + """ |
| 177 | + Reads a file, finds and replaces quotes, then saves to a new file. |
| 178 | + """ |
| 179 | + try: |
| 180 | + if not os.path.exists(input_file): |
| 181 | + logging.error(f"Error: The intermediate input file '{input_file}' was not found for quote replacement.") |
| 182 | + return |
| 183 | + |
| 184 | + logging.info(f"Starting quote cleanup on intermediate file: {input_file}") |
| 185 | + with open(input_file, 'r', encoding='utf-8') as f: |
| 186 | + content = f.read() |
| 187 | + |
| 188 | + # The core cleanup step: replaces three double quotes (""") with one double quote (") |
| 189 | + modified_content = content.replace('"""', '"') |
| 190 | + |
| 191 | + with open(output_file, 'w', encoding='utf-8') as f: |
| 192 | + f.write(modified_content) |
| 193 | + |
| 194 | + logging.info(f"Final find-and-replace process complete! Data saved to final CSV: {output_file}") |
| 195 | + except Exception as e: |
| 196 | + logging.error(f"An error occurred during find and replace: {e}") |
| 197 | + |
| 198 | +if __name__ == '__main__': |
| 199 | + # Setup logging first |
| 200 | + setup_logging() |
| 201 | + |
| 202 | + # --- Automatic Folder Scan Mode --- |
| 203 | + |
| 204 | + # 1. Define target folder: Navigate to the script's immediate environment and then into "FileTransform" |
| 205 | + try: |
| 206 | + # Get the full absolute path of the directory containing the running script. |
| 207 | + # This resolves to the folder where the script file resides (e.g., ...\TestFileEdit). |
| 208 | + script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) |
| 209 | + |
| 210 | + # We assume the 'FileTransform' folder is a CHILD of the script's directory (a sibling of the script if run from root). |
| 211 | + target_dir = os.path.join(script_dir, 'FileTransform') |
| 212 | + |
| 213 | + # Fallback check: If 'FileTransform' is not found as a child, check one level up (parent). |
| 214 | + if not os.path.exists(target_dir): |
| 215 | + parent_dir = os.path.dirname(script_dir) |
| 216 | + target_dir_parent = os.path.join(parent_dir, 'FileTransform') |
| 217 | + |
| 218 | + if os.path.exists(target_dir_parent): |
| 219 | + target_dir = target_dir_parent |
| 220 | + |
| 221 | + |
| 222 | + except IndexError: |
| 223 | + # Fallback for interactive environments or non-standard execution (start from CWD) |
| 224 | + current_dir = os.getcwd() |
| 225 | + # First, try CWD + 'FileTransform' (sibling if CWD is where script is run) |
| 226 | + target_dir = os.path.join(current_dir, 'FileTransform') |
| 227 | + # Second, try CWD + '..' + 'FileTransform' (parent directory) |
| 228 | + if not os.path.exists(target_dir): |
| 229 | + target_dir = os.path.join(current_dir, os.pardir, 'FileTransform') |
| 230 | + |
| 231 | + logging.info("\n" + "="*50) |
| 232 | + logging.info(f"Starting automatic scan for Excel files in: {target_dir}") |
| 233 | + logging.info("Files containing 'Enrollment' or 'Usage' will be processed.") |
| 234 | + logging.info("="*50) |
| 235 | + |
| 236 | + if not os.path.exists(target_dir): |
| 237 | + logging.error(f"Target directory not found: {target_dir}") |
| 238 | + logging.error("Please ensure the 'FileTransform' folder is located either in the same directory as the script or one level up.") |
| 239 | + sys.exit(1) |
| 240 | + |
| 241 | + processed_count = 0 |
| 242 | + |
| 243 | + for filename in os.listdir(target_dir): |
| 244 | + # 2. Construct the full path |
| 245 | + input_path = os.path.join(target_dir, filename) |
| 246 | + |
| 247 | + # 3. Skip directories, temporary files, and non-Excel files |
| 248 | + if os.path.isdir(input_path) or filename.startswith('~') or not filename.lower().endswith(('.xlsx', '.xls')): |
| 249 | + logging.debug(f"Skipping non-Excel file or directory: {filename}") |
| 250 | + continue |
| 251 | + |
| 252 | + logging.info(f"\n--- Processing File: {filename} ---") |
| 253 | + |
| 254 | + # 4. Determine output file names |
| 255 | + base_name = os.path.splitext(filename)[0] |
| 256 | + intermediate_csv_filename = os.path.join(target_dir, f"{base_name}_intermediate.csv") |
| 257 | + final_output_filename = os.path.join(target_dir, f"{base_name}_final.csv") |
| 258 | + |
| 259 | + # 5. Determine file type for logging |
| 260 | + file_name_lower = filename.lower() |
| 261 | + file_type = "Enrollment" if 'enrollment' in file_name_lower else "Usage" if 'usage' in file_name_lower else "Unknown" |
| 262 | + logging.info(f"File Type Detected: {file_type}") |
| 263 | + |
| 264 | + # 6. Execute the transformation and cleanup pipeline |
| 265 | + if transform_excel_data(input_path, intermediate_csv_filename): |
| 266 | + find_and_replace_quotes(intermediate_csv_filename, final_output_filename) |
| 267 | + processed_count += 1 |
| 268 | + |
| 269 | + # Cleanup |
| 270 | + try: |
| 271 | + logging.info("Starting cleanup of intermediate file...") |
| 272 | + os.remove(intermediate_csv_filename) |
| 273 | + logging.info(f"Intermediate file '{intermediate_csv_filename}' removed successfully.") |
| 274 | + except OSError as e: |
| 275 | + logging.error(f"Error removing intermediate file: {e}") |
| 276 | + |
| 277 | + logging.info("\n" + "="*50) |
| 278 | + logging.info(f"Automatic folder scan finished. {processed_count} files processed.") |
| 279 | + logging.info("="*50 + "\n") |
0 commit comments