Skip to content

Commit cd738b4

Browse files
authored
Initial Commit
1 parent d4c2d7c commit cd738b4

3 files changed

Lines changed: 320 additions & 0 deletions

File tree

file_transform_20250927_004932.log

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2025-09-27 00:49:32 - INFO - Logging initiated. All output directed to console and file: logs\file_transform_20250927_004932.log
2+
2025-09-27 00:49:32 - INFO - --------------------------------------------------
3+
2025-09-27 00:49:32 - INFO -
4+
==================================================
5+
2025-09-27 00:49:32 - INFO - Starting automatic scan for Excel files in: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform
6+
2025-09-27 00:49:32 - INFO - Files containing 'Enrollment' or 'Usage' will be processed.
7+
2025-09-27 00:49:32 - INFO - ==================================================
8+
2025-09-27 00:49:32 - INFO -
9+
--- Processing File: 2025.11 Enrollment File Part 1.5.xlsx ---
10+
2025-09-27 00:49:32 - INFO - File Type Detected: Enrollment
11+
2025-09-27 00:49:32 - INFO - Successfully read file: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\2025.11 Enrollment File Part 1.5.xlsx
12+
2025-09-27 00:49:32 - INFO - Applying Enrollment-specific transformations...
13+
2025-09-27 00:49:32 - INFO - Encapsulating text in double quotes for columns: CUSTOMER_NAME, CUSTOMER_SERVICE_ADDRESS, CUSTOMER_SERVICE_CITY_STATE_ZIP, TX_TAR_SHORT_DESC, TX_TAR_SCH_DESC
14+
2025-09-27 00:49:32 - INFO - Applying custom encapsulation ('"\'{value}\'"') to column: TX_SERV_SUPP
15+
2025-09-27 00:49:32 - INFO - Formatting dates to YYYY-MM-DD for columns: DT_EFF, CUST_ENR_START_DATE, CUST_EDI_DROP_DATE, LAST_UPDATE
16+
2025-09-27 00:49:32 - INFO - Applying zero-padding for columns: {'CITY_GATE': 4, 'KY_MTR_BILL_GRP': 2, 'CD_SERV_SUPP': 4}
17+
2025-09-27 00:49:32 - INFO - Removing commas from columns: TOT_ANNUAL_USAGE, CUST_PEAK_DAY, CUST_BASE_LOAD, CUST_THERMAL_RESPONSE
18+
2025-09-27 00:49:32 - INFO - Excel transformation complete! Data saved to intermediate CSV: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\2025.11 Enrollment File Part 1.5_intermediate.csv
19+
2025-09-27 00:49:32 - INFO - Starting quote cleanup on intermediate file: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\2025.11 Enrollment File Part 1.5_intermediate.csv
20+
2025-09-27 00:49:32 - INFO - Final find-and-replace process complete! Data saved to final CSV: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\2025.11 Enrollment File Part 1.5_final.csv
21+
2025-09-27 00:49:32 - INFO - Starting cleanup of intermediate file...
22+
2025-09-27 00:49:32 - INFO - Intermediate file 'c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\2025.11 Enrollment File Part 1.5_intermediate.csv' removed successfully.
23+
2025-09-27 00:49:32 - INFO -
24+
--- Processing File: z_usage_1.xlsx ---
25+
2025-09-27 00:49:32 - INFO - File Type Detected: Usage
26+
2025-09-27 00:49:32 - INFO - Successfully read file: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\z_usage_1.xlsx
27+
2025-09-27 00:49:32 - INFO - Applying Usage-specific transformations...
28+
2025-09-27 00:49:32 - INFO - Encapsulating text in double quotes for columns: CUST_NAME, CUST_SERV_ADDR, CUST_SERV_CITY_ST_ZIP, CUST_POOL_ID
29+
2025-09-27 00:49:32 - INFO - Formatting dates to YYYY-MM-DD for columns: DT_LST_BLLD, DT_RDG_FROM, DT_RDG_TO, DT_ENTERED
30+
2025-09-27 00:49:32 - INFO - Applying zero-padding for various columns...
31+
2025-09-27 00:49:32 - INFO - Removing commas from columns: USAGE, QY_BTU_FACTOR
32+
2025-09-27 00:49:32 - INFO - Excel transformation complete! Data saved to intermediate CSV: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\z_usage_1_intermediate.csv
33+
2025-09-27 00:49:32 - INFO - Starting quote cleanup on intermediate file: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\z_usage_1_intermediate.csv
34+
2025-09-27 00:49:32 - INFO - Final find-and-replace process complete! Data saved to final CSV: c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\z_usage_1_final.csv
35+
2025-09-27 00:49:32 - INFO - Starting cleanup of intermediate file...
36+
2025-09-27 00:49:32 - INFO - Intermediate file 'c:\Users\SaptarshiChakraborty\Documents\TestFileEdit\FileTransform\z_usage_1_intermediate.csv' removed successfully.
37+
2025-09-27 00:49:32 - INFO -
38+
==================================================
39+
2025-09-27 00:49:32 - INFO - Automatic folder scan finished. 2 files processed.
40+
2025-09-27 00:49:32 - INFO - ==================================================
41+

manual_transform.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
import pandas as pd
2+
import sys
3+
import os
4+
import time
5+
import logging
6+
from datetime import datetime
7+
8+
def setup_logging(log_dir="logs"):
9+
"""Configures logging to both console (stdout) and a time-stamped file."""
10+
if not os.path.exists(log_dir):
11+
os.makedirs(log_dir)
12+
13+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
14+
log_file = os.path.join(log_dir, f"file_transform_{timestamp}.log")
15+
16+
# Basic configuration: logs everything of INFO level and above
17+
logging.basicConfig(
18+
level=logging.INFO,
19+
format='%(asctime)s - %(levelname)s - %(message)s',
20+
datefmt='%Y-%m-%d %H:%M:%S',
21+
handlers=[
22+
# 1. File Handler: Writes to the log file
23+
logging.FileHandler(log_file, mode='w', encoding='utf-8'),
24+
# 2. Console Handler: Prints to standard output
25+
logging.StreamHandler(sys.stdout)
26+
]
27+
)
28+
logging.info(f"Logging initiated. All output directed to console and file: {log_file}")
29+
logging.info("-" * 50)
30+
31+
def apply_enrollment_transforms(df: pd.DataFrame):
32+
"""Applies the specific set of transformations for Enrollment data."""
33+
logging.info("Applying Enrollment-specific transformations...")
34+
35+
# --- 1. Put text in double quotes for a list of columns ---
36+
double_quote_cols = [
37+
'CUSTOMER_NAME', 'CUSTOMER_SERVICE_ADDRESS', 'CUSTOMER_SERVICE_CITY_STATE_ZIP',
38+
'TX_TAR_SHORT_DESC', 'TX_TAR_SCH_DESC'
39+
]
40+
logging.info(f"Encapsulating text in double quotes for columns: {', '.join(double_quote_cols)}")
41+
for col in double_quote_cols:
42+
if col in df.columns:
43+
mask = df[col].notna()
44+
df.loc[mask, col] = df.loc[mask, col].astype(str).apply(lambda x: f'"{x}"')
45+
46+
# --- 2. Put text in custom quotes for a specific column ---
47+
triple_quote_col = 'TX_SERV_SUPP'
48+
# FIX: Corrected f-string syntax by using double curly braces '{{}}' for literal braces and escaping internal quotes.
49+
logging.info(f"Applying custom encapsulation ('\"\\'{{value}}\\'\"') to column: {triple_quote_col}")
50+
if triple_quote_col in df.columns:
51+
mask = df[triple_quote_col].notna()
52+
df.loc[mask, triple_quote_col] = df.loc[mask, triple_quote_col].astype(str).apply(lambda x: f'"\'{x}\'"')
53+
54+
# --- 3. Format dates to "YYYY-MM-DD" for a list of columns ---
55+
date_cols = ['DT_EFF', 'CUST_ENR_START_DATE', 'CUST_EDI_DROP_DATE', 'LAST_UPDATE']
56+
logging.info(f"Formatting dates to YYYY-MM-DD for columns: {', '.join(date_cols)}")
57+
for col in date_cols:
58+
if col in df.columns:
59+
df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d')
60+
61+
# --- 4. Zero-padding for specific columns ---
62+
padding_cols = {'CITY_GATE': 4, 'KY_MTR_BILL_GRP': 2, 'CD_SERV_SUPP': 4}
63+
logging.info(f"Applying zero-padding for columns: {padding_cols}")
64+
for col, length in padding_cols.items():
65+
if col in df.columns:
66+
mask = df[col].notna()
67+
df.loc[mask, col] = df.loc[mask, col].astype(str).str.split('.').str[0].str.zfill(length)
68+
69+
# --- 5. Remove comma separators from specific columns ---
70+
comma_removal_cols = [
71+
'TOT_ANNUAL_USAGE', 'CUST_PEAK_DAY', 'CUST_BASE_LOAD', 'CUST_THERMAL_RESPONSE'
72+
]
73+
logging.info(f"Removing commas from columns: {', '.join(comma_removal_cols)}")
74+
for col in comma_removal_cols:
75+
if col in df.columns:
76+
mask = df[col].notna()
77+
df.loc[mask, col] = df.loc[mask, col].astype(str).str.replace(',', '')
78+
79+
return df
80+
81+
def apply_usage_transforms(df: pd.DataFrame):
82+
"""Applies the specific set of transformations for Usage data."""
83+
logging.info("Applying Usage-specific transformations...")
84+
85+
# --- 1. Put text in double quotes for a list of columns ---
86+
double_quote_cols = [
87+
'CUST_NAME', 'CUST_SERV_ADDR', 'CUST_SERV_CITY_ST_ZIP', 'CUST_POOL_ID'
88+
]
89+
logging.info(f"Encapsulating text in double quotes for columns: {', '.join(double_quote_cols)}")
90+
for col in double_quote_cols:
91+
if col in df.columns:
92+
mask = df[col].notna()
93+
df.loc[mask, col] = df.loc[mask, col].astype(str).apply(lambda x: f'"{x}"')
94+
95+
# --- 2. Format dates to "YYYY-MM-DD" for a list of columns ---
96+
date_cols = ['DT_LST_BLLD', 'DT_RDG_FROM', 'DT_RDG_TO', 'DT_ENTERED']
97+
logging.info(f"Formatting dates to YYYY-MM-DD for columns: {', '.join(date_cols)}")
98+
for col in date_cols:
99+
if col in df.columns:
100+
df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d')
101+
102+
# --- 3. Zero-padding for specific columns ---
103+
logging.info("Applying zero-padding for various columns...")
104+
105+
# KY_MTR_BILL_GRP needs 2 chars
106+
if 'KY_MTR_BILL_GRP' in df.columns:
107+
mask = df['KY_MTR_BILL_GRP'].notna()
108+
df.loc[mask, 'KY_MTR_BILL_GRP'] = df.loc[mask, 'KY_MTR_BILL_GRP'].astype(str).str.split('.').str[0].str.zfill(2)
109+
110+
# CITY_GATE needs 4 chars
111+
if 'CITY_GATE' in df.columns:
112+
mask = df['CITY_GATE'].notna()
113+
df.loc[mask, 'CITY_GATE'] = df.loc[mask, 'CITY_GATE'].astype(str).str.split('.').str[0].str.zfill(4)
114+
115+
# CD_BILL_PRCS_INSTR needs 4 chars
116+
if 'CD_BILL_PRCS_INSTR' in df.columns:
117+
mask = df['CD_BILL_PRCS_INSTR'].notna()
118+
df.loc[mask, 'CD_BILL_PRCS_INSTR'] = df.loc[mask, 'CD_BILL_PRCS_INSTR'].astype(str).str.split('.').str[0].str.zfill(4)
119+
120+
# CD_SERV_SUPP needs 4 chars
121+
if 'CD_SERV_SUPP' in df.columns:
122+
mask = df['CD_SERV_SUPP'].notna()
123+
df.loc[mask, 'CD_SERV_SUPP'] = df.loc[mask, 'CD_SERV_SUPP'].astype(str).str.split('.').str[0].str.zfill(4)
124+
125+
# --- 4. Remove comma separators from specific columns ---
126+
comma_removal_cols = ['USAGE', 'QY_BTU_FACTOR']
127+
logging.info(f"Removing commas from columns: {', '.join(comma_removal_cols)}")
128+
for col in comma_removal_cols:
129+
if col in df.columns:
130+
mask = df[col].notna()
131+
df.loc[mask, col] = df.loc[mask, col].astype(str).str.replace(',', '')
132+
133+
return df
134+
135+
def transform_excel_data(input_file: str, output_file: str):
136+
"""
137+
Reads an Excel file and applies either Enrollment or Usage transforms
138+
based on keywords in the filename, then saves the result as a CSV file.
139+
"""
140+
file_name_lower = os.path.basename(input_file).lower()
141+
142+
try:
143+
df = pd.read_excel(input_file, engine='openpyxl')
144+
logging.info(f"Successfully read file: {input_file}")
145+
except FileNotFoundError:
146+
logging.error(f"Error: The input file '{input_file}' was not found.")
147+
return False
148+
except Exception as e:
149+
logging.error(f"An error occurred while reading the file: {e}")
150+
return False
151+
152+
# Dispatch to the correct transformation function based on keywords
153+
try:
154+
if 'enrollment' in file_name_lower:
155+
df = apply_enrollment_transforms(df)
156+
elif 'usage' in file_name_lower:
157+
df = apply_usage_transforms(df)
158+
else:
159+
logging.warning("Filename does not contain 'Enrollment' or 'Usage'. Skipping transformations.")
160+
return False
161+
except KeyError as e:
162+
# Catches common errors where a column expected by the transformation logic is missing.
163+
logging.error("Transformation Failed: A required column was not found in the Excel file.")
164+
logging.error(f"Missing column: {e}. Please ensure the input file schema is correct.")
165+
return False
166+
except Exception as e:
167+
# Catches any other unexpected error during the transformation logic execution.
168+
logging.error(f"Transformation Failed unexpectedly during data processing: {e}")
169+
return False
170+
171+
df.to_csv(output_file, index=False)
172+
logging.info(f"Excel transformation complete! Data saved to intermediate CSV: {output_file}")
173+
return True
174+
175+
def find_and_replace_quotes(input_file, output_file):
176+
"""
177+
Reads a file, finds and replaces quotes, then saves to a new file.
178+
"""
179+
try:
180+
if not os.path.exists(input_file):
181+
logging.error(f"Error: The intermediate input file '{input_file}' was not found for quote replacement.")
182+
return
183+
184+
logging.info(f"Starting quote cleanup on intermediate file: {input_file}")
185+
with open(input_file, 'r', encoding='utf-8') as f:
186+
content = f.read()
187+
188+
# The core cleanup step: replaces three double quotes (""") with one double quote (")
189+
modified_content = content.replace('"""', '"')
190+
191+
with open(output_file, 'w', encoding='utf-8') as f:
192+
f.write(modified_content)
193+
194+
logging.info(f"Final find-and-replace process complete! Data saved to final CSV: {output_file}")
195+
except Exception as e:
196+
logging.error(f"An error occurred during find and replace: {e}")
197+
198+
if __name__ == '__main__':
199+
# Setup logging first
200+
setup_logging()
201+
202+
# --- Automatic Folder Scan Mode ---
203+
204+
# 1. Define target folder: Navigate to the script's immediate environment and then into "FileTransform"
205+
try:
206+
# Get the full absolute path of the directory containing the running script.
207+
# This resolves to the folder where the script file resides (e.g., ...\TestFileEdit).
208+
script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
209+
210+
# We assume the 'FileTransform' folder is a CHILD of the script's directory (a sibling of the script if run from root).
211+
target_dir = os.path.join(script_dir, 'FileTransform')
212+
213+
# Fallback check: If 'FileTransform' is not found as a child, check one level up (parent).
214+
if not os.path.exists(target_dir):
215+
parent_dir = os.path.dirname(script_dir)
216+
target_dir_parent = os.path.join(parent_dir, 'FileTransform')
217+
218+
if os.path.exists(target_dir_parent):
219+
target_dir = target_dir_parent
220+
221+
222+
except IndexError:
223+
# Fallback for interactive environments or non-standard execution (start from CWD)
224+
current_dir = os.getcwd()
225+
# First, try CWD + 'FileTransform' (sibling if CWD is where script is run)
226+
target_dir = os.path.join(current_dir, 'FileTransform')
227+
# Second, try CWD + '..' + 'FileTransform' (parent directory)
228+
if not os.path.exists(target_dir):
229+
target_dir = os.path.join(current_dir, os.pardir, 'FileTransform')
230+
231+
logging.info("\n" + "="*50)
232+
logging.info(f"Starting automatic scan for Excel files in: {target_dir}")
233+
logging.info("Files containing 'Enrollment' or 'Usage' will be processed.")
234+
logging.info("="*50)
235+
236+
if not os.path.exists(target_dir):
237+
logging.error(f"Target directory not found: {target_dir}")
238+
logging.error("Please ensure the 'FileTransform' folder is located either in the same directory as the script or one level up.")
239+
sys.exit(1)
240+
241+
processed_count = 0
242+
243+
for filename in os.listdir(target_dir):
244+
# 2. Construct the full path
245+
input_path = os.path.join(target_dir, filename)
246+
247+
# 3. Skip directories, temporary files, and non-Excel files
248+
if os.path.isdir(input_path) or filename.startswith('~') or not filename.lower().endswith(('.xlsx', '.xls')):
249+
logging.debug(f"Skipping non-Excel file or directory: {filename}")
250+
continue
251+
252+
logging.info(f"\n--- Processing File: {filename} ---")
253+
254+
# 4. Determine output file names
255+
base_name = os.path.splitext(filename)[0]
256+
intermediate_csv_filename = os.path.join(target_dir, f"{base_name}_intermediate.csv")
257+
final_output_filename = os.path.join(target_dir, f"{base_name}_final.csv")
258+
259+
# 5. Determine file type for logging
260+
file_name_lower = filename.lower()
261+
file_type = "Enrollment" if 'enrollment' in file_name_lower else "Usage" if 'usage' in file_name_lower else "Unknown"
262+
logging.info(f"File Type Detected: {file_type}")
263+
264+
# 6. Execute the transformation and cleanup pipeline
265+
if transform_excel_data(input_path, intermediate_csv_filename):
266+
find_and_replace_quotes(intermediate_csv_filename, final_output_filename)
267+
processed_count += 1
268+
269+
# Cleanup
270+
try:
271+
logging.info("Starting cleanup of intermediate file...")
272+
os.remove(intermediate_csv_filename)
273+
logging.info(f"Intermediate file '{intermediate_csv_filename}' removed successfully.")
274+
except OSError as e:
275+
logging.error(f"Error removing intermediate file: {e}")
276+
277+
logging.info("\n" + "="*50)
278+
logging.info(f"Automatic folder scan finished. {processed_count} files processed.")
279+
logging.info("="*50 + "\n")

requirements.txt

278 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)