-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAR_Files.py
More file actions
444 lines (436 loc) · 23.2 KB
/
AR_Files.py
File metadata and controls
444 lines (436 loc) · 23.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import pdfplumber, re, pandas as pd
from collections import defaultdict
import os
# Let the user choose a PDF file; fallback to console input if GUI not available
pdf_path = None
# Prefer command-line arg if provided (enables GUI wrapper to pass a selected PDF)
import sys
if len(sys.argv) > 1:
pdf_path = sys.argv[1]
try:
# Only prompt for a file if no path was provided via command line
if not pdf_path:
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
pdf_path = filedialog.askopenfilename(title="Select AR PDF", filetypes=[["PDF files", "*.pdf"], ["All files", "*.*"]])
try:
root.update()
except Exception:
pass
finally:
try:
root.destroy()
except Exception:
pass
except Exception:
# tkinter may not be available (e.g., headless). We'll fallback to input below.
pass
if not pdf_path:
try:
pdf_path = input("Enter full path to the PDF file: ").strip().strip('"')
except Exception:
pdf_path = ""
rows = []
date_re = re.compile(r"(?:\d{4}[-\/]\d{2}[-\/]\d{2}|\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})")
money_re = re.compile(r"\(?-?\$?\d{1,3}(?:,\d{3})*\.\d{2}\)?")
docid_re = re.compile(r"^[A-Z0-9\-_]{2,}")
# Customer header patterns:
# 1) Both ID and Name on the same line (with optional label variants)
cust_header_both_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s{2,}(.+)$")
# Be a bit more permissive for spacing between ID and name
cust_header_both_loose_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s+(.+)$")
# 2) ID only on this line; name likely on the next line
cust_header_id_only_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s*$")
try:
# Validate the selected/entered PDF path before processing
if not pdf_path or not os.path.isfile(pdf_path):
raise FileNotFoundError(f"PDF file not found or not specified: {pdf_path}")
with pdfplumber.open(pdf_path) as pdf:
current_customer_id = None
current_customer_name = None
awaiting_name = False
# Try to determine the report end date from filename or first page header
report_end_date = None
try:
# 1) From filename like "April 30, 2025 Aged AR Listing.pdf"
fname = os.path.basename(pdf_path)
m_fname = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}", fname, flags=re.IGNORECASE)
if m_fname:
report_end_date = pd.to_datetime(m_fname.group(0), errors='coerce')
# 2) From first page text: "User Date: 04/30/2025" or similar
if report_end_date is None and len(pdf.pages) > 0:
first_text = pdf.pages[0].extract_text() or ""
m_userdate = re.search(r"User\s*Date\s*:\s*(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})", first_text, flags=re.IGNORECASE)
if m_userdate:
report_end_date = pd.to_datetime(m_userdate.group(1), errors='coerce')
except Exception:
report_end_date = None
# Fallback to today's date if still unknown
if report_end_date is None:
try:
report_end_date = pd.Timestamp.today().normalize()
except Exception:
report_end_date = None
for pagenum, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if not text:
continue
lines = text.split("\n")
for line in lines:
line_stripped = line.strip()
if not line_stripped:
continue
# detect customer header lines (customer ID and optionally name)
# Only consider header detection on lines that do NOT contain dates or money amounts
has_money = bool(money_re.search(line))
has_date_in_line = bool(date_re.search(line))
if not has_money and not has_date_in_line:
# Ignore dashed/equals separator lines (page/table separators)
if re.fullmatch(r"[-=\s]{3,}", line_stripped):
pass # do not treat as a customer header
else:
# First: try both ID and Name on the same line
m_both = cust_header_both_re.match(line_stripped) or cust_header_both_loose_re.match(line_stripped)
if m_both:
current_customer_id = m_both.group(1).strip()
current_customer_name = m_both.group(2).strip()
awaiting_name = False
continue
# Second: handle case where only ID is on this line and the next non-empty line is the name
m_id_only = cust_header_id_only_re.match(line_stripped)
if m_id_only:
current_customer_id = m_id_only.group(1).strip()
current_customer_name = None
awaiting_name = True
continue
# If we are awaiting the name from the next line, capture it here (skip obvious header/footer lines)
if awaiting_name:
# First, try to capture from a labeled line like "Customer Name: XYZ Co"
m_name_label = re.match(r"^Customer\s*(?:Name)?\s*:\s*(.+)$", line_stripped, flags=re.IGNORECASE)
if m_name_label:
current_customer_name = m_name_label.group(1).strip()
awaiting_name = False
continue
# If this line appears to be a document row or contains money/date, do NOT consume it as a name.
# Stop awaiting and allow normal parsing to handle this line as a document row.
toks2 = line_stripped.split()
looks_like_doc_row = bool(toks2) and docid_re.match(toks2[0])
if looks_like_doc_row or has_money or has_date_in_line:
awaiting_name = False
# fall through to normal parsing of this line
else:
# Otherwise, if it's not a header/footer, treat as the name
if not re.search(r"(Receivables Management|DETAIL HISTORICAL AGED TRIAL BALANCE|Page:|System:|User ID:|Customer Type:|Totals:|====|---|Document:|User Date:|Ranges:|Exclude:)", line_stripped):
current_customer_name = line_stripped
awaiting_name = False
continue
# skip page headers/footers but allow lines that contain money or doc IDs
if re.search(r"(Receivables Management|DETAIL HISTORICAL AGED TRIAL BALANCE|Page:|System:|User ID:|Customer Type:|Totals:|====|---|Customer:|Document:|User Date:|Ranges:|Exclude:)", line):
# Still check if it's a customer header
continue
tokens = line.split()
if not tokens:
continue
# allow parsing doc rows even if customer id hasn't been detected yet
# (IDs/Names will be filled when detected; rows will still be exported)
# if not current_customer_id:
# continue
if not docid_re.match(tokens[0]):
continue
# find date token index
date_idx = None
doc_date = None
for i, tok in enumerate(tokens):
mdate = date_re.search(tok)
if mdate:
date_idx = i
doc_date = mdate.group(0)
break
if date_idx is None:
# try scanning the entire line for a date
mline = date_re.search(line)
if mline:
doc_date = mline.group(0)
# approximate the token index by finding the token containing the date text
for i, tok in enumerate(tokens):
if doc_date in tok:
date_idx = i
break
if date_idx is None:
continue
doc_number = tokens[0]
doc_type = " ".join(tokens[1:date_idx]).strip()
# ensure doc_date is set (fallback safety)
doc_date = doc_date if doc_date is not None else tokens[date_idx].strip()
monies = money_re.findall(line)
# Prefer using actual per-column amounts from the PDF line when present.
# Keep original substrings (to preserve negatives/parentheses); parse later via parse_money
if len(monies) >= 2:
# Dynamic mapping: keep Amount first; infer the rest. Many GP rows omit Discount/Writeoff and only one aging bucket has a value.
tokens_money = list(monies)
amount = tokens_money[0]
others = tokens_money[1:]
# Helper to parse for zero-checking without altering original strings
def _pm(s):
try:
return float(str(s).replace("(", "-").replace(")", "").replace("$", "").replace(",", ""))
except Exception:
return None
nonzero_others = [(i, v) for i, v in enumerate(others) if _pm(v) not in (None, 0.0)]
# Initialize fields
discount = ""
writeoff = ""
col_0_30 = col_31_45 = col_45_60 = col_60_90 = col_91p = ""
if len(nonzero_others) == 1:
# A single meaningful value after Amount: treat as aging bucket value according to days difference
val = nonzero_others[0][1]
try:
doc_dt = pd.to_datetime(doc_date, errors='coerce')
rep_dt = pd.to_datetime(report_end_date, errors='coerce') if report_end_date is not None else pd.NaT
if not pd.isna(doc_dt) and not pd.isna(rep_dt):
days = int((rep_dt.normalize() - doc_dt.normalize()).days)
if days <= 30:
col_0_30 = val
elif days <= 45:
col_31_45 = val
elif days <= 60:
col_45_60 = val
elif days <= 90:
col_60_90 = val
else:
col_91p = val
else:
# Fallback to positional mapping if we can't compute days
while len(tokens_money) < 8:
tokens_money.append("")
_, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
except Exception:
while len(tokens_money) < 8:
tokens_money.append("")
_, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
else:
# Use positional mapping for lines that have more information (e.g., real Discount/Writeoff shown)
while len(tokens_money) < 8:
tokens_money.append("")
_, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
else:
# If the line only contains a single monetary value, fall back to date-based bucketing
amount = monies[0] if len(monies) == 1 else ""
col_0_30 = col_31_45 = col_45_60 = col_60_90 = col_91p = ""
try:
doc_dt = pd.to_datetime(doc_date, errors='coerce')
rep_dt = pd.to_datetime(report_end_date, errors='coerce') if report_end_date is not None else pd.NaT
if not pd.isna(doc_dt) and not pd.isna(rep_dt):
days = int((rep_dt.normalize() - doc_dt.normalize()).days)
if days < 0:
col_0_30 = amount
elif days <= 30:
col_0_30 = amount
elif days <= 45:
col_31_45 = amount
elif days <= 60:
col_45_60 = amount
elif days <= 90:
col_60_90 = amount
else:
col_91p = amount
except Exception:
pass
discount = ""
writeoff = ""
rows.append({
"Customer ID": current_customer_id,
"Customer Name": current_customer_name,
"Document Number": doc_number,
"Type": doc_type,
"Date": doc_date,
"Amount": amount,
"Discount": discount,
"Writeoff": writeoff,
"0-30": col_0_30,
"31-45": col_31_45,
"45-60": col_45_60,
"60-90": col_60_90,
"91+": col_91p,
"Page": pagenum,
"RawLine": line_stripped[:200]
})
df = pd.DataFrame(rows)
# backfill missing customer names within the same customer id group
try:
if not df.empty and 'Customer ID' in df.columns and 'Customer Name' in df.columns:
df['Customer Name'] = df.groupby('Customer ID')['Customer Name'].transform(lambda s: s.ffill().bfill())
except Exception:
pass
# parse money columns
def parse_money(x):
try:
if x is None or x == "":
return pd.NA
return float(str(x).replace("(", "-").replace(")", "").replace("$", "").replace(",", ""))
except:
return pd.NA
for c in ["Amount","Discount","Writeoff","0-30","31-45","45-60","60-90","91+"]:
if c in df.columns:
df[c] = df[c].apply(parse_money)
# Remove unwanted columns before export
for drop_col in ["RawLine", "Discount", "Writeoff"]:
if drop_col in df.columns:
try:
df = df.drop(columns=[drop_col])
except Exception:
pass
# Add totals row at bottom for money columns
money_cols_keep = [c for c in ["Amount","0-30","31-45","45-60","60-90","91+"] if c in df.columns]
totals_row = {col: (df[col].sum(skipna=True) if col in money_cols_keep else "") for col in df.columns}
# Put label in ID column to indicate totals row
if "Customer ID" in totals_row:
totals_row["Customer ID"] = "Totals"
elif "Document Number" in totals_row:
totals_row["Document Number"] = "Totals"
# Append totals row
try:
df = pd.concat([df, pd.DataFrame([totals_row])], ignore_index=True)
except Exception:
pass
# Build output file paths next to the input PDF, with "_Parsed" appended to the base name
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
out_dir = os.path.dirname(pdf_path) or "."
out_path = os.path.join(out_dir, f"{base_name}_Parsed.xlsx")
# Try saving; if the target file is open/locked, save with a numeric suffix
def write_formatted_excel(df_, path_, money_cols, sheet_name="Sheet1"):
try:
with pd.ExcelWriter(path_, engine="xlsxwriter") as writer:
df_.to_excel(writer, index=False, sheet_name=sheet_name)
workbook = writer.book
worksheet = writer.sheets[sheet_name]
cur_fmt = workbook.add_format({"num_format": "$#,##0.00;[Red]-$#,##0.00"})
center_fmt = workbook.add_format({"align": "center"})
bold_top_fmt = workbook.add_format({"bold": True, "top": 2}) # bold with medium top border
# Determine index of totals row in the worksheet (0-based row index inside sheet)
totals_row_idx = len(df_) # +1 for header later when applying set_row
for idx, col in enumerate(df_.columns):
series = df_[col]
if col in money_cols:
def disp_len(v):
try:
if pd.isna(v):
return 0
return len(f"${abs(float(v)):,.2f}") + (1 if float(v) < 0 else 0)
except Exception:
s = str(v)
return len(s)
max_len = max(len(str(col)), max((disp_len(v) for v in series), default=0)) + 2
worksheet.set_column(idx, idx, max_len, cur_fmt)
else:
def text_len(v):
s = "" if pd.isna(v) else str(v)
return len(s)
max_len = max(len(str(col)), max((text_len(v) for v in series), default=0)) + 2
# Center align specific columns
if col in ("Page", "Type", "Date"):
worksheet.set_column(idx, idx, max_len, center_fmt)
else:
worksheet.set_column(idx, idx, max_len)
# Apply bold format with top border to totals row
# In xlsxwriter, rows are 0-based and pandas writes header at row 0; totals row is at index len(df_)
worksheet.set_row(totals_row_idx, None, bold_top_fmt)
return
except Exception:
with pd.ExcelWriter(path_, engine="openpyxl") as writer:
df_.to_excel(writer, index=False, sheet_name=sheet_name)
ws = writer.sheets[sheet_name]
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment, Border, Side, Font
# Determine totals row number in Excel (1-based), including header
totals_row_num = len(df_) + 1 # header is row 1; data rows start at 2; totals is last data row
for j, col in enumerate(df_.columns, start=1):
col_letter = get_column_letter(j)
if col in money_cols:
num_fmt = '"$"#,##0.00;[Red]-"$"#,##0.00'
for i in range(2, len(df_) + 2):
cell = ws.cell(row=i, column=j)
cell.number_format = num_fmt
def disp_len(v):
try:
if pd.isna(v):
return 0
return len(f"${abs(float(v)):,.2f}") + (1 if float(v) < 0 else 0)
except Exception:
s = str(v)
return len(s)
max_len = max(len(str(col)), max((disp_len(v) for v in df_[col]), default=0)) + 2
else:
def text_len(v):
s = "" if pd.isna(v) else str(v)
return len(s)
max_len = max(len(str(col)), max((text_len(v) for v in df_[col]), default=0)) + 2
ws.column_dimensions[col_letter].width = max_len
# Center align specific columns
if col in ("Page", "Type", "Date"):
for i in range(2, len(df_) + 2):
ws.cell(row=i, column=j).alignment = Alignment(horizontal="center")
# Make totals row distinct: bold with top border
top_side = Side(style="medium")
for j in range(1, len(df_.columns) + 1):
cell = ws.cell(row=totals_row_num, column=j)
cell.font = Font(bold=True)
# Preserve number format in money columns
if ws.cell(row=2, column=j).number_format:
cell.number_format = ws.cell(row=2, column=j).number_format
# Apply top border
b = cell.border or Border()
cell.border = Border(top=top_side, left=b.left, right=b.right, bottom=b.bottom)
def try_save(df_, xlsx):
money_columns = ["Amount","0-30","31-45","45-60","60-90","91+"]
write_formatted_excel(df_, xlsx, money_columns)
try:
try_save(df, out_path)
result = {"excel": out_path, "rows": len(df)}
except PermissionError:
base = os.path.splitext(out_path)[0]
saved = False
for i in range(1, 100):
alt_xlsx = f"{base}_v{i}.xlsx"
try:
try_save(df, alt_xlsx)
result = {"excel": alt_xlsx, "rows": len(df), "note": "Original file locked; saved with version suffix."}
saved = True
break
except PermissionError:
continue
if not saved:
raise
except OSError as e:
# Some environments surface permission denials as generic OSError
if hasattr(e, 'winerror') or 'denied' in str(e).lower():
base = os.path.splitext(out_path)[0]
for i in range(1, 100):
alt_xlsx = f"{base}_v{i}.xlsx"
try:
try_save(df, alt_xlsx)
result = {"excel": alt_xlsx, "rows": len(df), "note": "Original file locked; saved with version suffix."}
break
except Exception:
continue
else:
raise
except Exception as e:
result = {"error": str(e)}
# Print a concise summary so the user knows what happened
try:
if isinstance(result, dict):
if "error" in result:
print(f"Error: {result['error']}")
else:
print(f"Exported Excel: {result['excel']}")
print(f"Rows exported: {result['rows']}")
else:
print(result)
except Exception:
# Printing should never crash the script
pass
result
# py -m PyInstaller --noconsole --onefile --name "GP Export Extraction" AR_Files.py