Dynamics-GP-AR-AP-Exporter-Parser/AR_Files.py at main · GimpyHand/Dynamics-GP-AR-AP-Exporter-Parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import pdfplumber, re, pandas as pd
from collections import defaultdict
import os

# Let the user choose a PDF file; fallback to console input if GUI not available
pdf_path = None
# Prefer command-line arg if provided (enables GUI wrapper to pass a selected PDF)
import sys
if len(sys.argv) > 1:
    pdf_path = sys.argv[1]
try:
    # Only prompt for a file if no path was provided via command line
    if not pdf_path:
        import tkinter as tk
        from tkinter import filedialog
        root = tk.Tk()
        root.withdraw()
        pdf_path = filedialog.askopenfilename(title="Select AR PDF", filetypes=[["PDF files", "*.pdf"], ["All files", "*.*"]])
        try:
            root.update()
        except Exception:
            pass
        finally:
            try:
                root.destroy()
            except Exception:
                pass
except Exception:
    # tkinter may not be available (e.g., headless). We'll fallback to input below.
    pass

if not pdf_path:
    try:
        pdf_path = input("Enter full path to the PDF file: ").strip().strip('"')
    except Exception:
        pdf_path = ""

rows = []

date_re = re.compile(r"(?:\d{4}[-\/]\d{2}[-\/]\d{2}|\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})")
money_re = re.compile(r"\(?-?\$?\d{1,3}(?:,\d{3})*\.\d{2}\)?")
docid_re = re.compile(r"^[A-Z0-9\-_]{2,}")
# Customer header patterns:
# 1) Both ID and Name on the same line (with optional label variants)
cust_header_both_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s{2,}(.+)$")
# Be a bit more permissive for spacing between ID and name
cust_header_both_loose_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s+(.+)$")
# 2) ID only on this line; name likely on the next line
cust_header_id_only_re = re.compile(r"^(?:Customer\s*(?:Number|No\.?|ID)?\s*:\s*)?([A-Z0-9][A-Z0-9\-_]{1,})\s*$")

try:
    # Validate the selected/entered PDF path before processing
    if not pdf_path or not os.path.isfile(pdf_path):
        raise FileNotFoundError(f"PDF file not found or not specified: {pdf_path}")
    with pdfplumber.open(pdf_path) as pdf:
        current_customer_id = None
        current_customer_name = None
        awaiting_name = False
        # Try to determine the report end date from filename or first page header
        report_end_date = None
        try:
            # 1) From filename like "April 30, 2025 Aged AR Listing.pdf"
            fname = os.path.basename(pdf_path)
            m_fname = re.search(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}", fname, flags=re.IGNORECASE)
            if m_fname:
                report_end_date = pd.to_datetime(m_fname.group(0), errors='coerce')
            # 2) From first page text: "User Date: 04/30/2025" or similar
            if report_end_date is None and len(pdf.pages) > 0:
                first_text = pdf.pages[0].extract_text() or ""
                m_userdate = re.search(r"User\s*Date\s*:\s*(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})", first_text, flags=re.IGNORECASE)
                if m_userdate:
                    report_end_date = pd.to_datetime(m_userdate.group(1), errors='coerce')
        except Exception:
            report_end_date = None
        # Fallback to today's date if still unknown
        if report_end_date is None:
            try:
                report_end_date = pd.Timestamp.today().normalize()
            except Exception:
                report_end_date = None
        for pagenum, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            for line in lines:
                line_stripped = line.strip()
                if not line_stripped:
                    continue
                # detect customer header lines (customer ID and optionally name)
                # Only consider header detection on lines that do NOT contain dates or money amounts
                has_money = bool(money_re.search(line))
                has_date_in_line = bool(date_re.search(line))
                if not has_money and not has_date_in_line:
                    # Ignore dashed/equals separator lines (page/table separators)
                    if re.fullmatch(r"[-=\s]{3,}", line_stripped):
                        pass  # do not treat as a customer header
                    else:
                        # First: try both ID and Name on the same line
                        m_both = cust_header_both_re.match(line_stripped) or cust_header_both_loose_re.match(line_stripped)
                        if m_both:
                            current_customer_id = m_both.group(1).strip()
                            current_customer_name = m_both.group(2).strip()
                            awaiting_name = False
                            continue
                        # Second: handle case where only ID is on this line and the next non-empty line is the name
                        m_id_only = cust_header_id_only_re.match(line_stripped)
                        if m_id_only:
                            current_customer_id = m_id_only.group(1).strip()
                            current_customer_name = None
                            awaiting_name = True
                            continue
                # If we are awaiting the name from the next line, capture it here (skip obvious header/footer lines)
                if awaiting_name:
                    # First, try to capture from a labeled line like "Customer Name: XYZ Co"
                    m_name_label = re.match(r"^Customer\s*(?:Name)?\s*:\s*(.+)$", line_stripped, flags=re.IGNORECASE)
                    if m_name_label:
                        current_customer_name = m_name_label.group(1).strip()
                        awaiting_name = False
                        continue
                    # If this line appears to be a document row or contains money/date, do NOT consume it as a name.
                    # Stop awaiting and allow normal parsing to handle this line as a document row.
                    toks2 = line_stripped.split()
                    looks_like_doc_row = bool(toks2) and docid_re.match(toks2[0])
                    if looks_like_doc_row or has_money or has_date_in_line:
                        awaiting_name = False
                        # fall through to normal parsing of this line
                    else:
                        # Otherwise, if it's not a header/footer, treat as the name
                        if not re.search(r"(Receivables Management|DETAIL HISTORICAL AGED TRIAL BALANCE|Page:|System:|User ID:|Customer Type:|Totals:|====|---|Document:|User Date:|Ranges:|Exclude:)", line_stripped):
                            current_customer_name = line_stripped
                            awaiting_name = False
                            continue
                # skip page headers/footers but allow lines that contain money or doc IDs
                if re.search(r"(Receivables Management|DETAIL HISTORICAL AGED TRIAL BALANCE|Page:|System:|User ID:|Customer Type:|Totals:|====|---|Customer:|Document:|User Date:|Ranges:|Exclude:)", line):
                    # Still check if it's a customer header
                    continue
                tokens = line.split()
                if not tokens:
                    continue
                # allow parsing doc rows even if customer id hasn't been detected yet
                # (IDs/Names will be filled when detected; rows will still be exported)
                # if not current_customer_id:
                #     continue
                if not docid_re.match(tokens[0]):
                    continue
                # find date token index
                date_idx = None
                doc_date = None
                for i, tok in enumerate(tokens):
                    mdate = date_re.search(tok)
                    if mdate:
                        date_idx = i
                        doc_date = mdate.group(0)
                        break
                if date_idx is None:
                    # try scanning the entire line for a date
                    mline = date_re.search(line)
                    if mline:
                        doc_date = mline.group(0)
                        # approximate the token index by finding the token containing the date text
                        for i, tok in enumerate(tokens):
                            if doc_date in tok:
                                date_idx = i
                                break
                if date_idx is None:
                    continue
                doc_number = tokens[0]
                doc_type = " ".join(tokens[1:date_idx]).strip()
                # ensure doc_date is set (fallback safety)
                doc_date = doc_date if doc_date is not None else tokens[date_idx].strip()
                monies = money_re.findall(line)
                # Prefer using actual per-column amounts from the PDF line when present.
                # Keep original substrings (to preserve negatives/parentheses); parse later via parse_money
                if len(monies) >= 2:
                    # Dynamic mapping: keep Amount first; infer the rest. Many GP rows omit Discount/Writeoff and only one aging bucket has a value.
                    tokens_money = list(monies)
                    amount = tokens_money[0]
                    others = tokens_money[1:]
                    # Helper to parse for zero-checking without altering original strings
                    def _pm(s):
                        try:
                            return float(str(s).replace("(", "-").replace(")", "").replace("$", "").replace(",", ""))
                        except Exception:
                            return None
                    nonzero_others = [(i, v) for i, v in enumerate(others) if _pm(v) not in (None, 0.0)]
                    # Initialize fields
                    discount = ""
                    writeoff = ""
                    col_0_30 = col_31_45 = col_45_60 = col_60_90 = col_91p = ""
                    if len(nonzero_others) == 1:
                        # A single meaningful value after Amount: treat as aging bucket value according to days difference
                        val = nonzero_others[0][1]
                        try:
                            doc_dt = pd.to_datetime(doc_date, errors='coerce')
                            rep_dt = pd.to_datetime(report_end_date, errors='coerce') if report_end_date is not None else pd.NaT
                            if not pd.isna(doc_dt) and not pd.isna(rep_dt):
                                days = int((rep_dt.normalize() - doc_dt.normalize()).days)
                                if days <= 30:
                                    col_0_30 = val
                                elif days <= 45:
                                    col_31_45 = val
                                elif days <= 60:
                                    col_45_60 = val
                                elif days <= 90:
                                    col_60_90 = val
                                else:
                                    col_91p = val
                            else:
                                # Fallback to positional mapping if we can't compute days
                                while len(tokens_money) < 8:
                                    tokens_money.append("")
                                _, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
                        except Exception:
                            while len(tokens_money) < 8:
                                tokens_money.append("")
                            _, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
                    else:
                        # Use positional mapping for lines that have more information (e.g., real Discount/Writeoff shown)
                        while len(tokens_money) < 8:
                            tokens_money.append("")
                        _, discount, writeoff, col_0_30, col_31_45, col_45_60, col_60_90, col_91p = tokens_money[:8]
                else:
                    # If the line only contains a single monetary value, fall back to date-based bucketing
                    amount = monies[0] if len(monies) == 1 else ""
                    col_0_30 = col_31_45 = col_45_60 = col_60_90 = col_91p = ""
                    try:
                        doc_dt = pd.to_datetime(doc_date, errors='coerce')
                        rep_dt = pd.to_datetime(report_end_date, errors='coerce') if report_end_date is not None else pd.NaT
                        if not pd.isna(doc_dt) and not pd.isna(rep_dt):
                            days = int((rep_dt.normalize() - doc_dt.normalize()).days)
                            if days < 0:
                                col_0_30 = amount
                            elif days <= 30:
                                col_0_30 = amount
                            elif days <= 45:
                                col_31_45 = amount
                            elif days <= 60:
                                col_45_60 = amount
                            elif days <= 90:
                                col_60_90 = amount
                            else:
                                col_91p = amount
                    except Exception:
                        pass
                    discount = ""
                    writeoff = ""
                rows.append({
                    "Customer ID": current_customer_id,
                    "Customer Name": current_customer_name,
                    "Document Number": doc_number,
                    "Type": doc_type,
                    "Date": doc_date,
                    "Amount": amount,
                    "Discount": discount,
                    "Writeoff": writeoff,
                    "0-30": col_0_30,
                    "31-45": col_31_45,
                    "45-60": col_45_60,
                    "60-90": col_60_90,
                    "91+": col_91p,
                    "Page": pagenum,
                    "RawLine": line_stripped[:200]
                })
    df = pd.DataFrame(rows)
    # backfill missing customer names within the same customer id group
    try:
        if not df.empty and 'Customer ID' in df.columns and 'Customer Name' in df.columns:
            df['Customer Name'] = df.groupby('Customer ID')['Customer Name'].transform(lambda s: s.ffill().bfill())
    except Exception:
        pass
    # parse money columns
    def parse_money(x):
        try:
            if x is None or x == "":
                return pd.NA
            return float(str(x).replace("(", "-").replace(")", "").replace("$", "").replace(",", ""))
        except:
            return pd.NA
    for c in ["Amount","Discount","Writeoff","0-30","31-45","45-60","60-90","91+"]:
        if c in df.columns:
            df[c] = df[c].apply(parse_money)
    # Remove unwanted columns before export
    for drop_col in ["RawLine", "Discount", "Writeoff"]:
        if drop_col in df.columns:
            try:
                df = df.drop(columns=[drop_col])
            except Exception:
                pass
    # Add totals row at bottom for money columns
    money_cols_keep = [c for c in ["Amount","0-30","31-45","45-60","60-90","91+"] if c in df.columns]
    totals_row = {col: (df[col].sum(skipna=True) if col in money_cols_keep else "") for col in df.columns}
    # Put label in ID column to indicate totals row
    if "Customer ID" in totals_row:
        totals_row["Customer ID"] = "Totals"
    elif "Document Number" in totals_row:
        totals_row["Document Number"] = "Totals"
    # Append totals row
    try:
        df = pd.concat([df, pd.DataFrame([totals_row])], ignore_index=True)
    except Exception:
        pass
    # Build output file paths next to the input PDF, with "_Parsed" appended to the base name
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    out_dir = os.path.dirname(pdf_path) or "."
    out_path = os.path.join(out_dir, f"{base_name}_Parsed.xlsx")
    # Try saving; if the target file is open/locked, save with a numeric suffix
    def write_formatted_excel(df_, path_, money_cols, sheet_name="Sheet1"):
        try:
            with pd.ExcelWriter(path_, engine="xlsxwriter") as writer:
                df_.to_excel(writer, index=False, sheet_name=sheet_name)
                workbook = writer.book
                worksheet = writer.sheets[sheet_name]
                cur_fmt = workbook.add_format({"num_format": "$#,##0.00;[Red]-$#,##0.00"})
                center_fmt = workbook.add_format({"align": "center"})
                bold_top_fmt = workbook.add_format({"bold": True, "top": 2})  # bold with medium top border
                # Determine index of totals row in the worksheet (0-based row index inside sheet)
                totals_row_idx = len(df_)  # +1 for header later when applying set_row
                for idx, col in enumerate(df_.columns):
                    series = df_[col]
                    if col in money_cols:
                        def disp_len(v):
                            try:
                                if pd.isna(v):
                                    return 0
                                return len(f"${abs(float(v)):,.2f}") + (1 if float(v) < 0 else 0)
                            except Exception:
                                s = str(v)
                                return len(s)
                        max_len = max(len(str(col)), max((disp_len(v) for v in series), default=0)) + 2
                        worksheet.set_column(idx, idx, max_len, cur_fmt)
                    else:
                        def text_len(v):
                            s = "" if pd.isna(v) else str(v)
                            return len(s)
                        max_len = max(len(str(col)), max((text_len(v) for v in series), default=0)) + 2
                        # Center align specific columns
                        if col in ("Page", "Type", "Date"):
                            worksheet.set_column(idx, idx, max_len, center_fmt)
                        else:
                            worksheet.set_column(idx, idx, max_len)
                # Apply bold format with top border to totals row
                # In xlsxwriter, rows are 0-based and pandas writes header at row 0; totals row is at index len(df_)
                worksheet.set_row(totals_row_idx, None, bold_top_fmt)
                return
        except Exception:
            with pd.ExcelWriter(path_, engine="openpyxl") as writer:
                df_.to_excel(writer, index=False, sheet_name=sheet_name)
                ws = writer.sheets[sheet_name]
                from openpyxl.utils import get_column_letter
                from openpyxl.styles import Alignment, Border, Side, Font
                # Determine totals row number in Excel (1-based), including header
                totals_row_num = len(df_) + 1  # header is row 1; data rows start at 2; totals is last data row
                for j, col in enumerate(df_.columns, start=1):
                    col_letter = get_column_letter(j)
                    if col in money_cols:
                        num_fmt = '"$"#,##0.00;[Red]-"$"#,##0.00'
                        for i in range(2, len(df_) + 2):
                            cell = ws.cell(row=i, column=j)
                            cell.number_format = num_fmt
                        def disp_len(v):
                            try:
                                if pd.isna(v):
                                    return 0
                                return len(f"${abs(float(v)):,.2f}") + (1 if float(v) < 0 else 0)
                            except Exception:
                                s = str(v)
                                return len(s)
                        max_len = max(len(str(col)), max((disp_len(v) for v in df_[col]), default=0)) + 2
                    else:
                        def text_len(v):
                            s = "" if pd.isna(v) else str(v)
                            return len(s)
                        max_len = max(len(str(col)), max((text_len(v) for v in df_[col]), default=0)) + 2
                    ws.column_dimensions[col_letter].width = max_len
                    # Center align specific columns
                    if col in ("Page", "Type", "Date"):
                        for i in range(2, len(df_) + 2):
                            ws.cell(row=i, column=j).alignment = Alignment(horizontal="center")
                # Make totals row distinct: bold with top border
                top_side = Side(style="medium")
                for j in range(1, len(df_.columns) + 1):
                    cell = ws.cell(row=totals_row_num, column=j)
                    cell.font = Font(bold=True)
                    # Preserve number format in money columns
                    if ws.cell(row=2, column=j).number_format:
                        cell.number_format = ws.cell(row=2, column=j).number_format
                    # Apply top border
                    b = cell.border or Border()
                    cell.border = Border(top=top_side, left=b.left, right=b.right, bottom=b.bottom)
    def try_save(df_, xlsx):
        money_columns = ["Amount","0-30","31-45","45-60","60-90","91+"]
        write_formatted_excel(df_, xlsx, money_columns)
    try:
        try_save(df, out_path)
        result = {"excel": out_path, "rows": len(df)}
    except PermissionError:
        base = os.path.splitext(out_path)[0]
        saved = False
        for i in range(1, 100):
            alt_xlsx = f"{base}_v{i}.xlsx"
            try:
                try_save(df, alt_xlsx)
                result = {"excel": alt_xlsx, "rows": len(df), "note": "Original file locked; saved with version suffix."}
                saved = True
                break
            except PermissionError:
                continue
        if not saved:
            raise
    except OSError as e:
        # Some environments surface permission denials as generic OSError
        if hasattr(e, 'winerror') or 'denied' in str(e).lower():
            base = os.path.splitext(out_path)[0]
            for i in range(1, 100):
                alt_xlsx = f"{base}_v{i}.xlsx"
                try:
                    try_save(df, alt_xlsx)
                    result = {"excel": alt_xlsx, "rows": len(df), "note": "Original file locked; saved with version suffix."}
                    break
                except Exception:
                    continue
        else:
            raise
except Exception as e:
    result = {"error": str(e)}

# Print a concise summary so the user knows what happened
try:
    if isinstance(result, dict):
        if "error" in result:
            print(f"Error: {result['error']}")
        else:
            print(f"Exported Excel: {result['excel']}")
            print(f"Rows exported: {result['rows']}")
    else:
        print(result)
except Exception:
    # Printing should never crash the script
    pass

result

# py -m PyInstaller --noconsole --onefile --name "GP Export Extraction" AR_Files.py