PDF imports gehärtet

Pitastic · Pitastic · commit 74767786c142 · 2025-10-29T22:29:40.000+01:00
diff --git a/app/static/js/index.js b/app/static/js/index.js
@@ -172,9 +172,9 @@ function uploadFile() {
             alert('File upload failed: ' + '(' + error + ')' + responseText);
 
         } else {
-            alert('File uploaded successfully!' + responseText);
-            window.location.href = '/' + iban;
-
+            if (confirm('File uploaded successfully!' + responseText + '\nKonto aufrufen?')) {
+                window.location.href = '/' + iban;
+            }
         }
     }, true);
 }
diff --git a/reader/Comdirect.py b/reader/Comdirect.py
@@ -47,17 +47,18 @@ def from_csv(self, filepath):
                     # Skippe offene Buchungen
                     continue
 
+                betrag = float(row['Umsatz in EUR'].replace(',', '.'))    
                 date_tx = datetime.datetime.strptime(
                             date_tx, date_format
                         ).replace(tzinfo=datetime.timezone.utc).timestamp()
                 valuta = datetime.datetime.strptime(
                             row['Wertstellung (Valuta)'], date_format
                         ).replace(tzinfo=datetime.timezone.utc).timestamp()
-                betrag = float(row['Umsatz in EUR'].replace(',', '.'))
+
                 text_tx = row['Buchungstext']
                 match = rx.match(text_tx)
 
-                result.append({
+                line = {
                     'date_tx': date_tx,
                     'valuta': valuta,
                     'art': row['Vorgang'],
@@ -68,7 +69,12 @@ def from_csv(self, filepath):
                     'parsed': {},
                     'category': None,
                     'tags': None
-                })
+                }
+
+                if not line['betrag']:
+                    continue  # Skip Null-Buchungen
+
+                result.append(line)
 
         return result
 
@@ -86,7 +92,6 @@ def from_pdf(self, filepath):
             filepath,
             pages="2-end",
             flavor="stream",
-            #strip_text="\n",
             row_tol=10,
             columns=["115,187,305,500"]
         )
@@ -123,6 +128,7 @@ def from_pdf(self, filepath):
             if row[0] == 'Buchungstag\nValuta':
                 continue  # Skip Header Rows
 
+            betrag = float(row[4].replace('.', '').replace(',', '.'))
             date_format = "%d.%m.%Y"
             date_row = row[0].replace('\n', '')
 
@@ -135,7 +141,7 @@ def from_pdf(self, filepath):
                     ).replace(tzinfo=datetime.timezone.utc).timestamp(),
                 'art': row[1].replace('\n', '').replace(' ', ''),
                 'text_tx': self._newline_replace(row[3]),
-                'betrag': float(row[4].replace('.', '').replace(',', '.')),
+                'betrag': betrag,
                 'gegenkonto': self._newline_replace(row[2]),
                 'currency': "EUR",
                 'parsed': {},
@@ -151,6 +157,9 @@ def from_pdf(self, filepath):
                 i, row = next(enumerated_table)
                 line['text_tx'] += ' ' + self._newline_replace(row[3])
 
+            if not line['betrag']:
+                continue  # Skip Null-Buchungen
+
             result.append(line)
 
         return result
@@ -167,7 +176,8 @@ def _newline_replace(self, text_in:str) -> str:
         Args:
             text_in, str:   Input Text aus Kontoauszug
         Return:
-            str: Text ohne Newlines"""
+            str: Text ohne Newlines
+        """
         if not '\n' in text_in:
             return text_in
 
diff --git a/reader/Commerzbank.py b/reader/Commerzbank.py
@@ -42,31 +42,31 @@ def from_pdf(self, filepath):
         )
 
         # Tabellen aller Seiten zusammenfügen
-        all_rows = []
+        self.all_rows = []
         for t in tables:
             if not t.data:
                 continue
 
-            all_rows.extend(t.data)
+            self.all_rows.extend(t.data)
 
         # Start bei den Kontoumsätzen
         start_index = 0
-        end_index = len(all_rows)
-        for row in all_rows:
+        end_index = len(self.all_rows)
+        for row in self.all_rows:
 
             if row[0].replace(' ', '').lower().startswith('angabenzudenumsätzen'):
                 # Last row before transactions
-                start_index = all_rows.index(row) + 1
+                start_index = self.all_rows.index(row) + 1
 
             if 'Kreditlinie' in row[0]:
                 # First row after transactions
-                end_index = all_rows.index(row)
+                end_index = self.all_rows.index(row)
                 break
 
         result = []
         date_tx = 0
         date_tx_year = "1970"  # Default Year if not found yet
-        enumerated_table = enumerate(all_rows[start_index:end_index])
+        enumerated_table = enumerate(self.all_rows[start_index:end_index])
         for i, row in enumerated_table:
 
             if row[0].startswith('Buchungsdatum: '):
@@ -86,7 +86,7 @@ def from_pdf(self, filepath):
                         f"{row[1]}.{date_tx_year}", "%d.%m.%Y"
                     ).replace(tzinfo=datetime.timezone.utc).timestamp(),
                 'art': "",
-                'text_tx': row[0],
+                'text_tx':  row[0],
                 'betrag': float(betrag.replace('.', '').replace(',', '.')),
                 'gegenkonto': row[0],
                 'currency': "EUR",
@@ -95,20 +95,49 @@ def from_pdf(self, filepath):
                 'tags': None
             }
 
-            while start_index + i + 1 < end_index and \
-               all_rows[start_index + i + 1][1] == '' and \
-               not all_rows[start_index + i + 1][0].startswith('Buchungsdatum: '):
+            while self._check_next_line_available(start_index, end_index, i):
                 # 1. There are more lines in the table
                 # 2. Next line belongs to this transaction
                 # (no new date but text continuation - last line in this block is 'art')
+                prev_line_len = len(row[0])
                 i, row = next(enumerated_table)
 
-                line['text_tx'] += ' ' + row[0]
                 line['art'] = row[0] # Overwrite to keep value of last line in block
 
+                if self._check_next_line_available(start_index, end_index, i):
+                    # Line overflow or intentional line break?
+                    glue = ' ' if prev_line_len < 35 else ''
+                    line['text_tx'] += glue + row[0]
+
+            if not line['betrag']:
+                continue  # Skip Null-Buchungen
+
             result.append(line)
 
         return result
 
     def from_http(self, url):
         raise NotImplementedError("from_http is not implemented yet for Commerzbank Reader")
+
+    def _check_next_line_available(self, start_index, end_index, i):
+        """
+        Hilfsmethode um zu prüfen, ob die nächste Zeile im
+        ausgelesenen PDF-Dokument noch verfügbar ist.
+
+        Args:
+            start_index (int): Startindex der Kontoumsätze
+            end_index (int): Endindex der Kontoumsätze
+            i (int): Aktueller Index in der Iteration
+        Returns:
+            bool: True, wenn die nächste Zeile noch verfügbar ist, sonst False
+        """
+        if start_index + i + 1 >= end_index:
+            return False
+
+        if self.all_rows[start_index + i + 1][1] != '':
+            return False
+        
+        if self.all_rows[start_index + i + 1][0].startswith('Buchungsdatum: '):
+            return False
+
+        return True
diff --git a/reader/Generic.py b/reader/Generic.py
@@ -36,14 +36,15 @@ def from_csv(self, filepath):
             date_format = "%d.%m.%Y"
 
             for row in reader:
+
                 betrag = float(row['Betrag'].replace('.', '').replace(',', '.'))
                 date_tx = datetime.datetime.strptime(
                             row['Buchungstag'], date_format
                         ).replace(tzinfo=datetime.timezone.utc).timestamp()
                 valuta = datetime.datetime.strptime(
                             row['Wertstellung'], date_format
                         ).replace(tzinfo=datetime.timezone.utc).timestamp()
-                result.append({
+                line = {
                     'date_tx': date_tx,
                     'valuta': valuta,
                     'art': row['Umsatzart'],
@@ -54,7 +55,12 @@ def from_csv(self, filepath):
                     'parsed': {},
                     'category': None,
                     'tags': None
-                })
+                }
+
+                if not line['betrag']:
+                    continue  # Skip Null-Buchungen
+
+                result.append(line)
 
         return result
 

Original file line number	Diff line number	Diff line change
`@@ -172,9 +172,9 @@ function uploadFile() {`
`172`	`172`	`alert('File upload failed: ' + '(' + error + ')' + responseText);`
`173`	`173`
`174`	`174`	`} else {`
`175`		`- alert('File uploaded successfully!' + responseText);`
`176`		`- window.location.href = '/' + iban;`
`177`		`-`
	`175`	`+ if (confirm('File uploaded successfully!' + responseText + '\nKonto aufrufen?')) {`
	`176`	`+ window.location.href = '/' + iban;`
	`177`	`+ }`
`178`	`178`	`}`
`179`	`179`	`}, true);`
`180`	`180`	`}`