Skip to content

Commit 7476778

Browse files
committed
PDF imports gehärtet
1 parent 718d3b6 commit 7476778

4 files changed

Lines changed: 68 additions & 23 deletions

File tree

app/static/js/index.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,9 @@ function uploadFile() {
172172
alert('File upload failed: ' + '(' + error + ')' + responseText);
173173

174174
} else {
175-
alert('File uploaded successfully!' + responseText);
176-
window.location.href = '/' + iban;
177-
175+
if (confirm('File uploaded successfully!' + responseText + '\nKonto aufrufen?')) {
176+
window.location.href = '/' + iban;
177+
}
178178
}
179179
}, true);
180180
}

reader/Comdirect.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,18 @@ def from_csv(self, filepath):
4747
# Skippe offene Buchungen
4848
continue
4949

50+
betrag = float(row['Umsatz in EUR'].replace(',', '.'))
5051
date_tx = datetime.datetime.strptime(
5152
date_tx, date_format
5253
).replace(tzinfo=datetime.timezone.utc).timestamp()
5354
valuta = datetime.datetime.strptime(
5455
row['Wertstellung (Valuta)'], date_format
5556
).replace(tzinfo=datetime.timezone.utc).timestamp()
56-
betrag = float(row['Umsatz in EUR'].replace(',', '.'))
57+
5758
text_tx = row['Buchungstext']
5859
match = rx.match(text_tx)
5960

60-
result.append({
61+
line = {
6162
'date_tx': date_tx,
6263
'valuta': valuta,
6364
'art': row['Vorgang'],
@@ -68,7 +69,12 @@ def from_csv(self, filepath):
6869
'parsed': {},
6970
'category': None,
7071
'tags': None
71-
})
72+
}
73+
74+
if not line['betrag']:
75+
continue # Skip Null-Buchungen
76+
77+
result.append(line)
7278

7379
return result
7480

@@ -86,7 +92,6 @@ def from_pdf(self, filepath):
8692
filepath,
8793
pages="2-end",
8894
flavor="stream",
89-
#strip_text="\n",
9095
row_tol=10,
9196
columns=["115,187,305,500"]
9297
)
@@ -123,6 +128,7 @@ def from_pdf(self, filepath):
123128
if row[0] == 'Buchungstag\nValuta':
124129
continue # Skip Header Rows
125130

131+
betrag = float(row[4].replace('.', '').replace(',', '.'))
126132
date_format = "%d.%m.%Y"
127133
date_row = row[0].replace('\n', '')
128134

@@ -135,7 +141,7 @@ def from_pdf(self, filepath):
135141
).replace(tzinfo=datetime.timezone.utc).timestamp(),
136142
'art': row[1].replace('\n', '').replace(' ', ''),
137143
'text_tx': self._newline_replace(row[3]),
138-
'betrag': float(row[4].replace('.', '').replace(',', '.')),
144+
'betrag': betrag,
139145
'gegenkonto': self._newline_replace(row[2]),
140146
'currency': "EUR",
141147
'parsed': {},
@@ -151,6 +157,9 @@ def from_pdf(self, filepath):
151157
i, row = next(enumerated_table)
152158
line['text_tx'] += ' ' + self._newline_replace(row[3])
153159

160+
if not line['betrag']:
161+
continue # Skip Null-Buchungen
162+
154163
result.append(line)
155164

156165
return result
@@ -167,7 +176,8 @@ def _newline_replace(self, text_in:str) -> str:
167176
Args:
168177
text_in, str: Input Text aus Kontoauszug
169178
Return:
170-
str: Text ohne Newlines"""
179+
str: Text ohne Newlines
180+
"""
171181
if not '\n' in text_in:
172182
return text_in
173183

reader/Commerzbank.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,31 +42,31 @@ def from_pdf(self, filepath):
4242
)
4343

4444
# Tabellen aller Seiten zusammenfügen
45-
all_rows = []
45+
self.all_rows = []
4646
for t in tables:
4747
if not t.data:
4848
continue
4949

50-
all_rows.extend(t.data)
50+
self.all_rows.extend(t.data)
5151

5252
# Start bei den Kontoumsätzen
5353
start_index = 0
54-
end_index = len(all_rows)
55-
for row in all_rows:
54+
end_index = len(self.all_rows)
55+
for row in self.all_rows:
5656

5757
if row[0].replace(' ', '').lower().startswith('angabenzudenumsätzen'):
5858
# Last row before transactions
59-
start_index = all_rows.index(row) + 1
59+
start_index = self.all_rows.index(row) + 1
6060

6161
if 'Kreditlinie' in row[0]:
6262
# First row after transactions
63-
end_index = all_rows.index(row)
63+
end_index = self.all_rows.index(row)
6464
break
6565

6666
result = []
6767
date_tx = 0
6868
date_tx_year = "1970" # Default Year if not found yet
69-
enumerated_table = enumerate(all_rows[start_index:end_index])
69+
enumerated_table = enumerate(self.all_rows[start_index:end_index])
7070
for i, row in enumerated_table:
7171

7272
if row[0].startswith('Buchungsdatum: '):
@@ -86,7 +86,7 @@ def from_pdf(self, filepath):
8686
f"{row[1]}.{date_tx_year}", "%d.%m.%Y"
8787
).replace(tzinfo=datetime.timezone.utc).timestamp(),
8888
'art': "",
89-
'text_tx': row[0],
89+
'text_tx': row[0],
9090
'betrag': float(betrag.replace('.', '').replace(',', '.')),
9191
'gegenkonto': row[0],
9292
'currency': "EUR",
@@ -95,20 +95,49 @@ def from_pdf(self, filepath):
9595
'tags': None
9696
}
9797

98-
while start_index + i + 1 < end_index and \
99-
all_rows[start_index + i + 1][1] == '' and \
100-
not all_rows[start_index + i + 1][0].startswith('Buchungsdatum: '):
98+
while self._check_next_line_available(start_index, end_index, i):
10199
# 1. There are more lines in the table
102100
# 2. Next line belongs to this transaction
103101
# (no new date but text continuation - last line in this block is 'art')
102+
prev_line_len = len(row[0])
104103
i, row = next(enumerated_table)
105104

106-
line['text_tx'] += ' ' + row[0]
107105
line['art'] = row[0] # Overwrite to keep value of last line in block
108106

107+
if self._check_next_line_available(start_index, end_index, i):
108+
# Line overflow or intentional line break?
109+
glue = ' ' if prev_line_len < 35 else ''
110+
line['text_tx'] += glue + row[0]
111+
112+
if not line['betrag']:
113+
continue # Skip Null-Buchungen
114+
109115
result.append(line)
110116

111117
return result
112118

113119
def from_http(self, url):
114120
raise NotImplementedError("from_http is not implemented yet for Commerzbank Reader")
121+
122+
def _check_next_line_available(self, start_index, end_index, i):
123+
"""
124+
Hilfsmethode um zu prüfen, ob die nächste Zeile im
125+
ausgelesenen PDF-Dokument noch verfügbar ist.
126+
127+
Args:
128+
start_index (int): Startindex der Kontoumsätze
129+
end_index (int): Endindex der Kontoumsätze
130+
i (int): Aktueller Index in der Iteration
131+
Returns:
132+
bool: True, wenn die nächste Zeile noch verfügbar ist, sonst False
133+
"""
134+
if start_index + i + 1 >= end_index:
135+
return False
136+
137+
if self.all_rows[start_index + i + 1][1] != '':
138+
return False
139+
140+
if self.all_rows[start_index + i + 1][0].startswith('Buchungsdatum: '):
141+
return False
142+
143+
return True

reader/Generic.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,15 @@ def from_csv(self, filepath):
3636
date_format = "%d.%m.%Y"
3737

3838
for row in reader:
39+
3940
betrag = float(row['Betrag'].replace('.', '').replace(',', '.'))
4041
date_tx = datetime.datetime.strptime(
4142
row['Buchungstag'], date_format
4243
).replace(tzinfo=datetime.timezone.utc).timestamp()
4344
valuta = datetime.datetime.strptime(
4445
row['Wertstellung'], date_format
4546
).replace(tzinfo=datetime.timezone.utc).timestamp()
46-
result.append({
47+
line = {
4748
'date_tx': date_tx,
4849
'valuta': valuta,
4950
'art': row['Umsatzart'],
@@ -54,7 +55,12 @@ def from_csv(self, filepath):
5455
'parsed': {},
5556
'category': None,
5657
'tags': None
57-
})
58+
}
59+
60+
if not line['betrag']:
61+
continue # Skip Null-Buchungen
62+
63+
result.append(line)
5864

5965
return result
6066

0 commit comments

Comments
 (0)