@@ -42,31 +42,31 @@ def from_pdf(self, filepath):
4242 )
4343
4444 # Tabellen aller Seiten zusammenfügen
45- all_rows = []
45+ self . all_rows = []
4646 for t in tables :
4747 if not t .data :
4848 continue
4949
50- all_rows .extend (t .data )
50+ self . all_rows .extend (t .data )
5151
5252 # Start bei den Kontoumsätzen
5353 start_index = 0
54- end_index = len (all_rows )
55- for row in all_rows :
54+ end_index = len (self . all_rows )
55+ for row in self . all_rows :
5656
5757 if row [0 ].replace (' ' , '' ).lower ().startswith ('angabenzudenumsätzen' ):
5858 # Last row before transactions
59- start_index = all_rows .index (row ) + 1
59+ start_index = self . all_rows .index (row ) + 1
6060
6161 if 'Kreditlinie' in row [0 ]:
6262 # First row after transactions
63- end_index = all_rows .index (row )
63+ end_index = self . all_rows .index (row )
6464 break
6565
6666 result = []
6767 date_tx = 0
6868 date_tx_year = "1970" # Default Year if not found yet
69- enumerated_table = enumerate (all_rows [start_index :end_index ])
69+ enumerated_table = enumerate (self . all_rows [start_index :end_index ])
7070 for i , row in enumerated_table :
7171
7272 if row [0 ].startswith ('Buchungsdatum: ' ):
@@ -86,7 +86,7 @@ def from_pdf(self, filepath):
8686 f"{ row [1 ]} .{ date_tx_year } " , "%d.%m.%Y"
8787 ).replace (tzinfo = datetime .timezone .utc ).timestamp (),
8888 'art' : "" ,
89- 'text_tx' : row [0 ],
89+ 'text_tx' : row [0 ],
9090 'betrag' : float (betrag .replace ('.' , '' ).replace (',' , '.' )),
9191 'gegenkonto' : row [0 ],
9292 'currency' : "EUR" ,
@@ -95,20 +95,49 @@ def from_pdf(self, filepath):
9595 'tags' : None
9696 }
9797
98- while start_index + i + 1 < end_index and \
99- all_rows [start_index + i + 1 ][1 ] == '' and \
100- not all_rows [start_index + i + 1 ][0 ].startswith ('Buchungsdatum: ' ):
98+ while self ._check_next_line_available (start_index , end_index , i ):
10199 # 1. There are more lines in the table
102100 # 2. Next line belongs to this transaction
103101 # (no new date but text continuation - last line in this block is 'art')
102+ prev_line_len = len (row [0 ])
104103 i , row = next (enumerated_table )
105104
106- line ['text_tx' ] += ' ' + row [0 ]
107105 line ['art' ] = row [0 ] # Overwrite to keep value of last line in block
108106
107+ if self ._check_next_line_available (start_index , end_index , i ):
108+ # Line overflow or intentional line break?
109+ glue = ' ' if prev_line_len < 35 else ''
110+ line ['text_tx' ] += glue + row [0 ]
111+
112+ if not line ['betrag' ]:
113+ continue # Skip Null-Buchungen
114+
109115 result .append (line )
110116
111117 return result
112118
113119 def from_http (self , url ):
114120 raise NotImplementedError ("from_http is not implemented yet for Commerzbank Reader" )
121+
122+ def _check_next_line_available (self , start_index , end_index , i ):
123+ """
124+ Hilfsmethode um zu prüfen, ob die nächste Zeile im
125+ ausgelesenen PDF-Dokument noch verfügbar ist.
126+
127+ Args:
128+ start_index (int): Startindex der Kontoumsätze
129+ end_index (int): Endindex der Kontoumsätze
130+ i (int): Aktueller Index in der Iteration
131+ Returns:
132+ bool: True, wenn die nächste Zeile noch verfügbar ist, sonst False
133+ """
134+ if start_index + i + 1 >= end_index :
135+ return False
136+
137+ if self .all_rows [start_index + i + 1 ][1 ] != '' :
138+ return False
139+
140+ if self .all_rows [start_index + i + 1 ][0 ].startswith ('Buchungsdatum: ' ):
141+ return False
142+
143+ return True
0 commit comments