33#
44
55import logging
6+ import warnings
67from io import IOBase
78from pathlib import Path
89from typing import Any , Dict , Iterable , Mapping , Optional , Tuple , Union
1718)
1819from airbyte_cdk .sources .file_based .exceptions import (
1920 ConfigValidationError ,
21+ ExcelCalamineParsingError ,
2022 FileBasedSourceError ,
2123 RecordParseError ,
2224)
@@ -64,7 +66,7 @@ async def infer_schema(
6466 fields : Dict [str , str ] = {}
6567
6668 with stream_reader .open_file (file , self .file_read_mode , self .ENCODING , logger ) as fp :
67- df = self .open_and_parse_file (fp )
69+ df = self .open_and_parse_file (fp , logger , file )
6870 for column , df_type in df .dtypes .items ():
6971 # Choose the broadest data type if the column's data type differs in dataframes
7072 prev_frame_column_type = fields .get (column ) # type: ignore [call-overload]
@@ -92,7 +94,7 @@ def parse_records(
9294 discovered_schema : Optional [Mapping [str , SchemaType ]] = None ,
9395 ) -> Iterable [Dict [str , Any ]]:
9496 """
95- Parses records from an Excel file based on the provided configuration .
97+ Parses records from an Excel file with fallback error handling .
9698
9799 Args:
98100 config (FileBasedStreamConfig): Configuration for the file-based stream.
@@ -111,7 +113,7 @@ def parse_records(
111113 try :
112114 # Open and parse the file using the stream reader
113115 with stream_reader .open_file (file , self .file_read_mode , self .ENCODING , logger ) as fp :
114- df = self .open_and_parse_file (fp )
116+ df = self .open_and_parse_file (fp , logger , file )
115117 # Yield records as dictionaries
116118 # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
117119 # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
@@ -180,15 +182,93 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
180182 logger .info (f"Expected ExcelFormat, got { excel_format } " )
181183 raise ConfigValidationError (FileBasedSourceError .CONFIG_VALIDATION_ERROR )
182184
183- @staticmethod
184- def open_and_parse_file (fp : Union [IOBase , str , Path ]) -> pd .DataFrame :
185+ def _open_and_parse_file_with_calamine (
186+ self ,
187+ fp : Union [IOBase , str , Path ],
188+ logger : logging .Logger ,
189+ file : RemoteFile ,
190+ ) -> pd .DataFrame :
191+ """Opens and parses Excel file using Calamine engine.
192+
193+ Args:
194+ fp: File pointer to the Excel file.
195+ logger: Logger for logging information and errors.
196+ file: Remote file information for logging context.
197+
198+ Returns:
199+ pd.DataFrame: Parsed data from the Excel file.
200+
201+ Raises:
202+ ExcelCalamineParsingError: If Calamine fails to parse the file.
203+ """
204+ try :
205+ return pd .ExcelFile (fp , engine = "calamine" ).parse () # type: ignore [arg-type, call-overload, no-any-return]
206+ except BaseException as exc :
207+ # Calamine engine raises PanicException(child of BaseException) if Calamine fails to parse the file.
208+ # Checking if ValueError in exception arg to know if it was actually an error during parsing due to invalid values in cells.
209+ # Otherwise, raise an exception.
210+ if "ValueError" in str (exc ):
211+ logger .warning (
212+ f"Calamine parsing failed for { file .file_uri_for_logging } , falling back to openpyxl: { exc } "
213+ )
214+ raise ExcelCalamineParsingError (
215+ f"Calamine engine failed to parse { file .file_uri_for_logging } " ,
216+ filename = file .uri ,
217+ ) from exc
218+ raise exc
219+
220+ def _open_and_parse_file_with_openpyxl (
221+ self ,
222+ fp : Union [IOBase , str , Path ],
223+ logger : logging .Logger ,
224+ file : RemoteFile ,
225+ ) -> pd .DataFrame :
226+ """Opens and parses Excel file using Openpyxl engine.
227+
228+ Args:
229+ fp: File pointer to the Excel file.
230+ logger: Logger for logging information and errors.
231+ file: Remote file information for logging context.
232+
233+ Returns:
234+ pd.DataFrame: Parsed data from the Excel file.
185235 """
186- Opens and parses the Excel file.
236+ # Some file-like objects are not seekable.
237+ if hasattr (fp , "seek" ):
238+ try :
239+ fp .seek (0 ) # type: ignore [union-attr]
240+ except OSError as exc :
241+ logger .info (
242+ f"Could not rewind stream for { file .file_uri_for_logging } ; "
243+ f"proceeding with openpyxl from current position: { exc } "
244+ )
245+
246+ with warnings .catch_warnings (record = True ) as warning_records :
247+ warnings .simplefilter ("always" )
248+ df = pd .ExcelFile (fp , engine = "openpyxl" ).parse () # type: ignore [arg-type, call-overload]
249+
250+ for warning in warning_records :
251+ logger .warning (f"Openpyxl warning for { file .file_uri_for_logging } : { warning .message } " )
252+
253+ return df # type: ignore [no-any-return]
254+
255+ def open_and_parse_file (
256+ self ,
257+ fp : Union [IOBase , str , Path ],
258+ logger : logging .Logger ,
259+ file : RemoteFile ,
260+ ) -> pd .DataFrame :
261+ """Opens and parses the Excel file with Calamine-first and Openpyxl fallback.
187262
188263 Args:
189264 fp: File pointer to the Excel file.
265+ logger: Logger for logging information and errors.
266+ file: Remote file information for logging context.
190267
191268 Returns:
192269 pd.DataFrame: Parsed data from the Excel file.
193270 """
194- return pd .ExcelFile (fp , engine = "calamine" ).parse () # type: ignore [arg-type, call-overload, no-any-return]
271+ try :
272+ return self ._open_and_parse_file_with_calamine (fp , logger , file )
273+ except ExcelCalamineParsingError :
274+ return self ._open_and_parse_file_with_openpyxl (fp , logger , file )
0 commit comments