77import re
88import tempfile
99from collections .abc import Callable
10- from typing import BinaryIO
10+ from io import BytesIO
1111
1212import docx2txt
1313from epub2txt import epub2txt
14- from fastapi import UploadFile
1514from langchain_unstructured import UnstructuredLoader
1615from odfdo import Document
1716from pandas import read_csv , read_excel
1817from pypdf import PdfReader
1918from pypdf .errors import FileNotDecryptedError as PdfFileNotDecryptedError
2019from striprtf import striprtf
2120
21+ from ...types import SourceItem
22+
2223logger = logging .getLogger ('ccb.doc_loader' )
2324
24- def _temp_file_wrapper (file : BinaryIO , loader : Callable , sep : str = '\n ' ) -> str :
25+ def _temp_file_wrapper (file : BytesIO , loader : Callable , sep : str = '\n ' ) -> str :
2526 raw_bytes = file .read ()
2627 with tempfile .NamedTemporaryFile (mode = 'wb' ) as tmp :
2728 tmp .write (raw_bytes )
@@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
3536
3637# -- LOADERS -- #
3738
38- def _load_pdf (file : BinaryIO ) -> str :
39+ def _load_pdf (file : BytesIO ) -> str :
3940 pdf_reader = PdfReader (file )
4041 return '\n \n ' .join ([page .extract_text ().strip () for page in pdf_reader .pages ])
4142
4243
43- def _load_csv (file : BinaryIO ) -> str :
44+ def _load_csv (file : BytesIO ) -> str :
4445 return read_csv (file ).to_string (header = False , na_rep = '' )
4546
4647
47- def _load_epub (file : BinaryIO ) -> str :
48+ def _load_epub (file : BytesIO ) -> str :
4849 return _temp_file_wrapper (file , epub2txt ).strip ()
4950
5051
51- def _load_docx (file : BinaryIO ) -> str :
52+ def _load_docx (file : BytesIO ) -> str :
5253 return docx2txt .process (file ).strip ()
5354
5455
55- def _load_odt (file : BinaryIO ) -> str :
56+ def _load_odt (file : BytesIO ) -> str :
5657 return _temp_file_wrapper (file , lambda fp : Document (fp ).get_formatted_text ()).strip ()
5758
5859
59- def _load_ppt_x (file : BinaryIO ) -> str :
60+ def _load_ppt_x (file : BytesIO ) -> str :
6061 return _temp_file_wrapper (file , lambda fp : UnstructuredLoader (fp ).load ()).strip ()
6162
6263
63- def _load_rtf (file : BinaryIO ) -> str :
64+ def _load_rtf (file : BytesIO ) -> str :
6465 return striprtf .rtf_to_text (file .read ().decode ('utf-8' , 'ignore' )).strip ()
6566
6667
67- def _load_xml (file : BinaryIO ) -> str :
68+ def _load_xml (file : BytesIO ) -> str :
6869 data = file .read ().decode ('utf-8' , 'ignore' )
6970 data = re .sub (r'</.+>' , '' , data )
7071 return data .strip ()
7172
7273
73- def _load_xlsx (file : BinaryIO ) -> str :
74+ def _load_xlsx (file : BytesIO ) -> str :
7475 return read_excel (file , na_filter = False ).to_string (header = False , na_rep = '' )
7576
7677
77- def _load_email (file : BinaryIO , ext : str = 'eml' ) -> str | None :
78+ def _load_email (file : BytesIO , ext : str = 'eml' ) -> str | None :
7879 # NOTE: msg format is not tested
7980 if ext not in ['eml' , 'msg' ]:
8081 return None
@@ -115,30 +116,34 @@ def attachment_partitioner(
115116}
116117
117118
118- def decode_source (source : UploadFile ) -> str | None :
119+ def decode_source (source : SourceItem ) -> str | None :
120+ io_obj : BytesIO | None = None
119121 try :
120122 # .pot files are powerpoint templates but also plain text files,
121123 # so we skip them to prevent decoding errors
122- if source .headers [ ' title' ] .endswith ('.pot' ):
124+ if source .title .endswith ('.pot' ):
123125 return None
124126
125- mimetype = source .headers [ ' type' ]
127+ mimetype = source .type
126128 if mimetype is None :
127129 return None
128130
131+ if isinstance (source .content , str ):
132+ io_obj = BytesIO (source .content .encode ('utf-8' , 'ignore' ))
133+ else :
134+ io_obj = source .content
135+
129136 if _loader_map .get (mimetype ):
130- result = _loader_map [mimetype ](source .file )
131- source .file .close ()
137+ result = _loader_map [mimetype ](io_obj )
132138 return result .encode ('utf-8' , 'ignore' ).decode ('utf-8' , 'ignore' )
133139
134- result = source .file .read ().decode ('utf-8' , 'ignore' )
135- source .file .close ()
136- return result
140+ return io_obj .read ().decode ('utf-8' , 'ignore' )
137141 except PdfFileNotDecryptedError :
138- logger .warning (f'PDF file ({ source .filename } ) is encrypted and cannot be read' )
142+ logger .warning (f'PDF file ({ source .reference } ) is encrypted and cannot be read' )
139143 return None
140144 except Exception :
141- logger .exception (f'Error decoding source file ({ source .filename } )' , stack_info = True )
145+ logger .exception (f'Error decoding source file ({ source .reference } )' , stack_info = True )
142146 return None
143147 finally :
144- source .file .close () # Ensure file is closed after processing
148+ if io_obj is not None :
149+ io_obj .close ()
0 commit comments