nextcloud
diff --git a/‎context_chat_backend/chain/ingest/doc_loader.py‎
Lines changed: 29 additions & 24 deletions b/‎context_chat_backend/chain/ingest/doc_loader.py‎
Lines changed: 29 additions & 24 deletions
@@ -7,21 +7,22 @@
 import re
 import tempfile
 from collections.abc import Callable
-from typing import BinaryIO
+from io import BytesIO
 
 import docx2txt
 from epub2txt import epub2txt
-from fastapi import UploadFile
 from langchain_unstructured import UnstructuredLoader
 from odfdo import Document
 from pandas import read_csv, read_excel
 from pypdf import PdfReader
 from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError
 from striprtf import striprtf
 
+from ...types import SourceItem
+
 logger = logging.getLogger('ccb.doc_loader')
 
-def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
+def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
 	with tempfile.NamedTemporaryFile(mode='wb') as tmp:
 		tmp.write(raw_bytes)
@@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
 
 # -- LOADERS -- #
 
-def _load_pdf(file: BinaryIO) -> str:
+def _load_pdf(file: BytesIO) -> str:
 	pdf_reader = PdfReader(file)
 	return '\n\n'.join([page.extract_text().strip() for page in pdf_reader.pages])
 
 
-def _load_csv(file: BinaryIO) -> str:
+def _load_csv(file: BytesIO) -> str:
 	return read_csv(file).to_string(header=False, na_rep='')
 
 
-def _load_epub(file: BinaryIO) -> str:
+def _load_epub(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, epub2txt).strip()
 
 
-def _load_docx(file: BinaryIO) -> str:
+def _load_docx(file: BytesIO) -> str:
 	return docx2txt.process(file).strip()
 
 
-def _load_odt(file: BinaryIO) -> str:
+def _load_odt(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, lambda fp: Document(fp).get_formatted_text()).strip()
 
 
-def _load_ppt_x(file: BinaryIO) -> str:
+def _load_ppt_x(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, lambda fp: UnstructuredLoader(fp).load()).strip()
 
 
-def _load_rtf(file: BinaryIO) -> str:
+def _load_rtf(file: BytesIO) -> str:
 	return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()
 
 
-def _load_xml(file: BinaryIO) -> str:
+def _load_xml(file: BytesIO) -> str:
 	data = file.read().decode('utf-8', 'ignore')
 	data = re.sub(r'</.+>', '', data)
 	return data.strip()
 
 
-def _load_xlsx(file: BinaryIO) -> str:
+def _load_xlsx(file: BytesIO) -> str:
 	return read_excel(file, na_filter=False).to_string(header=False, na_rep='')
 
 
-def _load_email(file: BinaryIO, ext: str = 'eml') -> str | None:
+def _load_email(file: BytesIO, ext: str = 'eml') -> str | None:
 	# NOTE: msg format is not tested
 	if ext not in ['eml', 'msg']:
 		return None
@@ -115,30 +116,34 @@ def attachment_partitioner(
 }
 
 
-def decode_source(source: UploadFile) -> str | None:
+def decode_source(source: SourceItem) -> str | None:
+	io_obj: BytesIO | None = None
 	try:
 		# .pot files are powerpoint templates but also plain text files,
 		# so we skip them to prevent decoding errors
-		if source.headers['title'].endswith('.pot'):
+		if source.title.endswith('.pot'):
 			return None
 
-		mimetype = source.headers['type']
+		mimetype = source.type
 		if mimetype is None:
 			return None
 
+		if isinstance(source.content, str):
+			io_obj = BytesIO(source.content.encode('utf-8', 'ignore'))
+		else:
+			io_obj = source.content
+
 		if _loader_map.get(mimetype):
-			result = _loader_map[mimetype](source.file)
-			source.file.close()
+			result = _loader_map[mimetype](io_obj)
 			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
 
-		result = source.file.read().decode('utf-8', 'ignore')
-		source.file.close()
-		return result
+		return io_obj.read().decode('utf-8', 'ignore')
 	except PdfFileNotDecryptedError:
-		logger.warning(f'PDF file ({source.filename}) is encrypted and cannot be read')
+		logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read')
 		return None
 	except Exception:
-		logger.exception(f'Error decoding source file ({source.filename})', stack_info=True)
+		logger.exception(f'Error decoding source file ({source.reference})', stack_info=True)
 		return None
 	finally:
-		source.file.close()  # Ensure file is closed after processing
+		if io_obj is not None:
+			io_obj.close()