Skip to content

Commit ea5208a

Browse files
committed
wip: migrate the indexing process
Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
1 parent c282f3d commit ea5208a

8 files changed

Lines changed: 659 additions & 254 deletions

File tree

context_chat_backend/chain/ingest/doc_loader.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,22 @@
77
import re
88
import tempfile
99
from collections.abc import Callable
10-
from typing import BinaryIO
10+
from io import BytesIO
1111

1212
import docx2txt
1313
from epub2txt import epub2txt
14-
from fastapi import UploadFile
1514
from langchain_unstructured import UnstructuredLoader
1615
from odfdo import Document
1716
from pandas import read_csv, read_excel
1817
from pypdf import PdfReader
1918
from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError
2019
from striprtf import striprtf
2120

21+
from ...types import SourceItem
22+
2223
logger = logging.getLogger('ccb.doc_loader')
2324

24-
def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
25+
def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
2526
raw_bytes = file.read()
2627
with tempfile.NamedTemporaryFile(mode='wb') as tmp:
2728
tmp.write(raw_bytes)
@@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
3536

3637
# -- LOADERS -- #
3738

38-
def _load_pdf(file: BinaryIO) -> str:
39+
def _load_pdf(file: BytesIO) -> str:
3940
pdf_reader = PdfReader(file)
4041
return '\n\n'.join([page.extract_text().strip() for page in pdf_reader.pages])
4142

4243

43-
def _load_csv(file: BinaryIO) -> str:
44+
def _load_csv(file: BytesIO) -> str:
4445
return read_csv(file).to_string(header=False, na_rep='')
4546

4647

47-
def _load_epub(file: BinaryIO) -> str:
48+
def _load_epub(file: BytesIO) -> str:
4849
return _temp_file_wrapper(file, epub2txt).strip()
4950

5051

51-
def _load_docx(file: BinaryIO) -> str:
52+
def _load_docx(file: BytesIO) -> str:
5253
return docx2txt.process(file).strip()
5354

5455

55-
def _load_odt(file: BinaryIO) -> str:
56+
def _load_odt(file: BytesIO) -> str:
5657
return _temp_file_wrapper(file, lambda fp: Document(fp).get_formatted_text()).strip()
5758

5859

59-
def _load_ppt_x(file: BinaryIO) -> str:
60+
def _load_ppt_x(file: BytesIO) -> str:
6061
return _temp_file_wrapper(file, lambda fp: UnstructuredLoader(fp).load()).strip()
6162

6263

63-
def _load_rtf(file: BinaryIO) -> str:
64+
def _load_rtf(file: BytesIO) -> str:
6465
return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()
6566

6667

67-
def _load_xml(file: BinaryIO) -> str:
68+
def _load_xml(file: BytesIO) -> str:
6869
data = file.read().decode('utf-8', 'ignore')
6970
data = re.sub(r'</.+>', '', data)
7071
return data.strip()
7172

7273

73-
def _load_xlsx(file: BinaryIO) -> str:
74+
def _load_xlsx(file: BytesIO) -> str:
7475
return read_excel(file, na_filter=False).to_string(header=False, na_rep='')
7576

7677

77-
def _load_email(file: BinaryIO, ext: str = 'eml') -> str | None:
78+
def _load_email(file: BytesIO, ext: str = 'eml') -> str | None:
7879
# NOTE: msg format is not tested
7980
if ext not in ['eml', 'msg']:
8081
return None
@@ -115,30 +116,34 @@ def attachment_partitioner(
115116
}
116117

117118

118-
def decode_source(source: UploadFile) -> str | None:
119+
def decode_source(source: SourceItem) -> str | None:
120+
io_obj: BytesIO | None = None
119121
try:
120122
# .pot files are powerpoint templates but also plain text files,
121123
# so we skip them to prevent decoding errors
122-
if source.headers['title'].endswith('.pot'):
124+
if source.title.endswith('.pot'):
123125
return None
124126

125-
mimetype = source.headers['type']
127+
mimetype = source.type
126128
if mimetype is None:
127129
return None
128130

131+
if isinstance(source.content, str):
132+
io_obj = BytesIO(source.content.encode('utf-8', 'ignore'))
133+
else:
134+
io_obj = source.content
135+
129136
if _loader_map.get(mimetype):
130-
result = _loader_map[mimetype](source.file)
131-
source.file.close()
137+
result = _loader_map[mimetype](io_obj)
132138
return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
133139

134-
result = source.file.read().decode('utf-8', 'ignore')
135-
source.file.close()
136-
return result
140+
return io_obj.read().decode('utf-8', 'ignore')
137141
except PdfFileNotDecryptedError:
138-
logger.warning(f'PDF file ({source.filename}) is encrypted and cannot be read')
142+
logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read')
139143
return None
140144
except Exception:
141-
logger.exception(f'Error decoding source file ({source.filename})', stack_info=True)
145+
logger.exception(f'Error decoding source file ({source.reference})', stack_info=True)
142146
return None
143147
finally:
144-
source.file.close() # Ensure file is closed after processing
148+
if io_obj is not None:
149+
io_obj.close()

0 commit comments

Comments
 (0)