Skip to content

Commit be82b42

Browse files
authored
fix: utf-8 encoding fixes (#118)
fixes #71
2 parents c3cc44b + f88c798 commit be82b42

3 files changed

Lines changed: 19 additions & 11 deletions

File tree

context_chat_backend/chain/ingest/doc_loader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
3131
os.remove(tmp.name)
3232

3333
if isinstance(docs, str) or isinstance(docs, bytes):
34-
return docs.decode('utf-8') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType]
34+
return docs.decode('utf-8', 'ignore') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType]
3535

3636
return sep.join(d.page_content for d in docs)
3737

@@ -64,11 +64,11 @@ def _load_ppt_x(file: BinaryIO) -> str:
6464

6565

6666
def _load_rtf(file: BinaryIO) -> str:
67-
return striprtf.rtf_to_text(file.read().decode('utf-8')).strip()
67+
return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()
6868

6969

7070
def _load_xml(file: BinaryIO) -> str:
71-
data = file.read().decode('utf-8')
71+
data = file.read().decode('utf-8', 'ignore')
7272
data = re.sub(r'</.+>', '', data)
7373
return data.strip()
7474

@@ -122,10 +122,10 @@ def decode_source(source: UploadFile) -> str | None:
122122
try:
123123
# .pot files are powerpoint templates but also plain text files,
124124
# so we skip them to prevent decoding errors
125-
if source.headers.get('title', '').endswith('.pot'):
125+
if source.headers['title'].endswith('.pot'):
126126
return None
127127

128-
mimetype = source.headers.get('type')
128+
mimetype = source.headers['type']
129129
if mimetype is None:
130130
return None
131131

@@ -134,7 +134,7 @@ def decode_source(source: UploadFile) -> str | None:
134134
source.file.close()
135135
return result
136136

137-
result = source.file.read().decode('utf-8')
137+
result = source.file.read().decode('utf-8', 'ignore')
138138
source.file.close()
139139
return result
140140
except Exception:

context_chat_backend/chain/ingest/injest.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
7676

7777
metadata = {
7878
'source': source.filename,
79-
'title': source.headers['title'],
79+
'title': _decode_latin_1(source.headers['title']),
8080
'type': source.headers['type'],
8181
}
8282
doc = Document(page_content=content, metadata=metadata)
@@ -86,7 +86,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
8686

8787
indocuments.append(InDocument(
8888
documents=split_docs,
89-
userIds=source.headers['userIds'].split(','),
89+
userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))),
9090
source_id=source.filename, # pyright: ignore[reportArgumentType]
9191
provider=source.headers['provider'],
9292
modified=to_int(source.headers['modified']),
@@ -114,7 +114,7 @@ def _process_sources(
114114
try:
115115
vectordb.update_access(
116116
UpdateAccessOp.allow,
117-
source.headers['userIds'].split(','),
117+
list(map(_decode_latin_1, source.headers['userIds'].split(','))),
118118
source.filename, # pyright: ignore[reportArgumentType]
119119
)
120120
except SafeDbException as e:
@@ -141,6 +141,14 @@ def _process_sources(
141141
return added_sources
142142

143143

144+
def _decode_latin_1(s: str) -> str:
145+
try:
146+
return s.encode('latin-1').decode('utf-8')
147+
except UnicodeDecodeError:
148+
print('Failed to decode latin-1:', s, flush=True)
149+
return s
150+
151+
144152
def embed_sources(
145153
vectordb_loader: VectorDBLoader,
146154
config: TConfig,
@@ -155,7 +163,7 @@ def embed_sources(
155163

156164
print(
157165
'Embedding sources:\n' +
158-
'\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]),
166+
'\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]),
159167
flush=True,
160168
)
161169

context_chat_backend/ocs_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _verify_signature(headers: Headers) -> str | None:
4444
)
4545
return None
4646

47-
auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8')
47+
auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8', 'ignore')
4848
username, app_secret = auth_aa.split(':', maxsplit=1)
4949

5050
if app_secret != getenv('APP_SECRET'):

0 commit comments

Comments
 (0)