@@ -76,7 +76,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
7676
7777 metadata = {
7878 'source' : source .filename ,
79- 'title' : source .headers ['title' ],
79+ 'title' : _decode_latin_1 ( source .headers ['title' ]) ,
8080 'type' : source .headers ['type' ],
8181 }
8282 doc = Document (page_content = content , metadata = metadata )
@@ -86,7 +86,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
8686
8787 indocuments .append (InDocument (
8888 documents = split_docs ,
89- userIds = source .headers ['userIds' ].split (',' ),
89+ userIds = list ( map ( _decode_latin_1 , source .headers ['userIds' ].split (',' )) ),
9090 source_id = source .filename , # pyright: ignore[reportArgumentType]
9191 provider = source .headers ['provider' ],
9292 modified = to_int (source .headers ['modified' ]),
@@ -114,7 +114,7 @@ def _process_sources(
114114 try :
115115 vectordb .update_access (
116116 UpdateAccessOp .allow ,
117- source .headers ['userIds' ].split (',' ),
117+ list ( map ( _decode_latin_1 , source .headers ['userIds' ].split (',' )) ),
118118 source .filename , # pyright: ignore[reportArgumentType]
119119 )
120120 except SafeDbException as e :
@@ -141,6 +141,14 @@ def _process_sources(
141141 return added_sources
142142
143143
144+ def _decode_latin_1 (s : str ) -> str :
145+ try :
146+ return s .encode ('latin-1' ).decode ('utf-8' )
147+ except UnicodeDecodeError :
148+ print ('Failed to decode latin-1:' , s , flush = True )
149+ return s
150+
151+
144152def embed_sources (
145153 vectordb_loader : VectorDBLoader ,
146154 config : TConfig ,
@@ -155,7 +163,7 @@ def embed_sources(
155163
156164 print (
157165 'Embedding sources:\n ' +
158- '\n ' .join ([f'{ source .filename } ({ source .headers ["title" ]} )' for source in sources_filtered ]),
166+ '\n ' .join ([f'{ source .filename } ({ _decode_latin_1 ( source .headers ["title" ]) } )' for source in sources_filtered ]),
159167 flush = True ,
160168 )
161169
0 commit comments