Skip to content

Commit 15a9922

Browse files
authored
fix: improve loadSources error handling (#288)
companion PR: nextcloud/context_chat#228 --------- Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
1 parent cbca6e8 commit 15a9922

2 files changed

Lines changed: 29 additions & 9 deletions

File tree

context_chat_backend/controller.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,15 @@ def _(sources: list[UploadFile]):
343343
if len(sources) == 0:
344344
return JSONResponse('No sources provided', 400)
345345

346+
filtered_sources = []
347+
346348
for source in sources:
347349
if not value_of(source.filename):
348-
return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400)
350+
logger.warning('Skipping source with invalid source_id', extra={
351+
'source_id': source.filename,
352+
'title': source.headers.get('title'),
353+
})
354+
continue
349355

350356
with index_lock:
351357
if source.filename in _indexing:
@@ -364,12 +370,14 @@ def _(sources: list[UploadFile]):
364370
and source.headers['modified'].isdigit()
365371
and value_of(source.headers.get('provider'))
366372
):
367-
logger.error('Invalid/missing headers received', extra={
373+
logger.warning('Skipping source with invalid/missing headers', extra={
368374
'source_id': source.filename,
369375
'title': source.headers.get('title'),
370376
'headers': source.headers,
371377
})
372-
return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400)
378+
continue
379+
380+
filtered_sources.append(source)
373381

374382
# wait for 10 minutes before failing the request
375383
semres = doc_parse_semaphore.acquire(block=True, timeout=10*60)
@@ -381,27 +389,27 @@ def _(sources: list[UploadFile]):
381389
)
382390

383391
with index_lock:
384-
for source in sources:
392+
for source in filtered_sources:
385393
_indexing[source.filename] = source.size
386394

387395
try:
388396
loaded_sources, not_added_sources = exec_in_proc(
389397
target=embed_sources,
390-
args=(vectordb_loader, app.extra['CONFIG'], sources)
398+
args=(vectordb_loader, app.extra['CONFIG'], filtered_sources)
391399
)
392400
except (DbException, EmbeddingException):
393401
raise
394402
except Exception as e:
395403
raise DbException('Error: failed to load sources') from e
396404
finally:
397405
with index_lock:
398-
for source in sources:
406+
for source in filtered_sources:
399407
_indexing.pop(source.filename, None)
400408
doc_parse_semaphore.release()
401409

402-
if len(loaded_sources) != len(sources):
410+
if len(loaded_sources) != len(filtered_sources):
403411
logger.debug('Some sources were not loaded', extra={
404-
'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}',
412+
'Count of loaded sources': f'{len(loaded_sources)}/{len(filtered_sources)}',
405413
'source_ids': loaded_sources,
406414
})
407415

context_chat_backend/vectordb/pgvector.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
from datetime import datetime
88

9+
import psycopg
910
import sqlalchemy as sa
1011
import sqlalchemy.dialects.postgresql as postgresql_dialects
1112
import sqlalchemy.orm as orm
@@ -155,7 +156,18 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis
155156
chunks=chunk_ids,
156157
)
157158
session.add(doc)
158-
session.commit()
159+
try:
160+
session.commit()
161+
except sa.exc.IntegrityError as ie: # pyright: ignore[reportAttributeAccessIssue]
162+
if not isinstance(ie.orig, psycopg.errors.UniqueViolation):
163+
raise
164+
165+
# it's already in the db, continue updating the access
166+
logger.debug(
167+
'Unique violation: document already exists in the database',
168+
exc_info=ie,
169+
extra={ 'source_id': indoc.source_id },
170+
)
159171

160172
self.decl_update_access(indoc.userIds, indoc.source_id, session)
161173
added_sources.append(indoc.source_id)

0 commit comments

Comments
 (0)