From 6659c40d4447a1a4501e248a8091adcf0f592f03 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 26 May 2025 15:39:59 +0530 Subject: [PATCH 1/2] feat: add doc search endpoint Signed-off-by: Anupam Kumar --- context_chat_backend/chain/context.py | 52 ++++++++++++++++++++++++++- context_chat_backend/chain/types.py | 6 ++++ context_chat_backend/controller.py | 19 +++++++++- 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py index 2e0219d..8480239 100644 --- a/context_chat_backend/chain/context.py +++ b/context_chat_backend/chain/context.py @@ -6,8 +6,9 @@ from langchain.schema import Document +from ..dyn_loader import VectorDBLoader from ..vectordb.base import BaseVectorDB -from .types import ContextException, ScopeType +from .types import ContextException, ScopeType, SearchResult logger = logging.getLogger('ccb.chain') @@ -39,3 +40,52 @@ def get_context_chunks(context_docs: list[Document]) -> list[str]: context_chunks.append(doc.page_content) return context_chunks + + +def do_doc_search( + user_id: str, + query: str, + vectordb_loader: VectorDBLoader, + ctx_limit: int = 20, + scope_type: ScopeType | None = None, + scope_list: list[str] | None = None, +) -> list[SearchResult]: + """ + Raises + ------ + ContextException + If the scope type is provided but the scope list is empty or not provided + """ + db = vectordb_loader.load() + augmented_limit = ctx_limit * 2 # to account for duplicate sources + docs = get_context_docs(user_id, query, db, augmented_limit, scope_type, scope_list) + if len(docs) == 0: + raise ContextException('No documents retrieved, please index a few documents first') + + sources_cache = {} + results: list[SearchResult] = [] + for doc in docs: + source_id = doc.metadata.get('source') + if not source_id: + logger.warning('Document without source id encountered in doc search, skipping', extra={ + 'doc': doc, + }) + continue + if source_id in sources_cache: + continue + if len(results) >= ctx_limit: + break + + sources_cache[source_id] = None + results.append(SearchResult( + source_id=source_id, + title=doc.metadata.get('title', ''), + )) + + logger.debug('do_doc_search', extra={ + 'len(docs)': len(docs), + 'len(results)': len(results), + 'scope_type': scope_type, + 'scope_list': scope_list, + }) + return results diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py index 4d9b2ab..b006ad1 100644 --- a/context_chat_backend/chain/types.py +++ b/context_chat_backend/chain/types.py @@ -36,3 +36,9 @@ class ContextException(Exception): class LLMOutput(TypedDict): output: str sources: list[str] + # todo: add "titles" field + + +class SearchResult(TypedDict): + source_id: str + title: str diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index a9970d1..19c236d 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -4,7 +4,7 @@ # # isort: off -from .chain.types import ContextException, LLMOutput, ScopeType +from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult from .types import LoaderException, EmbeddingException from .vectordb.types import DbException, SafeDbException, UpdateAccessOp # isort: on @@ -26,6 +26,7 @@ from nc_py_api.ex_app import persistent_storage, set_handlers from pydantic import BaseModel, ValidationInfo, field_validator +from .chain.context import do_doc_search from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query, process_query from .config_parser import get_config @@ -315,12 +316,14 @@ def _(userId: str = Body(embed=True)): return JSONResponse('User deleted') + @app.post('/countIndexedDocuments') @enabled_guard(app) def _(): counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,)) return JSONResponse(counts) + @app.put('/loadSources') @enabled_guard(app) def _(sources: list[UploadFile]): @@ -467,3 +470,17 @@ def _(query: Query) -> LLMOutput: with llm_lock: return execute_query(query, in_proc=False) + + +@app.post('/docSearch') +@enabled_guard(app) +def _(query: Query) -> list[SearchResult]: + # useContext from Query is not used here + return exec_in_proc(target=do_doc_search, args=( + query.userId, + query.query, + vectordb_loader, + query.ctxLimit, + query.scopeType, + query.scopeList, + )) From bc0773a305a6f0719262f1d1593605858334c3c3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 2 Jun 2025 17:11:13 +0530 Subject: [PATCH 2/2] fix: do not return an error on empty doc retrieval Signed-off-by: Anupam Kumar --- context_chat_backend/chain/context.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py index 8480239..adbac2d 100644 --- a/context_chat_backend/chain/context.py +++ b/context_chat_backend/chain/context.py @@ -60,7 +60,8 @@ def do_doc_search( augmented_limit = ctx_limit * 2 # to account for duplicate sources docs = get_context_docs(user_id, query, db, augmented_limit, scope_type, scope_list) if len(docs) == 0: - raise ContextException('No documents retrieved, please index a few documents first') + logger.warning('No documents retrieved, please index a few documents first') + return [] sources_cache = {} results: list[SearchResult] = []