From 6659c40d4447a1a4501e248a8091adcf0f592f03 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 26 May 2025 15:39:59 +0530
Subject: [PATCH 1/2] feat: add doc search endpoint

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/context.py | 52 ++++++++++++++++++++++++++-
 context_chat_backend/chain/types.py   |  6 ++++
 context_chat_backend/controller.py    | 19 +++++++++-
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py
index 2e0219d..8480239 100644
--- a/context_chat_backend/chain/context.py
+++ b/context_chat_backend/chain/context.py
@@ -6,8 +6,9 @@
 
 from langchain.schema import Document
 
+from ..dyn_loader import VectorDBLoader
 from ..vectordb.base import BaseVectorDB
-from .types import ContextException, ScopeType
+from .types import ContextException, ScopeType, SearchResult
 
 logger = logging.getLogger('ccb.chain')
 
@@ -39,3 +40,52 @@ def get_context_chunks(context_docs: list[Document]) -> list[str]:
 		context_chunks.append(doc.page_content)
 
 	return context_chunks
+
+
+def do_doc_search(
+	user_id: str,
+	query: str,
+	vectordb_loader: VectorDBLoader,
+	ctx_limit: int = 20,
+	scope_type: ScopeType | None = None,
+	scope_list: list[str] | None = None,
+) -> list[SearchResult]:
+	"""
+	Raises
+	------
+	ContextException
+		If the scope type is provided but the scope list is empty or not provided
+	"""
+	db = vectordb_loader.load()
+	augmented_limit = ctx_limit * 2 # to account for duplicate sources
+	docs = get_context_docs(user_id, query, db, augmented_limit, scope_type, scope_list)
+	if len(docs) == 0:
+		raise ContextException('No documents retrieved, please index a few documents first')
+
+	sources_cache = {}
+	results: list[SearchResult] = []
+	for doc in docs:
+		source_id = doc.metadata.get('source')
+		if not source_id:
+			logger.warning('Document without source id encountered in doc search, skipping', extra={
+				'doc': doc,
+			})
+			continue
+		if source_id in sources_cache:
+			continue
+		if len(results) >= ctx_limit:
+			break
+
+		sources_cache[source_id] = None
+		results.append(SearchResult(
+			source_id=source_id,
+			title=doc.metadata.get('title', ''),
+		))
+
+	logger.debug('do_doc_search', extra={
+		'len(docs)': len(docs),
+		'len(results)': len(results),
+		'scope_type': scope_type,
+		'scope_list': scope_list,
+	})
+	return results
diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py
index 4d9b2ab..b006ad1 100644
--- a/context_chat_backend/chain/types.py
+++ b/context_chat_backend/chain/types.py
@@ -36,3 +36,9 @@ class ContextException(Exception):
 class LLMOutput(TypedDict):
 	output: str
 	sources: list[str]
+	# todo: add "titles" field
+
+
+class SearchResult(TypedDict):
+	source_id: str
+	title: str
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index a9970d1..19c236d 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -4,7 +4,7 @@
 #
 
 # isort: off
-from .chain.types import ContextException, LLMOutput, ScopeType
+from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult
 from .types import LoaderException, EmbeddingException
 from .vectordb.types import DbException, SafeDbException, UpdateAccessOp
 # isort: on
@@ -26,6 +26,7 @@
 from nc_py_api.ex_app import persistent_storage, set_handlers
 from pydantic import BaseModel, ValidationInfo, field_validator
 
+from .chain.context import do_doc_search
 from .chain.ingest.injest import embed_sources
 from .chain.one_shot import process_context_query, process_query
 from .config_parser import get_config
@@ -315,12 +316,14 @@ def _(userId: str = Body(embed=True)):
 
 	return JSONResponse('User deleted')
 
+
 @app.post('/countIndexedDocuments')
 @enabled_guard(app)
 def _():
 	counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,))
 	return JSONResponse(counts)
 
+
 @app.put('/loadSources')
 @enabled_guard(app)
 def _(sources: list[UploadFile]):
@@ -467,3 +470,17 @@ def _(query: Query) -> LLMOutput:
 
 	with llm_lock:
 		return execute_query(query, in_proc=False)
+
+
+@app.post('/docSearch')
+@enabled_guard(app)
+def _(query: Query) -> list[SearchResult]:
+	# useContext from Query is not used here
+	return exec_in_proc(target=do_doc_search, args=(
+		query.userId,
+		query.query,
+		vectordb_loader,
+		query.ctxLimit,
+		query.scopeType,
+		query.scopeList,
+	))

From bc0773a305a6f0719262f1d1593605858334c3c3 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 2 Jun 2025 17:11:13 +0530
Subject: [PATCH 2/2] fix: do not return an error on empty doc retrieval

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/context.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py
index 8480239..adbac2d 100644
--- a/context_chat_backend/chain/context.py
+++ b/context_chat_backend/chain/context.py
@@ -60,7 +60,8 @@ def do_doc_search(
 	augmented_limit = ctx_limit * 2 # to account for duplicate sources
 	docs = get_context_docs(user_id, query, db, augmented_limit, scope_type, scope_list)
 	if len(docs) == 0:
-		raise ContextException('No documents retrieved, please index a few documents first')
+		logger.warning('No documents retrieved, please index a few documents first')
+		return []
 
 	sources_cache = {}
 	results: list[SearchResult] = []