Fix: for when too many sources/chunks for postgres IN (#166)

kyteinsky · web-flow · commit 02a4d289e1e5 · 2025-03-29T04:44:27.000+05:30
When your context chat has too many documents/chunks (65535) passed into certain in statements it causes the query to timeout. https://www.postgresql.org/docs/current/limits.html Changes: 1. Gets the source_ids in doc_search in one sql statement using joins instead of an in 2. In similarity_search will now batch the embedding search in batches of 50k chunks 3. Document insertion happens at 10k chunks
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
@@ -26,6 +26,7 @@
 COLLECTION_NAME = 'ccb_store'
 DOCUMENTS_TABLE_NAME = 'docs'
 ACCESS_LIST_TABLE_NAME = 'access_list'
+PG_BATCH_SIZE = 50000
 
 logger = logging.getLogger('ccb.vectordb')
 
@@ -130,11 +131,17 @@ def get_users(self) -> list[str]:
 	def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], list[str]]:
 		added_sources = []
 		not_added_sources = []
+		batch_size = PG_BATCH_SIZE // 5
 
 		with self.session_maker() as session:
 			for indoc in indocuments:
 				try:
-					chunk_ids = self.client.add_documents(indoc.documents)
+					# query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
+					# so we chunk the documents into (5 values * 10k) chunks
+					# change the chunk size when there are more inserted values per document
+					chunk_ids = []
+					for i in range(0, len(indoc.documents), batch_size):
+						chunk_ids.extend(self.client.add_documents(indoc.documents[i:i+batch_size]))
 
 					doc = DocumentsStore(
 						source_id=indoc.source_id,
@@ -497,24 +504,17 @@ def doc_search(
 
 		try:
 			with self.session_maker() as session:
-				# get user's access list
-				stmt = (
-					sa.select(AccessListStore.source_id)
-					.filter(AccessListStore.uid == user_id)
-				)
-				result = session.execute(stmt).fetchall()
-				source_ids = [r.source_id for r in result]
-
-				doc_filters = [DocumentsStore.source_id.in_(source_ids)]
+				doc_filters = [AccessListStore.uid == user_id]
 				match scope_type:
 					case ScopeType.PROVIDER:
 						doc_filters.append(DocumentsStore.provider.in_(scope_list))  # pyright: ignore[reportArgumentType]
 					case ScopeType.SOURCE:
 						doc_filters.append(DocumentsStore.source_id.in_(scope_list))  # pyright: ignore[reportArgumentType]
 
-				# get chunks associated with the source_ids
+				# get chunks associated with the user
 				stmt = (
 					sa.select(DocumentsStore.chunks)
+					.join(AccessListStore, AccessListStore.source_id==DocumentsStore.source_id)
 					.filter(*doc_filters)
 				)
 				result = session.execute(stmt).fetchall()
@@ -538,30 +538,43 @@ def _similarity_search(
 		if not collection:
 			raise DbException('Collection not found')
 
-		filter_by = [
-			self.client.EmbeddingStore.collection_id == collection.uuid,
-			self.client.EmbeddingStore.id.in_(chunk_ids),
-		]
+		# Initialize results list to store all potential matches
+		all_results = []
+		# Process chunk_ids in batches to prevent db errors
+		# query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
+		for i in range(0, len(chunk_ids), PG_BATCH_SIZE):
+			batch_chunk_ids = chunk_ids[i:i+PG_BATCH_SIZE]
 
-		results = (
-			session.query(
-				self.client.EmbeddingStore,
-				self.client.distance_strategy(embedding).label('distance'),
-			)
-			.filter(*filter_by)
-			.order_by(sa.asc('distance'))
-			.join(
-				self.client.CollectionStore,
-				self.client.EmbeddingStore.collection_id == self.client.CollectionStore.uuid,
+			filter_by = [
+				self.client.EmbeddingStore.collection_id == collection.uuid,
+				self.client.EmbeddingStore.id.in_(batch_chunk_ids),
+			]
+
+			batch_results = (
+				session.query(
+					self.client.EmbeddingStore,
+					self.client.distance_strategy(embedding).label('distance'),
+				)
+				.filter(*filter_by)
+				.join(
+					self.client.CollectionStore,
+					self.client.EmbeddingStore.collection_id == self.client.CollectionStore.uuid,
+				)
+				.order_by(sa.asc('distance'))
+				.limit(k)  # Get up to k results from the batch
 			)
-			.limit(k)
-			.all()
-		)
+
+			all_results.extend(batch_results)
+
+		# Sort all collected results by distance and take top k
+		if len(chunk_ids) > PG_BATCH_SIZE:
+			all_results.sort(key=lambda x: x.distance)
+		top_k_results = all_results[:k]
 
 		return [
 			Document(
 				id=str(result.EmbeddingStore.id),
 				page_content=result.EmbeddingStore.document,
 				metadata=result.EmbeddingStore.cmetadata,
-			) for result in results
+			) for result in top_k_results
 		]