Skip to content

Commit 1ed2f82

Browse files
committed
fix: limit params in insert query too
Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
1 parent 8c9f089 commit 1ed2f82

1 file changed

Lines changed: 12 additions & 5 deletions

File tree

context_chat_backend/vectordb/pgvector.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
COLLECTION_NAME = 'ccb_store'
2727
DOCUMENTS_TABLE_NAME = 'docs'
2828
ACCESS_LIST_TABLE_NAME = 'access_list'
29+
PG_BATCH_SIZE = 50000
2930

3031
logger = logging.getLogger('ccb.vectordb')
3132

@@ -130,11 +131,17 @@ def get_users(self) -> list[str]:
130131
def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], list[str]]:
131132
added_sources = []
132133
not_added_sources = []
134+
batch_size = PG_BATCH_SIZE // 5
133135

134136
with self.session_maker() as session:
135137
for indoc in indocuments:
136138
try:
137-
chunk_ids = self.client.add_documents(indoc.documents)
139+
# query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
140+
# so we chunk the documents into (5 values * 10k) chunks
141+
# change the chunk size when there are more inserted values per document
142+
chunk_ids = []
143+
for i in range(0, len(indoc.documents), batch_size):
144+
chunk_ids.extend(self.client.add_documents(indoc.documents[i:i+batch_size]))
138145

139146
doc = DocumentsStore(
140147
source_id=indoc.source_id,
@@ -533,10 +540,10 @@ def _similarity_search(
533540

534541
# Initialize results list to store all potential matches
535542
all_results = []
536-
batch_size = 50000
537543
# Process chunk_ids in batches to prevent db errors
538-
for i in range(0, len(chunk_ids), batch_size):
539-
batch_chunk_ids = chunk_ids[i:i+batch_size]
544+
# query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
545+
for i in range(0, len(chunk_ids), PG_BATCH_SIZE):
546+
batch_chunk_ids = chunk_ids[i:i+PG_BATCH_SIZE]
540547

541548
filter_by = [
542549
self.client.EmbeddingStore.collection_id == collection.uuid,
@@ -559,7 +566,7 @@ def _similarity_search(
559566
all_results.extend(batch_results)
560567

561568
# Sort all collected results by distance and take top k
562-
if len(chunk_ids) > batch_size:
569+
if len(chunk_ids) > PG_BATCH_SIZE:
563570
all_results.sort(key=lambda x: x.distance)
564571
top_k_results = all_results[:k]
565572

0 commit comments

Comments
 (0)