2626COLLECTION_NAME = 'ccb_store'
2727DOCUMENTS_TABLE_NAME = 'docs'
2828ACCESS_LIST_TABLE_NAME = 'access_list'
29+ PG_BATCH_SIZE = 50000
2930
3031logger = logging .getLogger ('ccb.vectordb' )
3132
@@ -130,11 +131,17 @@ def get_users(self) -> list[str]:
130131 def add_indocuments (self , indocuments : list [InDocument ]) -> tuple [list [str ], list [str ]]:
131132 added_sources = []
132133 not_added_sources = []
134+ batch_size = PG_BATCH_SIZE // 5
133135
134136 with self .session_maker () as session :
135137 for indoc in indocuments :
136138 try :
137- chunk_ids = self .client .add_documents (indoc .documents )
139+ # query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
140+ # so we chunk the documents into (5 values * 10k) chunks
141+ # change the chunk size when there are more inserted values per document
142+ chunk_ids = []
143+ for i in range (0 , len (indoc .documents ), batch_size ):
144+ chunk_ids .extend (self .client .add_documents (indoc .documents [i :i + batch_size ]))
138145
139146 doc = DocumentsStore (
140147 source_id = indoc .source_id ,
@@ -533,10 +540,10 @@ def _similarity_search(
533540
534541 # Initialize results list to store all potential matches
535542 all_results = []
536- batch_size = 50000
537543 # Process chunk_ids in batches to prevent db errors
538- for i in range (0 , len (chunk_ids ), batch_size ):
539- batch_chunk_ids = chunk_ids [i :i + batch_size ]
544+ # query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
545+ for i in range (0 , len (chunk_ids ), PG_BATCH_SIZE ):
546+ batch_chunk_ids = chunk_ids [i :i + PG_BATCH_SIZE ]
540547
541548 filter_by = [
542549 self .client .EmbeddingStore .collection_id == collection .uuid ,
@@ -559,7 +566,7 @@ def _similarity_search(
559566 all_results .extend (batch_results )
560567
561568 # Sort all collected results by distance and take top k
562- if len (chunk_ids ) > batch_size :
569+ if len (chunk_ids ) > PG_BATCH_SIZE :
563570 all_results .sort (key = lambda x : x .distance )
564571 top_k_results = all_results [:k ]
565572
0 commit comments