feat(nodes): improve Milvus vector DB node — address all TODOs (#562)

charliegillet · claude · shashidharbabu · commit 5af0a4b93bac · 2026-04-07T13:35:52.000-07:00
* feat(vscode): improve stop button feedback in Pipeline Observability screen Handle TASK_STATE.STOPPING in the control button to show "Stopping..." with a disabled state and distinct orange styling, preventing duplicate clicks and giving immediate visual feedback during pipeline shutdown. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(nodes): improve Milvus vector DB node — address all TODOs - Add configurable timeout (default 60s) replacing hardcoded timeout=20, read from node config via 'timeout' key (TODO line 101, 483) - Add connection error handling with meaningful failure messages instead of raw pymilvus exceptions propagating - Implement bulk insert with configurable batch size (default 50) for addChunks(), replacing one-at-a-time upserts (TODO lines 449, 464) - Add _batchUpsertResults() helper to batch-update markDeleted/markActive operations, eliminating the per-vector upsert loop bottleneck (TODO lines 514-515, 546-547) - Add timeout parameter to remove() delete call (TODO line 483) - Document Milvus COSINE distance score range [0,2] rescaling to [0,1] for codebase consistency (TODO line 253) - Fix typos in docstrings and comments Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: remove unrelated PageStatus "Stopping..." changes from Milvus PR The PageStatus changes belong in a separate PR (#549) and were accidentally included here. * fix(nodes): address CodeRabbit feedback on Milvus PR #562 - Remove dead protocol check (host already stripped of scheme at init) - Add exception chaining with 'from e' for connection errors (B904) - Add output_fields to markDeleted/markActive queries to prevent data loss during upsert (was only returning primary key) - Add output_fields to renderChunks query to prevent KeyError on content/chunkId access Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(nodes): address remaining review feedback on Milvus PR #562 - Remove unrelated PageStatus changes that were re-introduced - Validate timeout and bulkInsertBatchSize to ensure positive values - Make isDeleted a keyword-only argument in _batchUpsertResults Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/nodes/src/nodes/milvus/milvus.py b/nodes/src/nodes/milvus/milvus.py
@@ -47,6 +47,12 @@
 from ai.common.store import DocumentStoreBase
 from ai.common.config import Config
 
+# Default batch size for bulk upsert operations
+DEFAULT_BULK_INSERT_BATCH_SIZE = 50
+
+# Default connection timeout in seconds
+DEFAULT_TIMEOUT = 60
+
 
 def _escape_milvus_str(value: object) -> str:
     """Escape a value for safe interpolation into a Milvus filter expression."""
@@ -89,6 +95,10 @@ def __init__(self, provider: str, connConfig: Dict[str, Any], bag: Dict[str, Any
         self.renderChunkSize = config.get('renderChunkSize', self.renderChunkSize)
         self.threshold_search = config.get('score', 0.5)
 
+        # Configurable timeout (seconds) and bulk insert batch size
+        self.timeout = max(int(config.get('timeout', DEFAULT_TIMEOUT)), 1)
+        self.bulkInsertBatchSize = max(int(config.get('bulkInsertBatchSize', DEFAULT_BULK_INSERT_BATCH_SIZE)), 1)
+
         profile = config.get('mode')
 
         # check if the similarity matches milvus configuration options
@@ -98,15 +108,16 @@ def __init__(self, provider: str, connConfig: Dict[str, Any], bag: Dict[str, Any
         else:
             raise Exception('The metric you provided in the config.json does not match required milvus configurations')
 
-        # Establish a connection // TODO: Revise alternative setup as this connection action is only necessary for the flush() method
-        if profile != 'local':
-            # Init the store
-            if self.host.startswith('https:') or self.host.startswith('http:'):
-                self.client = MilvusClient(uri=self.host, token=self.apikey, timeout=20)
+        # Establish a connection to the Milvus instance with configurable timeout
+        try:
+            if profile != 'local':
+                # Init the store (host was stripped of protocol at line 87, so always add https://)
+                self.client = MilvusClient(uri=f'https://{self.host}', token=self.apikey, timeout=self.timeout)
             else:
-                self.client = MilvusClient(uri=f'https://{self.host}', token=self.apikey, timeout=20)
-        else:
-            self.client = MilvusClient(uri=f'http://{self.host}:{self.port}', timeout=20)
+                self.client = MilvusClient(uri=f'http://{self.host}:{self.port}', timeout=self.timeout)
+        except Exception as e:
+            self.client = None
+            raise Exception(f'Failed to connect to Milvus at {self.host}: {e}') from e
 
         return
 
@@ -250,7 +261,9 @@ def _convertToDocs(self, points: List[dict]) -> List[Doc]:
                 entity = point
                 score = 0
             else:
-                # If we are return scaled scores, build it TODO: CHECK IF THIS IS ALSO THE CASE FOR MILVUS (-1 to 1 range) OR MIGHT IT BE CORRECTED ALREADY?
+                # Milvus COSINE distance returns values in the range [0, 2] where 0 is
+                # identical. We rescale to [0, 1] with 1 meaning most similar to stay
+                # consistent with the rest of the codebase score convention.
                 if self.similarity == 'COSINE':
                     score = (point.get('distance') + 1) / 2
                 else:
@@ -267,7 +280,7 @@ def _convertToDocs(self, points: List[dict]) -> List[Doc]:
             # Get the payload content and metadata
             metadata = cast(DocMetadata, metadata)
 
-            # Create asearc new document
+            # Create a new document
             doc = Doc(score=score, page_content=content, metadata=metadata)
 
             # Append it to this documents chunks
@@ -419,7 +432,7 @@ def getPaths(self, parent: str | None = None, offset: int = 0, limit: int = 1000
 
     def addChunks(self, chunks: List[Doc], checkCollection: bool = True) -> None:
         """
-        Addsdocument chunks to the document store.
+        Add document chunks to the document store using batched bulk upsert.
         """
         # If no documents present, get out
         if not len(chunks):
@@ -437,7 +450,8 @@ def addChunks(self, chunks: List[Doc], checkCollection: bool = True) -> None:
             # Save this object id
             objectIds[chunk.metadata.objectId] = True
 
-        # Erase all documents/chunks associated with that ObjectId in one operation (TODO: Start discussion about better use of upsert() method to increase performance)
+        # Erase all documents/chunks associated with that ObjectId in one operation
+        # so we can cleanly insert the new version
         if len(objectIds.keys()):
             filter_condition = f"meta['objectId'] in [{', '.join(json.dumps(k) for k in objectIds.keys())}]"
             try:
@@ -446,8 +460,20 @@ def addChunks(self, chunks: List[Doc], checkCollection: bool = True) -> None:
             except Exception as e:
                 engLib.debug(f'Error deleting old chunks: {e}')
 
-        # TODO: Consider implementing a bulk insertion https://milvus.io/api-reference/pymilvus/v2.4.x/ORM/utility/do_bulk_insert.md
-        # Disatvantage here is that is will require to reformat interation data into a JSON file format
+        # Collect chunks into batches for bulk upsert instead of one-at-a-time
+        batch: List[dict] = []
+
+        def flush_batch():
+            nonlocal batch
+            if not batch:
+                return
+            try:
+                self.client.upsert(collection_name=self.collection, data=batch)
+                engLib.debug(f'Milvus bulk upsert: {len(batch)} chunks inserted')
+            except Exception as e:
+                engLib.debug(f'Error during bulk upsert ({len(batch)} chunks): {e}')
+                raise
+            batch = []
 
         # For each document
         for chunk in chunks:
@@ -461,8 +487,14 @@ def addChunks(self, chunks: List[Doc], checkCollection: bool = True) -> None:
             # Append the points // create a unique identifier that fits into an int64 id field
             tmp_struct = {'id': np.int64(((uuid.uuid1().time & 0x1FFFFFFFF) << 27) | random.getrandbits(27)), 'vector': embedding, 'content': chunk.page_content, 'meta': chunk.metadata}
 
-            # TODO: Consider printing out upsert count for debugging and imprement bulk insert
-            self.client.upsert(collection_name=self.collection, data=[tmp_struct])
+            batch.append(tmp_struct)
+
+            # Flush when batch reaches configured size
+            if len(batch) >= self.bulkInsertBatchSize:
+                flush_batch()
+
+        # Flush any remaining chunks
+        flush_batch()
 
     def remove(self, objectIds: List[str]) -> None:
         """
@@ -480,13 +512,39 @@ def remove(self, objectIds: List[str]) -> None:
             objectIdsJoint = ', '.join(f"'{_escape_milvus_str(o)}'" for o in objectIds)
             must_conditions.append(f"meta['objectId'] in [{objectIdsJoint}]")
 
-        # TODO: Add time out
         filter_expression = ' and '.join(must_conditions) if must_conditions else None
         if filter_expression:
-            self.client.delete(collection_name=self.collection, filter=filter_expression)
+            try:
+                self.client.delete(collection_name=self.collection, filter=filter_expression, timeout=self.timeout)
+            except Exception as e:
+                engLib.debug(f'Error removing documents: {e}')
+                raise
 
         return
 
+    def _batchUpsertResults(self, results: List[dict], *, isDeleted: bool) -> None:
+        """
+        Batch-update the isDeleted metadata field on a list of query results.
+
+        Collects results into batches of bulkInsertBatchSize and upserts them
+        together, avoiding the performance bottleneck of one-at-a-time upserts.
+        """
+        batch: List[dict] = []
+
+        for result in results:
+            meta = result.get('meta', {})
+            meta['isDeleted'] = isDeleted
+            result['meta'] = meta
+            batch.append(result)
+
+            if len(batch) >= self.bulkInsertBatchSize:
+                self.client.upsert(collection_name=self.collection, data=batch)
+                batch = []
+
+        # Flush remaining
+        if batch:
+            self.client.upsert(collection_name=self.collection, data=batch)
+
     def markDeleted(self, objectIds: List[str]) -> None:
         """
         Mark the set of documents with the given objectId as deleted.
@@ -509,14 +567,10 @@ def markDeleted(self, objectIds: List[str]) -> None:
         if not filter_expression:
             return
 
-        results = self.client.query(collection_name=self.collection, filter=filter_expression)
+        results = self.client.query(collection_name=self.collection, filter=filter_expression, output_fields=['id', 'vector', 'content', 'meta'])
 
-        # Update the 'isDeleted' field for each result -> TODO: Might there be a better way to do this? Looping over the
-        # vecotrs can be a performance bottleneck and additionally whats the oint if all entries will be deleled shortly after?
-        for result in results:
-            result['isDeleted'] = True
-            # Assuming there's a method to update the document in the client
-            self.client.upsert(collection_name=self.collection, data=result)
+        # Batch-update instead of one-at-a-time to avoid performance bottleneck
+        self._batchUpsertResults(results, isDeleted=True)
         return
 
     def markActive(self, objectIds: List[str]) -> None:
@@ -541,14 +595,10 @@ def markActive(self, objectIds: List[str]) -> None:
         if not filter_expression:
             return
 
-        results = self.client.query(collection_name=self.collection, filter=filter_expression)
+        results = self.client.query(collection_name=self.collection, filter=filter_expression, output_fields=['id', 'vector', 'content', 'meta'])
 
-        # Update the 'isDeleted' field for each result -> TODO: Might there be a better way to do this? Looping over the
-        # vecotrs can be a performance bottleneck and additionally whats the oint if all entries will be deleled shortly after?
-        for result in results:
-            result['isDeleted'] = False
-            # Assuming there's a method to update the document in the client
-            self.client.upsert(collection_name=self.collection, data=result)
+        # Batch-update instead of one-at-a-time to avoid performance bottleneck
+        self._batchUpsertResults(results, isDeleted=False)
         return
 
     def render(self, objectId: str, callback: Callable[[str], None]) -> None:
@@ -573,7 +623,7 @@ def render(self, objectId: str, callback: Callable[[str], None]) -> None:
             # Build filter for getting a set of chunks within the offset range
             must_condition = f"(meta['objectId'] == '{_escape_milvus_str(objectId)}') && ({offset - 1} < meta['chunkId'] < {offset + self.renderChunkSize})"
 
-            results = self.client.query(collection_name=self.collection, filter=must_condition)
+            results = self.client.query(collection_name=self.collection, filter=must_condition, output_fields=['meta', 'content'])
 
             # Create a renderChunkSize array with empty
             # entries. This will allow us to join even when