fix(pgvector): fix ConcurrentInsertRunner for non-thread-safe DBs

XuanYang-cn · XuanYang-cn · commit d8765b24b55b · 2026-04-21T14:58:03.000+08:00
For non-thread-safe DBs (e.g. PgVector), ConcurrentInsertRunner clamps max_workers to 1, so there is always exactly one worker thread. There is no need to deepcopy self.db per thread — the single worker can use self.db directly via the connection already opened by task()'s `with self.db.init():`. The original code called deepcopy(self.db) inside _get_thread_db() after task() had already opened a live psycopg C-extension Connection on self.db. C-extension objects cannot be deep-copied, causing: TypeError: no default __reduce__ due to non-trivial __cinit__ Fix: remove the deepcopy branch entirely. All workers (thread-safe or not) now use self.db directly; thread-safety is guaranteed for non-thread-safe DBs by the max_workers=1 clamp. Also clean up stale comments in pgvector.py left over from zilliztech#760/zilliztech#763. Adds tests/test_pgvector.py with: - unit test that reproduces the bug (fails on original, passes on fix) - e2e regression test via ConcurrentInsertRunner + OpenAI 50K dataset See also: zilliztech#756 Signed-off-by: yangxuan <xuan.yang@zilliz.com>
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -3,3 +3,6 @@
 filterwarnings =
     ignore::UserWarning
     ignore::DeprecationWarning
+
+markers =
+    integration: tests that require external services or network access (deselect with -m "not integration")
diff --git a/tests/test_pgvector.py b/tests/test_pgvector.py
@@ -0,0 +1,180 @@
+"""Tests for PgVector client and ConcurrentInsertRunner.
+
+Reproduces issue #756: insert fails with
+  TypeError: no default __reduce__ due to non-trivial __cinit__
+when ConcurrentInsertRunner deep-copies a PgVector instance that has a live
+psycopg connection open (the connection is opened by `with self.db.init():`
+inside task() before the deepcopy in _get_thread_db()).
+
+Requires:
+  docker run -d --name pgvector-test \
+    -e POSTGRES_USER=vectordb -e POSTGRES_PASSWORD=vectordb \
+    -e POSTGRES_DB=vectordb -p 5432:5432 \
+    pgvector/pgvector:pg17
+
+Usage:
+  pytest tests/test_pgvector.py -v -s
+"""
+
+from __future__ import annotations
+
+import logging
+import pickle
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+from vectordb_bench.backend.clients import DB
+from vectordb_bench.backend.clients.pgvector.config import PgVectorHNSWConfig
+from vectordb_bench.backend.dataset import Dataset, DatasetSource
+from vectordb_bench.backend.filter import Filter, FilterOp, non_filter
+from vectordb_bench.backend.runner.concurrent_runner import ConcurrentInsertRunner
+
+log = logging.getLogger(__name__)
+
+# ── Connection config ────────────────────────────────────────────────────────
+
+DB_CONFIG = {
+    "connect_config": {
+        "host": "localhost",
+        "port": 5432,
+        "dbname": "vectordb",
+        "user": "vectordb",
+        "password": "vectordb",
+    },
+    "table_name": "test_pgvector",
+}
+
+DIM = 128
+COUNT = 500
+RNG = np.random.default_rng(42)
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_hnsw_config(**kwargs) -> PgVectorHNSWConfig:
+    return PgVectorHNSWConfig(
+        metric_type="COSINE",
+        m=16,
+        ef_construction=64,
+        ef_search=64,
+        **kwargs,
+    )
+
+
+def make_db(table_name: str = "test_pgvector", drop_old: bool = True) -> DB.PgVector.init_cls:
+    cfg = dict(DB_CONFIG)
+    cfg["table_name"] = table_name
+    return DB.PgVector.init_cls(
+        dim=DIM,
+        db_config=cfg,
+        db_case_config=make_hnsw_config(),
+        drop_old=drop_old,
+    )
+
+
+def random_embeddings(n: int = COUNT, d: int = DIM) -> list[list[float]]:
+    return RNG.random((n, d)).tolist()
+
+
+# ── Basic client tests ────────────────────────────────────────────────────────
+
+
+class TestPgVectorBasic:
+    """Unit tests for the PgVector client (no subprocess)."""
+
+    def test_insert_and_search(self):
+        db = make_db("test_basic")
+        embeddings = random_embeddings()
+        metadata = list(range(COUNT))
+
+        with db.init():
+            count, err = db.insert_embeddings(embeddings=embeddings, metadata=metadata)
+        assert err is None, f"Insert error: {err}"
+        assert count == COUNT
+
+        with db.init():
+            db.optimize()
+
+        with db.init():
+            db.prepare_filter(Filter(type=FilterOp.NonFilter))
+            results = db.search_embedding(query=embeddings[0], k=10)
+        assert len(results) > 0
+
+    def test_db_is_not_thread_safe(self):
+        db = make_db("test_thread_safe")
+        assert db.thread_safe is False
+
+    def test_db_picklable_after_init(self):
+        """PgVector instance must be picklable after __init__ (conn/cursor are None).
+
+        This is required for ConcurrentInsertRunner which spawns a subprocess
+        and pickles self (which includes self.db).
+        """
+        db = make_db("test_pickle")
+        data = pickle.dumps(db)
+        db2 = pickle.loads(data)  # noqa: S301
+        assert db2.dim == DIM
+
+    def test_get_thread_db_with_open_connection(self):
+        """Regression test for issue #756.
+
+        ConcurrentInsertRunner.task() opens `with self.db.init()` before calling
+        workers. For non-thread-safe DBs the original _get_thread_db() then called
+        deepcopy(self.db) — but the live psycopg C-extension Connection is not
+        deep-copyable, causing TypeError.
+
+        Fixed code returns self.db directly (no deepcopy), so this test must pass
+        without raising.
+        """
+        db = make_db("test_get_thread_db")
+        runner = ConcurrentInsertRunner(db=db, dataset=MagicMock(), normalize=False)
+
+        with db.init():
+            assert db.conn is not None
+            result = runner._get_thread_db()  # TypeError here on original code
+
+        assert result is db
+
+
+# ── ConcurrentInsertRunner tests ──────────────────────────────────────────────
+
+
+class TestPgVectorConcurrentInsert:
+    """Tests for ConcurrentInsertRunner with PgVector (reproduces issue #756)."""
+
+    @pytest.mark.integration
+    def test_concurrent_insert_e2e(self):
+        """E2E regression test for issue #756 using the OpenAI 50K dataset.
+
+        Exercises the full pipeline:
+          ProcessPoolExecutor(spawn) → pickle runner → subprocess task()
+          → with self.db.init() → worker _get_thread_db() → insert batches
+
+        FAILS on original code (TypeError: deepcopy of live psycopg connection).
+        PASSES on fixed code.
+        """
+        dataset = Dataset.OPENAI.manager(50_000)
+        dataset.prepare(DatasetSource.AliyunOSS)
+
+        cfg = dict(DB_CONFIG)
+        cfg["table_name"] = "test_e2e_insert"
+        db = DB.PgVector.init_cls(
+            dim=dataset.data.dim,
+            db_config=cfg,
+            db_case_config=PgVectorHNSWConfig(
+                metric_type="COSINE",
+                m=16,
+                ef_construction=64,
+                ef_search=64,
+            ),
+            drop_old=True,
+        )
+
+        runner = ConcurrentInsertRunner(db=db, dataset=dataset, normalize=True, filters=non_filter)
+        count = runner.run()
+
+        assert count == 50_000, f"Expected 50000 rows, got {count}"
+        log.info(f"E2E insert completed: {count} rows")
diff --git a/vectordb_bench/backend/clients/pgvector/pgvector.py b/vectordb_bench/backend/clients/pgvector/pgvector.py
@@ -335,16 +335,9 @@ def _create_index(self):
 
         index_param = self.case_config.index_param()
         self._set_parallel_index_build_param()
-        # [FIX] The index access method name registered by the PostgreSQL pgvector extension is in
-        # lowercase (e.g., "hnsw", "ivfflat"), but the index type passed from the frontend UI is
-        # uppercase "HNSW" via IndexType.HNSW.value, causing SQL syntax "USING 'HNSW'" to fail
-        # with error "access method HNSW does not exist". Here we uniformly convert it to lowercase
-        # to match PostgreSQL's access method name.
+        # pgvector registers access methods in lowercase ("hnsw", "ivfflat") but
+        # IndexType enum values are uppercase; also IVFFlat maps to "ivfflat" (no underscore).
         index_type_lower = index_param["index_type"].lower()
-        # [FIX] The pgvector access method name is "ivfflat" (no underscore), but IndexType.IVFFlat.value
-        # produces "IVF_FLAT" which becomes "ivf_flat" after lowercase conversion, causing SQL syntax
-        # "USING 'ivf_flat'" to fail with error "access method 'ivf_flat' does not exist".
-        # Here we map "ivf_flat" → "ivfflat" to match PostgreSQL pgvector's registered access method name.
         if index_type_lower == "ivf_flat":
             index_type_lower = "ivfflat"
         log.info(f"index_type (original={index_param['index_type']}, normalized={index_type_lower})")
@@ -374,9 +367,8 @@ def _create_index(self):
                     if index_param["quantization_type"] == "bit"
                     else sql.Identifier("embedding")
                 ),
-                # [FIX] Use lowercase index_type_lower instead of original index_param["index_type"]
                 index_type=sql.Identifier(index_type_lower),
-                # This assumes that the quantization_type value matches the quantization function name
+                # quantization_type value matches the quantization function name
                 quantization_type=sql.SQL(index_param["quantization_type"]),
                 dim=self.dim,
                 embedding_metric=sql.Identifier(index_param["metric"]),
@@ -390,7 +382,6 @@ def _create_index(self):
             ).format(
                 index_name=sql.Identifier(self._index_name),
                 table_name=sql.Identifier(self.table_name),
-                # [FIX] Use lowercase index_type_lower instead of original index_param["index_type"]
                 index_type=sql.Identifier(index_type_lower),
                 embedding_metric=sql.Identifier(index_param["metric"]),
             )
diff --git a/vectordb_bench/backend/runner/concurrent_runner.py b/vectordb_bench/backend/runner/concurrent_runner.py
@@ -13,7 +13,6 @@
 import multiprocessing as mp
 import threading
 import time
-from copy import deepcopy
 from enum import StrEnum
 from typing import TYPE_CHECKING
 
@@ -44,7 +43,7 @@ class ConcurrentInsertRunner:
     """Concurrent insert runner with pluggable executor backend.
 
     Thread-safety: If db.thread_safe is False, max_workers is clamped to 1
-    and each worker thread gets a deep-copied DB instance with its own connection.
+    so the single worker thread uses self.db directly (no deepcopy needed).
 
     Args:
         db: VectorDB instance.
@@ -78,57 +77,31 @@ def __init__(
             log.info(f"DB {db.name} is not thread-safe, falling back to max_workers=1")
             effective_workers = 1
         self.max_workers = effective_workers
+        assert db.thread_safe or self.max_workers == 1, (
+            "Non-thread-safe DBs must use max_workers=1 — "
+            "_get_thread_db() relies on this to avoid concurrent access to self.db"
+        )
 
     def __getstate__(self):
         """Exclude unpicklable thread-local state for ProcessPoolExecutor(spawn)."""
         state = self.__dict__.copy()
-        state.pop("_local", None)
-        state.pop("_ctx_lock", None)
-        state.pop("_thread_contexts", None)
         state.pop("_iter_lock", None)
         state.pop("_dataset_iter", None)
         return state
 
-    def __setstate__(self, state: dict):
-        self.__dict__.update(state)
-        self._local = threading.local()
-        self._ctx_lock = threading.Lock()
-        self._thread_contexts = []
-
     def _create_executor(self) -> TaskExecutor:
         if self.backend == ExecutorBackend.ASYNC:
             return AsyncExecutor(max_workers=self.max_workers)
         return ThreadExecutor(max_workers=self.max_workers)
 
     def _get_thread_db(self) -> api.VectorDB:
-        """Get or create a per-thread DB instance.
+        """Return self.db.
 
-        Thread-safe DBs reuse self.db (connection opened in task()).
-        Non-thread-safe DBs get a deep-copied instance with its own connection,
-        cached in thread-local storage so it is created once per thread.
+        All workers share the connection opened by task()'s `with self.db.init()`.
+        Thread-safe DBs share it across multiple workers. Non-thread-safe DBs are
+        clamped to max_workers=1, so there is never concurrent access.
         """
-        if not hasattr(self._local, "db"):
-            if self.db.thread_safe:
-                self._local.db = self.db
-            else:
-                db = deepcopy(self.db)
-                # Manual __enter__/__exit__ because enter and exit happen in
-                # different scopes (here vs _cleanup_thread_contexts).
-                ctx = db.init()
-                ctx.__enter__()
-                self._local.db = db
-                with self._ctx_lock:
-                    self._thread_contexts.append(ctx)
-        return self._local.db
-
-    def _cleanup_thread_contexts(self) -> None:
-        """Close per-thread DB connections opened for non-thread-safe clients."""
-        for ctx in self._thread_contexts:
-            try:
-                ctx.__exit__(None, None, None)
-            except Exception:
-                log.warning("Failed to close per-thread DB connection", exc_info=True)
-        self._thread_contexts.clear()
+        return self.db
 
     def _insert_batch_with_retry(
         self,
@@ -160,14 +133,7 @@ def _worker_insert(
         metadata: list[int],
         labels_data: list[str] | None = None,
     ) -> int:
-        """Worker function: insert a batch with retry.
-
-        Thread-safe DBs: reuse self.db whose connection is already open
-        via task()'s `with self.db.init()` — all threads share it safely.
-
-        Non-thread-safe DBs: use a per-thread deep-copied instance with
-        its own connection, cached via threading.local.
-        """
+        """Worker function: insert a batch with retry."""
         db = self._get_thread_db()
         return self._insert_batch_with_retry(db, embeddings, metadata, labels_data)
 
@@ -214,9 +180,6 @@ def _worker_loop(self) -> int:
     def task(self) -> int:
         """Insert entire dataset using concurrent executor. Runs in subprocess."""
         count = 0
-        self._local = threading.local()
-        self._ctx_lock = threading.Lock()
-        self._thread_contexts = []
         self._iter_lock = threading.Lock()
         self._dataset_iter = iter(self.dataset)
 
@@ -227,23 +190,20 @@ def task(self) -> int:
             )
             start = time.perf_counter()
 
-            try:
-                with self._create_executor() as executor:
-                    for _ in range(self.max_workers):
-                        executor.submit(self._worker_loop)
-
-                    batch_results = executor.wait_all()
-
-                # Log all errors, then raise the first one
-                errors = [r.error for r in batch_results if r.error is not None]
-                if errors:
-                    for err in errors:
-                        log.warning(f"Batch insert error: {err}")
-                    raise errors[0]
-
-                count = sum(r.value for r in batch_results)
-            finally:
-                self._cleanup_thread_contexts()
+            with self._create_executor() as executor:
+                for _ in range(self.max_workers):
+                    executor.submit(self._worker_loop)
+
+                batch_results = executor.wait_all()
+
+            # Log all errors, then raise the first one
+            errors = [r.error for r in batch_results if r.error is not None]
+            if errors:
+                for err in errors:
+                    log.warning(f"Batch insert error: {err}")
+                raise errors[0]
+
+            count = sum(r.value for r in batch_results)
 
             log.info(
                 f"({mp.current_process().name:16}) Finish concurrent insert, "