Add session-scoped conversation cache and disable parallelization

jucor · claude · jucor · commit 4d12c2e81eef · 2026-03-26T23:51:39.000Z
Introduce a session-scoped pytest fixture in conftest.py that caches
computed Conversations across test files. Combined with test reordering,
this eliminates redundant computation when multiple test files need the
same datasets.

Key changes:
1. Session-scoped cache in conftest.py with single-dataset eviction
2. pytest_collection_modifyitems hook reorders tests to group by dataset
3. Disable pytest-xdist parallelization (-n0 in pyproject.toml)

How it works:
- Tests are reordered: all tests for dataset1 run first (across all files),
  then all tests for dataset2, etc.
- Cache holds ONE dataset at a time, evicting when switching datasets
- Peak memory: O(1 dataset) instead of O(N datasets)

Performance improvement for running legacy + discrepancy tests:
- Before: ~23s (each file computed conversations independently)
- After:  ~2.2s (compute once per dataset, reuse across files)
- Speedup: ~10x

Why disable parallelization:
- With parallelization: ~28s (each worker has separate process/cache)
- Without parallelization: ~2.2s (single process shares cache)
- Parallel workers can't share the session cache (separate processes)

Note: If PYTEST_ADDOPTS is set in the environment (e.g., "-n auto"),
it will override pyproject.toml's -n0. Either unset it or set to "".

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/delphi/pyproject.toml b/delphi/pyproject.toml
@@ -124,8 +124,12 @@ include = [
 
 # Pytest configuration
 [tool.pytest.ini_options]
-# When using pytest-xdist (-n), group tests by xdist_group marker for efficient fixture sharing
-addopts = "--dist=loadgroup"
+# Force sequential execution (-n0) to leverage the session-scoped conversation cache.
+# The cache shares computed conversations across test files, but each xdist worker
+# has its own process with a separate cache. With only 2 datasets (biodiversity, vw),
+# sequential execution with caching is ~6x faster than parallel execution where each
+# worker recomputes the conversations independently.
+addopts = "-n0"
 filterwarnings = [
     # Ignore python_multipart deprecation warning from ddtrace (third-party)
     "ignore:Please use `import python_multipart`:PendingDeprecationWarning:ddtrace.internal.module",
diff --git a/delphi/tests/conftest.py b/delphi/tests/conftest.py
@@ -5,44 +5,92 @@
 - Command line options --include-local and --datasets for dataset selection
 - Fixtures for accessing dataset information
 - @pytest.mark.use_discovered_datasets for dynamic dataset parametrization
-- Helper functions for parallel test execution with xdist_group markers
+- Session-scoped conversation cache for efficient test execution
 """
 
+from copy import deepcopy
+
 import pytest
+
+from polismath.conversation.conversation import Conversation
+from polismath.regression import get_dataset_files
 from polismath.regression.datasets import (
     discover_datasets,
     list_regression_datasets,
     get_blob_variants,
 )
+from tests.common_utils import load_votes, load_comments
 
 
 # =============================================================================
-# Parallel Execution Helpers
+# Session-scoped Conversation Cache
 # =============================================================================
 
-def make_dataset_params(datasets: list[str]) -> list:
+_SESSION_CONV_CACHE: dict = {}
+
+
+@pytest.fixture(scope="session")
+def get_or_compute_conversation():
+    """Session-wide conversation cache shared across all test files.
+
+    Returns a function that computes a Conversation once per dataset and
+    returns a deepcopy each time to preserve test isolation.
+
+    Only ONE dataset is kept in memory at a time. When a different dataset
+    is requested, the previous one is evicted. This works because tests are
+    reordered by pytest_collection_modifyitems to group all tests for a
+    dataset together (across all test files).
     """
-    Create pytest.param objects with xdist_group markers for parallel execution.
+    import gc
 
-    When using pytest-xdist with --dist=loadgroup, tests with the same
-    xdist_group marker will run on the same worker. This ensures fixtures
-    are computed only once per dataset per worker.
+    def _get(dataset_name: str) -> dict:
+        if dataset_name not in _SESSION_CONV_CACHE:
+            # Evict previous dataset (we only keep one at a time)
+            for ds in list(_SESSION_CONV_CACHE.keys()):
+                _SESSION_CONV_CACHE.pop(ds, None)
+            Conversation._reset_conversion_cache()
+            gc.collect()
+
+            files = get_dataset_files(dataset_name, blob_type='incremental')
+            votes = load_votes(files['votes'])
+            comments = load_comments(files['comments'])
+
+            conv = Conversation(dataset_name)
+            conv = conv.update_votes(votes)
+            conv = conv.recompute()
+
+            _SESSION_CONV_CACHE[dataset_name] = {
+                'conv': conv,
+                'dataset_name': dataset_name,
+                'files': files,
+                'comments': comments,
+            }
+
+        return deepcopy(_SESSION_CONV_CACHE[dataset_name])
+
+    return _get
+
+
+# =============================================================================
+# Dataset Parametrization Helpers
+# =============================================================================
+
+def make_dataset_params(datasets: list[str]) -> list:
+    """
+    Create pytest.param objects for dataset parametrization.
 
     Args:
         datasets: List of dataset names (or "dataset-blob_type" composite IDs)
 
     Returns:
-        List of pytest.param objects with xdist_group markers
+        List of pytest.param objects
 
     Example:
         @pytest.mark.parametrize("dataset_name", make_dataset_params(["biodiversity", "vw"]))
         def test_something(dataset_name):
             ...
     """
-    return [
-        pytest.param(ds, marks=pytest.mark.xdist_group(ds))
-        for ds in datasets
-    ]
+    return [pytest.param(ds) for ds in datasets]
 
 
 def parse_dataset_blob_id(composite_id: str) -> tuple[str, str]:
@@ -137,7 +185,7 @@ def pytest_generate_tests(metafunc):
     With use_blobs=True, parametrize with 'dataset-blob_type' composite IDs
     (e.g., 'biodiversity-incremental', 'engage-cold_start') for each filled blob variant.
 
-    Uses xdist_group markers for efficient parallel execution with pytest-xdist.
+    Uses the session-scoped conversation cache for efficient test execution.
     """
     markers = list(metafunc.definition.iter_markers("use_discovered_datasets"))
     if not markers:
@@ -169,6 +217,58 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("dataset_name", make_dataset_params(datasets))
 
 
+# =============================================================================
+# Test Reordering for Cache Efficiency
+# =============================================================================
+
+def _extract_dataset_from_test(item) -> str:
+    """Extract the dataset name from a test item's parameters.
+
+    Handles both plain dataset names ('biodiversity') and composite IDs
+    ('biodiversity-incremental'). Returns empty string if no dataset parameter.
+    """
+    # Check callspec for parametrized values
+    if hasattr(item, 'callspec') and item.callspec.params:
+        for param_name in ('dataset_name', 'dataset_blob_id'):
+            if param_name in item.callspec.params:
+                value = item.callspec.params[param_name]
+                # Extract base dataset name from composite IDs
+                if '-incremental' in value:
+                    return value.replace('-incremental', '')
+                elif '-cold_start' in value:
+                    return value.replace('-cold_start', '')
+                return value
+    return ''
+
+
+def pytest_collection_modifyitems(session, config, items):
+    """Reorder tests to group by dataset for cache efficiency.
+
+    Groups all tests for a dataset together (across all test files) so that
+    the session-scoped conversation cache only needs to hold ONE dataset at
+    a time. This reduces peak memory from O(N datasets) to O(1 dataset).
+
+    Order: dataset1[file1, file2, ...], dataset2[file1, file2, ...], ...
+    Within each dataset, original test order is preserved.
+    """
+    # Separate tests into dataset-parametrized and non-parametrized
+    dataset_tests = []
+    other_tests = []
+
+    for item in items:
+        ds = _extract_dataset_from_test(item)
+        if ds:
+            dataset_tests.append((ds, item))
+        else:
+            other_tests.append(item)
+
+    # Sort dataset tests by dataset name (stable sort preserves order within dataset)
+    dataset_tests.sort(key=lambda x: x[0])
+
+    # Rebuild items list: non-parametrized first, then dataset tests grouped
+    items[:] = other_tests + [item for _, item in dataset_tests]
+
+
 # Provide summary of discovered datasets at start of test run
 def pytest_report_header(config):
     """Add dataset discovery info to pytest header."""
diff --git a/delphi/tests/test_discrepancy_fixes.py b/delphi/tests/test_discrepancy_fixes.py
@@ -50,7 +50,7 @@
 )
 from polismath.regression.datasets import discover_datasets
 from conftest import _get_requested_datasets, make_dataset_params, parse_dataset_blob_id
-from tests.common_utils import load_votes, load_comments, load_clojure_output
+from tests.common_utils import load_clojure_output
 
 
 # ---------------------------------------------------------------------------
@@ -85,55 +85,23 @@ def pytest_generate_tests(metafunc):
 # Shared fixtures
 # ---------------------------------------------------------------------------
 
-# Module-level caches — Conversation is keyed by dataset name (shared across
-# blob variants), blobs are keyed by composite ID.
-_CONV_CACHE: dict = {}
+# Module-level cache for blobs (keyed by composite ID)
 _BLOB_CACHE: dict = {}
 
 
-def _get_or_compute_conversation(dataset_name: str) -> dict:
-    """Compute (or retrieve cached) conversation for a dataset."""
-    import gc
-    if dataset_name in _CONV_CACHE:
-        return _CONV_CACHE[dataset_name]
-
-    # Evict other datasets
-    for ds in list(_CONV_CACHE.keys()):
-        if ds != dataset_name:
-            _CONV_CACHE.pop(ds, None)
-            Conversation._reset_conversion_cache()
-            gc.collect()
-
-    files = get_dataset_files(dataset_name, blob_type='incremental')
-    votes = load_votes(files['votes'])
-    comments = load_comments(files['comments'])
-
-    conv = Conversation(dataset_name)
-    conv = conv.update_votes(votes)
-    conv = conv.recompute()
-
-    data = {
-        'conv': conv,
-        'dataset_name': dataset_name,
-        'files': files,
-        'comments': comments,
-    }
-    _CONV_CACHE[dataset_name] = data
-    return data
-
-
 @pytest.fixture(scope="class")
-def conversation_data(dataset_name):
+def conversation_data(dataset_name, get_or_compute_conversation):
     """Class-scoped fixture: runs the full pipeline once per dataset+blob_type.
 
     dataset_name here is actually a composite 'dataset-blob_type' ID
-    (e.g., 'biodiversity-full'). The Conversation is shared across blob variants.
+    (e.g., 'biodiversity-full'). The Conversation is shared across blob variants
+    via the session-scoped get_or_compute_conversation fixture.
     """
     global _BLOB_CACHE
     ds_name, blob_type = parse_dataset_blob_id(dataset_name)
 
-    # Get or compute the conversation (shared across blob variants)
-    conv_data = _get_or_compute_conversation(ds_name)
+    # Get or compute the conversation (shared across blob variants via session cache)
+    conv_data = get_or_compute_conversation(ds_name)
 
     # Load the specific blob variant (cache per composite ID)
     if dataset_name not in _BLOB_CACHE:
diff --git a/delphi/tests/test_legacy_clojure_regression.py b/delphi/tests/test_legacy_clojure_regression.py
@@ -18,12 +18,10 @@
 
 import pytest
 import pytest_check as check
-import gc
 
-from polismath.conversation.conversation import Conversation
 from polismath.regression import get_dataset_files, get_blob_variants
 from polismath.regression.datasets import discover_datasets
-from tests.common_utils import load_votes, load_comments, load_clojure_output
+from tests.common_utils import load_clojure_output
 from conftest import _get_requested_datasets, make_dataset_params, parse_dataset_blob_id
 from polismath.regression.clojure_comparer import (
     ClojureComparer,
@@ -50,9 +48,7 @@ def _get_clojure_dataset_blob_ids(include_local: bool, requested: Optional[set[s
     return result
 
 
-# Module-level caches — Conversation is keyed by dataset name (shared across
-# blob variants of the same dataset), blobs are keyed by composite ID.
-_CONV_CACHE: dict = {}
+# Module-level cache for blobs (keyed by composite ID)
 _BLOB_CACHE: dict = {}
 
 
@@ -66,88 +62,17 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("dataset_blob_id", params, scope="class")
 
 
-def _get_or_compute_conversation(dataset_name: str) -> dict:
-    """Get cached Conversation or compute it. Evicts other datasets for memory."""
-    global _CONV_CACHE
-    if dataset_name in _CONV_CACHE:
-        return _CONV_CACHE[dataset_name]
-
-    # Evict previous datasets
-    for ds in list(_CONV_CACHE.keys()):
-        if ds != dataset_name:
-            print(f"[{ds}] Cleaning up previous dataset...")
-            _CONV_CACHE.pop(ds, None)
-            Conversation._reset_conversion_cache()
-            gc.collect()
-
-    # Get dataset files (blob_type doesn't matter here — we only need votes/comments)
-    dataset_files = get_dataset_files(dataset_name, blob_type='incremental')
-
-    # Create and compute conversation
-    votes = load_votes(dataset_files['votes'])
-    comments = load_comments(dataset_files['comments'])
-
-    print(f"\n[{dataset_name}] Processing conversation with {len(votes['votes'])} votes and {len(comments['comments'])} comments")
-    conv = Conversation(dataset_name)
-    conv = conv.update_votes(votes)
-
-    print(f"[{dataset_name}] Recomputing conversation analysis...")
-    conv = conv.recompute()
-
-    # Extract key metrics for reporting
-    group_count = len(conv.group_clusters)
-    print(f"[{dataset_name}] Found {group_count} groups")
-    print(f"[{dataset_name}] Processed {conv.comment_count} comments")
-    print(f"[{dataset_name}] Found {conv.participant_count} participants")
-
-    if conv.repness and 'comment_repness' in conv.repness:
-        print(f"[{dataset_name}] Calculated representativeness for {len(conv.repness['comment_repness'])} comments")
-
-    # Print top representative comments for each group
-    if conv.repness and 'comment_repness' in conv.repness:
-        for group_id in range(group_count):
-            print(f"\n[{dataset_name}] Top representative comments for Group {group_id}:")
-            group_repness = [item for item in conv.repness['comment_repness'] if item['gid'] == group_id]
-
-            # Sort by representativeness
-            group_repness.sort(key=lambda x: abs(x['repness']), reverse=True)
-
-            # Print top 5 comments
-            for i, rep_item in enumerate(group_repness[:5]):
-                comment_id = rep_item['tid']
-                # Get the comment text if available
-                comment_txt = next((c['txt'] for c in comments['comments'] if str(c['tid']) == str(comment_id)), 'Unknown')
-                print(f"  {i+1}. Comment {comment_id} (Repness: {rep_item['repness']:.4f}): {comment_txt[:50]}...")
-
-    # Save the Python conversion results for manual inspection
-    import os
-    import json
-    data_dir = dataset_files['data_dir']
-    output_dir = os.path.join(os.path.dirname(data_dir), '.test_outputs', 'python_output', dataset_name)
-    os.makedirs(output_dir, exist_ok=True)
-
-    output_path = os.path.join(output_dir, 'conversation_result.json')
-    with open(output_path, 'w') as f:
-        json.dump(conv.to_dict(), f, indent=2)
-
-    print(f"[{dataset_name}] Saved results to {output_path}")
-
-    data = {'conv': conv, 'comments': comments}
-    _CONV_CACHE[dataset_name] = data
-    return data
-
-
 @pytest.fixture(scope="class")
-def conversation_data(dataset_blob_id):
+def conversation_data(dataset_blob_id, get_or_compute_conversation):
     """
     Class-scoped fixture computed once per dataset+blob_type.
-    Reuses the Conversation across blob variants of the same dataset.
+    Reuses the Conversation across blob variants via the session-scoped cache.
     """
     global _BLOB_CACHE
     dataset_name, blob_type = parse_dataset_blob_id(dataset_blob_id)
 
-    # Get or compute the conversation (shared across blob variants)
-    conv_data = _get_or_compute_conversation(dataset_name)
+    # Get or compute the conversation (shared across blob variants via session cache)
+    conv_data = get_or_compute_conversation(dataset_name)
 
     # Load the specific blob variant (cache per composite ID)
     if dataset_blob_id not in _BLOB_CACHE: