Test both incremental and cold-start Clojure blobs

jucor · claude · jucor · commit 1e96cb65b448 · 2026-03-30T17:46:24.000+01:00
Clojure comparison tests now run against both blob variants when available:
- incremental: result of progressive refinement as votes trickled in
- cold_start: computed from scratch in one pass on full dataset

Each dataset generates separate test IDs (e.g., biodiversity-incremental,
biodiversity-cold_start). Only blobs with meaningful content (PCA data or
non-empty clusters) are included.

Key changes:
- Add get_blob_variants() to discover filled blob variants per dataset
- Add _is_blob_filled() to check if a blob has meaningful content
- Extend get_dataset_files() with explicit blob_type parameter
- Add use_blobs=True option to @pytest.mark.use_discovered_datasets
- Add parse_dataset_blob_id() helper for composite ID parsing
- Update test_legacy_clojure_regression, test_discrepancy_fixes, and
  test_legacy_repness_comparison to parametrize by blob variant
- Conversation computation is shared across blob variants of same dataset

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/delphi/polismath/regression/__init__.py b/delphi/polismath/regression/__init__.py
@@ -15,6 +15,7 @@
     get_dataset_info,
     get_dataset_files,
     get_dataset_report_id,
+    get_blob_variants,
 )
 __all__ = [
     'ConversationRecorder',
@@ -26,4 +27,5 @@
     'get_dataset_info',
     'get_dataset_files',
     'get_dataset_report_id',
+    'get_blob_variants',
 ]
diff --git a/delphi/polismath/regression/datasets.py b/delphi/polismath/regression/datasets.py
@@ -156,12 +156,15 @@ def get_dataset_report_id(name: str) -> str:
     return get_dataset_info(name).report_id
 
 
-def get_dataset_files(name: str, prefer_cold_start: bool = True) -> Dict[str, str]:
+def get_dataset_files(name: str, prefer_cold_start: bool = True, blob_type: Optional[str] = None) -> Dict[str, str]:
     """Get file paths for a dataset.
 
     Args:
         name: Dataset name
-        prefer_cold_start: If True (default), use cold-start blob when available
+        prefer_cold_start: If True (default), use cold-start blob when available.
+            Ignored if blob_type is specified.
+        blob_type: Explicit blob type to use: 'incremental' or 'cold_start'.
+            If specified, overrides prefer_cold_start.
     """
     info = get_dataset_info(name)
     rid = info.report_id
@@ -174,11 +177,19 @@ def find_file(pattern: str) -> str:
             raise ValueError(f"Multiple files matching {pattern} in {info.path}: {matches}")
         return str(matches[0].resolve())
 
-    # Check for cold-start blob first, fall back to original
+    # Determine which blob to use
     cold_start_blob = info.path / f"{rid}_math_blob_cold_start.json"
     original_blob = info.path / f"{rid}_math_blob.json"
 
-    if prefer_cold_start and cold_start_blob.exists():
+    if blob_type == 'cold_start':
+        if not cold_start_blob.exists():
+            raise FileNotFoundError(f"No cold-start blob for {name}")
+        math_blob_path = str(cold_start_blob)
+    elif blob_type == 'incremental':
+        if not original_blob.exists():
+            raise FileNotFoundError(f"No full blob for {name}")
+        math_blob_path = str(original_blob)
+    elif prefer_cold_start and cold_start_blob.exists():
         math_blob_path = str(cold_start_blob)
     else:
         math_blob_path = str(original_blob)
@@ -193,6 +204,44 @@ def find_file(pattern: str) -> str:
     }
 
 
+def _is_blob_filled(blob_path: Path) -> bool:
+    """Check if a math blob has meaningful content (PCA, non-empty clusters, etc.)."""
+    import json
+    if not blob_path.exists():
+        return False
+    try:
+        with open(blob_path) as f:
+            data = json.load(f)
+        # A blob is "filled" if it has PCA data or non-empty base-clusters
+        has_pca = 'pca' in data and 'comps' in data.get('pca', {})
+        bc = data.get('base-clusters', {})
+        has_clusters = isinstance(bc, dict) and len(bc.get('id', [])) > 0
+        return has_pca or has_clusters
+    except (json.JSONDecodeError, OSError):
+        return False
+
+
+def get_blob_variants(name: str) -> List[str]:
+    """Get available filled blob variants for a dataset.
+
+    Returns a list of blob_type strings ('incremental', 'cold_start') for blobs
+    that exist and contain meaningful data.
+    """
+    info = get_dataset_info(name)
+    rid = info.report_id
+    variants = []
+
+    full_blob = info.path / f"{rid}_math_blob.json"
+    if _is_blob_filled(full_blob):
+        variants.append('incremental')
+
+    cold_start_blob = info.path / f"{rid}_math_blob_cold_start.json"
+    if _is_blob_filled(cold_start_blob):
+        variants.append('cold_start')
+
+    return variants
+
+
 # Legacy aliases
 def get_dataset_directory(report_id: str, dataset_name: Optional[str] = None) -> Path:
     """Find dataset directory by report_id."""
diff --git a/delphi/tests/conftest.py b/delphi/tests/conftest.py
@@ -12,6 +12,7 @@
 from polismath.regression.datasets import (
     discover_datasets,
     list_regression_datasets,
+    get_blob_variants,
 )
 
 
@@ -28,7 +29,7 @@ def make_dataset_params(datasets: list[str]) -> list:
     are computed only once per dataset per worker.
 
     Args:
-        datasets: List of dataset names
+        datasets: List of dataset names (or "dataset-blob_type" composite IDs)
 
     Returns:
         List of pytest.param objects with xdist_group markers
@@ -44,6 +45,24 @@ def test_something(dataset_name):
     ]
 
 
+def parse_dataset_blob_id(composite_id: str) -> tuple[str, str]:
+    """Parse a 'dataset-blob_type' composite ID into (dataset_name, blob_type).
+
+    Examples:
+        'biodiversity-incremental' -> ('biodiversity', 'incremental')
+        'bg2050-cold_start' -> ('bg2050', 'cold_start')
+    """
+    if composite_id.endswith('-cold_start'):
+        return composite_id[:-len('-cold_start')], 'cold_start'
+    elif composite_id.endswith('-incremental'):
+        return composite_id[:-len('-incremental')], 'incremental'
+    else:
+        raise ValueError(
+            f"Invalid composite dataset ID: {composite_id}. "
+            f"Expected format: 'dataset-incremental' or 'dataset-cold_start'"
+        )
+
+
 def pytest_addoption(parser):
     """Add custom command line options to pytest."""
     parser.addoption(
@@ -83,8 +102,10 @@ def pytest_configure(config):
     """Register custom markers."""
     config.addinivalue_line(
         "markers",
-        "use_discovered_datasets: dynamically parametrize with discovered "
-        "datasets, respecting --include-local and --datasets CLI options"
+        "use_discovered_datasets(use_blobs=False): dynamically parametrize with discovered "
+        "datasets, respecting --include-local and --datasets CLI options. "
+        "With use_blobs=True, parametrize with 'dataset-blob_type' composite IDs "
+        "(e.g., 'biodiversity-incremental', 'engage-cold_start') for each filled blob variant."
     )
 
 
@@ -113,19 +134,39 @@ def pytest_generate_tests(metafunc):
     These tests must declare a 'dataset_name' parameter. They will be parametrized
     with all regression datasets, filtered by --include-local and --datasets.
 
+    With use_blobs=True, parametrize with 'dataset-blob_type' composite IDs
+    (e.g., 'biodiversity-incremental', 'engage-cold_start') for each filled blob variant.
+
     Uses xdist_group markers for efficient parallel execution with pytest-xdist.
     """
-    if not list(metafunc.definition.iter_markers("use_discovered_datasets")):
+    markers = list(metafunc.definition.iter_markers("use_discovered_datasets"))
+    if not markers:
         return
 
     include_local = metafunc.config.getoption("--include-local")
     requested = _get_requested_datasets(metafunc.config)
 
-    datasets = list_regression_datasets(include_local=include_local)
-    if requested:
-        datasets = [d for d in datasets if d in requested]
-
-    metafunc.parametrize("dataset_name", make_dataset_params(datasets))
+    # Check if use_blobs=True was passed to the marker
+    use_blobs = any(m.kwargs.get('use_blobs', False) for m in markers)
+
+    if use_blobs:
+        # Parametrize with composite 'dataset-blob_type' IDs
+        datasets = discover_datasets(include_local=include_local)
+        blob_ids = []
+        for name, info in datasets.items():
+            if not (info.has_votes and info.has_comments and info.has_clojure_reference):
+                continue
+            if requested and name not in requested:
+                continue
+            for blob_type in get_blob_variants(name):
+                blob_ids.append(f"{name}-{blob_type}")
+        metafunc.parametrize("dataset_name", make_dataset_params(blob_ids))
+    else:
+        # Parametrize with plain dataset names
+        datasets = list_regression_datasets(include_local=include_local)
+        if requested:
+            datasets = [d for d in datasets if d in requested]
+        metafunc.parametrize("dataset_name", make_dataset_params(datasets))
 
 
 # Provide summary of discovered datasets at start of test run
diff --git a/delphi/tests/test_discrepancy_fixes.py b/delphi/tests/test_discrepancy_fixes.py
@@ -43,65 +43,68 @@
     repness_metric,
     finalize_cmt_stats,
 )
-from polismath.regression import get_dataset_files
+from polismath.regression import get_dataset_files, get_blob_variants
 from polismath.regression.clojure_comparer import (
     ClojureComparer,
     unfold_clojure_group_clusters,
 )
 from polismath.regression.datasets import discover_datasets
-from conftest import _get_requested_datasets, make_dataset_params
+from conftest import _get_requested_datasets, make_dataset_params, parse_dataset_blob_id
 from tests.common_utils import load_votes, load_comments, load_clojure_output
 
 
 # ---------------------------------------------------------------------------
-# Dataset parametrization (same pattern as test_legacy_clojure_regression.py)
+# Dataset+blob parametrization (same pattern as test_legacy_clojure_regression.py)
 # ---------------------------------------------------------------------------
 
-def _get_clojure_datasets(include_local: bool, requested: set[str] | None = None) -> list[str]:
-    """Get datasets that have Clojure math_blob for comparison."""
+def _get_clojure_dataset_blob_ids(include_local: bool, requested: set[str] | None = None) -> list[str]:
+    """Get composite 'dataset-blob_type' IDs for all filled blobs."""
     datasets = discover_datasets(include_local=include_local)
-    result = [
-        name for name, info in datasets.items()
-        if info.has_votes and info.has_comments and info.has_clojure_reference
-    ]
-    if requested:
-        result = [d for d in result if d in requested]
+    result = []
+    for name, info in datasets.items():
+        if not (info.has_votes and info.has_comments and info.has_clojure_reference):
+            continue
+        if requested and name not in requested:
+            continue
+        for blob_type in get_blob_variants(name):
+            result.append(f"{name}-{blob_type}")
     return result
 
 
 def pytest_generate_tests(metafunc):
-    """Parametrize tests with clojure datasets at collection time."""
+    """Parametrize tests with clojure dataset+blob_type at collection time."""
     if "dataset_name" in metafunc.fixturenames:
         include_local = metafunc.config.getoption("--include-local", default=False)
         requested = _get_requested_datasets(metafunc.config)
-        datasets = _get_clojure_datasets(include_local, requested)
-        params = make_dataset_params(datasets)
+        blob_ids = _get_clojure_dataset_blob_ids(include_local, requested)
+        params = make_dataset_params(blob_ids)
         metafunc.parametrize("dataset_name", params, scope="class")
 
 
 # ---------------------------------------------------------------------------
 # Shared fixtures
 # ---------------------------------------------------------------------------
 
-# Module-level cache — one Conversation at a time to manage memory
-_CACHE: dict = {}
+# Module-level caches — Conversation is keyed by dataset name (shared across
+# blob variants), blobs are keyed by composite ID.
+_CONV_CACHE: dict = {}
+_BLOB_CACHE: dict = {}
 
 
-def _get_conversation_data(dataset_name: str) -> dict:
-    """Compute (or retrieve cached) conversation + clojure blob for a dataset."""
+def _get_or_compute_conversation(dataset_name: str) -> dict:
+    """Compute (or retrieve cached) conversation for a dataset."""
     import gc
+    if dataset_name in _CONV_CACHE:
+        return _CONV_CACHE[dataset_name]
+
     # Evict other datasets
-    for ds in list(_CACHE.keys()):
+    for ds in list(_CONV_CACHE.keys()):
         if ds != dataset_name:
-            _CACHE.pop(ds, None)
+            _CONV_CACHE.pop(ds, None)
             Conversation._reset_conversion_cache()
             gc.collect()
 
-    if dataset_name in _CACHE:
-        return _CACHE[dataset_name]
-
-    files = get_dataset_files(dataset_name)
-    clojure = load_clojure_output(files['math_blob'])
+    files = get_dataset_files(dataset_name, blob_type='incremental')
     votes = load_votes(files['votes'])
     comments = load_comments(files['comments'])
 
@@ -111,19 +114,44 @@ def _get_conversation_data(dataset_name: str) -> dict:
 
     data = {
         'conv': conv,
-        'clojure': clojure,
         'dataset_name': dataset_name,
         'files': files,
         'comments': comments,
     }
-    _CACHE[dataset_name] = data
+    _CONV_CACHE[dataset_name] = data
     return data
 
 
 @pytest.fixture(scope="class")
 def conversation_data(dataset_name):
-    """Class-scoped fixture: runs the full pipeline once per dataset."""
-    return _get_conversation_data(dataset_name)
+    """Class-scoped fixture: runs the full pipeline once per dataset+blob_type.
+
+    dataset_name here is actually a composite 'dataset-blob_type' ID
+    (e.g., 'biodiversity-full'). The Conversation is shared across blob variants.
+    """
+    global _BLOB_CACHE
+    ds_name, blob_type = parse_dataset_blob_id(dataset_name)
+
+    # Get or compute the conversation (shared across blob variants)
+    conv_data = _get_or_compute_conversation(ds_name)
+
+    # Load the specific blob variant (cache per composite ID)
+    if dataset_name not in _BLOB_CACHE:
+        for bid in list(_BLOB_CACHE.keys()):
+            if not bid.startswith(ds_name + '-'):
+                _BLOB_CACHE.pop(bid, None)
+        files = get_dataset_files(ds_name, blob_type=blob_type)
+        clojure = load_clojure_output(files['math_blob'])
+        _BLOB_CACHE[dataset_name] = clojure
+
+    return {
+        'conv': conv_data['conv'],
+        'clojure': _BLOB_CACHE[dataset_name],
+        'dataset_name': ds_name,
+        'blob_type': blob_type,
+        'files': conv_data['files'],
+        'comments': conv_data['comments'],
+    }
 
 
 @pytest.fixture(scope="class")
diff --git a/delphi/tests/test_legacy_clojure_regression.py b/delphi/tests/test_legacy_clojure_regression.py
diff --git a/delphi/tests/test_legacy_repness_comparison.py b/delphi/tests/test_legacy_repness_comparison.py

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`get_dataset_info,`
`16`	`16`	`get_dataset_files,`
`17`	`17`	`get_dataset_report_id,`
	`18`	`+ get_blob_variants,`
`18`	`19`	`)`
`19`	`20`	`__all__ = [`
`20`	`21`	`'ConversationRecorder',`
`@@ -26,4 +27,5 @@`
`26`	`27`	`'get_dataset_info',`
`27`	`28`	`'get_dataset_files',`
`28`	`29`	`'get_dataset_report_id',`
	`30`	`+ 'get_blob_variants',`
`29`	`31`	`]`