4343 repness_metric ,
4444 finalize_cmt_stats ,
4545)
46- from polismath .regression import get_dataset_files
46+ from polismath .regression import get_dataset_files , get_blob_variants
4747from polismath .regression .clojure_comparer import (
4848 ClojureComparer ,
4949 unfold_clojure_group_clusters ,
5050)
5151from polismath .regression .datasets import discover_datasets
52- from conftest import _get_requested_datasets , make_dataset_params
52+ from conftest import _get_requested_datasets , make_dataset_params , parse_dataset_blob_id
5353from tests .common_utils import load_votes , load_comments , load_clojure_output
5454
5555
5656# ---------------------------------------------------------------------------
57- # Dataset parametrization (same pattern as test_legacy_clojure_regression.py)
57+ # Dataset+blob parametrization (same pattern as test_legacy_clojure_regression.py)
5858# ---------------------------------------------------------------------------
5959
60- def _get_clojure_datasets (include_local : bool , requested : set [str ] | None = None ) -> list [str ]:
61- """Get datasets that have Clojure math_blob for comparison ."""
60+ def _get_clojure_dataset_blob_ids (include_local : bool , requested : set [str ] | None = None ) -> list [str ]:
61+ """Get composite 'dataset-blob_type' IDs for all filled blobs ."""
6262 datasets = discover_datasets (include_local = include_local )
63- result = [
64- name for name , info in datasets .items ()
65- if info .has_votes and info .has_comments and info .has_clojure_reference
66- ]
67- if requested :
68- result = [d for d in result if d in requested ]
63+ result = []
64+ for name , info in datasets .items ():
65+ if not (info .has_votes and info .has_comments and info .has_clojure_reference ):
66+ continue
67+ if requested and name not in requested :
68+ continue
69+ for blob_type in get_blob_variants (name ):
70+ result .append (f"{ name } -{ blob_type } " )
6971 return result
7072
7173
7274def pytest_generate_tests (metafunc ):
73- """Parametrize tests with clojure datasets at collection time."""
75+ """Parametrize tests with clojure dataset+blob_type at collection time."""
7476 if "dataset_name" in metafunc .fixturenames :
7577 include_local = metafunc .config .getoption ("--include-local" , default = False )
7678 requested = _get_requested_datasets (metafunc .config )
77- datasets = _get_clojure_datasets (include_local , requested )
78- params = make_dataset_params (datasets )
79+ blob_ids = _get_clojure_dataset_blob_ids (include_local , requested )
80+ params = make_dataset_params (blob_ids )
7981 metafunc .parametrize ("dataset_name" , params , scope = "class" )
8082
8183
8284# ---------------------------------------------------------------------------
8385# Shared fixtures
8486# ---------------------------------------------------------------------------
8587
86- # Module-level cache — one Conversation at a time to manage memory
87- _CACHE : dict = {}
88+ # Module-level caches — Conversation is keyed by dataset name (shared across
89+ # blob variants), blobs are keyed by composite ID.
90+ _CONV_CACHE : dict = {}
91+ _BLOB_CACHE : dict = {}
8892
8993
90- def _get_conversation_data (dataset_name : str ) -> dict :
91- """Compute (or retrieve cached) conversation + clojure blob for a dataset."""
94+ def _get_or_compute_conversation (dataset_name : str ) -> dict :
95+ """Compute (or retrieve cached) conversation for a dataset."""
9296 import gc
97+ if dataset_name in _CONV_CACHE :
98+ return _CONV_CACHE [dataset_name ]
99+
93100 # Evict other datasets
94- for ds in list (_CACHE .keys ()):
101+ for ds in list (_CONV_CACHE .keys ()):
95102 if ds != dataset_name :
96- _CACHE .pop (ds , None )
103+ _CONV_CACHE .pop (ds , None )
97104 Conversation ._reset_conversion_cache ()
98105 gc .collect ()
99106
100- if dataset_name in _CACHE :
101- return _CACHE [dataset_name ]
102-
103- files = get_dataset_files (dataset_name )
104- clojure = load_clojure_output (files ['math_blob' ])
107+ files = get_dataset_files (dataset_name , blob_type = 'incremental' )
105108 votes = load_votes (files ['votes' ])
106109 comments = load_comments (files ['comments' ])
107110
@@ -111,19 +114,44 @@ def _get_conversation_data(dataset_name: str) -> dict:
111114
112115 data = {
113116 'conv' : conv ,
114- 'clojure' : clojure ,
115117 'dataset_name' : dataset_name ,
116118 'files' : files ,
117119 'comments' : comments ,
118120 }
119- _CACHE [dataset_name ] = data
121+ _CONV_CACHE [dataset_name ] = data
120122 return data
121123
122124
123125@pytest .fixture (scope = "class" )
124126def conversation_data (dataset_name ):
125- """Class-scoped fixture: runs the full pipeline once per dataset."""
126- return _get_conversation_data (dataset_name )
127+ """Class-scoped fixture: runs the full pipeline once per dataset+blob_type.
128+
129+ dataset_name here is actually a composite 'dataset-blob_type' ID
130+ (e.g., 'biodiversity-full'). The Conversation is shared across blob variants.
131+ """
132+ global _BLOB_CACHE
133+ ds_name , blob_type = parse_dataset_blob_id (dataset_name )
134+
135+ # Get or compute the conversation (shared across blob variants)
136+ conv_data = _get_or_compute_conversation (ds_name )
137+
138+ # Load the specific blob variant (cache per composite ID)
139+ if dataset_name not in _BLOB_CACHE :
140+ for bid in list (_BLOB_CACHE .keys ()):
141+ if not bid .startswith (ds_name + '-' ):
142+ _BLOB_CACHE .pop (bid , None )
143+ files = get_dataset_files (ds_name , blob_type = blob_type )
144+ clojure = load_clojure_output (files ['math_blob' ])
145+ _BLOB_CACHE [dataset_name ] = clojure
146+
147+ return {
148+ 'conv' : conv_data ['conv' ],
149+ 'clojure' : _BLOB_CACHE [dataset_name ],
150+ 'dataset_name' : ds_name ,
151+ 'blob_type' : blob_type ,
152+ 'files' : conv_data ['files' ],
153+ 'comments' : conv_data ['comments' ],
154+ }
127155
128156
129157@pytest .fixture (scope = "class" )
0 commit comments