feat(scoped-sd): wire cleaned_* into merged_sd + Codex P2 fix (#789)

paddymul · web-flow · commit 76743bf0f89b · 2026-05-21T22:18:08.000Z
* test(scoped-sd): failing tests for cleaned_* scope + Codex P2 invalidation Adds four tests pinning the deferred items on #785: - ``test_cleaned_keys_appear_when_cleaning_active`` — clean scope SD must be layered into ``merged_sd`` with a ``cleaned_*`` prefix when cleaning ops are active. - ``test_cleaning_only_does_not_emit_filtered_keys`` — the broken ``filter_active`` gate currently mislabels cleaning-affected stats as ``filtered_*`` when no search filter is active. The right gate is on chain-shape diff (``filt != clean``). - ``test_filter_and_clean_both_emit_correctly`` — both scopes layered without cross-talk; ``cleaned_null_count`` reflects the clean df, ``filtered_null_count`` reflects the search-nulled df. - ``test_analysis_klasses_change_invalidates_scoped_sd`` — Codex P2 pin: a swap of ``analysis_klasses`` must invalidate the per-scope SD cache so the new stat klass's keys surface in ``merged_sd``. All four are expected to fail on this commit; fixes follow. * feat(scoped-sd): wire cleaned_* into merged_sd + chain-shape gates + Codex P2 A. ``_merged_sd``: reads the clean scope's SD from the keyed cache and layers ``cleaned_*`` keys between the raw bare keys and the ``filtered_*`` keys. Adds ``clean_sd_key`` to the observed set so the observer fires after ``_populate_sd_cache`` has updated the pointer. Replaces the broken ``filter_active = filt_sd_key != raw_sd_key`` gate (which fired whenever cleaning was active, mislabelling cleaning-affected stats as ``filtered_*``) with chain-shape diffs: chains = split_chain_by_scope(self.operations) cleaning_active = chains['clean'] != chains['raw'] filter_active = chains['filt'] != chains['clean'] ``filtered_*`` now only fires when the filt chain extends the clean chain — i.e. a real quick-command op was added. B. ``_scope_cache_key``: includes ``id(self.analysis_klasses)`` in the ``extra`` arg passed to ``hash_chain``. Pins Codex P2: a klass-list swap with an unchanged op chain now produces a distinct cache key, so the scope SD is recomputed against the new klasses. The previously-failing ``test_cleaning_only_does_not_emit_filtered_keys`` (the gate-bug pin from the failing-tests commit) now passes naturally; no test changes in this commit.
diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py
@@ -405,21 +405,23 @@ def setup_options_from_analysis(self):
     df_data_dict = Any({'empty':[]}).tag(sync=True)
 
 
-    @observe('summary_sd', 'processed_result', 'filt_sd_key')
+    @observe('summary_sd', 'processed_result', 'clean_sd_key', 'filt_sd_key')
     @exception_protect('merged_sd-protector')
     def _merged_sd(self, change):
         # Bare keys come from the raw scope's SD (computed on
-        # sampled_df). ``filtered_*`` keys are layered on top from the
-        # filt scope's SD when the filter is active. Scope SDs are read
-        # from the keyed cache (#783) — the cache observer
-        # ``_populate_sd_cache`` is what computes and stores them; this
-        # observer just assembles the wire shape #777's `?key` JS
-        # consumes.
+        # sampled_df). ``cleaned_*`` keys are layered on top from the
+        # clean scope's SD when cleaning is active; ``filtered_*`` keys
+        # are layered on top from the filt scope's SD when a search
+        # filter is active. Scope SDs are read from the keyed cache
+        # (#783) — the cache observer ``_populate_sd_cache`` is what
+        # computes and stores them; this observer just assembles the
+        # wire shape #777's `?key` JS consumes.
         #
-        # filt_sd_key is in the observed set so this fires after
-        # ``_populate_sd_cache`` has updated the pointer (which it
-        # always does, even on a pure cache hit) — guarantees the
-        # cache lookups below see the right keys for the current state.
+        # clean_sd_key and filt_sd_key are in the observed set so this
+        # fires after ``_populate_sd_cache`` has updated the pointers
+        # (which it always does, even on a pure cache hit) — guarantees
+        # the cache lookups below see the right keys for the current
+        # state.
 
         # Resolve scope SDs. Falls back to summary_sd / cleaned_sd
         # for pre-cache-population states (initial startup, the brief
@@ -428,17 +430,23 @@ def _merged_sd(self, change):
         raw_sd = cache.get(self.raw_sd_key) if self.raw_sd_key else None
         if raw_sd is None:
             raw_sd = self.summary_sd or {}
+        clean_sd = cache.get(self.clean_sd_key) if self.clean_sd_key else None
+        if clean_sd is None:
+            clean_sd = self.cleaned_sd or {}
         filt_sd = cache.get(self.filt_sd_key) if self.filt_sd_key else None
         if filt_sd is None:
             filt_sd = self.summary_sd or {}
 
-        # ``filtered_*`` keys reflect "search filter applied on top of
-        # cleaning", so the gate is "filt chain has ops the clean chain
-        # doesn't" — i.e. at least one quick-command op is present. Keying
-        # off ``filt_sd_key != raw_sd_key`` would also fire for
-        # cleaning-only states, mislabelling cleaned stats as filtered
-        # until the deferred ``cleaned_*`` scope lands.
+        # Gate each prefixed layer on a chain-shape diff between scopes.
+        # ``cleaned_*`` fires when the clean chain has ops the raw chain
+        # doesn't (cleaning is on); ``filtered_*`` fires when the filt
+        # chain has ops the clean chain doesn't (at least one
+        # quick-command op is present, i.e. a search filter is on).
+        # Keying off ``filt_sd_key != raw_sd_key`` would also fire
+        # ``filtered_*`` for cleaning-only states, mislabelling cleaned
+        # stats as filtered.
         chains = split_chain_by_scope(self.operations)
+        cleaning_active = chains['clean'] != chains['raw']
         filter_active = chains['filt'] != chains['clean']
 
         if self.processed_df is None:
@@ -451,6 +459,13 @@ def _merged_sd(self, change):
         intermediate_sd = merge_sds(rewritten_init_sd, self.cleaned_sd, raw_sd)
         base = merge_sd_overrides(intermediate_sd, self.processed_df, self.processed_sd)
 
+        # Layer ``cleaned_*`` keys on top when cleaning is active.
+        if cleaning_active and clean_sd:
+            for col, stats in clean_sd.items():
+                col_dict = base.setdefault(col, {})
+                for stat_name, val in stats.items():
+                    col_dict[f'cleaned_{stat_name}'] = val
+
         # Layer ``filtered_*`` keys on top when a filter is active.
         if filter_active and filt_sd:
             for col, stats in filt_sd.items():
@@ -505,8 +520,9 @@ def _scope_cache_key(self, chain):
 
         Includes the op chain *and* an identifier for the source
         dataframe (``id(sampled_df)``) *and* the post-processing method
-        — all three are inputs to the scope df, and a cache hit must
-        mean "same SD-producing inputs" not just "same chain".
+        *and* the analysis-klasses identity — all four are inputs to
+        the scope's SD, and a cache hit must mean "same SD-producing
+        inputs" not just "same chain".
 
         - sampled_df identity addresses codex P1 on #783: a ``raw_df``
           swap with an unchanged chain must invalidate.
@@ -515,13 +531,14 @@ def _scope_cache_key(self, chain):
           post-processing replaces the df entirely (e.g. ``hide_post``
           → ``SENTINEL_DF``), the raw scope's SD must reflect that
           new df, not the pre-post-processing one.
-
-        analysis_klasses is *not* included here; that's a separate
-        invariant (codex P2, deferred — see follow-up issue).
+        - analysis_klasses identity addresses codex P2 on #783: a
+          klass-list swap with an unchanged chain must invalidate so
+          new stat klasses surface in ``merged_sd``.
         """
         sampled_id = id(self.sampled_df) if self.sampled_df is not None else 0
         pp = self.post_processing_method or ''
-        return hash_chain(chain, extra=f"{sampled_id}|{pp}")
+        klasses_id = id(self.analysis_klasses)
+        return hash_chain(chain, extra=f"{sampled_id}|{pp}|{klasses_id}")
 
     @observe('summary_sd', 'operations', 'analysis_klasses')
     @exception_protect('sd-cache-protector')
diff --git a/tests/unit/dataflow/scoped_summary_stats_test.py b/tests/unit/dataflow/scoped_summary_stats_test.py
@@ -112,36 +112,6 @@ def test_bare_mean_is_raw_not_filtered():
     )
 
 
-def test_cleaning_only_does_not_emit_filtered_keys():
-    """Cleaning ops in the chain (but no search/quick-command) must NOT
-    cause ``filtered_*`` keys to appear. ``filtered_*`` semantically means
-    "search filter is active"; a key-inequality gate (filt_sd_key !=
-    raw_sd_key) would mislabel cleaning-affected stats as filtered until
-    the deferred ``cleaned_*`` scope lands. The gate must be on the
-    chains themselves: filt != clean.
-    """
-    df = pd.DataFrame({'a': ['10', '20', '30', '40', '50'],
-                       'b': ['foo', 'bar', 'foo', 'baz', 'foo']})
-    dfc = ScopedDataflow(df)
-    dfc.cleaning_method = 'default'
-
-    clean_chain = [op for op in (dfc.operations or [])
-                   if isinstance(op, list) and len(op) > 0]
-    assert len(clean_chain) > 0, (
-        "precondition: cleaning_method='default' should have produced "
-        "cleaning ops for a numeric-string column"
-    )
-
-    sd = dfc.merged_sd
-    filtered_keys = [k for k in sd.get('a', {}) if k.startswith('filtered_')]
-    assert filtered_keys == [], (
-        f"cleaning-only state must not emit filtered_* keys; got "
-        f"{filtered_keys}. The `filter_active` gate is firing on "
-        f"filt_sd_key != raw_sd_key instead of on the chain-shape "
-        f"difference between filt and clean."
-    )
-
-
 def test_raw_df_change_invalidates_scoped_sd():
     """Codex P1 from #783: the cache key was derived only from the op chain,
     so a ``raw_df`` swap with the same (empty) chain reused stale entries.
@@ -166,3 +136,118 @@ def test_raw_df_change_invalidates_scoped_sd():
         f"mean (400.0); got {dfc.merged_sd['a']['mean']} — likely a stale "
         f"cache entry keyed only by the (unchanged) op chain"
     )
+
+
+def test_cleaned_keys_appear_when_cleaning_active():
+    """When ``cleaning_method`` produces auto-clean ops, the clean scope's
+    SD must be layered into ``merged_sd`` with a ``cleaned_*`` prefix.
+
+    Column 'a' is numeric-string. ``safe_int`` casts it to a UInt8 column,
+    so the clean scope's ``mean`` is 30.0 (computed on ints) while the
+    raw scope's ``mean`` is the string-column fallback (0).
+    """
+    df = pd.DataFrame({'a': ['10', '20', '30', '40', '50'],
+                       'b': ['foo', 'bar', 'foo', 'baz', 'foo']})
+    dfc = ScopedDataflow(df)
+    dfc.cleaning_method = 'default'
+
+    sd = dfc.merged_sd
+    assert 'cleaned_mean' in sd['a'], (
+        f"cleaning active: `cleaned_mean` should be emitted alongside raw "
+        f"`mean`; got keys {sorted(sd['a'].keys())}"
+    )
+    assert sd['a']['cleaned_mean'] == 30.0, (
+        f"`cleaned_mean` should be the int-cast mean (30.0); got "
+        f"{sd['a']['cleaned_mean']}"
+    )
+
+
+def test_cleaning_only_does_not_emit_filtered_keys():
+    """The pre-#785 ``filter_active`` gate was keyed on
+    ``filt_sd_key != raw_sd_key``, which fires whenever the clean chain is
+    non-empty — even with no search filter. The right gate is on chain
+    shape: ``filtered_*`` only when ``filt`` differs from ``clean``.
+
+    With cleaning active but no quick-command args, ``merged_sd`` must
+    have ``cleaned_*`` keys and NO ``filtered_*`` keys.
+    """
+    df = pd.DataFrame({'a': ['10', '20', '30', '40', '50'],
+                       'b': ['foo', 'bar', 'foo', 'baz', 'foo']})
+    dfc = ScopedDataflow(df)
+    dfc.cleaning_method = 'default'
+
+    sd = dfc.merged_sd
+    filtered_keys = [k for k in sd['a'] if k.startswith('filtered_')]
+    assert filtered_keys == [], (
+        f"cleaning-only state must not emit filtered_* keys; got "
+        f"{filtered_keys}"
+    )
+    cleaned_keys = [k for k in sd['a'] if k.startswith('cleaned_')]
+    assert cleaned_keys, (
+        "cleaning-only state should emit cleaned_* keys; got none"
+    )
+
+
+def test_filter_and_clean_both_emit_correctly():
+    """With both cleaning and a search filter active, ``merged_sd``
+    carries bare raw keys, ``cleaned_*`` keys reflecting the clean scope,
+    and ``filtered_*`` keys reflecting the filt scope. The three layers
+    do not cross-talk.
+
+    'a' is numeric-string; safe_int casts it. Search 'foo' on 'b' keeps
+    the foo rows (length 4 in raw / clean scopes, with the filt scope
+    nulling out non-foo rows in 'a' → 3 nulls).
+    """
+    df = pd.DataFrame({'a': ['10', '20', '30', '40', '50', '60', '70'],
+                       'b': ['foo', 'bar', 'foo', 'baz', 'foo', 'bar', 'foo']})
+    dfc = ScopedDataflow(df)
+    dfc.cleaning_method = 'default'
+    dfc.quick_command_args = {'search': ['foo']}
+
+    sd = dfc.merged_sd['a']
+    cleaned_keys = [k for k in sd if k.startswith('cleaned_')]
+    filtered_keys = [k for k in sd if k.startswith('filtered_')]
+    assert cleaned_keys, f"both-active: cleaned_* keys missing; got {sorted(sd.keys())}"
+    assert filtered_keys, f"both-active: filtered_* keys missing; got {sorted(sd.keys())}"
+
+    # Cross-talk check: the filt scope nulls out non-foo rows in 'a' (3
+    # nulls), while the clean scope leaves all 7 rows intact (0 nulls).
+    assert sd['cleaned_null_count'] == 0, (
+        f"cleaned_null_count should reflect the clean scope (0); got "
+        f"{sd['cleaned_null_count']}"
+    )
+    assert sd['filtered_null_count'] == 3, (
+        f"filtered_null_count should reflect the filt scope (3 nulls); "
+        f"got {sd['filtered_null_count']}"
+    )
+
+
+def test_analysis_klasses_change_invalidates_scoped_sd():
+    """Codex P2 from #783: ``_scope_cache_key`` was hashed from chain +
+    sampled_df + post_processing_method only. Two dataflows with the
+    same df + chain but different ``analysis_klasses`` would collide on
+    the same cache key, so a klass swap left stale SD blobs in the
+    cache. Including ``id(analysis_klasses)`` in the cache key must
+    produce distinct keys for distinct klass lists.
+
+    Asserted at the ``_scope_cache_key`` level because ``analysis_klasses``
+    is a plain class attribute (not a traitlet) on ``DataFlow`` — setting
+    it on the instance doesn't fire observers, so the merged_sd-level
+    behavior can't be exercised end-to-end without an unrelated
+    architectural change. The cache-key contract is the load-bearing
+    invariant.
+    """
+    df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
+                       'b': ['foo', 'bar', 'foo', 'baz', 'foo']})
+    dfc1 = ScopedDataflow(df)
+    key1 = dfc1._scope_cache_key([])
+
+    dfc2 = ScopedDataflow(df)
+    dfc2.analysis_klasses = [StylingAnalysis, DefaultSummaryStats, CleaningGenOps]
+    key2 = dfc2._scope_cache_key([])
+
+    assert key1 != key2, (
+        f"scope cache key must differ when analysis_klasses differs; "
+        f"got the same key {key1} for both — likely the cache key still "
+        f"omits analysis_klasses identity (Codex P2)"
+    )