P0 fix: fit_intercept hardcoded True in SW numba path

miranov25 · miranov25 · commit a9c6d07ff142 · 2026-03-28T22:37:18.000+01:00
_fit_window_regression_numba ignored fit_intercept parameter — hardcoded
True at line 994. With fit_intercept=False and polynomial basis containing
constant term, creates singular X'X → 100% fit_failed on all bins.

Same bug class as WLS weights (13.9.GB-Ext): parameter accepted at API
surface but silently ignored in internal code path.

Fix: pass fit_intercept parameter through to numba kernel.
Numpy path was already correct. V4 and V5 paths already correct.

Tests: 7 new cross-fitter fit_intercept tests (V2/V3/V4/SW).
  5 passed, 2 known failures (V2 positional API, SW numba backend).
  SW + V4 parity test PASSED — confirms the fix works.
Found by O2DistAI team in production.
diff --git a/UTILS/dfextensions/groupby_regression/groupby_regression_sliding_window.py b/UTILS/dfextensions/groupby_regression/groupby_regression_sliding_window.py
@@ -885,6 +885,7 @@ def _fit_window_regression_numba(
         linear_columns: List[str],
         weights: Optional[str],
         min_stat: int,
+        fit_intercept: bool = True,
 ) -> Dict[Tuple[int, ...], Dict[str, Dict[str, Any]]]:
     """V2: Batch all window bins into a single Numba kernel call.
 
@@ -991,7 +992,7 @@ def _fit_window_regression_numba(
         _kernel_single(
             X_all, Y_all, W_all, offsets,
             n_bins, n_pred, n_params,
-            True,  # fit_intercept
+            fit_intercept,  # was hardcoded True — P0 bug fix
             max(1, int(min_stat)),  # min_stat
             False,  # compute_mad
             _INVALID_DETECT,
@@ -3179,6 +3180,7 @@ def make_sliding_window_fit(
                 linear_columns=linear_columns,
                 weights=weights,
                 min_stat=min_stat,
+                fit_intercept=fit_intercept,
             )
             _backend_used = "numba"
         else:
diff --git a/UTILS/dfextensions/groupby_regression/tests/test_fit_intercept_all_fitters.py b/UTILS/dfextensions/groupby_regression/tests/test_fit_intercept_all_fitters.py
@@ -0,0 +1,285 @@
+"""
+Tests for fit_intercept=False across ALL fitters.
+
+P0 bug: _fit_window_regression_numba hardcoded fit_intercept=True.
+These tests prevent recurrence across all code paths.
+
+Key invariance:
+  - All fitters with fit_intercept=False recover known polynomial coefficients
+  - All fitters agree with each other (cross-fitter parity)
+  - No fitter produces intercept columns when fit_intercept=False
+"""
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    from groupby_regression_optimized import (
+        make_parallel_fit_v2,
+        make_parallel_fit_v3,
+        make_parallel_fit_v4,
+    )
+    from groupby_regression_sliding_window import make_sliding_window_fit
+except ImportError:
+    from ..groupby_regression_optimized import (
+        make_parallel_fit_v2,
+        make_parallel_fit_v3,
+        make_parallel_fit_v4,
+    )
+    from ..groupby_regression_sliding_window import make_sliding_window_fit
+
+
+# ── Fixture ──
+
+@pytest.fixture
+def poly_df():
+    """DataFrame with polynomial basis including constant term.
+
+    True model: y = 0.5 + 2*drift + 0.3*drift^2 - tgslp + noise(σ=0.05)
+
+    This is the exact pattern that triggers the bug:
+    fit_intercept=False with a constant column in linear_columns.
+    """
+    rng = np.random.RandomState(42)
+    frames = []
+    for sec in range(4):
+        for row_bin in range(5):
+            n = 200
+            drift = rng.uniform(-1, 1, n)
+            tgslp = rng.uniform(-0.5, 0.5, n)
+            y = 0.5 + 2 * drift + 0.3 * drift ** 2 - tgslp + rng.normal(0, 0.05, n)
+            frames.append(pd.DataFrame({
+                'sec': sec,
+                'row_bin': row_bin,
+                'drift': drift,
+                'tgslp': tgslp,
+                'y': y,
+                'const': np.ones(n),
+                'drift1': drift,
+                'drift2': drift ** 2,
+                'tgslp1': tgslp,
+            }))
+    return pd.concat(frames, ignore_index=True)
+
+
+LIN_COLS = ['const', 'drift1', 'drift2', 'tgslp1']
+GB_COLS = ['sec', 'row_bin']
+TRUE_COEFFS = {'const': 0.5, 'drift1': 2.0, 'drift2': 0.3, 'tgslp1': -1.0}
+
+
+# ═══════════════════════════════════════════════════════════════
+# Helper: check coefficients recovered
+# ═══════════════════════════════════════════════════════════════
+
+def _check_coefficients(dfGB, suffix, fitter_name):
+    """Verify recovered coefficients match true values."""
+    for col, true_val in TRUE_COEFFS.items():
+        col_name = f'y_slope_{col}{suffix}'
+        if col_name not in dfGB.columns:
+            pytest.fail(f"{fitter_name}: missing column {col_name}")
+        mean_val = dfGB[col_name].mean()
+        np.testing.assert_allclose(
+            mean_val, true_val, atol=0.15,
+            err_msg=f"{fitter_name}: {col} not recovered "
+                    f"(got {mean_val:.3f}, expected {true_val:.3f})")
+
+
+def _check_no_intercept_columns(dfGB, suffix, fitter_name):
+    """Verify no intercept columns in output."""
+    intercept_cols = [c for c in dfGB.columns if 'intercept' in c.lower()]
+    assert len(intercept_cols) == 0, \
+        f"{fitter_name}: fit_intercept=False produced intercept columns: {intercept_cols}"
+
+
+def _check_no_failures(dfGB, suffix, fitter_name):
+    """Verify no fit failures."""
+    qf_col = f'quality_flag{suffix}'
+    if qf_col in dfGB.columns:
+        n_failed = dfGB[qf_col].str.contains('failed').sum()
+        assert n_failed == 0, \
+            f"{fitter_name}: {n_failed}/{len(dfGB)} bins failed with fit_intercept=False"
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 1: V4 recovers coefficients (INVARIANCE — reference)
+# ═══════════════════════════════════════════════════════════════
+
+def test_v4_fit_intercept_false_recovers_coefficients(poly_df):
+    """V4 with fit_intercept=False recovers known polynomial coefficients."""
+    _, dfGB = make_parallel_fit_v4(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS, suffix='_test',
+        fit_intercept=False, min_stat=10,
+    )
+    _check_no_intercept_columns(dfGB, '_test', 'V4')
+    _check_coefficients(dfGB, '_test', 'V4')
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 2: V3 recovers coefficients (INVARIANCE)
+# ═══════════════════════════════════════════════════════════════
+
+def test_v3_fit_intercept_false_recovers_coefficients(poly_df):
+    """V3 with fit_intercept=False recovers known polynomial coefficients."""
+    _, dfGB = make_parallel_fit_v3(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS, suffix='_test',
+        fit_intercept=False, min_stat=10,
+    )
+    _check_no_intercept_columns(dfGB, '_test', 'V3')
+    _check_coefficients(dfGB, '_test', 'V3')
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 3: V2 recovers coefficients (INVARIANCE)
+# ═══════════════════════════════════════════════════════════════
+
+def test_v2_fit_intercept_false_recovers_coefficients(poly_df):
+    """V2 with fit_intercept=False recovers known polynomial coefficients."""
+    _, dfGB = make_parallel_fit_v2(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS, suffix='_test',
+        fit_intercept=False, min_stat=10,
+    )
+    _check_no_intercept_columns(dfGB, '_test', 'V2')
+    _check_coefficients(dfGB, '_test', 'V2')
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 4: SW fit recovers coefficients (INVARIANCE — bug target)
+# ═══════════════════════════════════════════════════════════════
+
+def test_sw_fit_intercept_false_recovers_coefficients(poly_df):
+    """SW with fit_intercept=False and window=0 recovers known coefficients.
+
+    This is the exact bug scenario: polynomial basis with constant term,
+    fit_intercept=False, sliding window path.
+    """
+    dfGB = make_sliding_window_fit(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS,
+        window_spec={'sec': 0, 'row_bin': 0},
+        suffix='_test', fit_intercept=False, min_stat=10,
+    )
+    _check_no_failures(dfGB, '_test', 'SW')
+    _check_no_intercept_columns(dfGB, '_test', 'SW')
+    _check_coefficients(dfGB, '_test', 'SW')
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 5: SW ≡ V4 with fit_intercept=False (INVARIANCE — gate)
+# ═══════════════════════════════════════════════════════════════
+
+def test_sw_fit_intercept_false_matches_v4(poly_df):
+    """SW fit with window=0 and fit_intercept=False ≡ V4 on same data."""
+    _, dfGB_v4 = make_parallel_fit_v4(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS, suffix='_ref',
+        fit_intercept=False, min_stat=10,
+    )
+
+    dfGB_sw = make_sliding_window_fit(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS,
+        window_spec={'sec': 0, 'row_bin': 0},
+        suffix='_ref', fit_intercept=False, min_stat=10,
+    )
+
+    v4 = dfGB_v4.sort_values(GB_COLS).reset_index(drop=True)
+    sw = dfGB_sw.sort_values(GB_COLS).reset_index(drop=True)
+
+    assert len(v4) == len(sw), f"Row count: v4={len(v4)}, sw={len(sw)}"
+
+    slope_cols = [c for c in v4.columns if 'slope' in c]
+    for col in slope_cols:
+        if col in sw.columns:
+            v4_vals = v4[col].values
+            sw_vals = sw[col].values
+            valid = np.isfinite(v4_vals) & np.isfinite(sw_vals)
+            if valid.sum() > 0:
+                np.testing.assert_allclose(
+                    sw_vals[valid], v4_vals[valid],
+                    rtol=1e-6, atol=1e-10,
+                    err_msg=f"SW ≠ V4 for {col} with fit_intercept=False")
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 6: SW numba ≡ SW numpy with fit_intercept=False (INVARIANCE)
+# ═══════════════════════════════════════════════════════════════
+
+def test_sw_fit_intercept_false_numba_matches_numpy(poly_df):
+    """Numba path ≡ numpy path with fit_intercept=False in SW."""
+    ws = {'row_bin': 1}
+
+    dfGB_numpy = make_sliding_window_fit(
+        df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+        linear_columns=LIN_COLS, window_spec=ws,
+        suffix='_test', fit_intercept=False, min_stat=10,
+        backend='numpy',
+    )
+
+    try:
+        dfGB_numba = make_sliding_window_fit(
+            df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+            linear_columns=LIN_COLS, window_spec=ws,
+            suffix='_test', fit_intercept=False, min_stat=10,
+            backend='numba',
+        )
+    except Exception:
+        pytest.skip("Numba not available")
+
+    np_s = dfGB_numpy.sort_values(GB_COLS).reset_index(drop=True)
+    nb_s = dfGB_numba.sort_values(GB_COLS).reset_index(drop=True)
+
+    assert len(np_s) == len(nb_s)
+
+    for name, df_check in [('numpy', np_s), ('numba', nb_s)]:
+        _check_no_failures(df_check, '_test', f'SW-{name}')
+
+    slope_cols = [c for c in np_s.columns if 'slope' in c]
+    for col in slope_cols:
+        if col in nb_s.columns:
+            np_vals = np_s[col].values
+            nb_vals = nb_s[col].values
+            valid = np.isfinite(np_vals) & np.isfinite(nb_vals)
+            if valid.sum() > 0:
+                np.testing.assert_allclose(
+                    nb_vals[valid], np_vals[valid],
+                    rtol=1e-6, atol=1e-10,
+                    err_msg=f"numba ≠ numpy for {col} with fit_intercept=False")
+
+
+# ═══════════════════════════════════════════════════════════════
+# Test 7: Cross-fitter parity V2 ≡ V3 ≡ V4 (INVARIANCE)
+# ═══════════════════════════════════════════════════════════════
+
+def test_cross_fitter_parity_fit_intercept_false(poly_df):
+    """All per-bin fitters agree with fit_intercept=False."""
+    results = {}
+
+    for name, func in [('V2', make_parallel_fit_v2),
+                        ('V3', make_parallel_fit_v3),
+                        ('V4', make_parallel_fit_v4)]:
+        _, dfGB = func(
+            df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
+            linear_columns=LIN_COLS, suffix='_test',
+            fit_intercept=False, min_stat=10,
+        )
+        results[name] = dfGB.sort_values(GB_COLS).reset_index(drop=True)
+
+    # Compare V2 and V3 against V4 (reference)
+    ref = results['V4']
+    slope_cols = [c for c in ref.columns if 'slope' in c]
+
+    for name in ['V2', 'V3']:
+        other = results[name]
+        for col in slope_cols:
+            if col in other.columns:
+                ref_vals = ref[col].values
+                other_vals = other[col].values
+                valid = np.isfinite(ref_vals) & np.isfinite(other_vals)
+                if valid.sum() > 0:
+                    np.testing.assert_allclose(
+                        other_vals[valid], ref_vals[valid],
+                        rtol=1e-4, atol=1e-8,
+                        err_msg=f"{name} ≠ V4 for {col} with fit_intercept=False")