Skip to content

Commit 1278986

Browse files
committed
P0 fix complete: fit_intercept result unpacking + full-chain tests
Part 3: result unpacking in _fit_window_regression_numba used j+1 offset assuming intercept at beta[0]. With fit_intercept=False, slopes start at beta[0]. Fix: offset = 1 if fit_intercept else 0. Tests: 10 total (was 8), 2 new full-chain invariance tests: test_sw_window1_numba_matches_manual_windowed_v4 — window>0, numba, compared against manually windowed V4 (the exact production pattern) test_sw_window1_numba_matches_numpy — backend parity with actual window Previous tests used window=0 which bypassed the recompute path where all three bugs lived. Window>0 tests exercise the complete chain: kernel call → result unpacking → coefficient assembly. 517 passed, 3 failed (pre-existing), 0 regressions
1 parent 0ed8a56 commit 1278986

2 files changed

Lines changed: 133 additions & 4 deletions

File tree

UTILS/dfextensions/groupby_regression/groupby_regression_sliding_window.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,11 +1006,12 @@ def _fit_window_regression_numba(
10061006
out[center] = {}
10071007

10081008
if out_status[i] == _STATUS_OK:
1009-
intercept = float(out_beta[i, 0])
1010-
intercept_err = float(out_errors[i, 0])
1011-
coeffs = {linear_columns[j]: float(out_beta[i, j + 1])
1009+
offset = 1 if fit_intercept else 0
1010+
intercept = float(out_beta[i, 0]) if fit_intercept else 0.0
1011+
intercept_err = float(out_errors[i, 0]) if fit_intercept else 0.0
1012+
coeffs = {linear_columns[j]: float(out_beta[i, j + offset])
10121013
for j in range(n_pred)}
1013-
coeffs_err = {linear_columns[j]: float(out_errors[i, j + 1])
1014+
coeffs_err = {linear_columns[j]: float(out_errors[i, j + offset])
10141015
for j in range(n_pred)}
10151016

10161017
# RMSE: V1 uses sqrt(RSS/n), kernel gives sqrt(RSS/dof).

UTILS/dfextensions/groupby_regression/tests/test_fit_intercept_all_fitters.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,3 +338,131 @@ def test_cross_fitter_parity_fit_intercept_false(poly_df):
338338
other_vals[valid], ref_vals[valid],
339339
rtol=1e-4, atol=1e-8,
340340
err_msg=f"{name} ≠ V4 for {col} with fit_intercept=False")
341+
342+
343+
# ═══════════════════════════════════════════════════════════════
344+
# Test 8: SW window>0 numba ≡ manual windowed V4 (INVARIANCE — full chain)
345+
# ═══════════════════════════════════════════════════════════════
346+
347+
def test_sw_window1_numba_matches_manual_windowed_v4(poly_df):
348+
"""SW numba with window=1 and fit_intercept=False ≡ manually windowed V4.
349+
350+
THIS IS THE FULL-CHAIN TEST. It exercises the complete recompute path
351+
including result unpacking. Previous tests used window=0 which could
352+
bypass the buggy code path.
353+
354+
The manual windowing replicates data with ±1 shift in row_bin, then
355+
fits with V4 — producing the same result as SW with window=1.
356+
"""
357+
lin_cols = LIN_COLS
358+
gb = ['sec', 'row_bin']
359+
360+
# SW with actual window > 0
361+
try:
362+
dfGB_sw = make_sliding_window_fit(
363+
df=poly_df, gb_columns=gb, fit_columns=['y'],
364+
linear_columns=lin_cols,
365+
window_spec={'sec': 0, 'row_bin': 1},
366+
suffix='_test', fit_intercept=False, min_stat=10,
367+
backend='numba',
368+
)
369+
except Exception:
370+
pytest.skip("Numba not available")
371+
372+
# No fit failures
373+
_check_no_failures(dfGB_sw, '_test', 'SW-numba-w1')
374+
375+
# No intercept columns
376+
_check_no_intercept_columns(dfGB_sw, '_test', 'SW-numba-w1')
377+
378+
# Must have slope columns with finite values
379+
slope_cols = [c for c in dfGB_sw.columns if 'slope' in c and '_err' not in c]
380+
assert len(slope_cols) == len(lin_cols), \
381+
f"Expected {len(lin_cols)} slope columns, got {len(slope_cols)}: {slope_cols}"
382+
for col in slope_cols:
383+
n_finite = np.isfinite(dfGB_sw[col]).sum()
384+
assert n_finite > 0, f"All NaN in {col}"
385+
386+
# Manual windowing: replicate data with ±1 row_bin shift
387+
parts = []
388+
for offset in [-1, 0, 1]:
389+
tmp = poly_df.copy()
390+
tmp['row_bin'] = tmp['row_bin'] - offset
391+
parts.append(tmp)
392+
df_windowed = pd.concat(parts, ignore_index=True)
393+
394+
_, dfGB_manual = make_parallel_fit_v4(
395+
df=df_windowed, gb_columns=gb, fit_columns=['y'],
396+
linear_columns=lin_cols, suffix='_test',
397+
fit_intercept=False, min_stat=10,
398+
)
399+
400+
# Compare: SW window=1 ≡ manual windowed V4
401+
sw = dfGB_sw.sort_values(gb).reset_index(drop=True)
402+
manual = dfGB_manual.sort_values(gb).reset_index(drop=True)
403+
404+
# Only compare bins present in both (edge bins may differ)
405+
merged = sw.merge(manual, on=gb, suffixes=('_sw', '_manual'))
406+
407+
for lin_col in lin_cols:
408+
col_sw = f'y_slope_{lin_col}_test_sw'
409+
col_man = f'y_slope_{lin_col}_test_manual'
410+
if col_sw in merged.columns and col_man in merged.columns:
411+
sw_vals = merged[col_sw].values
412+
man_vals = merged[col_man].values
413+
valid = np.isfinite(sw_vals) & np.isfinite(man_vals)
414+
if valid.sum() > 0:
415+
np.testing.assert_allclose(
416+
sw_vals[valid], man_vals[valid],
417+
rtol=1e-5, atol=1e-8,
418+
err_msg=f"SW-numba window=1 ≠ manual windowed V4 for {lin_col}")
419+
420+
421+
# ═══════════════════════════════════════════════════════════════
422+
# Test 9: SW window>0 numpy ≡ numba (INVARIANCE — backend parity with window)
423+
# ═══════════════════════════════════════════════════════════════
424+
425+
def test_sw_window1_numba_matches_numpy(poly_df):
426+
"""SW numba with window=1 ≡ SW numpy with window=1 and fit_intercept=False.
427+
428+
Both backends must produce identical results with actual windowing.
429+
"""
430+
ws = {'sec': 0, 'row_bin': 1}
431+
432+
dfGB_numpy = make_sliding_window_fit(
433+
df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
434+
linear_columns=LIN_COLS, window_spec=ws,
435+
suffix='_test', fit_intercept=False, min_stat=10,
436+
backend='numpy',
437+
)
438+
439+
try:
440+
dfGB_numba = make_sliding_window_fit(
441+
df=poly_df, gb_columns=GB_COLS, fit_columns=['y'],
442+
linear_columns=LIN_COLS, window_spec=ws,
443+
suffix='_test', fit_intercept=False, min_stat=10,
444+
backend='numba',
445+
)
446+
except Exception:
447+
pytest.skip("Numba not available")
448+
449+
# No failures in either
450+
_check_no_failures(dfGB_numpy, '_test', 'SW-numpy-w1')
451+
_check_no_failures(dfGB_numba, '_test', 'SW-numba-w1')
452+
453+
np_s = dfGB_numpy.sort_values(GB_COLS).reset_index(drop=True)
454+
nb_s = dfGB_numba.sort_values(GB_COLS).reset_index(drop=True)
455+
456+
assert len(np_s) == len(nb_s)
457+
458+
slope_cols = [c for c in np_s.columns if 'slope' in c]
459+
for col in slope_cols:
460+
if col in nb_s.columns:
461+
np_vals = np_s[col].values
462+
nb_vals = nb_s[col].values
463+
valid = np.isfinite(np_vals) & np.isfinite(nb_vals)
464+
if valid.sum() > 0:
465+
np.testing.assert_allclose(
466+
nb_vals[valid], np_vals[valid],
467+
rtol=1e-6, atol=1e-10,
468+
err_msg=f"numba ≠ numpy for {col} with window=1, fit_intercept=False")

0 commit comments

Comments
 (0)