From 79fad24f9247ac8f8244357fcd2e4945d075821a Mon Sep 17 00:00:00 2001 From: SuryaSunil287 Date: Sun, 10 May 2026 14:18:40 +0530 Subject: [PATCH 1/2] fix: dynamic fill_zero in get_binned_data to prevent KL/PSI distortion The hardcoded fill value (0.0001 or min/1e6) for zero-probability bins in get_binned_data caused two problems: 1. When min non-zero percent <= 0.0001, filling with min/1e6 produced an astronomically small value that inflated KL divergence scores. 2. Reference and current distributions received different fill values, introducing asymmetry in divergence metrics. Fix: compute a single fill value as min_nonzero_both / 10, where min_nonzero_both is the smallest non-zero probability across both distributions. This guarantees the fill is always smaller than any genuine probability and is applied symmetrically. Fixes #334 --- .../legacy/calculations/stattests/utils.py | 22 ++++----- .../spark/calculations/stattests/utils.py | 18 ++----- tests/stattests/test_stattests.py | 47 +++++++++++++++++++ 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/src/evidently/legacy/calculations/stattests/utils.py b/src/evidently/legacy/calculations/stattests/utils.py index 1957632451..f2beeba520 100644 --- a/src/evidently/legacy/calculations/stattests/utils.py +++ b/src/evidently/legacy/calculations/stattests/utils.py @@ -42,20 +42,16 @@ def get_binned_data( current_percents = np.array([current_feature_dict[key] / len(current_data) for key in keys]) if feel_zeroes: - np.place( - reference_percents, - reference_percents == 0, - min(reference_percents[reference_percents != 0]) / 10**6 - if min(reference_percents[reference_percents != 0]) <= 0.0001 - else 0.0001, - ) - np.place( - current_percents, - current_percents == 0, - min(current_percents[current_percents != 0]) / 10**6 - if min(current_percents[current_percents != 0]) <= 0.0001 - else 0.0001, + # Use a single fill value derived from both distributions so that + # reference and current are treated symmetrically. The fill is 1/10 of + # the smallest genuine non-zero probability seen in either distribution, + # guaranteeing it never inflates divergence metrics (KL, PSI, etc.). + all_nonzero = np.concatenate( + [reference_percents[reference_percents > 0], current_percents[current_percents > 0]] ) + fill_zero_value = float(all_nonzero.min()) / 10 if all_nonzero.size > 0 else 1e-4 + reference_percents = np.where(reference_percents == 0, fill_zero_value, reference_percents) + current_percents = np.where(current_percents == 0, fill_zero_value, current_percents) return reference_percents, current_percents diff --git a/src/evidently/legacy/spark/calculations/stattests/utils.py b/src/evidently/legacy/spark/calculations/stattests/utils.py index e43ab40194..aa895c98ea 100644 --- a/src/evidently/legacy/spark/calculations/stattests/utils.py +++ b/src/evidently/legacy/spark/calculations/stattests/utils.py @@ -52,19 +52,11 @@ def get_binned_data( current_percents = current_percents / current_percents.sum() if fill_zeroes: - np.place( - reference_percents, - reference_percents == 0, - min(reference_percents[reference_percents != 0]) / 10**6 - if min(reference_percents[reference_percents != 0]) <= 0.0001 - else 0.0001, - ) - np.place( - current_percents, - current_percents == 0, - min(current_percents[current_percents != 0]) / 10**6 - if min(current_percents[current_percents != 0]) <= 0.0001 - else 0.0001, + all_nonzero = np.concatenate( + [reference_percents[reference_percents > 0], current_percents[current_percents > 0]] ) + fill_zero_value = float(all_nonzero.min()) / 10 if all_nonzero.size > 0 else 1e-4 + reference_percents = np.where(reference_percents == 0, fill_zero_value, reference_percents) + current_percents = np.where(current_percents == 0, fill_zero_value, current_percents) return reference_percents, current_percents diff --git a/tests/stattests/test_stattests.py b/tests/stattests/test_stattests.py index 33ed818466..3808a69941 100644 --- a/tests/stattests/test_stattests.py +++ b/tests/stattests/test_stattests.py @@ -5,6 +5,8 @@ from scipy import stats from evidently.legacy.calculations.stattests import z_stat_test +from evidently.legacy.calculations.stattests.kl_div import kl_div_stat_test +from evidently.legacy.calculations.stattests.utils import get_binned_data from evidently.legacy.calculations.stattests.anderson_darling_stattest import anderson_darling_test from evidently.legacy.calculations.stattests.chisquare_stattest import chi_stat_test from evidently.legacy.calculations.stattests.cramer_von_mises_stattest import cramer_von_mises @@ -335,3 +337,48 @@ def test_t_test() -> None: reference = pd.Series([38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0]) current = pd.Series([39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8]) assert t_test.func(reference, current, "num", 0.05) == (approx(0.084, abs=1e-3), False) + + +def test_get_binned_data_fill_zero_is_dynamic() -> None: + # Regression test for https://github.com/evidentlyai/evidently/issues/334. + # When the minimum non-zero probability is smaller than 0.0001 the old code + # used min/1e6 as fill, which is astronomically small and inflates KL divergence. + # The fix uses min/10 so the fill stays proportional to the real data. + from evidently.legacy.core import ColumnType + + # Build a categorical distribution where one bucket has a very small percent. + # 99_999 "a" and 1 "b" → p("b") ≈ 1e-5, well below the old 0.0001 threshold. + reference = pd.Series(["a"] * 99_999 + ["b"]) + current = pd.Series(["a"] * 99_999 + ["c"]) # "c" absent in reference, "b" absent in current + + ref_pct, cur_pct = get_binned_data(reference, current, ColumnType.Categorical, n=10) + + # All zero slots must be filled with a positive value + assert np.all(ref_pct > 0), "reference percents must be strictly positive after fill" + assert np.all(cur_pct > 0), "current percents must be strictly positive after fill" + + # The fill value must be strictly smaller than the smallest genuine probability + min_genuine = min(reference.value_counts(normalize=True).min(), current.value_counts(normalize=True).min()) + fill_used = min(ref_pct.min(), cur_pct.min()) + assert fill_used < min_genuine, "fill value must be smaller than any real non-zero probability" + + # KL divergence must not be inflated: identical distributions should score near 0 + ref_same = pd.Series(["a"] * 50 + ["b"] * 50) + cur_same = pd.Series(["a"] * 50 + ["b"] * 50) + score_same, _ = kl_div_stat_test.func(ref_same, cur_same, ColumnType.Categorical, 0.1) + assert score_same == approx(0.0, abs=1e-6), "KL of identical distributions must be ~0" + + +def test_get_binned_data_fill_zero_symmetric() -> None: + # Both reference and current must receive the same fill value so that + # KL divergence is not artificially asymmetric. + from evidently.legacy.core import ColumnType + + reference = pd.Series(["a"] * 90 + ["b"] * 10) + current = pd.Series(["a"] * 95 + ["c"] * 5) # "b" zero in current, "c" zero in reference + + ref_pct, cur_pct = get_binned_data(reference, current, ColumnType.Categorical, n=10) + + # The fill value is derived from both distributions combined, so the minimum + # of ref and current percents must be identical. + assert ref_pct.min() == approx(cur_pct.min(), rel=1e-9), "fill value must be identical for ref and current" From 62a5f0c6d83b00c77cdbee22b77935a829185ab7 Mon Sep 17 00:00:00 2001 From: SuryaSunil287 Date: Mon, 18 May 2026 21:45:26 +0530 Subject: [PATCH 2/2] fix: GroupBy(ValueDrift) widget title now includes group-by context When ValueDrift is wrapped in GroupBy, the counter widget label was hardcoded as "Drift in column ''" regardless of the GroupBy label. Thread display_name() through _render() so the patched name ("... group by '' for label: '