Fixes #899: fix boolean short-circuit bug in dask zonal.stats()

brendancol · brendancol · commit 94a474761a8d · 2026-02-26T18:24:13.000-08:00
The conditions `if 'mean' or 'std' or 'var' in stats_funcs` always
evaluated to True because the string 'mean' is truthy. This caused
compute_sum, compute_count, and compute_sum_squares to always be set,
wasting work on every dask zonal.stats() call regardless of which
stats were requested.

Fix: use `any(s in stats_funcs for s in (...))` for correct membership
testing. Add regression tests covering 7 stat subsets on both numpy
and dask backends to exercise each compute flag independently.
diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py
@@ -590,6 +590,47 @@ def test_majority_with_ties(backend):
     check_results(backend, df_result, expected_result)
 
 
+@pytest.mark.parametrize("stats_funcs, expected_cols", [
+    (['min', 'max'], ['zone', 'min', 'max']),
+    (['mean'], ['zone', 'mean']),
+    (['std'], ['zone', 'std']),
+    (['var'], ['zone', 'var']),
+    (['count'], ['zone', 'count']),
+    (['sum'], ['zone', 'sum']),
+    (['min', 'max', 'count'], ['zone', 'min', 'max', 'count']),
+])
+@pytest.mark.parametrize("backend", ['numpy', 'dask+numpy'])
+def test_stats_subset_columns(backend, data_zones, data_values_2d,
+                              stats_funcs, expected_cols):
+    """Requesting a subset of stats returns only those columns.
+
+    Regression test for GH-899: the dask path had a boolean short-circuit
+    bug (``if 'mean' or 'std' or 'var' in stats_funcs``) that always
+    evaluated to True, causing unnecessary intermediate stats to be
+    computed.  After the fix, each subset exercises a distinct code path
+    for compute_sum / compute_count / compute_sum_squares flags.
+    """
+    if 'dask' in backend and not dask_array_available():
+        pytest.skip("Requires Dask")
+
+    df_result = stats(zones=data_zones, values=data_values_2d,
+                      stats_funcs=stats_funcs)
+
+    # Verify values are correct for the requested stats
+    all_expected = {
+        'zone':  [0, 1, 2, 3],
+        'mean':  [0, 1, 2, 2.4],
+        'max':   [0, 1, 2, 3],
+        'min':   [0, 1, 2, 0],
+        'sum':   [0, 6, 8, 12],
+        'std':   [0, 0, 0, 1.2],
+        'var':   [0, 0, 0, 1.44],
+        'count': [5, 6, 4, 5],
+    }
+    expected = {k: all_expected[k] for k in expected_cols}
+    check_results(backend, df_result, expected)
+
+
 def test_zonal_stats_against_qgis(elevation_raster_no_nans, raster, qgis_zonal_stats):
     stats_funcs = list(set(qgis_zonal_stats.keys()) - set(['zone']))
     zones_agg = create_test_raster(raster)
diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py
@@ -240,11 +240,11 @@ def _stats_dask_numpy(
     compute_sum = False
     compute_count = False
 
-    if 'mean' or 'std' or 'var' in stats_funcs:
+    if any(s in stats_funcs for s in ('mean', 'std', 'var')):
         compute_sum = True
         compute_count = True
 
-    if 'std' or 'var' in stats_funcs:
+    if any(s in stats_funcs for s in ('std', 'var')):
         compute_sum_squares = True
 
     basis_stats = [s for s in _DASK_BLOCK_STATS if s in stats_funcs]