Skip to content

Commit ec6b8b8

Browse files
committed
fix: enhance NaN handling in dict_sweep and _val_to_delete with improved type checks and tests
1 parent 6eb5acb commit ec6b8b8

2 files changed

Lines changed: 119 additions & 55 deletions

File tree

biothings/utils/dataload.py

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -21,38 +21,50 @@
2121

2222
csv.field_size_limit(10000000) # default is 131072, too small for some big files
2323

24+
25+
def _missing_value_kind(val):
26+
"""Return a stable kind for NaN-like values without importing optional deps."""
27+
val_cls = val.__class__
28+
cls_module = getattr(val_cls, "__module__", "")
29+
cls_name = getattr(val_cls, "__name__", "")
30+
31+
if (cls_module == "pandas" or cls_module.startswith("pandas.")) and cls_name in ("NAType", "NaTType"):
32+
return cls_name
33+
34+
try:
35+
if math.isnan(val):
36+
return "NaN"
37+
except (TypeError, ValueError):
38+
pass
39+
40+
return None
41+
42+
2443
def _val_to_delete(val, vals):
2544
"""Return True if val is considered as a value to delete, False otherwise.
2645
2746
NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed when
2847
explicitly included in the vals list.
2948
"""
30-
try:
31-
# float('nan') != float('nan') by IEEE 754, so `val in vals` always
32-
# returns False for NaN floats. Handle it with math.isnan instead.
33-
if isinstance(val, float) and math.isnan(val):
34-
return any(isinstance(v, float) and math.isnan(v) for v in vals)
35-
to_delete = val in vals
36-
except (TypeError, ValueError):
37-
# pandas.NA raises TypeError on comparison; pandas.NaT may raise
38-
# ValueError. Catch both so either type is handled.
49+
if is_str(vals):
50+
vals = [vals]
51+
52+
val_missing_kind = _missing_value_kind(val)
53+
54+
for candidate in vals:
55+
candidate_missing_kind = _missing_value_kind(candidate)
56+
if val_missing_kind or candidate_missing_kind:
57+
if val_missing_kind == candidate_missing_kind:
58+
return True
59+
continue
60+
3961
try:
40-
val_is_nan = val.__module__ == "pandas" and val.__class__.__name__ in ("NAType", "NaTType")
41-
except AttributeError:
42-
return False
43-
# Only remove if the caller explicitly put a pandas NA/NaT in vals.
44-
nan_in_vals = False
45-
for v in vals:
46-
try:
47-
if v.__module__ == "pandas" and v.__class__.__name__ in ("NAType", "NaTType"):
48-
nan_in_vals = True
49-
break
50-
except AttributeError:
51-
# Regular values (str, int, …) don't have __module__.
52-
continue
53-
to_delete = val_is_nan and nan_in_vals
62+
if val == candidate:
63+
return True
64+
except (TypeError, ValueError):
65+
continue
5466

55-
return to_delete
67+
return False
5668

5769

5870
def dict_sweep(d, vals=None, remove_invalid_list=False):
@@ -96,17 +108,16 @@ def dict_sweep(d, vals=None, remove_invalid_list=False):
96108
else:
97109
d[key] = val
98110
else:
99-
new_val = []
100-
for item in val:
111+
i = 0
112+
while i < len(val):
113+
item = val[i]
101114
if _val_to_delete(item, vals):
102-
continue
115+
del val[i]
103116
elif isinstance(item, dict):
104117
dict_sweep(item, vals, remove_invalid_list=remove_invalid_list)
105-
new_val.append(item)
106-
if not new_val:
118+
i += 1
119+
if not val:
107120
del d[key]
108-
else:
109-
d[key] = new_val
110121
elif isinstance(val, dict):
111122
dict_sweep(val, vals, remove_invalid_list=remove_invalid_list)
112123
# if len(val) == 0:

tests/utils/test_dataload.py

Lines changed: 77 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@
1414

1515
# ---------------------------------------------------------------------------
1616
# Fake pandas-like NA / NaT types for testing without importing pandas.
17-
# __module__ is set to "pandas" so that _val_to_delete's module check works.
17+
# __module__ mirrors real pandas internals closely enough to exercise fallback
18+
# detection without importing pandas.
1819
# ---------------------------------------------------------------------------
1920

21+
2022
class NAType:
2123
"""Mimics pandas.NA (pandas.core.arrays.masked.NAType)."""
22-
__module__ = "pandas"
24+
25+
__module__ = "pandas._libs.missing"
2326

2427
def __bool__(self):
2528
raise TypeError("boolean value of NA is ambiguous")
@@ -36,7 +39,8 @@ def __repr__(self):
3639

3740
class NaTType:
3841
"""Mimics pandas.NaT (pandas._libs.tslibs.nattype.NaTType)."""
39-
__module__ = "pandas"
42+
43+
__module__ = "pandas._libs.tslibs.nattype"
4044

4145
def __bool__(self):
4246
raise TypeError("boolean value of NaT is ambiguous")
@@ -58,10 +62,21 @@ def __repr__(self):
5862
_VALS_WITH_NAN = _DEFAULT_VALS + [float("nan"), _NA, _NaT]
5963

6064

65+
@pytest.fixture(scope="module")
66+
def pandas():
67+
return pytest.importorskip("pandas")
68+
69+
70+
@pytest.fixture(scope="module")
71+
def numpy():
72+
return pytest.importorskip("numpy")
73+
74+
6175
# ---------------------------------------------------------------------------
6276
# _val_to_delete helper tests
6377
# ---------------------------------------------------------------------------
6478

79+
6580
class TestValToDelete:
6681
# -- default vals (no NaN entries) -----------------------------------
6782

@@ -73,6 +88,9 @@ def test_regular_values_not_deleted(self):
7388
for val in [0, 1, -1, "hello", None, [], {}, 0.0, 1.5, True, False]:
7489
assert _val_to_delete(val, _DEFAULT_VALS) is False, f"should keep {val!r}"
7590

91+
def test_string_vals_matched(self):
92+
assert _val_to_delete("NA", "NA") is True
93+
7694
def test_float_nan_not_deleted_by_default(self):
7795
"""float NaN is kept when vals does not contain a NaN float."""
7896
assert _val_to_delete(float("nan"), _DEFAULT_VALS) is False
@@ -96,11 +114,19 @@ def test_na_deleted_when_in_vals(self):
96114
def test_nat_deleted_when_in_vals(self):
97115
assert _val_to_delete(_NaT, _VALS_WITH_NAN) is True
98116

117+
def test_na_and_nat_are_not_interchangeable(self):
118+
assert _val_to_delete(_NA, _DEFAULT_VALS + [_NaT]) is False
119+
assert _val_to_delete(_NaT, _DEFAULT_VALS + [_NA]) is False
120+
121+
def test_regular_match_after_na_like_value(self):
122+
assert _val_to_delete("", [_NA, ""]) is True
123+
99124

100125
# ---------------------------------------------------------------------------
101126
# dict_sweep — default vals (NaN kept)
102127
# ---------------------------------------------------------------------------
103128

129+
104130
class TestDictSweepDefaultKeepsNan:
105131
def test_float_nan_kept(self):
106132
d = {"a": 1, "b": float("nan")}
@@ -135,6 +161,7 @@ def test_default_vals_still_removed(self):
135161
# dict_sweep — opt-in NaN removal (NaN in vals)
136162
# ---------------------------------------------------------------------------
137163

164+
138165
class TestDictSweepOptInNanRemoval:
139166
def test_float_nan_removed(self):
140167
d = {"a": 1, "b": float("nan")}
@@ -161,6 +188,7 @@ def test_multiple_nan_types_removed(self):
161188
# dict_sweep — NaN inside lists (opt-in)
162189
# ---------------------------------------------------------------------------
163190

191+
164192
class TestDictSweepNanInList:
165193
def test_nan_removed_from_list(self):
166194
d = {"a": [1, float("nan"), 2]}
@@ -192,11 +220,22 @@ def test_all_nan_list_removed_with_remove_invalid_list(self):
192220
result = dict_sweep(d, vals=_VALS_WITH_NAN, remove_invalid_list=True)
193221
assert "a" not in result
194222

223+
def test_all_invalid_list_preserves_false_mode_behavior(self):
224+
d = {"gene": [None, None], "site": ["Intron", None], "snp_build": 136}
225+
result = dict_sweep(d, vals=[None], remove_invalid_list=False)
226+
assert result == {"gene": [None], "site": ["Intron"], "snp_build": 136}
227+
228+
def test_all_invalid_list_removed_with_remove_invalid_list(self):
229+
d = {"gene": [None, None], "site": ["Intron", None], "snp_build": 136}
230+
result = dict_sweep(d, vals=[None], remove_invalid_list=True)
231+
assert result == {"site": ["Intron"], "snp_build": 136}
232+
195233

196234
# ---------------------------------------------------------------------------
197235
# dict_sweep — NaN in nested dicts (opt-in)
198236
# ---------------------------------------------------------------------------
199237

238+
200239
class TestDictSweepNanNested:
201240
def test_nan_in_nested_dict(self):
202241
d = {"a": {"b": float("nan"), "c": 1}}
@@ -223,6 +262,7 @@ def test_nan_in_list_of_dicts(self):
223262
# dict_sweep — default vals behaviour unchanged
224263
# ---------------------------------------------------------------------------
225264

265+
226266
class TestDictSweepDefaultBehavior:
227267
def test_normal_values_kept(self):
228268
d = {"a": 1, "b": "hello", "c": [1, 2], "d": {"nested": True}}
@@ -234,38 +274,51 @@ def test_mixed_nan_and_default_vals_with_optin(self):
234274
result = dict_sweep(d, vals=_VALS_WITH_NAN)
235275
assert result == {"d": "keep"}
236276

277+
def test_string_vals_supported(self):
278+
d = {"a": "NA", "b": "keep", "c": ["NA", "keep"]}
279+
result = dict_sweep(d, vals="NA")
280+
assert result == {"b": "keep", "c": ["keep"]}
281+
237282

238283
# ---------------------------------------------------------------------------
239-
# Tests using real pandas and numpy types
284+
# Tests using real pandas and numpy types, when the optional deps are installed.
240285
# ---------------------------------------------------------------------------
241286

242-
pandas = pytest.importorskip("pandas")
243-
numpy = pytest.importorskip("numpy")
244-
245287

246288
class TestValToDeleteRealTypes:
247-
def test_numpy_nan_not_deleted_by_default(self):
289+
def test_numpy_nan_not_deleted_by_default(self, numpy):
248290
assert _val_to_delete(numpy.nan, _DEFAULT_VALS) is False
249291

250-
def test_pandas_na_not_deleted_by_default(self):
292+
def test_pandas_na_not_deleted_by_default(self, pandas):
251293
assert _val_to_delete(pandas.NA, _DEFAULT_VALS) is False
252294

253-
def test_pandas_nat_not_deleted_by_default(self):
295+
def test_pandas_nat_not_deleted_by_default(self, pandas):
254296
assert _val_to_delete(pandas.NaT, _DEFAULT_VALS) is False
255297

256-
def test_numpy_nan_deleted_when_in_vals(self):
298+
def test_numpy_nan_deleted_when_in_vals(self, numpy):
257299
vals_with_nan = _DEFAULT_VALS + [float("nan")]
258300
assert _val_to_delete(numpy.nan, vals_with_nan) is True
259301

260-
def test_pandas_na_deleted_when_in_vals(self):
302+
def test_pandas_na_deleted_when_in_vals(self, pandas):
261303
vals_with_na = _DEFAULT_VALS + [pandas.NA]
262304
assert _val_to_delete(pandas.NA, vals_with_na) is True
263305

264-
def test_pandas_nat_deleted_when_in_vals(self):
306+
def test_pandas_nat_deleted_when_in_vals(self, pandas):
265307
vals_with_nat = _DEFAULT_VALS + [pandas.NaT]
266308
assert _val_to_delete(pandas.NaT, vals_with_nat) is True
267309

268-
def test_numpy_float64_nan_deleted_when_in_vals(self):
310+
def test_pandas_na_and_nat_are_not_interchangeable(self, pandas):
311+
assert _val_to_delete(pandas.NA, _DEFAULT_VALS + [pandas.NaT]) is False
312+
assert _val_to_delete(pandas.NaT, _DEFAULT_VALS + [pandas.NA]) is False
313+
314+
def test_regular_match_after_pandas_na(self, pandas):
315+
assert _val_to_delete("", [pandas.NA, ""]) is True
316+
317+
def test_numpy_float32_nan_deleted_when_in_vals(self, numpy):
318+
vals_with_nan = _DEFAULT_VALS + [float("nan")]
319+
assert _val_to_delete(numpy.float32("nan"), vals_with_nan) is True
320+
321+
def test_numpy_float64_nan_deleted_when_in_vals(self, numpy):
269322
vals_with_nan = _DEFAULT_VALS + [float("nan")]
270323
assert _val_to_delete(numpy.float64("nan"), vals_with_nan) is True
271324

@@ -276,52 +329,52 @@ class TestDictSweepRealPandas:
276329
def _vals_with(self, *extras):
277330
return _DEFAULT_VALS + list(extras)
278331

279-
def test_pandas_na_top_level(self):
332+
def test_pandas_na_top_level(self, pandas):
280333
d = {"a": 1, "b": pandas.NA}
281334
result = dict_sweep(d, vals=self._vals_with(pandas.NA))
282335
assert result == {"a": 1}
283336

284-
def test_pandas_nat_top_level(self):
337+
def test_pandas_nat_top_level(self, pandas):
285338
d = {"a": 1, "b": pandas.NaT}
286339
result = dict_sweep(d, vals=self._vals_with(pandas.NaT))
287340
assert result == {"a": 1}
288341

289-
def test_numpy_nan_top_level(self):
342+
def test_numpy_nan_top_level(self, numpy):
290343
d = {"a": 1, "b": numpy.nan}
291344
result = dict_sweep(d, vals=self._vals_with(float("nan")))
292345
assert result == {"a": 1}
293346

294-
def test_pandas_na_in_list(self):
347+
def test_pandas_na_in_list(self, pandas):
295348
d = {"a": [1, pandas.NA, 2]}
296349
result = dict_sweep(d, vals=self._vals_with(pandas.NA))
297350
assert result == {"a": [1, 2]}
298351

299-
def test_pandas_nat_in_list(self):
352+
def test_pandas_nat_in_list(self, pandas):
300353
d = {"a": [1, pandas.NaT, 2]}
301354
result = dict_sweep(d, vals=self._vals_with(pandas.NaT))
302355
assert result == {"a": [1, 2]}
303356

304-
def test_numpy_nan_in_list(self):
357+
def test_numpy_nan_in_list(self, numpy):
305358
d = {"a": [1, numpy.nan, 2]}
306359
result = dict_sweep(d, vals=self._vals_with(float("nan")))
307360
assert result == {"a": [1, 2]}
308361

309-
def test_pandas_na_in_nested_dict(self):
362+
def test_pandas_na_in_nested_dict(self, pandas):
310363
d = {"a": {"b": pandas.NA, "c": 1}}
311364
result = dict_sweep(d, vals=self._vals_with(pandas.NA))
312365
assert result == {"a": {"c": 1}}
313366

314-
def test_pandas_na_in_list_with_remove_invalid_list(self):
367+
def test_pandas_na_in_list_with_remove_invalid_list(self, pandas):
315368
d = {"a": [pandas.NA, pandas.NaT, "valid"]}
316369
result = dict_sweep(d, vals=self._vals_with(pandas.NA, pandas.NaT), remove_invalid_list=True)
317370
assert result == {"a": ["valid"]}
318371

319-
def test_all_real_nan_types_removed(self):
372+
def test_all_real_nan_types_removed(self, pandas, numpy):
320373
d = {"a": numpy.nan, "b": pandas.NA, "c": pandas.NaT, "d": "keep"}
321374
result = dict_sweep(d, vals=self._vals_with(float("nan"), pandas.NA, pandas.NaT))
322375
assert result == {"d": "keep"}
323376

324-
def test_real_nan_kept_by_default(self):
377+
def test_real_nan_kept_by_default(self, pandas, numpy):
325378
"""Without opting in, real NaN types are preserved."""
326379
d = {"a": numpy.nan, "b": pandas.NA, "c": pandas.NaT, "d": "keep"}
327380
result = dict_sweep(d)

0 commit comments

Comments
 (0)