Skip to content

Commit 47c9c49

Browse files
authored
Merge pull request #442 from biothings/fix/dict-sweep-nan-handling
fix: handle NaN-like values in dict_sweep
2 parents 2775ebf + ec6b8b8 commit 47c9c49

2 files changed

Lines changed: 442 additions & 7 deletions

File tree

biothings/utils/dataload.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# from __future__ import unicode_literals
1010
import itertools
1111
import json
12+
import math
1213
import os
1314
import os.path
1415
from collections import Counter, OrderedDict
@@ -21,9 +22,57 @@
2122
csv.field_size_limit(10000000) # default is 131072, too small for some big files
2223

2324

25+
def _missing_value_kind(val):
26+
"""Return a stable kind for NaN-like values without importing optional deps."""
27+
val_cls = val.__class__
28+
cls_module = getattr(val_cls, "__module__", "")
29+
cls_name = getattr(val_cls, "__name__", "")
30+
31+
if (cls_module == "pandas" or cls_module.startswith("pandas.")) and cls_name in ("NAType", "NaTType"):
32+
return cls_name
33+
34+
try:
35+
if math.isnan(val):
36+
return "NaN"
37+
except (TypeError, ValueError):
38+
pass
39+
40+
return None
41+
42+
43+
def _val_to_delete(val, vals):
44+
"""Return True if val is considered as a value to delete, False otherwise.
45+
46+
NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed when
47+
explicitly included in the vals list.
48+
"""
49+
if is_str(vals):
50+
vals = [vals]
51+
52+
val_missing_kind = _missing_value_kind(val)
53+
54+
for candidate in vals:
55+
candidate_missing_kind = _missing_value_kind(candidate)
56+
if val_missing_kind or candidate_missing_kind:
57+
if val_missing_kind == candidate_missing_kind:
58+
return True
59+
continue
60+
61+
try:
62+
if val == candidate:
63+
return True
64+
except (TypeError, ValueError):
65+
continue
66+
67+
return False
68+
69+
2470
def dict_sweep(d, vals=None, remove_invalid_list=False):
2571
"""
26-
Remove keys whose values are ".", "-", "", "NA", "none", " "; and remove empty dictionaries
72+
Remove keys whose values are ".", "-", "", "NA", "none", " "; and remove empty dictionaries.
73+
74+
NaN-like values (float NaN, pandas.NA, pandas.NaT) are only removed when
75+
explicitly included in the ``vals`` list.
2776
2877
Args:
2978
d (dict): a dictionary
@@ -45,11 +94,11 @@ def dict_sweep(d, vals=None, remove_invalid_list=False):
4594
# set default supported vals for empty values
4695
vals = vals or [".", "-", "", "NA", "none", " ", "Not Available", "unknown"]
4796
for key, val in list(d.items()):
48-
if val in vals:
97+
if _val_to_delete(val, vals):
4998
del d[key]
5099
elif isinstance(val, list):
51100
if remove_invalid_list:
52-
val = [v for v in val if v not in vals]
101+
val = [v for v in val if not _val_to_delete(v, vals)]
53102
for item in val:
54103
if isinstance(item, dict):
55104
dict_sweep(item, vals, remove_invalid_list=remove_invalid_list)
@@ -59,12 +108,14 @@ def dict_sweep(d, vals=None, remove_invalid_list=False):
59108
else:
60109
d[key] = val
61110
else:
62-
for item in val:
63-
if item in vals:
64-
val.remove(item)
111+
i = 0
112+
while i < len(val):
113+
item = val[i]
114+
if _val_to_delete(item, vals):
115+
del val[i]
65116
elif isinstance(item, dict):
66117
dict_sweep(item, vals, remove_invalid_list=remove_invalid_list)
67-
# if len(val) == 0:
118+
i += 1
68119
if not val:
69120
del d[key]
70121
elif isinstance(val, dict):

0 commit comments

Comments
 (0)